diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..f456dca7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,49 @@ +# Python +__pycache__ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +*.egg-info/ +dist/ +build/ + +# IDEs +.vscode +.idea +*.swp +*.swo +*~ + +# OS +.DS_Store +.env +.env.local + +# Docker +.dockerignore +Dockerfile + +# Git +.git +.gitignore + +# Frontend (don't need for backend build) +node_modules/ +plexe/ui/frontend/dist/ +npm-debug.log + +# Test & coverage +.pytest_cache +.coverage +htmlcov/ + +# MLflow +mlruns/ + +# Misc +*.log +.cache/ +workdir/ diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..04a73402 --- /dev/null +++ b/.env.example @@ -0,0 +1,38 @@ +# Postgres +POSTGRES_USER=mlflow +POSTGRES_PASSWORD=mlflow +POSTGRES_DB=mlflow_db + +# MLflow backend store URI (used by mlflow server and backend if you want to track experiments) +# Example: postgresql+psycopg2://mlflow:mlflow@localhost:5432/mlflow_db +MLFLOW_BACKEND_STORE_URI=postgresql+psycopg2://mlflow:mlflow@postgres:5432/mlflow_db + +# MLflow tracking URI (URL where mlflow server is running) +MLFLOW_TRACKING_URI=http://mlflow:5000 +MLFLOW_SERVER_ALLOWED_HOSTS=mlflow,localhost,127.0.0.1,localhost:5000,127.0.0.1:5000,mlflow:5000 + +# Frontend environment +# API URL for React frontend to connect to backend (adjust for your deployment) +REACT_APP_API_URL=http://localhost:8000 + +# pgAdmin credentials +PGADMIN_EMAIL=admin@example.com +PGADMIN_PASSWORD=admin + +PLEXE_CONVERSATIONAL_MODEL="gemini/gemini-2.5-flash" +PLEXE_ORCHESTRATOR_MODEL="gemini/gemini-2.5-flash" +PLEXE_ML_RESEARCHER_MODEL="gemini/gemini-2.5-flash" +PLEXE_ML_ENGINEER_MODEL="gemini/gemini-2.5-flash" +PLEXE_ML_OPS_ENGINEER_MODEL="gemini/gemini-2.5-flash" +PLEXE_TOOL_MODEL="gemini/gemini-2.5-flash" +PLEXE_TOOL_FEATURE_GENERATOR_MODEL="gemini/gemini-2.5-flash" + +# External API Keys for HPO Search and Literature Review +# Semantic Scholar API: https://www.semanticscholar.org/product/api#Partner-Form +SEMANTIC_SCHOLAR_API_KEY=your_semantic_scholar_api_key_here + +# OpenML API: https://www.openml.org/auth/sign-in +OPENML_API_KEY=your_openml_api_key_here + +# Hugging Face Token: https://huggingface.co/settings/tokens +HF_TOKEN=??? \ No newline at end of file diff --git a/.gitignore b/.gitignore index 325f22e2..040ed6d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +plexe-old/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -168,6 +169,34 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# VSCode +.vscode/ +*.code-workspace + +# Sublime Text +*.sublime-project +*.sublime-workspace + +# Vim +*.swp +*.swo +*~ +.vim/ + +# Emacs +*~ +\#*\# +.\#* + +# OS files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + # PyPI configuration file .pypirc @@ -198,4 +227,93 @@ examples/datasets/* experiments/ notebooks/ +# ============== DOCKER RELATED ============== +# Docker build artifacts +.docker/ +docker-compose.override.yml + +# ============== FRONTEND (NODE/REACT) ============== +# Dependencies +node_modules/ +plexe/ui/frontend/node_modules/ + +# Production build +plexe/ui/frontend/dist/ +plexe/ui/frontend/build/ + +# Misc frontend +plexe/ui/frontend/.env.local +plexe/ui/frontend/.env.*.local +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* + +# Frontend IDE +plexe/ui/frontend/.idea/ +plexe/ui/frontend/.vscode/ +plexe/ui/frontend/*.swp + +# ============== BACKEND ARTIFACTS ============== +# Generated model files +*.pkl +*.joblib +*.h5 +*.pt +*.pth +*.onnx + +# Temporary files +*.tmp +*.temp +*.bak +*.backup + +# ============== DATABASE ============== +# SQLite databases +*.sqlite +*.sqlite3 +*.db + +# ============== CREDENTIALS & SECRETS ============== +# Environment files (but keep .env.example) +.env +.env.local +.env.*.local +.env.prod + +# API keys and secrets +.aws/ +.gcloud/ +credentials.json +secrets/ + +# ============== PYTHON VIRTUAL ENVIRONMENTS ============== +venv/ +env/ +ENV/ +.venv + +# ============== PACKAGE MANAGER LOCK FILES ============== +# poetry.lock is typically committed, but you can ignore if preferred +# poetry.lock + +# Keep poetry.lock for reproducible builds +# (uncomment below if you prefer to ignore it) +poetry.lock **/.claude/settings.local.json + +# Data folder +data/ +data/uploads/ +data/uploads/* + +# temporal_datasets/* +# temporal_datasets/ +# output_datasets/* +# output_datasets/ + +model-from-chat.tar.gz +model_output_from_chat/ + +.workdir/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 9d6df48d..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,37 +0,0 @@ -# CLAUDE.md: Plexe Coding Reference - -## Project Overview -Plexe is a framework for building ML models using natural language. It employs a multi-agent architecture where -specialized AI agents collaborate to analyze requirements, generate solutions, and build functional ML models. - -The core architecture is as follows: agents go in `plexe/agents/*`, tools in `plexe/tools/*`, prompt templates in -`plexe/templates/prompts/*`, and the main model code in `plexe/models.py`. This structure must be followed. - -## Key Components -- `plexe/models.py`: Core `Model` class with build/predict functionality -- `plexe/agents/schema_resolver.py`: Agent inferring input/output schemas -- `plexe/internal/agents.py`: Multi-agent system implementation (`PlexeAgent` class) -- `plexe/tools/`: Tools for code generation, execution, validation -- `plexe/config.py`: Configuration management and prompt templates -- `plexe/internal/common/registries/objects.py`: Shared object registry for agents -- `plexe/datasets.py`: Dataset handling and synthetic data generation -- `docs/architecture/multi-agent-system.md`: Architectural documentation -- `plexe/templates/prompts/`: Prompt templates for agents and LLM calls - -## Build/Run Commands -- Install deps: `poetry install` -- Format code: `poetry run black .` -- Lint code: `poetry run ruff check . --fix` -- Run tests: `poetry run pytest tests/` -- Run with coverage: `poetry run pytest --cov=plexe tests/` - -## Code Style -- **Functions**: Max 50 lines (excluding docstrings) -- **Formatting**: Black with 120 char line length -- **Linting**: Ruff with E203/E501/E402 ignored -- **Typing**: Type hints and Pydantic models required -- **Imports**: ALWAYS at top level in order: stdlib, third-party, local; NEVER inside functions -- **__init__.py**: No implementation code except in `plexe/__init__.py` -- **Docstrings**: Required for public APIs; Sphinx style -- **Testing**: Write pytest tests for all new functionality -- **Elegance**: Write the simplest solution possible; avoid over-engineering; prefer deleting code over adding code diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 8d9f11dc..00000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,128 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our -community a harassment-free experience for everyone, regardless of age, body -size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, -nationality, personal appearance, race, religion, or sexual identity -and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, -diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our -community include: - -* Demonstrating empathy and kindness toward other people -* Being respectful of differing opinions, viewpoints, and experiences -* Giving and gracefully accepting constructive feedback -* Accepting responsibility and apologizing to those affected by our mistakes, - and learning from the experience -* Focusing on what is best not just for us as individuals, but for the - overall community - -Examples of unacceptable behavior include: - -* The use of sexualized language or imagery, and sexual attention or - advances of any kind -* Trolling, insulting or derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or email - address, without their explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of -acceptable behavior and will take appropriate and fair corrective action in -response to any behavior that they deem inappropriate, threatening, offensive, -or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions that are -not aligned to this Code of Conduct, and will communicate reasons for moderation -decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when -an individual is officially representing the community in public spaces. -Examples of representing our community include using an official e-mail address, -posting via an official social media account, or acting as an appointed -representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at -info@plexe.ai. -All complaints will be reviewed and investigated promptly and fairly. - -All community leaders are obligated to respect the privacy and security of the -reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining -the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed -unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing -clarity around the nature of the violation and an explanation of why the -behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series -of actions. - -**Consequence**: A warning with consequences for continued behavior. No -interaction with the people involved, including unsolicited interaction with -those enforcing the Code of Conduct, for a specified period of time. This -includes avoiding interactions in community spaces as well as external channels -like social media. Violating these terms may lead to a temporary or -permanent ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including -sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public -communication with the community for a specified period of time. No public or -private interaction with the people involved, including unsolicited interaction -with those enforcing the Code of Conduct, is allowed during this period. -Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community -standards, including sustained inappropriate behavior, harassment of an -individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within -the community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 2.0, available at -https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. - -Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder](https://github.com/mozilla/diversity). - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see the FAQ at -https://www.contributor-covenant.org/faq. Translations are available at -https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index a8c77c44..00000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,108 +0,0 @@ -# Contributing to plexe - -Thank you for considering contributing to plexe! Your contributions help improve this project for everyone. - -## Table of Contents - -- [Code of Conduct](#code-of-conduct) -- [How Can I Contribute?](#how-can-i-contribute) - - [Reporting Bugs](#reporting-bugs) - - [Suggesting Enhancements](#suggesting-enhancements) - - [Submitting Pull Requests](#submitting-pull-requests) -- [Development Setup](#development-setup) -- [Style Guides](#style-guides) - - [Coding Standards](#coding-standards) - - [Commit Messages](#commit-messages) - -## Code of Conduct - -By participating in this project, you agree to uphold our [Code of Conduct](link-to-code-of-conduct), which outlines expectations for respectful and inclusive interactions. - -## How Can I Contribute? - -### Reporting Bugs - -If you encounter a bug, please: - -1. **Search Existing Issues**: Check if the issue has already been reported. -2. **Open a New Issue**: If not found, create a new issue and include: - - A descriptive title. - - Steps to reproduce the bug. - - Expected and actual behavior. - - Screenshots or code snippets, if applicable. - -### Suggesting Enhancements - -To propose new features or improvements: - -1. **Search Existing Issues**: Ensure the suggestion hasn't been made. -2. **Open a New Issue**: Provide: - - A clear description of the enhancement. - - Rationale for the suggestion. - - Any relevant examples or references. - -### Submitting Pull Requests - -For code contributions: - -1. **Fork the Repository**: Create your own copy of the repo. -2. **Create a Branch**: Use a descriptive name (e.g., `feature/new-model` or `bugfix/issue-123`). -3. **Make Changes**: Implement your changes with clear and concise code. -4. **Write Tests**: Ensure new features or bug fixes are covered by tests. -5. **Commit Changes**: Follow our commit message guidelines. -6. **Push to Your Fork**: Upload your changes. -7. **Open a Pull Request**: Provide a detailed description of your changes and reference any related issues. - -## Development Setup - -To set up the development environment: - -1. **Clone the Repository**: - - ```bash - git clone https://github.com/plexe-ai/plexe.git - cd plexe - ``` - -2. **Install Dependencies**: - - ``` - -pip install poetry - -poetry env activate or poetry shell depending on the Poetry version being used - -python setup.py - ``` - - -3. **Run Tests**: - - ```bash - pytest - ``` - -Ensure all tests pass before making contributions. - -## Style Guides - -### Coding Standards - -Adhere to PEP 8 guidelines for Python code. Key points include: - -- Use 4 spaces per indentation level. -- Limit lines to 79 characters. -- Use meaningful variable and function names. -- Include docstrings for all public modules, classes, and functions. - -### Commit Messages - -Write clear and concise commit messages: - -- **Format**: `(): ` - - **Type**: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore` - - **Scope**: Optional, e.g., `data`, `model` - - **Subject**: Brief description (max 50 characters) - -- **Example**: - - ```bash - feat(model): add support for gemini - ``` diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 00000000..0f008921 --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,65 @@ +# Lightweight Backend Dockerfile (Development) +# Excludes torch and transformers for faster builds +# Use this during development with: docker compose -f docker-compose.dev.yml build backend + +FROM python:3.11-slim as base + +WORKDIR /app + +ENV PIP_DEFAULT_TIMEOUT=100 \ + PIP_RETRIES=5 \ + PYTHONUNBUFFERED=1 \ + POETRY_NO_INTERACTION=1 \ + POETRY_VIRTUALENVS_CREATE=false + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + curl \ + && rm -rf /var/lib/apt/lists/* + +FROM base as deps + +RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \ + pip install --no-cache-dir poetry + + +COPY pyproject.toml poetry.lock* README.md ./ + +# Install without torch (torch is optional for development) +# This runs much faster! +# Install chatui extras to ensure uvicorn has websocket support +RUN poetry config virtualenvs.create false && \ + poetry install --no-interaction --no-ansi --no-root --only main --extras chatui 2>&1 || \ + (echo "First attempt failed, retrying..." && sleep 10 && poetry install --no-interaction --no-ansi --no-root --only main --extras chatui) || \ + (echo "Second attempt failed, retrying with timeout increase..." && sleep 15 && poetry install --no-interaction --no-ansi --no-root --only main --extras chatui) + +# Force install torch CPU version to ensure compatibility with PyG dependencies +RUN pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu + +# Install PyG dependencies (pyg_lib, etc.) for GNN support +RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.1+cpu.html + + +# RUN pip install --no-cache-dir python-multipart + +FROM deps as app + +COPY . . + +RUN poetry config virtualenvs.create false && \ + poetry install --no-interaction --no-ansi --only main --extras chatui 2>&1 || \ + (echo "Retrying after delay..." && sleep 10 && poetry install --no-interaction --no-ansi --only main --extras chatui) + +# Force reinstall torch and extensions to ensure compatibility (fix for poetry overwriting versions) +RUN pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu && \ + pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.1+cpu.html + +# Install sentence-transformers for text embeddings in GNN training +RUN pip install sentence-transformers + +EXPOSE 8000 + +HEALTHCHECK --interval=10s --timeout=5s --retries=3 --start-period=30s \ + CMD python -c "import requests; requests.get('http://localhost:8000/health', timeout=2)" + +CMD ["python", "-m", "uvicorn", "plexe.server:app", "--host", "0.0.0.0", "--port", "8000", "--reload", "--reload-dir", "plexe"] \ No newline at end of file diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 00000000..42d7e18d --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,101 @@ +FROM pytorch/pytorch:2.7.0-cuda12.8-cudnn9-runtime +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /app + +# ---------------------------- +# System dependencies +# ---------------------------- +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3-pip \ + git \ + curl \ + build-essential \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# ---------------------------- +# Install uv (fast Python package manager) +# ---------------------------- +RUN pip install --no-cache-dir uv + +# ---------------------------- +# Environment variables +# ---------------------------- +ENV PYTHONUNBUFFERED=1 \ + PIP_DEFAULT_TIMEOUT=100 \ + PIP_RETRIES=5 \ + UV_SYSTEM_PYTHON=1 \ + UV_NO_CACHE=1 + +# ---------------------------- +# BƯỚC 1: Copy dependency files +# ---------------------------- +COPY pyproject.toml poetry.lock* README.md ./ + +# ---------------------------- +# BƯỚC 2: Install dependencies với uv (cài trực tiếp để đảm bảo đầy đủ) +# ---------------------------- +RUN uv pip install \ + python-dotenv pandas imbalanced-learn pydantic scikit-learn seaborn \ + dataclasses-json bandit joblib mlxtend xgboost tenacity pyarrow \ + litellm statsmodels hypothesis "numpy<2.0.0" black jinja2 platformdirs \ + "ray>=2.9.0" rich smolagents deprecated python-multipart psycopg2-binary \ + featuretools sqlalchemy pytorch-frame pooch duckdb scipy \ + fastapi "uvicorn[standard]" websockets langgraph langchain langchain-openai langchain-google-genai langchain-anthropic \ + requests openml scholarly kaggle "mcp[all]" nest_asyncio + +# ---------------------------- +# BƯỚC 3: Cài torchvision, torchaudio (torch đã có sẵn từ base image) +# ---------------------------- +RUN uv pip install torchvision torchaudio + +# ---------------------------- +# BƯỚC 4: PyTorch Geometric dependencies (dùng pip vì PyG wheel index) +# Theo hướng dẫn chính thức: https://pytorch-geometric.readthedocs.io +# ---------------------------- +RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv \ + -f https://data.pyg.org/whl/torch-2.7.0+cu128.html + +# ---------------------------- +# BƯỚC 5: torch-geometric +# ---------------------------- +RUN uv pip install torch-geometric torch-frame + +# ---------------------------- +# BƯỚC 6: sentence-transformers (for text embeddings in GNN training) +# ---------------------------- +RUN uv pip install sentence-transformers + +# ---------------------------- +# BƯỚC 7: MLflow +# ---------------------------- +RUN uv pip install "mlflow>=2.14.0,<3.0.0" + +# ---------------------------- +# Source code +# ---------------------------- +COPY . . + +# ---------------------------- +# Sanity check (build-time - GPU not available during build) +# ---------------------------- +RUN python - < +# Plexe: Agentic ML Framework with MCP Integration -# plexe ✨ +Plexe là một framework đa tác tử (multi-agent) được xây dựng trên nền tảng **LangGraph**, được thiết kế để tự động hóa toàn bộ quy trình xây dựng mô hình Machine Learning từ ngôn ngữ tự nhiên. -[![PyPI version](https://img.shields.io/pypi/v/plexe.svg)](https://pypi.org/project/plexe/) -[![Discord](https://img.shields.io/discord/1300920499886358529?logo=discord&logoColor=white)](https://discord.gg/SefZDepGMv) +Hệ thống đã được nâng cấp với **Model Context Protocol (MCP)** để mở rộng khả năng kết nối với các công cụ học thuật và dữ liệu bên ngoài một cách chuẩn hóa. -backed-by-yc +## 🚀 Các tính năng chính sau điều chỉnh +1. **Kiến trúc Đa tác tử LangGraph**: Điều phối luồng công việc giữa các Agent chuyên biệt (EDA, Dataset Builder, Task Builder, GNN Specialist). +2. **Tích hợp MCP (Model Context Protocol)**: + * **Google Scholar**: Tìm kiếm bài báo khoa học, trích xuất thông tin tác giả trực tiếp qua MCP. + * **Kaggle**: Tìm kiếm và tải xuống tập dữ liệu từ Kaggle API thông qua MCP server. + * **Khả năng mở rộng**: Dễ dàng thêm các MCP server mới chỉ bằng cách cập nhật `mcp_config.json`. +3. **Hỗ trợ GPU**: Tối ưu hóa cho việc huấn luyện Graph Neural Networks (GNNs) sử dụng CUDA. -Build machine learning models using natural language. +## 🏗️ Cấu trúc hệ thống MCP -[Quickstart](#1-quickstart) | -[Features](#2-features) | -[Installation](#3-installation) | -[Documentation](#4-documentation) +* `mcp_config.json`: Cấu hình danh sách các MCP server và tham số khởi chạy. +* `plexe/langgraph/mcp_manager.py`: Quản lý kết nối, khám phá tools và chuyển đổi MCP tools thành LangChain tools. +* `plexe/langgraph/mcp_servers/`: Thư mục chứa các tùy chỉnh MCP server (Scholar, Kaggle). -
+## 🛠️ Cài đặt & Sử dụng -**plexe** lets you create machine learning models by describing them in plain language. Simply explain what you want, -and the AI-powered system builds a fully functional model through an automated agentic approach. Also available as a -[managed cloud service](https://plexe.ai). +### 1. Cấu hình biến môi trường +Tạo file `.env` hoặc cập nhật `docker-compose.gpu.yml` với các thông tin sau: +```env +# LLM Keys +OPENAI_API_KEY=your_key +GOOGLE_API_KEY=your_key -
- -Watch the demo on YouTube: -[![Building an ML model with Plexe](resources/demo-thumbnail.png)](https://www.youtube.com/watch?v=bUwCSglhcXY) - - -## 1. Quickstart - -### Installation -```bash -pip install plexe -``` - -### Using plexe - -You can use plexe as a Python library to build and train machine learning models: - -```python -import plexe - -# Define the model -model = plexe.Model( - intent="Predict sentiment from news articles", - input_schema={"headline": str, "content": str}, - output_schema={"sentiment": str} -) - -# Build and train the model -model.build( - datasets=[your_dataset], - provider="openai/gpt-4o-mini", - max_iterations=10 -) - -# Use the model -prediction = model.predict({ - "headline": "New breakthrough in renewable energy", - "content": "Scientists announced a major advancement..." -}) - -# Save for later use -plexe.save_model(model, "sentiment-model") -loaded_model = plexe.load_model("sentiment-model.tar.gz") -``` - -## 2. Features - -### 2.1. 💬 Natural Language Model Definition -Define models using plain English descriptions: - -```python -model = plexe.Model( - intent="Predict housing prices based on features like size, location, etc.", - input_schema={"square_feet": int, "bedrooms": int, "location": str}, - output_schema={"price": float} -) +# Kaggle (Bắt buộc cho Kaggle MCP tool) +KAGGLE_USERNAME=your_username +KAGGLE_KEY=your_api_key ``` -### 2.2. 🤖 Multi-Agent Architecture -The system uses a team of specialized AI agents to: -- Analyze your requirements and data -- Plan the optimal model solution -- Generate and improve model code -- Test and evaluate performance -- Package the model for deployment - -### 2.3. 🎯 Automated Model Building -Build complete models with a single method call: - -```python -model.build( - datasets=[dataset_a, dataset_b], - provider="openai/gpt-4o-mini", # LLM provider - max_iterations=10, # Max solutions to explore - timeout=1800 # Optional time limit in seconds -) -``` - -### 2.4. 🚀 Distributed Training with Ray - -Plexe supports distributed model training and evaluation with Ray for faster parallel processing: - -```python -from plexe import Model - -# Optional: Configure Ray cluster address if using remote Ray -# from plexe import config -# config.ray.address = "ray://10.1.2.3:10001" - -model = Model( - intent="Predict house prices based on various features", - distributed=True # Enable distributed execution -) - -model.build( - datasets=[df], - provider="openai/gpt-4o-mini" -) -``` - -Ray distributes your workload across available CPU cores, significantly speeding up model generation and evaluation when exploring multiple model variants. - -### 2.5. 🎲 Data Generation & Schema Inference -Generate synthetic data or infer schemas automatically: - -```python -# Generate synthetic data -dataset = plexe.DatasetGenerator( - description="Example dataset with features and target", - provider="openai/gpt-4o-mini", - schema={"features": str, "target": int} -) -dataset.generate(500) # Generate 500 samples - -# Infer schema from intent -model = plexe.Model(intent="Predict customer churn based on usage patterns") -model.build(provider="openai/gpt-4o-mini") # Schema inferred automatically -``` - -### 2.6. 🌐 Multi-Provider Support -Use your preferred LLM provider, for example: -```python -model.build(provider="openai/gpt-4o-mini") # OpenAI -model.build(provider="anthropic/claude-3-opus") # Anthropic -model.build(provider="ollama/llama2") # Ollama -model.build(provider="huggingface/meta-llama/...") # Hugging Face -``` -See [LiteLLM providers](https://docs.litellm.ai/docs/providers) for instructions and available providers. - -> [!NOTE] -> Plexe *should* work with most LiteLLM providers, but we actively test only with `openai/*` and `anthropic/*` -> models. If you encounter issues with other providers, please let us know. - - -## 3. Installation - -### 3.1. Installation Options -```bash -pip install plexe # Standard installation, minimal dependencies -pip install plexe[transformers] # Support for transformers, tokenizers, etc -pip install plexe[chatui] # Local chat UI for model interaction -pip install plexe[all] # All optional dependencies -``` - -### 3.2. API Keys +### 2. Chạy với Docker +Sử dụng Docker Compose để khởi chạy toàn bộ hệ thống (bao gồm MLflow, Postgres, và Plexe Backend): ```bash -# Set your preferred provider's API key -export OPENAI_API_KEY= -export ANTHROPIC_API_KEY= -export GEMINI_API_KEY= +docker compose -f docker-compose.gpu.yml up -d ``` -See [LiteLLM providers](https://docs.litellm.ai/docs/providers) for environment variable names. - -## 4. Documentation -For full documentation, visit [docs.plexe.ai](https://docs.plexe.ai). - -## 5. Contributing -See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. Join our [Discord](https://discord.gg/SefZDepGMv) to connect with the team. - -## 6. License -[Apache-2.0 License](LICENSE) -## 7. Citation -If you use Plexe in your research, please cite it as follows: +### 3. Cách thức hoạt động của Agent +Mọi Agent kế thừa từ `BaseAgent` sẽ tự động tải các tools từ các MCP server được cấu hình trong `mcp_config.json`. Bạn có thể yêu cầu Agent trong Chat UI: +- *"Tìm các bài báo mới nhất về GNN trên Google Scholar"* +- *"Tải tập dữ liệu Titanic từ Kaggle và phân tích nó"* -```bibtex -@software{plexe2025, - author = {De Bernardi, Marcello AND Dubey, Vaibhav}, - title = {Plexe: Build machine learning models using natural language.}, - year = {2025}, - publisher = {GitHub}, - howpublished = {\url{https://github.com/plexe-ai/plexe}}, -} +## 📝 Ghi chú cho Docker +Hệ thống sử dụng `Dockerfile.gpu` dựa trên `pytorch/pytorch:2.7.0-cuda12.8` để đảm bảo hiệu suất huấn luyện mô hình. Các thư viện bổ sung như `scholarly`, `kaggle`, và `mcp[all]` đã được tích hợp sẵn trong quá trình build image. diff --git a/SECURITY.md b/SECURITY.md deleted file mode 100644 index bedc60b0..00000000 --- a/SECURITY.md +++ /dev/null @@ -1,11 +0,0 @@ -# Security Policy - -The Plexe AI team and community take security vulnerabilities seriously. We appreciate your efforts to responsibly disclose any issues you discover, and we will make every effort to acknowledge your contributions. - -## Reporting a Vulnerability - -To report a security issue, please use the GitHub Security Advisory **"Report a Vulnerability"** tab in the [plexe repository](https://github.com/plexe-ai/plexe). - -The Plexe AI team will review your report and send a response indicating the next steps in handling your disclosure. After the initial reply, we will keep you informed of our progress toward a fix and public announcement, and we may request additional information or guidance. - -Thank you for helping to make `plexe` secure and reliable for everyone. diff --git a/automl-agent b/automl-agent new file mode 160000 index 00000000..9e0d8429 --- /dev/null +++ b/automl-agent @@ -0,0 +1 @@ +Subproject commit 9e0d842959a09d96c00768c14d73bd9df6d39023 diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 00000000..9f0528cc --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,95 @@ +# docker-compose.dev.yml +# Lightweight compose file for development +# Use: docker compose -f docker-compose.dev.yml up -d +# This skips torch installation for faster dev cycles + +services: + postgres: + image: postgres:15 + # restart: "no" + environment: + POSTGRES_USER: ${POSTGRES_USER:-mlflow} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-mlflow} + POSTGRES_DB: ${POSTGRES_DB:-mlflow_db} + volumes: + - postgres_data:/var/lib/postgresql/data + ports: + - "5432:5432" + networks: + - plexe-net + + mlflow: + build: + context: . + dockerfile: Dockerfile.mlflow + # restart: "no" + depends_on: + - postgres + environment: + BACKEND_STORE_URI: ${MLFLOW_BACKEND_STORE_URI:-postgresql+psycopg2://mlflow:mlflow@postgres:5432/mlflow_db} + ARTIFACT_ROOT: /mlruns + MLFLOW_ALLOW_ORIGINS: ${MLFLOW_ALLOW_ORIGINS:-*} + MLFLOW_SERVER_ALLOWED_HOSTS: ${MLFLOW_SERVER_ALLOWED_HOSTS:-mlflow,localhost,127.0.0.1,localhost:5000,127.0.0.1:5000,mlflow:5000} + volumes: + - ./mlruns:/mlruns + ports: + - "5000:5000" + networks: + - plexe-net + + pgadmin: + image: dpage/pgadmin4:7 + # restart: "no" + environment: + PGADMIN_DEFAULT_EMAIL: ${PGADMIN_EMAIL:-admin@example.com} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_PASSWORD:-admin} + ports: + - "8080:80" + networks: + - plexe-net + + frontend: + build: + context: ./plexe/ui/frontend + dockerfile: Dockerfile.dev + # restart: "no" + volumes: + - ./plexe/ui/frontend/src:/app/src + - ./plexe/ui/frontend/index.html:/app/index.html + - ./plexe/ui/frontend/vite.config.js:/app/vite.config.js + - /app/node_modules + ports: + - "3000:3000" + environment: + VITE_BACKEND_URL: ${VITE_BACKEND_URL:-http://localhost:8000} + networks: + - plexe-net + + backend: + build: + context: . + dockerfile: Dockerfile.dev + # restart: no + depends_on: + postgres: + condition: service_started + environment: + MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI:-http://mlflow:5000} + MLFLOW_BACKEND_STORE_URI: ${MLFLOW_BACKEND_STORE_URI:-postgresql+psycopg2://mlflow:mlflow@postgres:5432/mlflow_db} + MLFLOW_HTTP_REQUEST_TIMEOUT: 300 + HF_TOKEN: ${HF_TOKEN} + HUGGINGFACE_API_KEY: ${HUGGINGFACE_API_KEY} + volumes: + - .:/app + ports: + - "8000:8000" + networks: + - plexe-net + + +volumes: + postgres_data: + +networks: + plexe-net: + driver: bridge diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 00000000..51c7771c --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,117 @@ +# docker-compose.gpu.yml +# GPU-enabled compose file for training +# Use: docker compose -f docker-compose.gpu.yml up -d + +services: + postgres: + image: postgres:15 + environment: + POSTGRES_USER: ${POSTGRES_USER:-mlflow} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-mlflow} + POSTGRES_DB: ${POSTGRES_DB:-mlflow_db} + volumes: + - postgres_data:/var/lib/postgresql/data + ports: + - "5432:5432" + networks: + - plexe-net + + mlflow: + build: + context: . + dockerfile: Dockerfile.mlflow + depends_on: + - postgres + environment: + BACKEND_STORE_URI: ${MLFLOW_BACKEND_STORE_URI:-postgresql+psycopg2://mlflow:mlflow@postgres:5432/mlflow_db} + ARTIFACT_ROOT: /mlruns + MLFLOW_ALLOW_ORIGINS: ${MLFLOW_ALLOW_ORIGINS:-*} + MLFLOW_SERVER_ALLOWED_HOSTS: ${MLFLOW_SERVER_ALLOWED_HOSTS:-mlflow,localhost,127.0.0.1,localhost:5000,127.0.0.1:5000,mlflow:5000} + volumes: + - ./mlruns:/mlruns + ports: + - "5000:5000" + networks: + - plexe-net + + pgadmin: + image: dpage/pgadmin4:7 + environment: + PGADMIN_DEFAULT_EMAIL: ${PGADMIN_EMAIL:-admin@example.com} + PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_PASSWORD:-admin} + ports: + - "8080:80" + networks: + - plexe-net + + frontend: + build: + context: ./plexe/ui/frontend + dockerfile: Dockerfile.dev + depends_on: + - backend + volumes: + - ./plexe/ui/frontend/src:/app/src + - ./plexe/ui/frontend/index.html:/app/index.html + - ./plexe/ui/frontend/vite.config.js:/app/vite.config.js + - /app/node_modules + ports: + - "3000:3000" + environment: + # Use 'backend' as the host for Docker inter-container communication + VITE_BACKEND_HOST: backend + # Don't set VITE_BACKEND_URL so frontend uses same-origin and goes through Vite proxy + networks: + - plexe-net + + backend: + build: + context: . + dockerfile: Dockerfile.gpu + depends_on: + postgres: + condition: service_started + environment: + MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI:-http://mlflow:5000} + MLFLOW_BACKEND_STORE_URI: ${MLFLOW_BACKEND_STORE_URI:-postgresql+psycopg2://mlflow:mlflow@postgres:5432/mlflow_db} + MLFLOW_HTTP_REQUEST_TIMEOUT: 300 + HF_TOKEN: ${HF_TOKEN} + HUGGINGFACE_API_KEY: ${HUGGINGFACE_API_KEY} + # API Keys for LLM providers + OPENAI_API_KEY: ${OPENAI_API_KEY} + GOOGLE_API_KEY: ${GOOGLE_API_KEY:-${GEMINI_API_KEY}} + ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY} + # Plexe Agent Model Configuration + PLEXE_ORCHESTRATOR_MODEL: ${PLEXE_ORCHESTRATOR_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_CONVERSATIONAL_MODEL: ${PLEXE_CONVERSATIONAL_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_EDA_MODEL: ${PLEXE_EDA_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_DATASET_BUILDER_MODEL: ${PLEXE_DATASET_BUILDER_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_TASK_BUILDER_MODEL: ${PLEXE_TASK_BUILDER_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_GNN_SPECIALIST_MODEL: ${PLEXE_GNN_SPECIALIST_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_OPERATION_MODEL: ${PLEXE_OPERATION_MODEL:-gemini/gemini-2.0-flash-exp} + PLEXE_AGENT_TEMPERATURE: ${PLEXE_AGENT_TEMPERATURE:-0.1} + PLEXE_VERBOSE: ${PLEXE_VERBOSE:-true} + # Kaggle Credentials for MCP + KAGGLE_USERNAME: ${KAGGLE_USERNAME} + KAGGLE_KEY: ${KAGGLE_KEY} + MCP_CONFIG_PATH: /app/mcp_config.json + volumes: + - .:/app + ports: + - "8100:8100" + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + networks: + - plexe-net + +volumes: + postgres_data: + +networks: + plexe-net: + driver: bridge diff --git a/docs/architecture/langgraph-multi-agent-system.md b/docs/architecture/langgraph-multi-agent-system.md new file mode 100644 index 00000000..438477ab --- /dev/null +++ b/docs/architecture/langgraph-multi-agent-system.md @@ -0,0 +1,474 @@ +# Plexe LangGraph Multi-Agent System + +This document describes the LangGraph-based multi-agent system for building ML models from relational databases. + +## Architecture Overview + +The system consists of specialized agents coordinated by a LangGraph StateGraph workflow: + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ PlexeOrchestrator │ +│ (LangGraph StateGraph) │ +├─────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │Conversational│───▶│ Graph │───▶│ Dataset │ │ +│ │ Agent │ │ Architect │ │ Builder │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ │ │ │ +│ │ │ ▼ │ +│ │ │ ┌──────────────┐ │ +│ │ │ │ Task │ │ +│ │ │ │ Builder │ │ +│ │ │ └──────────────┘ │ +│ │ │ │ │ +│ │ │ ▼ │ +│ │ │ ┌──────────────┐ │ +│ │ │ │ GNN │ │ +│ │ │ │ Specialist │ │ +│ │ │ └──────────────┘ │ +│ │ │ │ │ +│ │ │ ▼ │ +│ │ │ ┌──────────────┐ │ +│ └───────────────────┴───────────▶│ Operation │ │ +│ │ Agent │ │ +│ └──────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +## Agent Descriptions + +### 1. Conversational Agent +**Role**: User interaction and requirements gathering + +**Responsibilities**: +- Guide users through ML problem definition +- Validate data availability (CSV or database) +- Extract prediction targets and entity types +- Get explicit user confirmation before proceeding + +**Tools**: +- `get_dataset_preview`: Preview CSV data +- `validate_db_connection`: Test database connectivity + +### 2. Relational Graph Architect Agent +**Role**: Schema analysis and data export + +**Responsibilities**: +- Connect to databases and analyze schema +- Identify primary keys, foreign keys, and temporal columns +- Classify tables (Fact vs Dimension) +- Export all tables to CSV format + +**Tools**: +- `validate_db_connection`: Verify database access +- `export_tables_to_csv`: Export data to CSV +- `extract_schema_metadata`: Analyze schema structure + +### 3. Dataset Builder Agent +**Role**: Generate RelBench Dataset classes + +**Responsibilities**: +- Analyze CSV files and schema metadata +- Determine temporal split timestamps +- Generate complete Python Dataset class (GenDataset) +- Handle data cleaning and preprocessing + +**Tools**: +- `get_csv_files_info`: List CSV files +- `get_temporal_statistics`: Analyze timestamps +- `register_dataset_code`: Save generated code + +**Output**: `dataset.py` with GenDataset class + +### 4. Task Builder Agent +**Role**: Generate RelBench Task classes + +**Responsibilities**: +- Understand prediction requirements +- Design SQL queries for label computation +- Generate complete Python Task class (GenTask) +- Define metrics and evaluation + +**Tools**: +- `get_csv_files_info`: Understand data structure +- `test_sql_query`: Validate SQL queries +- `register_task_code`: Save generated code + +**Output**: `task.py` with GenTask class + +### 5. Relational GNN Specialist Agent +**Role**: GNN training script generation with Training-Free HPO + +**Responsibilities**: +- **Training-Free Hyperparameter Search**: Query external knowledge sources (literature, benchmarks) to find optimal hyperparameters without expensive trial-and-error +- Generate optimized training scripts using plexe.relbench.modeling +- Configure HeteroEncoder, HeteroTemporalEncoder, HeteroGraphSAGE +- Select hyperparameters based on dataset characteristics and task type +- **Handoff to Operation Agent**: Prepare training script for execution + +**Key Innovation**: Implements "Training-Free Model Search and HPO" approach from AutoML-Agent paper, leveraging: +- Academic literature search for best practices +- Benchmark database queries for proven configurations +- Heuristic-based selection using dataset characteristics +- Ensemble voting across multiple knowledge sources + +**Tools**: +- `search_optimal_hyperparameters`: Find hyperparameters based on dataset characteristics +- `search_literature_for_hyperparameters`: Query academic papers +- `get_benchmark_hyperparameters`: Retrieve from benchmark leaderboards +- `compare_hyperparameter_configs`: Ensemble multiple sources +- `generate_training_script`: Create train_script.py with optimal hyperparameters + +**Output**: +- `train_script.py` with optimized hyperparameters +- Hyperparameter selection reasoning +- Literature/benchmark references + +### 6. Operation Agent +**Role**: Training execution, monitoring, and finalization + +**Responsibilities**: +- **Execute training scripts** generated by GNN Specialist +- Monitor training progress and handle errors +- Process training results and metrics +- Handle timeouts and retries +- Generate inference code +- Package model artifacts for deployment + +**Tools**: +- `execute_training_script`: Run training scripts +- `save_artifact`: Save files + +**Output**: +- `best_model.pt`: Trained model +- `training_results.json`: Performance metrics +- Final pipeline summary + +## Pipeline Flow + +### Phase 1: Conversation +``` +User Message → Conversational Agent → Requirements Gathered + │ + ▼ + User Confirmation Required? + │ + Yes ───┴─── No + │ │ + ▼ └──▶ Continue Conversation + Proceed to Pipeline +``` + +### Phase 2: Schema Analysis +``` +Database Connection → Graph Architect Agent → Schema Metadata + │ + ▼ + Export Tables to CSV + │ + ▼ + Register in ObjectRegistry +``` + +### Phase 3: Dataset Building +``` +CSV Files + Schema → Dataset Builder Agent → GenDataset Class + │ + ▼ + Analyze Temporal Data + │ + ▼ + Generate dataset.py +``` + +### Phase 4: Task Building +``` +Schema + User Intent → Task Builder Agent → GenTask Class + │ + ▼ + Design SQL Query + │ + ▼ + Test and Validate + │ + ▼ + Generate task.py +``` + +### Phase 5: GNN Training (Training-Free HPO) +``` +dataset.py + task.py → GNN Specialist Agent → Optimized Training Script + │ + ├──▶ Search Optimal Hyperparameters + │ (Training-Free HPO) + │ │ + │ ├─ Query Literature + │ ├─ Check Benchmarks + │ ├─ Apply Heuristics + │ └─ Ensemble Results + │ + ▼ + Generate train_script.py + (with optimal hyperparameters) + │ + ▼ + Handoff to Operation Agent +``` + +**Key Innovation**: Training-Free Hyperparameter Optimization +- No trial-and-error training runs +- Leverage external knowledge sources +- Intelligent hyperparameter selection based on: + * Dataset characteristics (size, temporal nature) + * Academic literature best practices + * Benchmark leaderboard results + * Task type requirements + +### Phase 6: Operation (Training Execution) +``` +Training Script → Operation Agent → Trained Model + Results + │ + ├──▶ Execute Training Script + │ │ + │ └─ Monitor Progress + │ + ├──▶ Process Results + │ │ + │ └─ Extract Metrics + │ + ▼ + Generate Inference Code + │ + ▼ + Package Artifacts + │ + ▼ + Package Artifacts +``` + +## Configuration + +### Environment Variables + +Configure each agent's model via `.env`: + +```bash +# API Keys +OPENAI_API_KEY=your-key +ANTHROPIC_API_KEY=your-key +GEMINI_API_KEY=your-key + +# Agent Models (format: provider/model-name) +PLEXE_ORCHESTRATOR_MODEL="openai/gpt-4o" +PLEXE_CONVERSATIONAL_MODEL="openai/gpt-4o" +PLEXE_GRAPH_ARCHITECT_MODEL="openai/gpt-4o" +PLEXE_DATASET_BUILDER_MODEL="openai/gpt-4o" +PLEXE_TASK_BUILDER_MODEL="openai/gpt-4o" +PLEXE_GNN_SPECIALIST_MODEL="openai/gpt-4o" +PLEXE_OPERATION_MODEL="openai/gpt-4o" + +# Agent Settings +PLEXE_AGENT_TEMPERATURE="0.1" +PLEXE_MAX_RETRIES="3" +PLEXE_VERBOSE="false" +``` + +### Supported Model Formats +- OpenAI: `openai/gpt-4o`, `openai/gpt-4-turbo` +- Anthropic: `anthropic/claude-sonnet-4-20250514`, `anthropic/claude-3-opus` +- Google: `gemini/gemini-2.5-flash`, `gemini/gemini-pro` + +## Usage + +### Basic Usage + +```python +from plexe import PlexeOrchestrator + +orchestrator = PlexeOrchestrator(verbose=True) + +result = orchestrator.run( + user_message="Build a model to predict user churn", + db_connection_string="postgresql://user:pass@localhost:5432/mydb", +) + +print(f"Status: {result['status']}") +print(f"Working directory: {result['working_dir']}") +``` + +### Interactive Chat + +```python +from plexe import PlexeOrchestrator + +orchestrator = PlexeOrchestrator() + +# Start session +result = orchestrator.run( + user_message="I want to predict customer churn", + session_id="my-session" +) + +# Continue conversation +response = orchestrator.chat( + message="The data is in a PostgreSQL database", + session_id="my-session" +) +``` + +### Custom Configuration + +```python +from plexe import PlexeOrchestrator, AgentConfig + +config = AgentConfig( + orchestrator_model="anthropic/claude-sonnet-4-20250514", + conversational_model="openai/gpt-4o", + gnn_specialist_model="openai/gpt-4o", + temperature=0.2, + max_retries=5, +) + +orchestrator = PlexeOrchestrator(config=config) +``` + +## State Management + +The pipeline uses a shared `PipelineState` that flows between agents: + +```python +class PipelineState(TypedDict): + session_id: str + working_dir: str + current_phase: str + messages: List[Message] + user_intent: str + db_connection_string: Optional[str] + csv_dir: Optional[str] + schema_info: Optional[SchemaInfo] + dataset_info: Optional[DatasetInfo] + task_info: Optional[TaskInfo] + training_result: Optional[TrainingResult] + generated_code: Dict[str, str] + artifacts: List[str] + errors: List[str] + warnings: List[str] +``` + +## Error Handling + +The system includes automatic error recovery: + +1. **Retry Mechanism**: Failed steps are retried up to `PLEXE_MAX_RETRIES` times +2. **Error Routing**: Errors route to the error handler node +3. **Graceful Degradation**: Pipeline can continue from last successful state + +## Generated Artifacts + +After successful execution, the working directory contains: + +``` +workdir/session-YYYYMMDD-HHMMSS/ +├── csv_files/ # Exported table data +│ ├── users.csv +│ ├── orders.csv +│ └── ... +├── cache/ # Graph cache +├── dataset.py # GenDataset class +├── task.py # GenTask class +├── train_script.py # GNN training script +├── best_model.pt # Trained model weights +├── training_results.json # Metrics and metadata +└── inference_code.py # Inference utilities +``` + +## Key Components + +### plexe.relbench.modeling Modules + +The GNN training uses these core modules: + +- **make_pkey_fkey_graph**: Converts Database to HeteroData graph +- **HeteroEncoder**: Encodes tabular features to embeddings +- **HeteroTemporalEncoder**: Encodes temporal information +- **HeteroGraphSAGE**: GNN message passing layers +- **NeighborLoader**: Temporal-aware batch sampling + +### ObjectRegistry + +Shared state storage for cross-agent communication: + +```python +from plexe.core.object_registry import ObjectRegistry + +registry = ObjectRegistry() +registry.register(str, "schema_metadata", data) +data = registry.get(dict, "schema_metadata") +``` + +## Extension Points + +### Adding Custom Tools + +```python +from langchain_core.tools import tool + +@tool +def my_custom_tool(param: str) -> dict: + """Description of what the tool does.""" + return {"result": "value"} + +# Add to agent +agent = DatasetBuilderAgent(additional_tools=[my_custom_tool]) +``` + +### Custom Agents + +```python +from plexe.langgraph.agents.base import BaseAgent + +class MyCustomAgent(BaseAgent): + def __init__(self, config=None): + super().__init__( + agent_type="custom", + config=config, + tools=[...], + ) + + @property + def system_prompt(self) -> str: + return "Your custom system prompt..." +``` + +## Performance Considerations + +- **Model Selection**: Use faster models (gpt-4o-mini) for simpler agents +- **Batch Size**: Adjust based on GPU memory (default: 512) +- **Epochs**: Start with 10 epochs, increase if needed +- **Caching**: Graph construction is cached for repeated runs + +## Troubleshooting + +### Common Issues + +1. **Database Connection Failed** + - Verify connection string format + - Check network access to database + - Ensure credentials are correct + +2. **Training OOM Error** + - Reduce batch_size + - Reduce num_neighbors + - Use CPU if GPU memory insufficient + +3. **SQL Query Errors** + - Verify column names (snake_case in PostgreSQL) + - Test queries with test_sql_query tool + - Check temporal window logic + +4. **Missing Dependencies** + - Run `poetry install` to install all dependencies + - Ensure LangGraph packages are installed diff --git a/docs/external-api-setup.md b/docs/external-api-setup.md new file mode 100644 index 00000000..75a8210a --- /dev/null +++ b/docs/external-api-setup.md @@ -0,0 +1,162 @@ +# External API Configuration Guide +## Overview + +Plexe's hyperparameter optimization tools use external APIs to search academic literature and benchmarks for optimal hyperparameters. This eliminates the need for expensive trial-and-error training runs. + +## Required API Keys + +### 1. Semantic Scholar API + +**Purpose**: Search academic papers for hyperparameter recommendations +**Cost**: Free for academic use (up to 100 requests/minute with API key, 1/minute without) + +**How to get it:** +1. Visit https://www.semanticscholar.org/product/api +2. Click "Apply for an API Key" or visit the Partner Form +3. Fill out the application form (usually approved within 1-2 business days) +4. You'll receive an API key via email + +**Configuration:** +```bash +export SEMANTIC_SCHOLAR_API_KEY=your_api_key_here +``` + +Or add to `.env` file: +``` +SEMANTIC_SCHOLAR_API_KEY=your_api_key_here +``` + +### 2. OpenML API Key + +**Purpose**: Access benchmark datasets and model performance data +**Cost**: Free + +**How to get it:** +1. Visit https://www.openml.org/auth/sign-in +2. Create an account (or sign in with GitHub) +3. Go to your profile settings +4. Click on "API authentication" section +5. Your API key will be displayed there + +**Configuration:** +```bash +export OPENML_API_KEY=your_api_key_here +``` + +Or add to `.env` file: +``` +OPENML_API_KEY=your_api_key_here +``` + +### 3. Hugging Face Token (Optional) + +**Purpose**: Access Hugging Face datasets and models +**Cost**: Free + +**How to get it:** +1. Visit https://huggingface.co/settings/tokens +2. Create an account if you don't have one +3. Click "New token" +4. Give it a name (e.g., "plexe-hpo") +5. Select "Read" permissions +6. Copy the generated token + +**Configuration:** +```bash +export HF_TOKEN=your_token_here +``` + +Or add to `.env` file: +``` +HF_TOKEN=your_token_here +``` + +## APIs That Don't Require Keys + +### arXiv API + +**Purpose**: Search preprint papers +**No API key required** - Rate limited to 1 request per 3 seconds +**Documentation**: https://info.arxiv.org/help/api/index.html + +### Papers With Code API + +**Purpose**: Access benchmark leaderboards and SOTA results +**No API key required** - Rate limited +**Documentation**: https://paperswithcode.com/api/v1/docs/ + +## Configuration File + +Create a `.env` file in the project root: + +```bash +# Copy from .env.example +cp .env.example .env +``` + +Then edit `.env` and add your API keys: + +```bash +# External API Keys for HPO Search +SEMANTIC_SCHOLAR_API_KEY=your_semantic_scholar_api_key_here +OPENML_API_KEY=your_openml_api_key_here +HF_TOKEN=your_huggingface_token_here +``` + +## Rate Limits + +Be aware of rate limits for each service: + +| Service | With API Key | Without API Key | +|---------|-------------|-----------------| +| Semantic Scholar | 100 req/min | 1 req/min | +| arXiv | N/A | 1 req/3s (~20/min) | +| Papers With Code | N/A | ~20 req/min | +| OpenML | 20 req/min | 10 req/min | + +The clients include automatic rate limiting to respect these limits. + +## Troubleshooting + +### "API key not found" error + +Make sure your `.env` file is in the project root and the key names match exactly: +- `SEMANTIC_SCHOLAR_API_KEY` +- `OPENML_API_KEY` +- `HF_TOKEN` + +### "Rate limit exceeded" error + +The clients automatically handle rate limiting, but if you see this error: +1. Wait a few minutes +2. Reduce the number of concurrent requests +3. Consider getting an API key if using a service without one + +### "Connection error" or "Timeout" + +1. Check your internet connection +2. Verify the API service is online (check status pages) +3. Try increasing the timeout in the client configuration + +## Using MCP (Model Context Protocol) + +**Note**: The current implementation uses direct REST API calls. To use MCP: + +1. Install MCP client library: +```bash +pip install model-context-protocol +``` + +2. Configure MCP servers in your MCP configuration file + +3. Update the API clients to use MCP instead of direct HTTP requests + +MCP provides a standardized way to access these services with better error handling, caching, and composability. See the MCP documentation for more details: https://modelcontextprotocol.io/ + +## Security Best Practices + +1. **Never commit API keys** to version control +2. Add `.env` to `.gitignore` (already done in this project) +3. Use environment variables in production +4. Rotate API keys periodically +5. Use minimal permissions (e.g., read-only tokens for Hugging Face) \ No newline at end of file diff --git a/docs/mcp-hpo-explanation.md b/docs/mcp-hpo-explanation.md new file mode 100644 index 00000000..23e930f2 --- /dev/null +++ b/docs/mcp-hpo-explanation.md @@ -0,0 +1,283 @@ +# MCP HPO Integration trong Plexe + +## Tổng quan + +Plexe sử dụng **MCP (Model Context Protocol)** để implement **Training-Free Hyperparameter Optimization (HPO)**. Thay vì training nhiều lần để tìm hyperparameters tối ưu, hệ thống truy vấn các nguồn kiến thức external (papers, benchmarks) để tìm cấu hình đã được proven. + +## Kiến trúc MCP + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GNN Specialist Agent │ +│ (needs hyperparameters) │ +└──────────────────────────┬──────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ MCP Manager │ +│ (converts MCP tools → LangChain tools) │ +└──────────────────────────┬──────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ HPO MCP Server │ +│ (hpo_server.py - FastMCP) │ +├─────────────────────────────────────────────────────────────┤ +│ Tools: │ +│ ✓ search_optimal_hyperparameters() │ +│ ✓ extract_hyperparameters_from_papers() │ +│ ✓ get_benchmark_hyperparameters() │ +│ ✓ compare_hyperparameter_configs() │ +└──────────────────────────┬──────────────────────────────────┘ + │ + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────────┐ + │ arXiv │ │ Papers │ │ Dataset │ + │ API │ │ With │ │ Heuristics │ + │ │ │ Code │ │ │ + └──────────┘ └──────────┘ └──────────────┘ +``` + +## Components + +### 1. MCP Config ([mcp_config.json](../mcp_config.json)) + +```json +{ + "mcpServers": { + "hpo-search": { + "command": "python", + "args": ["plexe/langgraph/mcp_servers/hpo_server.py"] + } + } +} +``` + +### 2. HPO MCP Server ([plexe/langgraph/mcp_servers/hpo_server.py](../plexe/langgraph/mcp_servers/hpo_server.py)) + +**4 MCP Tools chính:** + +#### `search_optimal_hyperparameters()` +- **Input**: task_type, num_nodes, num_tables, is_temporal +- **Output**: Hyperparameters dựa trên heuristics và dataset scale +- **Logic**: Áp dụng rules từ literature (GraphSAGE, RelBench papers) +- **Ví dụ**: + ```python + { + "hyperparameters": { + "hidden_channels": 128, + "batch_size": 512, + "learning_rate": 0.008, # 0.01 * 0.8 for temporal + "num_gnn_layers": 2, + "epochs": 15 + }, + "reasoning": "Based on 15000 nodes, temporal task...", + "confidence": "high" + } + ``` + +#### `extract_hyperparameters_from_papers()` +- **Input**: paper_query, model_type, num_papers +- **Output**: Hyperparameters extracted từ papers thực +- **Logic**: + 1. Query arXiv API + 2. Parse paper abstracts + 3. Extract hyperparams bằng regex (learning rate, batch size, etc.) + 4. Aggregate multiple papers +- **Ví dụ**: + ```python + { + "papers_analyzed": 5, + "papers_with_hyperparams": 3, + "extracted_hyperparameters": [ + { + "source_paper": "GraphSAGE: ...", + "hyperparameters": { + "learning_rate": 0.01, + "batch_size": 512 + } + } + ], + "aggregated_hyperparameters": { + "learning_rate": 0.01, # median từ 3 papers + "batch_size": 512 + } + } + ``` + +#### `get_benchmark_hyperparameters()` +- **Input**: task_type, dataset_domain, model_architecture +- **Output**: Hyperparameters từ benchmark leaderboards +- **Logic**: Query Papers With Code API cho SOTA results +- **Ví dụ**: + ```python + { + "hyperparameters": { + "hidden_channels": 128, + "num_gnn_layers": 2, + "learning_rate": 0.005 + }, + "benchmarks_referenced": 2, + "benchmark_papers": [ + {"paper_title": "...", "paper_url": "..."} + ] + } + ``` + +#### `compare_hyperparameter_configs()` +- **Input**: List of configs from different sources, strategy +- **Output**: Ensemble recommendation +- **Logic**: Ensemble voting (median cho numeric, mode cho categorical) +- **Ví dụ**: + ```python + { + "recommended_hyperparameters": { + "learning_rate": 0.008, # median of [0.01, 0.008, 0.005] + "batch_size": 512, # mode + "hidden_channels": 128 # median of [128, 128, 256] + }, + "num_sources": 3, + "confidence": "high" + } + ``` + +### 3. GNN Specialist Agent ([plexe/langgraph/agents/gnn_specialist.py](../plexe/langgraph/agents/gnn_specialist.py)) + +**Workflow:** + +1. **Agent init** → MCPManager loads HPO tools tự động từ BaseAgent +2. **Agent execution**: + ```python + # Step 1: Heuristic-based + result_a = search_optimal_hyperparameters( + task_type="binary_classification", + num_nodes=15000, + num_tables=7, + is_temporal=True + ) + + # Step 2: Literature-based + result_b = extract_hyperparameters_from_papers( + paper_query="Relational GNN classification", + model_type="gnn" + ) + + # Step 3: Benchmark-based + result_c = get_benchmark_hyperparameters( + task_type="binary_classification", + dataset_domain="relational" + ) + + # Step 4: Ensemble + final = compare_hyperparameter_configs( + configs=[result_a, result_b, result_c] + ) + + # Step 5: Generate training script + generate_training_script( + ..., + **final["recommended_hyperparameters"] + ) + ``` + +3. **Output**: Training script với optimal hyperparameters + reasoning + +## So sánh: Trước vs Sau + +### ❌ Trước (Không dùng MCP đúng cách) + +``` +GNN Agent → tools/hpo_search.py → external_api_clients.py → Direct API calls + ↓ + Chỉ trả về paper metadata, không có hyperparams +``` + +**Vấn đề:** +- MCP servers (arxiv, semantic-scholar) chỉ return paper info +- HPO tools gọi trực tiếp API clients, không qua MCP +- Không có tool nào thực sự extract hyperparameters + +### ✅ Sau (MCP integration đúng) + +``` +GNN Agent → MCP Manager → HPO MCP Server → arXiv API + Regex extraction + ↓ + Trả về hyperparameters cụ thể ready-to-use +``` + +**Cải thiện:** +- HPO MCP server dedicated cho hyperparameter search +- Extract hyperparams từ paper text (learning rate, batch size, etc.) +- Aggregate multiple sources +- Return format chuẩn cho training script + +## Tác dụng + +### 1. **Training-Free HPO** +Không cần chạy 10-20 training experiments để tune hyperparameters. Thay vào đó: +- Query 5 papers → extract their hyperparams → aggregate +- Tiết kiệm hàng giờ GPU time + +### 2. **Knowledge-Based Optimization** +Leverage kiến thức từ: +- Hàng nghìn papers trên arXiv +- Benchmark leaderboards (Papers With Code) +- Proven heuristics từ literature + +### 3. **Explainable Recommendations** +Mỗi hyperparameter có reasoning: +``` +"Learning rate = 0.008 because: + - Paper A used 0.01 for similar task + - Paper B used 0.005 + - Median = 0.008 + - Temporal adjustment: -20%" +``` + +### 4. **Scalable Architecture** +Dễ dàng thêm MCP servers mới: +- OpenML server cho dataset benchmarks +- Kaggle server cho competition winners +- Custom servers cho internal knowledge base + +## Testing + +Run test để verify integration: + +```bash +python test_mcp_hpo_integration.py +``` + +Expected output: +``` +1. Initializing MCP Manager... +✓ MCP Manager initialized successfully + Connected to 5 MCP servers + +2. Checking loaded MCP tools... +✓ Loaded 12 total MCP tools +✓ Found 4 HPO-related tools: + - search_optimal_hyperparameters + - extract_hyperparameters_from_papers + - get_benchmark_hyperparameters + - compare_hyperparameter_configs + +3. Testing HPO tools... +✓ All tools return hyperparameters successfully +``` + +## Future Enhancements + +1. **LLM-based extraction**: Thay regex bằng LLM để extract hyperparams chính xác hơn +2. **Meta-learning**: Học từ past experiments để predict optimal hyperparams +3. **Active learning**: Query specific papers based on dataset characteristics +4. **Benchmark database**: Cache benchmark results để query nhanh hơn + +## Kết luận + +MCP trong Plexe không chỉ là infrastructure để kết nối external APIs. Nó là core component của **Training-Free HPO strategy**, cho phép: +- ✅ Tìm hyperparameters tối ưu WITHOUT expensive training +- ✅ Leverage collective knowledge từ research community +- ✅ Explainable và reproducible recommendations +- ✅ Extensible architecture cho future enhancements diff --git a/examples/csv_examples/badges.csv b/examples/csv_examples/badges.csv new file mode 100644 index 00000000..25fdc8cb --- /dev/null +++ b/examples/csv_examples/badges.csv @@ -0,0 +1,2001 @@ +id,user_id,class,name,tag_based,date +0,4,3,Teacher,False,2010-07-19 19:39:07.047 +8,4,3,Student,False,2010-07-19 19:39:07.890 +45,60,3,Critic,False,2010-07-19 19:39:09.280 +76,60,3,Autobiographer,False,2010-07-19 19:39:10.387 +100,82,3,Supporter,False,2010-07-19 19:49:07.210 +104,60,3,Teacher,False,2010-07-19 19:59:06.860 +111,22,3,Critic,False,2010-07-19 19:59:07.170 +113,22,3,Teacher,False,2010-07-19 20:04:06.817 +114,60,3,Editor,False,2010-07-19 20:04:06.927 +118,60,3,Supporter,False,2010-07-19 20:09:07.117 +121,4,3,Editor,False,2010-07-19 20:14:07.853 +129,4,3,Supporter,False,2010-07-19 20:19:06.937 +143,22,3,Supporter,False,2010-07-19 20:34:06.900 +185,114,3,Supporter,False,2010-07-19 21:39:07.203 +190,4,3,Self-Learner,False,2010-07-19 21:44:07.207 +191,4,3,Commentator,False,2010-07-19 21:44:07.270 +199,4,3,Mortarboard,False,2010-07-19 22:04:06.923 +201,114,3,Student,False,2010-07-19 22:14:07.037 +213,132,3,Autobiographer,False,2010-07-19 23:09:07.133 +214,132,3,Supporter,False,2010-07-19 23:14:07.220 +219,132,3,Organizer,False,2010-07-19 23:44:07.193 +224,132,3,Teacher,False,2010-07-20 00:04:07.293 +230,60,3,Student,False,2010-07-20 00:59:07.700 +248,155,3,Supporter,False,2010-07-20 02:59:08.163 +252,155,3,Teacher,False,2010-07-20 03:29:08.207 +253,155,3,Editor,False,2010-07-20 03:44:08.403 +265,132,3,Student,False,2010-07-20 05:09:08.837 +272,132,3,Editor,False,2010-07-20 05:29:08.990 +277,166,3,Supporter,False,2010-07-20 05:59:09.160 +279,166,3,Critic,False,2010-07-20 06:04:09.120 +283,65,3,Student,False,2010-07-20 06:59:10.320 +285,166,3,Student,False,2010-07-20 06:59:10.803 +287,169,3,Teacher,False,2010-07-20 07:04:08.233 +296,155,3,Student,False,2010-07-20 07:04:08.637 +306,169,3,Supporter,False,2010-07-20 07:39:08.397 +313,166,3,Scholar,False,2010-07-20 08:39:08.643 +316,114,3,Teacher,False,2010-07-20 08:54:10.213 +317,166,3,Editor,False,2010-07-20 08:54:10.387 +319,166,3,Teacher,False,2010-07-20 09:04:09.020 +320,169,3,Student,False,2010-07-20 09:09:08.743 +326,65,3,Supporter,False,2010-07-20 09:59:09.223 +333,132,3,Mortarboard,False,2010-07-20 10:29:09.383 +339,182,3,Editor,False,2010-07-20 11:39:11.587 +340,182,3,Autobiographer,False,2010-07-20 11:44:09.990 +345,182,3,Supporter,False,2010-07-20 12:04:12.897 +354,169,3,Editor,False,2010-07-20 13:59:12.243 +362,182,3,Student,False,2010-07-20 14:39:11.913 +373,4,3,Organizer,False,2010-07-20 15:54:11.770 +395,182,3,Teacher,False,2010-07-20 19:27:26.953 +403,60,3,Mortarboard,False,2010-07-20 22:07:27.690 +413,192,3,Editor,False,2010-07-21 00:37:32.570 +414,166,3,Commentator,False,2010-07-21 00:52:32.650 +424,190,3,Supporter,False,2010-07-21 06:03:13.123 +425,190,3,Teacher,False,2010-07-21 06:28:12.870 +427,190,3,Critic,False,2010-07-21 06:38:12.897 +432,190,3,Autobiographer,False,2010-07-21 07:13:12.837 +433,169,3,Scholar,False,2010-07-21 07:43:12.610 +435,211,3,Autobiographer,False,2010-07-21 07:58:12.557 +436,211,3,Supporter,False,2010-07-21 08:03:12.543 +438,192,3,Teacher,False,2010-07-21 08:13:12.473 +440,211,3,Teacher,False,2010-07-21 08:48:12.267 +443,4,3,Critic,False,2010-07-21 09:53:12.377 +455,186,3,Editor,False,2010-07-21 13:43:12.500 +468,114,3,Editor,False,2010-07-21 18:38:12.960 +475,60,3,Nice Question,False,2010-07-21 21:38:14.050 +490,190,3,Student,False,2010-07-22 09:58:18.857 +497,190,3,Commentator,False,2010-07-22 12:38:19.360 +498,190,3,Organizer,False,2010-07-22 12:58:19.647 +500,190,3,Editor,False,2010-07-22 13:08:19.393 +506,169,3,Critic,False,2010-07-22 14:58:19.490 +515,4,3,Nice Question,False,2010-07-22 17:58:19.657 +516,186,3,Student,False,2010-07-22 18:03:19.647 +523,132,3,Scholar,False,2010-07-23 01:48:25.107 +526,132,3,Critic,False,2010-07-23 09:53:25.727 +532,4,3,Nice Question,False,2010-07-23 15:48:28.130 +534,4,3,Scholar,False,2010-07-23 15:53:28.160 +540,132,3,Self-Learner,False,2010-07-23 22:18:29.410 +544,132,3,Commentator,False,2010-07-24 01:48:30.397 +559,4,3,Disciplined,False,2010-07-25 09:23:35.437 +565,155,3,Critic,False,2010-07-25 13:53:36.047 +568,22,3,Student,False,2010-07-25 20:23:37.090 +570,186,3,Cleanup,False,2010-07-26 10:38:41.317 +578,190,3,Scholar,False,2010-07-26 18:28:41.673 +579,60,3,Commentator,False,2010-07-26 18:38:41.700 +599,261,3,Supporter,False,2010-07-26 20:03:42.580 +602,261,3,Teacher,False,2010-07-26 20:08:42.490 +608,132,3,Nice Answer,False,2010-07-26 20:18:42.543 +618,232,3,Teacher,False,2010-07-26 21:18:42.687 +640,4,3,Nice Answer,False,2010-07-27 01:48:43.540 +660,166,3,Promoter,False,2010-07-27 05:18:44.860 +663,132,3,Cleanup,False,2010-07-27 06:43:44.823 +672,190,3,Self-Learner,False,2010-07-27 07:18:44.720 +678,4,2,Beta,False,2010-07-27 09:13:44.623 +684,22,2,Beta,False,2010-07-27 09:13:44.700 +699,60,2,Beta,False,2010-07-27 09:13:45.293 +711,132,2,Beta,False,2010-07-27 09:13:46.540 +714,155,2,Beta,False,2010-07-27 09:13:46.930 +718,166,2,Beta,False,2010-07-27 09:13:47.027 +720,169,2,Beta,False,2010-07-27 09:13:47.073 +722,182,2,Beta,False,2010-07-27 09:13:47.227 +724,190,2,Beta,False,2010-07-27 09:13:47.430 +725,192,2,Beta,False,2010-07-27 09:13:47.463 +729,211,2,Beta,False,2010-07-27 09:13:47.570 +742,346,3,Teacher,False,2010-07-27 10:33:44.457 +749,346,3,Editor,False,2010-07-27 12:33:44.297 +756,132,3,Nice Answer,False,2010-07-27 12:58:44.340 +771,346,3,Supporter,False,2010-07-27 14:18:44.997 +798,232,3,Supporter,False,2010-07-27 17:28:44.927 +801,261,3,Nice Answer,False,2010-07-27 17:58:45.043 +821,232,3,Critic,False,2010-07-27 22:08:47.057 +835,287,3,Teacher,False,2010-07-28 01:48:52.780 +873,346,3,Autobiographer,False,2010-07-28 15:28:54.083 +878,182,3,Scholar,False,2010-07-28 16:53:54.413 +884,261,3,Student,False,2010-07-28 18:18:55.040 +894,190,3,Nice Question,False,2010-07-28 22:23:56.027 +900,60,3,Nice Question,False,2010-07-29 01:58:59.820 +902,155,3,Scholar,False,2010-07-29 02:23:59.933 +905,436,3,Autobiographer,False,2010-07-29 06:39:01.130 +906,436,3,Supporter,False,2010-07-29 06:44:01.073 +913,436,3,Student,False,2010-07-29 11:54:05.790 +921,436,3,Scholar,False,2010-07-29 16:09:10.073 +938,22,3,Editor,False,2010-07-30 03:49:24.420 +948,22,3,Autobiographer,False,2010-07-30 19:24:32.767 +955,450,3,Teacher,False,2010-07-31 10:39:37.110 +956,287,3,Supporter,False,2010-07-31 10:39:37.297 +963,449,3,Teacher,False,2010-07-31 17:54:43.450 +977,450,3,Editor,False,2010-08-01 12:04:55.330 +1000,132,3,Nice Answer,False,2010-08-02 16:25:10.650 +1007,166,3,Nice Question,False,2010-08-02 20:30:11.287 +1008,211,3,Commentator,False,2010-08-02 20:30:11.367 +1019,211,3,Student,False,2010-08-03 04:45:13.027 +1029,287,3,Scholar,False,2010-08-03 12:40:16.270 +1035,287,3,Student,False,2010-08-03 15:10:20.557 +1038,287,3,Editor,False,2010-08-03 15:50:20.980 +1065,399,3,Editor,False,2010-08-04 08:45:31.350 +1068,399,3,Teacher,False,2010-08-04 09:20:32.217 +1081,346,3,Critic,False,2010-08-04 12:25:34.597 +1110,211,3,Scholar,False,2010-08-05 01:45:54.253 +1120,503,3,Teacher,False,2010-08-05 14:01:35.130 +1121,449,3,Supporter,False,2010-08-05 15:01:35.043 +1126,450,3,Supporter,False,2010-08-05 17:11:35.030 +1144,399,3,Supporter,False,2010-08-06 01:06:37.163 +1147,399,3,Student,False,2010-08-06 03:06:36.990 +1150,436,3,Editor,False,2010-08-06 06:21:45.740 +1154,436,3,Teacher,False,2010-08-06 07:46:45.400 +1159,211,3,Editor,False,2010-08-06 11:06:45.187 +1165,190,3,Popular Question,False,2010-08-06 15:52:26.583 +1167,186,3,Supporter,False,2010-08-06 18:02:26.877 +1177,399,3,Self-Learner,False,2010-08-06 21:12:27.240 +1181,132,3,Nice Question,False,2010-08-07 00:52:28.963 +1197,132,2,Good Answer,False,2010-08-07 13:37:32.613 +1200,449,3,Editor,False,2010-08-07 18:42:37.887 +1205,132,3,Nice Answer,False,2010-08-08 02:24:15.547 +1206,190,3,Nice Answer,False,2010-08-08 02:24:15.593 +1214,594,3,Teacher,False,2010-08-09 01:39:21.223 +1215,594,3,Editor,False,2010-08-09 01:59:21.393 +1225,4,3,Nice Answer,False,2010-08-09 11:34:33.953 +1227,449,3,Commentator,False,2010-08-09 12:24:34.367 +1242,628,3,Student,False,2010-08-10 07:04:52.293 +1246,346,3,Commentator,False,2010-08-10 11:19:51.127 +1248,436,3,Commentator,False,2010-08-10 14:59:51.553 +1252,633,3,Supporter,False,2010-08-10 22:39:52.590 +1255,633,3,Editor,False,2010-08-10 23:34:52.800 +1287,155,3,Commentator,False,2010-08-12 06:09:58.213 +1300,651,3,Teacher,False,2010-08-12 16:49:57.873 +1303,658,3,Teacher,False,2010-08-12 20:59:58.640 +1307,658,3,Supporter,False,2010-08-12 22:29:58.973 +1323,658,3,Nice Answer,False,2010-08-13 12:55:01.363 +1325,668,3,Autobiographer,False,2010-08-13 15:35:01.603 +1328,232,3,Commentator,False,2010-08-13 17:55:01.773 +1333,674,3,Autobiographer,False,2010-08-13 20:55:02.020 +1341,651,3,Supporter,False,2010-08-14 07:30:09.780 +1342,155,3,Organizer,False,2010-08-14 08:05:09.617 +1348,668,3,Teacher,False,2010-08-14 13:25:43.697 +1358,132,3,Nice Question,False,2010-08-14 23:35:49.447 +1365,261,3,Scholar,False,2010-08-15 13:18:06.703 +1374,594,3,Supporter,False,2010-08-16 00:13:12.120 +1382,651,3,Editor,False,2010-08-16 09:28:32.470 +1395,450,3,Commentator,False,2010-08-16 19:43:42.540 +1400,132,2,Enthusiast,False,2010-08-17 00:23:43.680 +1402,60,2,Enthusiast,False,2010-08-17 02:18:44.200 +1404,232,3,Editor,False,2010-08-17 03:38:44.327 +1409,4,2,Enthusiast,False,2010-08-17 06:48:45.550 +1413,155,3,Nice Question,False,2010-08-17 12:53:45.197 +1420,668,3,Student,False,2010-08-17 17:43:50.383 +1422,668,3,Supporter,False,2010-08-17 18:43:50.607 +1425,449,3,Student,False,2010-08-17 19:28:50.650 +1427,668,3,Editor,False,2010-08-17 22:13:51.457 +1448,166,3,Nice Question,False,2010-08-18 15:49:06.530 +1452,668,3,Commentator,False,2010-08-18 18:54:06.750 +1454,674,3,Student,False,2010-08-18 21:34:20.153 +1456,132,3,Disciplined,False,2010-08-19 00:39:20.923 +1458,82,3,Teacher,False,2010-08-19 02:04:20.903 +1462,723,3,Student,False,2010-08-19 06:24:22.167 +1469,674,3,Teacher,False,2010-08-19 08:59:28.643 +1476,399,3,Commentator,False,2010-08-19 12:39:28.833 +1507,674,3,Supporter,False,2010-08-21 21:15:08.333 +1510,750,3,Teacher,False,2010-08-22 03:20:08.590 +1513,750,3,Supporter,False,2010-08-22 04:45:08.937 +1521,750,3,Autobiographer,False,2010-08-23 00:45:53.377 +1523,155,2,Enthusiast,False,2010-08-23 03:35:53.690 +1526,674,3,Editor,False,2010-08-23 10:30:54.090 +1530,668,3,Organizer,False,2010-08-23 14:20:54.077 +1536,750,3,Editor,False,2010-08-23 16:30:54.777 +1548,346,3,Student,False,2010-08-24 11:15:57.413 +1551,287,3,Commentator,False,2010-08-24 14:30:57.537 +1556,261,3,Nice Question,False,2010-08-24 18:25:58.350 +1558,668,3,Scholar,False,2010-08-24 22:30:59.040 +1559,22,3,Scholar,False,2010-08-24 22:55:59.053 +1569,261,3,Commentator,False,2010-08-25 14:56:07.450 +1579,674,3,Critic,False,2010-08-25 18:21:09.667 +1581,674,3,Commentator,False,2010-08-25 21:11:09.870 +1587,346,3,Scholar,False,2010-08-26 07:21:17.900 +1595,232,3,Organizer,False,2010-08-26 13:11:17.513 +1597,346,3,Self-Learner,False,2010-08-26 13:46:17.490 +1599,786,3,Autobiographer,False,2010-08-26 17:21:17.967 +1603,449,2,Enthusiast,False,2010-08-27 03:26:27.097 +1604,60,3,Scholar,False,2010-08-27 03:31:27.123 +1616,132,3,Nice Answer,False,2010-08-27 18:51:29.587 +1617,182,3,Nice Answer,False,2010-08-27 18:51:29.647 +1618,786,3,Editor,False,2010-08-27 19:16:29.473 +1619,182,2,Enlightened,False,2010-08-27 19:36:29.697 +1621,750,3,Student,False,2010-08-27 20:26:29.613 +1622,793,3,Student,False,2010-08-27 22:26:49.470 +1625,786,3,Student,False,2010-08-27 23:11:49.593 +1629,793,3,Editor,False,2010-08-28 04:31:27.627 +1636,793,3,Scholar,False,2010-08-28 19:07:35.260 +1641,793,3,Supporter,False,2010-08-29 22:07:51.737 +1645,4,3,Nice Answer,False,2010-08-30 05:03:12.007 +1646,4,2,Enlightened,False,2010-08-30 06:13:12.090 +1647,346,3,Organizer,False,2010-08-30 15:03:12.960 +1649,155,3,Nice Answer,False,2010-08-30 16:13:12.930 +1661,803,3,Teacher,False,2010-08-31 10:33:35.123 +1666,786,3,Teacher,False,2010-08-31 17:18:35.663 +1667,4,2,Civic Duty,False,2010-08-31 17:18:35.867 +1671,674,3,Organizer,False,2010-08-31 21:53:36.180 +1677,190,2,Enthusiast,False,2010-09-01 06:18:39.853 +1680,750,3,Commentator,False,2010-09-01 12:18:39.963 +1699,4,3,Autobiographer,False,2010-09-02 13:49:06.790 +1701,261,3,Editor,False,2010-09-02 14:59:06.990 +1718,132,3,Popular Question,False,2010-09-03 06:34:15.763 +1720,155,3,Tag Editor,False,2010-09-03 07:29:15.473 +1738,855,3,Autobiographer,False,2010-09-03 17:04:16.083 +1740,750,3,Scholar,False,2010-09-03 17:59:16.187 +1742,4,3,Promoter,False,2010-09-03 18:04:16.143 +1745,651,3,Student,False,2010-09-03 18:29:15.973 +1751,651,3,Commentator,False,2010-09-04 08:54:33.440 +1754,793,3,Teacher,False,2010-09-05 08:39:01.690 +1767,793,3,Nice Question,False,2010-09-06 10:40:26.253 +1772,60,3,Nice Answer,False,2010-09-07 07:26:20.167 +1779,65,3,Teacher,False,2010-09-07 22:01:30.237 +1784,793,3,Self-Learner,False,2010-09-07 22:51:30.377 +1786,166,3,Self-Learner,False,2010-09-08 01:01:31.210 +1792,155,3,Promoter,False,2010-09-08 05:52:20.673 +1801,155,3,Citizen Patrol,False,2010-09-08 11:02:31.647 +1804,190,3,Citizen Patrol,False,2010-09-08 12:17:31.757 +1808,211,3,Critic,False,2010-09-08 14:17:31.993 +1809,211,3,Citizen Patrol,False,2010-09-08 14:17:32.043 +1822,132,3,Nice Answer,False,2010-09-09 03:47:36.160 +1826,803,3,Editor,False,2010-09-09 06:18:04.157 +1828,211,2,Enthusiast,False,2010-09-09 07:18:04.387 +1831,450,3,Student,False,2010-09-09 13:18:04.740 +1833,22,3,Commentator,False,2010-09-09 16:08:23.317 +1835,450,3,Scholar,False,2010-09-09 19:33:23.553 +1838,450,3,Critic,False,2010-09-10 12:34:18.340 +1841,674,2,Civic Duty,False,2010-09-10 13:19:18.543 +1843,4,3,Benefactor,False,2010-09-10 16:54:18.877 +1846,261,2,Good Answer,False,2010-09-10 19:19:19.077 +1856,668,2,Enthusiast,False,2010-09-11 15:09:42.757 +1857,449,3,Nice Answer,False,2010-09-11 16:24:49.723 +1861,4,3,Nice Question,False,2010-09-12 03:40:07.640 +1863,155,3,Benefactor,False,2010-09-12 07:20:09.377 +1899,261,3,Nice Answer,False,2010-09-13 23:49:18.770 +1903,674,3,Citizen Patrol,False,2010-09-14 13:35:13.150 +1904,4,3,Citizen Patrol,False,2010-09-14 14:00:13.210 +1910,399,3,Organizer,False,2010-09-15 04:10:48.387 +1913,628,3,Supporter,False,2010-09-15 09:01:00.283 +1914,346,3,Citizen Patrol,False,2010-09-15 11:41:01.287 +1932,628,3,Nice Question,False,2010-09-16 01:51:36.007 +1947,628,3,Scholar,False,2010-09-16 07:31:51.970 +1952,628,3,Popular Question,False,2010-09-16 11:02:33.400 +1960,803,3,Student,False,2010-09-16 14:28:06.613 +1972,947,3,Teacher,False,2010-09-16 23:58:35.770 +1982,674,3,Mortarboard,False,2010-09-17 15:24:06.620 +1983,192,3,Student,False,2010-09-17 16:14:06.540 +1999,4,3,Announcer,False,2010-09-18 04:19:35.640 +2001,674,3,Nice Question,False,2010-09-18 11:29:41.843 +2012,674,3,Promoter,False,2010-09-18 21:44:51.020 +2014,947,3,Supporter,False,2010-09-19 00:34:59.823 +2016,674,2,Enthusiast,False,2010-09-19 07:28:28.433 +2018,155,2,Civic Duty,False,2010-09-19 10:08:30.363 +2019,803,3,Supporter,False,2010-09-19 10:38:30.493 +2020,803,3,Scholar,False,2010-09-19 10:38:30.527 +2027,947,3,Editor,False,2010-09-20 03:59:15.810 +2033,750,3,Critic,False,2010-09-20 12:19:30.897 +2038,155,3,Self-Learner,False,2010-09-20 15:14:31.673 +2043,211,2,Civic Duty,False,2010-09-20 21:29:59.453 +2045,947,3,Student,False,2010-09-20 22:59:59.343 +2046,166,3,Nice Question,False,2010-09-21 01:34:59.607 +2047,750,3,Organizer,False,2010-09-21 03:40:00.453 +2051,346,2,Enthusiast,False,2010-09-21 09:45:27.943 +2053,287,3,Critic,False,2010-09-21 15:05:40.913 +2059,947,3,Autobiographer,False,2010-09-21 22:35:43.863 +2064,155,3,Nice Answer,False,2010-09-22 11:12:04.833 +2065,155,2,Enlightened,False,2010-09-22 12:02:05.777 +2067,192,3,Scholar,False,2010-09-22 14:52:05.713 +2079,132,2,Civic Duty,False,2010-09-23 09:12:37.263 +2089,728,3,Student,False,2010-09-24 06:04:22.710 +2090,399,3,Critic,False,2010-09-24 06:04:22.973 +2091,728,3,Supporter,False,2010-09-24 06:54:22.950 +2092,728,3,Scholar,False,2010-09-24 07:29:22.803 +2094,4,3,Nice Question,False,2010-09-24 09:49:34.343 +2102,674,3,Nice Question,False,2010-09-25 15:10:55.693 +2103,750,2,Civic Duty,False,2010-09-25 16:00:55.837 +2104,947,3,Commentator,False,2010-09-25 16:10:55.853 +2108,674,3,Scholar,False,2010-09-25 19:46:14.803 +2109,674,3,Benefactor,False,2010-09-25 19:51:14.603 +2112,947,3,Scholar,False,2010-09-26 04:26:50.937 +2113,674,3,Tag Editor,False,2010-09-26 10:22:12.090 +2114,723,3,Nice Question,False,2010-09-26 11:02:13.293 +2115,190,2,Good Question,False,2010-09-26 11:57:14.400 +2127,155,3,Announcer,False,2010-09-27 01:47:44.797 +2133,287,3,Promoter,False,2010-09-27 18:39:07.257 +2134,287,3,Nice Question,False,2010-09-27 19:09:07.307 +2137,947,3,Critic,False,2010-09-28 03:59:29.410 +2138,750,2,Enthusiast,False,2010-09-28 05:14:29.577 +2156,674,3,Investor,False,2010-09-29 19:21:14.783 +2159,674,3,Self-Learner,False,2010-09-30 12:01:58.200 +2165,221,3,Student,False,2010-09-30 14:17:11.977 +2166,658,3,Editor,False,2010-09-30 14:22:11.927 +2167,658,3,Student,False,2010-09-30 14:27:12.063 +2173,728,3,Editor,False,2010-09-30 16:27:13.313 +2174,658,3,Scholar,False,2010-09-30 17:47:15.137 +2179,132,3,Nice Question,False,2010-09-30 19:22:15.477 +2190,4,2,Enlightened,False,2010-10-01 05:33:56.890 +2192,132,2,Enlightened,False,2010-10-01 05:33:57.047 +2196,221,3,Editor,False,2010-10-01 12:00:06.173 +2197,221,3,Autobiographer,False,2010-10-01 12:15:09.497 +2199,436,3,Critic,False,2010-10-01 14:25:12.263 +2206,947,3,Cleanup,False,2010-10-01 19:20:24.043 +2218,450,2,Enthusiast,False,2010-10-03 03:40:55.370 +2221,1040,3,Teacher,False,2010-10-03 13:42:20.420 +2228,221,3,Supporter,False,2010-10-04 11:26:54.393 +2245,4,3,Nice Question,False,2010-10-05 13:50:24.583 +2253,674,3,Nice Answer,False,2010-10-06 11:31:06.407 +2257,674,3,Altruist,False,2010-10-06 15:56:08.773 +2270,855,3,Supporter,False,2010-10-07 15:27:34.060 +2289,668,2,Civic Duty,False,2010-10-08 20:34:35.233 +2296,114,3,Scholar,False,2010-10-09 14:58:50.427 +2297,450,3,Nice Answer,False,2010-10-09 15:03:50.963 +2298,450,2,Enlightened,False,2010-10-09 16:33:48.160 +2301,450,3,Nice Answer,False,2010-10-09 21:48:45.750 +2302,674,3,Nice Answer,False,2010-10-10 04:47:41.657 +2303,1073,3,Supporter,False,2010-10-10 06:27:44.490 +2306,668,3,Tag Editor,False,2010-10-10 16:11:28.833 +2310,668,3,Critic,False,2010-10-10 18:31:59.390 +2324,4,3,Nice Answer,False,2010-10-11 22:14:56.013 +2329,436,3,Nice Answer,False,2010-10-12 02:05:34.983 +2347,668,3,Nice Answer,False,2010-10-12 20:02:40.713 +2348,668,3,Mortarboard,False,2010-10-12 21:52:40.873 +2352,668,2,Necromancer,False,2010-10-13 00:07:40.943 +2354,155,3,Nice Question,False,2010-10-13 09:02:49.460 +2360,132,3,Nice Answer,False,2010-10-13 15:49:16.607 +2361,132,3,Nice Answer,False,2010-10-13 16:09:16.597 +2365,668,3,Nice Answer,False,2010-10-13 22:29:17.273 +2375,674,3,Nice Answer,False,2010-10-14 17:25:09.487 +2376,674,2,Enlightened,False,2010-10-14 18:15:24.917 +2409,674,3,Nice Answer,False,2010-10-16 01:13:49.127 +2412,182,3,Nice Answer,False,2010-10-16 21:26:32.000 +2413,450,3,Nice Answer,False,2010-10-17 04:21:34.007 +2416,450,2,Enlightened,False,2010-10-17 06:31:46.423 +2422,1073,3,Teacher,False,2010-10-17 12:16:59.500 +2453,190,3,Nice Answer,False,2010-10-19 07:54:05.100 +2455,4,3,Suffrage,False,2010-10-19 09:09:05.263 +2460,190,3,Suffrage,False,2010-10-19 09:09:05.467 +2462,399,3,Suffrage,False,2010-10-19 09:09:05.513 +2464,674,3,Suffrage,False,2010-10-19 09:09:05.543 +2472,346,3,Nice Answer,False,2010-10-19 15:19:22.023 +2482,674,2,Sportsmanship,False,2010-10-20 06:58:44.217 +2490,674,3,Nice Answer,False,2010-10-20 11:49:49.197 +2513,155,3,Nice Answer,False,2010-10-21 05:57:44.287 +2514,674,3,Cleanup,False,2010-10-21 06:07:44.130 +2515,155,2,Enlightened,False,2010-10-21 06:07:45.100 +2521,211,3,Nice Question,False,2010-10-21 13:15:07.183 +2532,371,3,Student,False,2010-10-21 16:32:10.070 +2535,371,3,Supporter,False,2010-10-21 20:12:39.557 +2536,674,3,Nice Answer,False,2010-10-21 21:37:41.133 +2542,371,3,Nice Question,False,2010-10-22 13:38:43.220 +2543,132,3,Announcer,False,2010-10-22 13:43:43.317 +2547,450,3,Nice Answer,False,2010-10-22 18:59:09.503 +2569,1145,3,Teacher,False,2010-10-24 04:49:01.187 +2571,1145,3,Supporter,False,2010-10-24 06:39:39.273 +2576,1150,3,Student,False,2010-10-24 21:03:16.787 +2579,371,3,Scholar,False,2010-10-25 08:50:06.587 +2580,1145,3,Student,False,2010-10-25 10:30:16.287 +2582,221,3,Teacher,False,2010-10-25 12:50:17.420 +2583,211,3,Nice Question,False,2010-10-25 13:40:17.543 +2584,1145,3,Nice Answer,False,2010-10-25 14:05:17.417 +2585,855,3,Teacher,False,2010-10-25 17:50:21.747 +2589,399,3,Nice Answer,False,2010-10-25 21:25:44.013 +2594,674,3,Nice Answer,False,2010-10-26 05:01:41.327 +2598,651,2,Enthusiast,False,2010-10-26 09:06:42.653 +2603,4,1,Fanatic,False,2010-10-26 12:22:05.033 +2604,750,3,Nice Question,False,2010-10-26 13:57:12.597 +2610,668,3,Suffrage,False,2010-10-26 21:27:12.640 +2611,132,2,Enlightened,False,2010-10-27 02:37:16.787 +2620,132,3,Nice Answer,False,2010-10-27 13:43:23.523 +2621,132,2,Enlightened,False,2010-10-27 13:53:23.650 +2624,674,1,Electorate,False,2010-10-27 19:13:26.257 +2633,436,3,Citizen Patrol,False,2010-10-28 06:18:32.393 +2656,436,2,Enthusiast,False,2010-10-30 09:20:27.650 +2677,155,3,Nice Question,False,2010-11-01 06:10:44.643 +2681,728,3,Nice Question,False,2010-11-01 06:10:44.737 +2686,947,3,Nice Question,False,2010-11-01 06:10:44.940 +2696,674,2,Strunk & White,False,2010-11-01 13:05:47.647 +2697,668,2,Necromancer,False,2010-11-01 15:10:47.590 +2698,211,3,Nice Answer,False,2010-11-01 15:50:47.553 +2699,211,2,Enlightened,False,2010-11-01 15:55:47.667 +2706,82,3,Student,False,2010-11-02 00:35:48.427 +2709,4,2,Taxonomist,False,2010-11-02 03:15:49.003 +2710,132,3,Revival,False,2010-11-02 04:00:48.830 +2712,450,3,Revival,False,2010-11-02 04:00:49.283 +2713,668,3,Revival,False,2010-11-02 04:00:49.517 +2714,674,3,Revival,False,2010-11-02 04:00:49.737 +2715,786,3,Revival,False,2010-11-02 04:00:49.937 +2717,450,3,Revival,False,2010-11-02 04:05:48.750 +2718,668,3,Revival,False,2010-11-02 04:05:48.937 +2719,674,3,Revival,False,2010-11-02 04:05:48.967 +2735,82,3,Scholar,False,2010-11-03 04:47:23.487 +2758,668,3,Citizen Patrol,False,2010-11-04 04:17:26.713 +2763,674,3,Nice Question,False,2010-11-04 07:02:26.607 +2769,674,2,Necromancer,False,2010-11-04 09:52:26.937 +2771,221,3,Scholar,False,2010-11-04 14:57:26.377 +2775,221,3,Self-Learner,False,2010-11-04 19:22:32.647 +2782,211,2,Necromancer,False,2010-11-04 21:27:33.413 +2792,1073,3,Student,False,2010-11-05 07:17:33.300 +2794,449,1,Fanatic,False,2010-11-05 08:07:33.457 +2795,1073,3,Scholar,False,2010-11-05 08:57:33.563 +2798,436,2,Enlightened,False,2010-11-05 13:27:33.753 +2804,155,3,Nice Question,False,2010-11-05 13:47:33.820 +2844,668,3,Nice Answer,False,2010-11-06 20:12:42.500 +2857,1209,3,Teacher,False,2010-11-07 08:42:42.680 +2859,674,3,Nice Answer,False,2010-11-07 11:07:42.520 +2861,674,2,Enlightened,False,2010-11-07 11:22:42.680 +2865,1073,3,Critic,False,2010-11-07 15:47:48.580 +2866,192,3,Supporter,False,2010-11-07 16:47:53.543 +2867,192,3,Critic,False,2010-11-07 16:47:53.573 +2874,399,3,Nice Answer,False,2010-11-07 21:27:53.230 +2883,450,2,Necromancer,False,2010-11-08 09:22:53.783 +2884,4,3,Nice Answer,False,2010-11-08 09:52:53.493 +2900,1085,3,Editor,False,2010-11-09 07:53:43.907 +2903,1085,3,Teacher,False,2010-11-09 10:28:43.447 +2904,1085,3,Student,False,2010-11-09 10:28:43.573 +2905,1085,3,Scholar,False,2010-11-09 10:28:43.633 +2906,1085,3,Nice Question,False,2010-11-09 10:28:43.680 +2909,1085,3,Supporter,False,2010-11-09 15:28:43.677 +2916,211,3,Nice Question,False,2010-11-09 22:38:46.653 +2923,192,3,Nice Question,False,2010-11-10 09:23:46.957 +2938,346,3,Nice Answer,False,2010-11-10 17:19:02.647 +2944,674,3,Announcer,False,2010-11-10 20:14:03.853 +2952,1085,3,Critic,False,2010-11-11 10:04:03.397 +2970,674,3,r,True,2010-11-12 03:00:07.883 +2976,674,3,Disciplined,False,2010-11-12 12:39:10.407 +2983,155,3,Nice Question,False,2010-11-12 16:59:10.950 +2987,346,2,Civic Duty,False,2010-11-12 17:54:13.243 +2989,155,3,Mortarboard,False,2010-11-12 23:09:17.340 +2993,155,3,Nice Question,False,2010-11-13 06:34:18.180 +3007,1322,3,Student,False,2010-11-14 14:54:25.153 +3011,1322,3,Scholar,False,2010-11-14 21:04:25.293 +3015,132,3,Nice Answer,False,2010-11-15 10:39:25.670 +3022,211,3,Nice Answer,False,2010-11-15 20:39:32.140 +3023,449,3,Organizer,False,2010-11-15 20:54:32.060 +3026,287,3,Self-Learner,False,2010-11-15 21:54:32.080 +3030,1145,3,Editor,False,2010-11-16 07:44:36.993 +3032,1085,3,Commentator,False,2010-11-16 09:29:36.987 +3038,287,3,Nice Question,False,2010-11-16 12:45:15.273 +3060,1145,3,Nice Answer,False,2010-11-17 13:40:25.050 +3061,287,3,Nice Answer,False,2010-11-17 13:45:25.043 +3074,1145,3,Commentator,False,2010-11-18 02:45:25.240 +3114,169,3,Nice Question,False,2010-11-19 22:10:36.430 +3119,4,3,Popular Question,False,2010-11-20 08:50:38.257 +3137,1145,2,Enthusiast,False,2010-11-22 00:00:53.153 +3141,1073,3,Editor,False,2010-11-22 08:55:53.177 +3149,1359,3,Student,False,2010-11-23 21:01:03.167 +3151,1359,3,Editor,False,2010-11-23 21:31:03.037 +3155,1359,3,Supporter,False,2010-11-23 22:46:02.990 +3156,132,3,Nice Answer,False,2010-11-23 23:11:03.003 +3161,155,3,Nice Answer,False,2010-11-24 02:21:03.080 +3165,674,3,Nice Answer,False,2010-11-24 11:46:03.163 +3174,221,3,Commentator,False,2010-11-24 16:41:03.077 +3176,1411,3,Teacher,False,2010-11-24 20:56:03.210 +3179,633,3,Teacher,False,2010-11-25 03:06:02.990 +3181,1412,3,Teacher,False,2010-11-25 07:36:08.077 +3201,1428,3,Autobiographer,False,2010-11-26 13:21:09.080 +3202,1428,3,Teacher,False,2010-11-26 13:26:08.997 +3206,1428,3,Editor,False,2010-11-26 17:46:09.173 +3207,1428,3,Supporter,False,2010-11-26 17:56:09.067 +3215,182,3,Nice Answer,False,2010-11-27 09:46:13.027 +3216,22,3,Nice Answer,False,2010-11-27 10:11:12.993 +3217,211,2,Necromancer,False,2010-11-27 10:26:13.133 +3219,674,3,Nice Answer,False,2010-11-27 13:36:13.107 +3225,4,2,Good Question,False,2010-11-27 19:36:16.107 +3232,674,3,Nice Answer,False,2010-11-28 05:51:16.047 +3237,674,1,Fanatic,False,2010-11-28 10:36:16.163 +3240,674,2,Enlightened,False,2010-11-28 12:56:21.993 +3259,4,3,Nice Answer,False,2010-11-29 20:36:34.180 +3270,4,3,Nice Answer,False,2010-11-30 07:46:34.047 +3271,4,2,Enlightened,False,2010-11-30 08:11:34.233 +3303,1085,3,Promoter,False,2010-12-02 03:11:37.260 +3309,1085,3,Organizer,False,2010-12-02 06:31:36.980 +3325,4,3,Talkative,False,2010-12-03 02:04:05.280 +3327,132,3,Talkative,False,2010-12-03 02:04:05.343 +3329,674,3,Talkative,False,2010-12-03 02:04:05.373 +3331,668,3,Talkative,False,2010-12-03 02:04:05.403 +3335,190,2,Notable Question,False,2010-12-03 07:46:44.360 +3342,211,3,Nice Question,False,2010-12-03 18:26:48.113 +3349,1209,3,Student,False,2010-12-04 00:16:48.023 +3350,1209,3,Cleanup,False,2010-12-04 00:51:48.230 +3361,668,3,Nice Answer,False,2010-12-04 17:26:49.323 +3365,628,2,Good Question,False,2010-12-04 17:31:49.303 +3376,728,3,Commentator,False,2010-12-05 00:06:49.020 +3390,60,3,Nice Answer,False,2010-12-06 23:42:07.010 +3391,750,1,Fanatic,False,2010-12-07 02:32:07.303 +3397,651,3,Critic,False,2010-12-07 09:32:08.407 +3401,1406,3,Student,False,2010-12-07 12:12:08.207 +3402,221,3,Critic,False,2010-12-07 12:27:08.297 +3403,1406,3,Editor,False,2010-12-07 13:07:08.170 +3408,371,3,Teacher,False,2010-12-07 16:27:08.030 +3409,371,3,Editor,False,2010-12-07 18:32:09.203 +3413,1359,3,Scholar,False,2010-12-07 22:37:08.950 +3416,1406,3,Scholar,False,2010-12-08 09:32:09.063 +3419,1406,3,Teacher,False,2010-12-08 13:22:09.073 +3420,60,3,Popular Question,False,2010-12-08 14:32:09.157 +3427,1406,3,Supporter,False,2010-12-08 20:17:09.070 +3428,155,3,Nice Question,False,2010-12-08 20:17:09.147 +3431,674,3,Nice Answer,False,2010-12-08 23:27:18.713 +3451,155,3,Precognitive,False,2010-12-09 01:22:23.360 +3464,4,3,Precognitive,False,2010-12-09 01:22:23.500 +3470,132,3,Precognitive,False,2010-12-09 01:22:23.577 +3487,22,3,Precognitive,False,2010-12-09 01:22:23.873 +3495,190,3,Precognitive,False,2010-12-09 01:22:24.030 +3499,1506,3,Teacher,False,2010-12-09 02:32:34.427 +3501,155,3,Nice Question,False,2010-12-09 05:57:36.933 +3507,1359,3,Teacher,False,2010-12-09 16:42:38.137 +3509,4,3,Nice Answer,False,2010-12-09 17:47:38.243 +3510,674,3,Nice Answer,False,2010-12-09 17:47:38.323 +3512,750,3,Suffrage,False,2010-12-09 21:17:38.310 +3519,1506,3,Editor,False,2010-12-10 00:27:38.167 +3529,1506,3,Supporter,False,2010-12-10 14:27:38.017 +3544,4,2,Convention,False,2010-12-11 00:22:29.403 +3547,132,2,Convention,False,2010-12-11 00:22:29.433 +3546,668,2,Convention,False,2010-12-11 00:22:29.433 +3548,1073,3,Quorum,False,2010-12-11 00:22:29.480 +3553,60,3,Quorum,False,2010-12-11 00:22:29.577 +3557,728,3,Quorum,False,2010-12-11 00:22:29.623 +3558,436,3,Quorum,False,2010-12-11 00:22:29.637 +3570,4,3,Quorum,False,2010-12-11 00:22:29.793 +3573,192,3,Quorum,False,2010-12-11 00:22:29.823 +3575,166,3,Quorum,False,2010-12-11 00:22:29.857 +3577,287,3,Quorum,False,2010-12-11 00:22:29.887 +3582,450,3,Quorum,False,2010-12-11 00:22:29.967 +3584,132,3,Quorum,False,2010-12-11 00:22:29.997 +3587,674,3,Quorum,False,2010-12-11 00:22:30.027 +3589,346,3,Quorum,False,2010-12-11 00:22:30.497 +3593,668,3,Quorum,False,2010-12-11 00:22:30.603 +3594,750,3,Quorum,False,2010-12-11 00:22:30.620 +3600,221,3,Quorum,False,2010-12-11 00:22:30.760 +3601,22,3,Quorum,False,2010-12-11 00:22:30.777 +3602,211,3,Quorum,False,2010-12-11 00:22:30.807 +3605,190,3,Quorum,False,2010-12-11 00:22:30.853 +3607,211,3,Self-Learner,False,2010-12-11 03:27:40.077 +3610,211,3,Nice Question,False,2010-12-11 12:52:40.150 +3611,674,3,Nice Answer,False,2010-12-11 12:57:40.050 +3622,1411,3,Supporter,False,2010-12-11 20:47:40.053 +3633,674,2,Enlightened,False,2010-12-12 20:22:43.237 +3646,155,3,Nice Question,False,2010-12-13 17:57:48.053 +3651,190,3,Nice Answer,False,2010-12-14 00:52:58.200 +3652,287,3,Nice Answer,False,2010-12-14 00:52:58.293 +3653,1209,3,Supporter,False,2010-12-14 02:07:58.107 +3655,60,3,Nice Answer,False,2010-12-14 02:47:58.633 +3657,1542,3,Scholar,False,2010-12-14 03:17:58.063 +3659,1506,3,Student,False,2010-12-14 04:02:58.087 +3662,1406,3,Commentator,False,2010-12-14 05:02:58.207 +3664,1542,3,Student,False,2010-12-14 07:07:58.050 +3672,668,3,Nice Answer,False,2010-12-14 16:32:59.180 +3675,1411,3,Nice Answer,False,2010-12-14 18:42:59.300 +3677,1411,2,Enlightened,False,2010-12-14 19:07:59.363 +3679,211,3,Nice Answer,False,2010-12-14 19:42:59.053 +3686,169,3,Nice Answer,False,2010-12-14 23:43:00.303 +3695,1411,3,Commentator,False,2010-12-15 16:43:01.173 +3712,1411,3,Editor,False,2010-12-16 16:46:45.997 +3728,449,3,Nice Answer,False,2010-12-17 15:51:47.140 +3729,668,3,Nice Answer,False,2010-12-17 16:26:47.377 +3730,1575,3,Autobiographer,False,2010-12-17 17:41:48.217 +3745,503,3,Supporter,False,2010-12-20 06:22:24.987 +3746,155,3,Cleanup,False,2010-12-20 06:57:25.053 +3748,1406,3,Revival,False,2010-12-20 14:12:25.060 +3754,1359,3,Critic,False,2010-12-20 19:02:25.070 +3766,668,2,Sportsmanship,False,2010-12-21 20:02:28.193 +3775,190,2,Favorite Question,False,2010-12-22 09:37:28.287 +3779,221,3,Revival,False,2010-12-22 17:37:29.133 +3780,1602,3,Student,False,2010-12-22 17:52:29.160 +3808,1209,3,Scholar,False,2010-12-24 18:37:30.063 +3816,668,3,Nice Answer,False,2010-12-26 14:12:35.063 +3830,1636,3,Teacher,False,2010-12-28 15:02:37.090 +3837,211,3,Nice Answer,False,2010-12-28 19:42:38.353 +3847,1359,3,Commentator,False,2010-12-29 18:42:38.213 +3854,169,3,Nice Question,False,2010-12-30 00:07:38.433 +3870,287,3,Popular Question,False,2010-12-30 21:42:39.203 +3877,674,3,Nice Answer,False,2010-12-31 09:32:43.363 +3914,211,3,Nice Question,False,2011-01-03 15:25:19.093 +3920,668,3,Revival,False,2011-01-03 20:50:19.397 +3925,668,3,Revival,False,2011-01-03 22:25:19.480 +3940,1406,3,Nice Answer,False,2011-01-04 13:35:23.233 +3941,1691,3,Supporter,False,2011-01-04 14:00:23.343 +3942,1691,3,Autobiographer,False,2011-01-04 14:00:23.373 +3955,1406,2,Enthusiast,False,2011-01-05 05:40:23.467 +3985,1693,3,Teacher,False,2011-01-06 16:00:28.137 +3986,668,2,Strunk & White,False,2011-01-06 16:05:28.753 +3993,1717,3,Editor,False,2011-01-06 21:00:50.793 +4001,399,3,Popular Question,False,2011-01-07 05:00:51.193 +4003,132,2,Strunk & White,False,2011-01-07 09:50:53.437 +4006,1693,3,Supporter,False,2011-01-07 13:25:53.203 +4007,1406,3,Organizer,False,2011-01-07 13:50:53.170 +4011,1717,3,Supporter,False,2011-01-07 17:20:54.173 +4012,1717,3,Scholar,False,2011-01-07 17:20:54.203 +4023,503,3,Organizer,False,2011-01-08 11:30:56.197 +4024,436,1,Fanatic,False,2011-01-08 18:50:56.250 +4034,1412,3,Editor,False,2011-01-09 22:35:59.153 +4040,1406,3,Quorum,False,2011-01-10 14:33:25.140 +4042,1693,3,Revival,False,2011-01-10 17:40:59.290 +4044,1412,3,Supporter,False,2011-01-10 17:55:59.130 +4051,132,3,Tag Editor,False,2011-01-10 23:25:59.233 +4054,190,3,Nice Answer,False,2011-01-11 05:41:02.293 +4067,1412,3,Commentator,False,2011-01-11 20:46:02.180 +4070,668,2,Necromancer,False,2011-01-11 21:16:02.270 +4074,132,3,time-series,True,2011-01-12 03:00:06.743 +4087,261,3,Nice Answer,False,2011-01-12 22:36:06.140 +4090,1406,3,Nice Answer,False,2011-01-13 02:56:06.240 +4091,1406,2,Enlightened,False,2011-01-13 03:01:06.577 +4097,668,2,Necromancer,False,2011-01-13 08:06:06.397 +4099,947,3,Nice Question,False,2011-01-13 12:16:07.553 +4122,4,3,Nice Answer,False,2011-01-14 11:16:15.417 +4127,4,2,Enlightened,False,2011-01-14 14:46:18.430 +4132,4,3,Nice Answer,False,2011-01-14 15:46:18.043 +4134,4,2,Enlightened,False,2011-01-14 17:16:18.420 +4139,60,3,Nice Answer,False,2011-01-14 22:41:18.220 +4142,1411,3,Nice Answer,False,2011-01-15 09:16:19.267 +4143,1411,2,Enlightened,False,2011-01-15 10:11:19.307 +4151,1412,3,Organizer,False,2011-01-15 18:11:20.247 +4165,4,3,Nice Answer,False,2011-01-16 16:56:43.297 +4175,1741,3,Scholar,False,2011-01-17 11:01:45.157 +4178,287,2,Good Question,False,2011-01-17 13:31:45.777 +4181,1741,3,Student,False,2011-01-17 14:56:45.440 +4191,1406,3,Nice Answer,False,2011-01-17 22:01:45.373 +4195,155,3,Nice Answer,False,2011-01-18 00:01:45.733 +4196,1805,3,Student,False,2011-01-18 04:36:46.157 +4197,1804,3,Student,False,2011-01-18 07:51:35.323 +4201,1809,3,Student,False,2011-01-18 14:01:36.190 +4203,1809,3,Supporter,False,2011-01-18 14:41:36.423 +4204,1809,3,Scholar,False,2011-01-18 15:06:36.337 +4206,1804,3,Scholar,False,2011-01-18 16:36:36.393 +4207,1804,3,Supporter,False,2011-01-18 16:41:36.607 +4209,1805,3,Scholar,False,2011-01-18 19:16:37.627 +4221,668,3,Nice Answer,False,2011-01-19 05:21:38.133 +4222,668,2,Enlightened,False,2011-01-19 05:36:38.493 +4231,1805,3,Editor,False,2011-01-19 18:41:39.210 +4232,1805,3,Supporter,False,2011-01-19 18:46:39.377 +4233,651,3,Nice Question,False,2011-01-19 18:51:41.090 +4238,1805,3,Teacher,False,2011-01-19 21:46:41.403 +4248,628,3,Critic,False,2011-01-20 11:36:03.100 +4257,750,3,Talkative,False,2011-01-21 17:50:11.490 +4259,1209,3,Commentator,False,2011-01-21 19:36:11.803 +4276,1406,3,Disciplined,False,2011-01-23 19:01:19.680 +4279,1693,3,Editor,False,2011-01-23 22:41:20.240 +4289,1406,3,Revival,False,2011-01-24 14:11:26.847 +4290,674,3,Nice Answer,False,2011-01-24 16:06:26.203 +4291,674,2,Convention,False,2011-01-24 16:50:11.187 +4294,628,2,Notable Question,False,2011-01-24 19:56:28.710 +4297,1693,3,Critic,False,2011-01-25 00:26:28.150 +4310,1693,3,Student,False,2011-01-25 15:21:53.670 +4321,1506,3,Scholar,False,2011-01-26 00:11:56.543 +4342,1790,3,Editor,False,2011-01-26 23:57:07.133 +4343,1790,3,Student,False,2011-01-27 00:07:07.327 +4350,1406,2,Civic Duty,False,2011-01-27 07:37:08.417 +4361,132,3,Nice Question,False,2011-01-27 17:52:12.280 +4374,132,2,Good Answer,False,2011-01-28 13:02:16.567 +4376,1790,3,Supporter,False,2011-01-28 13:37:16.530 +4384,190,3,Nice Answer,False,2011-01-28 18:07:16.827 +4385,399,3,Nice Answer,False,2011-01-28 18:07:16.857 +4390,1693,3,Scholar,False,2011-01-29 03:22:21.747 +4394,674,3,regression,True,2011-01-30 03:00:08.090 +4396,1895,3,Editor,False,2011-01-30 03:27:22.230 +4399,1693,3,Commentator,False,2011-01-30 05:12:22.547 +4404,1895,3,Teacher,False,2011-01-30 12:22:22.500 +4413,1895,3,Revival,False,2011-01-31 05:32:25.997 +4423,1575,3,Student,False,2011-01-31 14:17:25.287 +4431,651,3,Self-Learner,False,2011-01-31 18:07:25.527 +4438,1805,3,Commentator,False,2011-01-31 23:32:26.027 +4440,1889,3,Teacher,False,2011-02-01 02:02:25.927 +4447,1575,3,Supporter,False,2011-02-01 10:12:28.613 +4454,1575,3,Scholar,False,2011-02-01 17:02:28.300 +4462,674,3,data-visualization,True,2011-02-02 03:00:11.850 +4472,221,3,Organizer,False,2011-02-02 11:02:33.180 +4476,1923,3,Student,False,2011-02-02 14:52:33.647 +4477,1790,3,Scholar,False,2011-02-02 14:52:33.787 +4479,1923,3,Supporter,False,2011-02-02 15:57:33.980 +4481,1923,3,Editor,False,2011-02-02 16:22:33.617 +4483,1926,3,Autobiographer,False,2011-02-02 18:22:33.420 +4487,1895,3,Nice Answer,False,2011-02-02 21:27:33.587 +4488,1895,2,Enlightened,False,2011-02-02 23:07:33.783 +4490,1895,3,Supporter,False,2011-02-03 01:57:33.823 +4494,674,2,Necromancer,False,2011-02-03 04:17:33.467 +4496,1930,3,Student,False,2011-02-03 09:51:06.567 +4503,1927,3,Student,False,2011-02-03 13:11:06.393 +4504,668,3,Nice Answer,False,2011-02-03 16:51:06.617 +4521,1575,3,Editor,False,2011-02-04 12:51:08.660 +4526,1575,3,Teacher,False,2011-02-04 13:56:08.760 +4528,1406,3,Revival,False,2011-02-04 14:56:08.457 +4536,1945,3,Student,False,2011-02-04 22:01:08.503 +4537,1923,3,Teacher,False,2011-02-04 22:51:11.593 +4542,132,3,Nice Answer,False,2011-02-05 16:46:11.580 +4544,132,2,Enlightened,False,2011-02-05 17:36:11.893 +4546,1040,3,Student,False,2011-02-05 19:16:11.107 +4550,169,3,Nice Question,False,2011-02-05 20:51:11.180 +4554,1895,3,Commentator,False,2011-02-05 23:16:11.600 +4555,1145,2,Good Answer,False,2011-02-06 16:06:15.470 +4556,1927,3,Supporter,False,2011-02-06 16:56:15.350 +4557,1927,3,Scholar,False,2011-02-06 16:56:15.363 +4569,1895,3,Nice Answer,False,2011-02-07 11:11:39.420 +4570,1927,3,Editor,False,2011-02-07 12:36:39.737 +4583,1895,3,Mortarboard,False,2011-02-07 19:26:39.490 +4584,1406,3,Suffrage,False,2011-02-07 19:56:39.807 +4595,1895,3,Critic,False,2011-02-08 05:46:39.700 +4597,1691,3,Student,False,2011-02-08 09:21:39.687 +4603,1691,3,Editor,False,2011-02-08 14:21:39.380 +4606,4,3,Nice Answer,False,2011-02-08 16:41:39.527 +4620,1406,3,Critic,False,2011-02-09 14:16:39.247 +4632,221,3,Citizen Patrol,False,2011-02-10 04:47:00.150 +4636,750,3,Citizen Patrol,False,2011-02-10 04:47:00.167 +4639,1085,3,Citizen Patrol,False,2011-02-10 04:47:00.183 +4640,1406,3,Citizen Patrol,False,2011-02-10 04:47:00.183 +4645,436,3,Organizer,False,2011-02-10 10:32:00.090 +4646,60,3,Self-Learner,False,2011-02-10 11:57:00.493 +4648,1923,3,Commentator,False,2011-02-10 13:07:00.090 +4652,503,3,Student,False,2011-02-10 16:37:01.267 +4656,1985,3,Student,False,2011-02-10 16:42:00.263 +4658,1895,3,Nice Answer,False,2011-02-10 17:32:00.390 +4662,261,3,Nice Question,False,2011-02-10 18:37:01.193 +4664,503,3,Scholar,False,2011-02-10 21:27:00.133 +4674,190,2,Civic Duty,False,2011-02-11 09:22:09.513 +4676,1831,3,Student,False,2011-02-11 14:52:09.250 +4680,1406,2,Enlightened,False,2011-02-11 15:27:09.407 +4683,668,1,Electorate,False,2011-02-11 19:27:09.577 +4692,1927,3,Commentator,False,2011-02-12 07:12:14.097 +4694,155,3,Nice Answer,False,2011-02-12 12:47:14.303 +4696,155,3,Nice Answer,False,2011-02-13 06:17:20.607 +4709,155,2,Enlightened,False,2011-02-13 14:37:22.333 +4718,190,2,Sportsmanship,False,2011-02-14 00:22:27.540 +4720,1691,3,Scholar,False,2011-02-14 00:57:27.310 +4727,2015,3,Editor,False,2011-02-14 05:12:27.453 +4728,2015,3,Student,False,2011-02-14 05:22:27.500 +4736,1895,3,Revival,False,2011-02-14 12:47:29.587 +4742,4,3,Popular Question,False,2011-02-14 22:17:57.283 +4771,1040,3,Nice Question,False,2011-02-15 03:57:57.233 +4794,1831,3,Supporter,False,2011-02-15 09:12:58.373 +4795,1831,3,Scholar,False,2011-02-15 09:12:58.403 +4810,2069,3,Student,False,2011-02-15 19:32:58.010 +4812,2069,3,Editor,False,2011-02-15 21:42:58.113 +4813,2069,3,Scholar,False,2011-02-15 21:42:58.207 +4816,2071,3,Supporter,False,2011-02-15 23:42:58.043 +4821,723,3,Scholar,False,2011-02-16 06:37:58.257 +4823,2075,3,Student,False,2011-02-16 08:57:58.287 +4827,346,3,Nice Answer,False,2011-02-16 11:22:58.363 +4842,1805,3,Critic,False,2011-02-16 19:12:58.983 +4843,2081,3,Student,False,2011-02-16 19:47:58.940 +4844,1895,3,Quorum,False,2011-02-16 20:01:48.330 +4846,1923,3,Quorum,False,2011-02-16 21:01:49.343 +4853,1927,3,Teacher,False,2011-02-17 01:03:00.277 +4854,1895,2,Enlightened,False,2011-02-17 02:58:00.240 +4855,2085,3,Autobiographer,False,2011-02-17 05:07:59.970 +4860,1923,3,Promoter,False,2011-02-17 10:48:00.330 +4864,190,2,Necromancer,False,2011-02-17 12:48:01.257 +4866,1691,3,Teacher,False,2011-02-17 16:13:01.037 +4886,503,3,Quorum,False,2011-02-18 17:02:09.167 +4888,1406,3,Tag Editor,False,2011-02-18 17:58:06.293 +4894,1831,3,Teacher,False,2011-02-19 11:38:08.373 +4895,1831,3,Editor,False,2011-02-19 13:03:08.460 +4896,1889,3,Editor,False,2011-02-19 19:18:08.203 +4901,1889,3,Supporter,False,2011-02-20 00:38:08.980 +4910,1691,3,Promoter,False,2011-02-20 16:53:18.457 +4912,2121,3,Precognitive,False,2011-02-20 17:02:09.227 +4923,155,2,Strunk & White,False,2011-02-21 03:33:40.633 +4927,1923,3,Scholar,False,2011-02-21 08:48:42.020 +4934,786,3,Supporter,False,2011-02-21 15:18:42.033 +4935,786,3,Scholar,False,2011-02-21 15:18:42.063 +4960,166,3,Popular Question,False,2011-02-22 19:33:56.980 +4974,132,3,Nice Answer,False,2011-02-23 09:13:58.697 +4978,132,2,Enlightened,False,2011-02-23 12:28:59.290 +4981,2149,3,Teacher,False,2011-02-23 13:38:58.953 +4984,1359,3,Nice Answer,False,2011-02-23 15:58:58.967 +4985,1691,3,Commentator,False,2011-02-23 16:13:58.917 +4986,1359,2,Enlightened,False,2011-02-23 17:08:59.097 +4995,211,3,Nice Answer,False,2011-02-23 21:14:00.360 +4997,211,2,Enlightened,False,2011-02-23 22:29:01.453 +5019,2164,3,Autobiographer,False,2011-02-24 18:39:04.000 +5020,2164,3,Editor,False,2011-02-24 18:44:04.053 +5021,155,3,Popular Question,False,2011-02-24 18:44:04.197 +5043,1741,3,Teacher,False,2011-02-25 12:39:18.093 +5044,1741,3,Supporter,False,2011-02-25 12:44:18.880 +5045,674,3,Nice Answer,False,2011-02-25 12:49:19.093 +5046,674,2,Enlightened,False,2011-02-25 13:09:19.210 +5056,287,3,Nice Answer,False,2011-02-25 18:49:18.993 +5061,1889,3,Commentator,False,2011-02-26 01:54:18.790 +5075,1895,3,Nice Answer,False,2011-02-26 17:55:05.167 +5077,793,3,Critic,False,2011-02-26 21:55:09.007 +5078,1895,2,Enthusiast,False,2011-02-27 02:20:09.763 +5079,668,3,Nice Answer,False,2011-02-27 06:45:18.343 +5080,668,2,Enlightened,False,2011-02-27 06:45:18.763 +5085,186,3,Nice Question,False,2011-02-27 11:00:18.417 +5107,1809,3,Editor,False,2011-02-28 12:35:27.090 +5110,1809,3,Commentator,False,2011-02-28 14:25:27.120 +5111,2198,3,Teacher,False,2011-02-28 14:40:27.077 +5116,4,3,Popular Question,False,2011-02-28 18:55:27.007 +5126,1889,2,Enthusiast,False,2011-03-01 01:10:41.223 +5151,1209,3,Nice Question,False,2011-03-02 06:20:58.910 +5155,668,3,Nice Answer,False,2011-03-02 08:15:59.163 +5156,668,2,Enlightened,False,2011-03-02 09:15:59.377 +5164,1831,3,Organizer,False,2011-03-02 16:00:58.910 +5165,2217,3,Autobiographer,False,2011-03-02 16:30:59.003 +5169,1930,3,Nice Question,False,2011-03-03 10:46:13.210 +5177,2164,3,Teacher,False,2011-03-03 16:56:13.093 +5183,855,3,Editor,False,2011-03-03 18:06:12.923 +5184,2164,3,Supporter,False,2011-03-03 19:21:13.237 +5186,4,3,Nice Question,False,2011-03-03 20:21:12.957 +5197,287,3,Nice Question,False,2011-03-04 14:26:19.183 +5199,668,2,Pundit,False,2011-03-04 15:56:19.057 +5223,503,3,Editor,False,2011-03-05 11:31:26.267 +5231,668,3,Nice Answer,False,2011-03-06 11:52:10.737 +5234,668,2,Enlightened,False,2011-03-06 13:37:12.157 +5235,2069,3,Supporter,False,2011-03-06 16:42:13.150 +5238,674,3,Nice Answer,False,2011-03-06 21:17:13.153 +5242,503,3,Commentator,False,2011-03-07 01:02:12.953 +5261,668,3,Cleanup,False,2011-03-07 15:37:14.160 +5272,1209,3,Nice Question,False,2011-03-07 21:57:14.800 +5273,60,3,Nice Answer,False,2011-03-07 21:57:14.910 +5277,668,3,Nice Answer,False,2011-03-07 22:32:15.897 +5287,60,3,Nice Answer,False,2011-03-08 01:27:15.987 +5295,1406,3,Cleanup,False,2011-03-08 08:22:15.763 +5309,668,3,Nice Answer,False,2011-03-08 13:42:30.173 +5320,2149,3,Autobiographer,False,2011-03-08 21:38:02.150 +5346,346,3,Popular Question,False,2011-03-09 16:38:44.563 +5355,668,3,Nice Answer,False,2011-03-10 04:00:23.780 +5368,1831,3,Commentator,False,2011-03-10 15:45:42.193 +5370,1406,2,Strunk & White,False,2011-03-10 17:25:42.663 +5378,668,2,Enlightened,False,2011-03-10 22:00:43.377 +5405,793,3,Commentator,False,2011-03-12 02:05:58.303 +5445,449,3,Nice Question,False,2011-03-14 05:06:08.703 +5446,211,3,Nice Question,False,2011-03-14 08:11:08.250 +5455,2081,3,Teacher,False,2011-03-14 13:01:08.313 +5461,2081,3,Commentator,False,2011-03-14 17:21:08.423 +5466,2149,3,Supporter,False,2011-03-14 19:26:08.800 +5480,60,3,Nice Answer,False,2011-03-15 14:35:06.913 +5490,674,3,Nice Answer,False,2011-03-15 21:35:08.453 +5495,1406,1,Fanatic,False,2011-03-16 05:00:19.543 +5498,1406,3,Nice Answer,False,2011-03-16 08:00:21.710 +5501,2149,3,Editor,False,2011-03-16 11:10:21.500 +5511,4,2,Good Question,False,2011-03-16 22:15:22.330 +5516,2352,3,Teacher,False,2011-03-16 23:25:24.100 +5519,2352,3,Editor,False,2011-03-17 00:10:24.247 +5540,4,3,Nice Question,False,2011-03-17 19:40:59.723 +5543,2352,3,Supporter,False,2011-03-17 23:50:59.523 +5546,2352,3,Student,False,2011-03-18 05:51:03.430 +5573,503,3,Nice Answer,False,2011-03-19 15:26:17.360 +5588,1889,3,Quorum,False,2011-03-19 22:32:46.573 +5599,1927,3,Critic,False,2011-03-20 15:21:26.827 +5618,2149,3,Revival,False,2011-03-21 13:41:26.537 +5626,2217,3,Supporter,False,2011-03-21 21:41:38.450 +5652,1889,3,Revival,False,2011-03-22 21:47:56.077 +5653,2149,3,Commentator,False,2011-03-22 21:57:55.850 +5655,1359,3,Promoter,False,2011-03-22 22:37:55.613 +5661,2149,3,Critic,False,2011-03-23 09:19:42.093 +5678,1831,3,Critic,False,2011-03-24 08:19:45.230 +5697,166,3,Popular Question,False,2011-03-25 08:29:52.623 +5725,1927,2,Enthusiast,False,2011-03-27 06:10:26.770 +5729,668,3,Nice Answer,False,2011-03-27 11:45:26.750 +5733,1085,3,Quorum,False,2011-03-28 01:38:19.057 +5734,2149,2,Enthusiast,False,2011-03-28 03:10:26.293 +5799,211,3,Popular Question,False,2011-03-31 19:24:23.473 +5804,2490,3,Student,False,2011-04-01 05:59:59.367 +5818,2490,3,Supporter,False,2011-04-01 21:28:53.403 +5819,2490,3,Scholar,False,2011-04-01 21:28:53.433 +5824,1889,3,Student,False,2011-04-02 02:58:54.167 +5830,1889,3,Scholar,False,2011-04-02 10:48:54.223 +5851,132,3,r,True,2011-04-04 03:00:05.207 +5861,2352,3,Commentator,False,2011-04-04 19:04:24.277 +5863,1741,3,Editor,False,2011-04-04 20:29:24.030 +5874,2420,3,Student,False,2011-04-05 12:55:30.277 +5886,2420,3,Supporter,False,2011-04-05 20:30:34.443 +5899,750,2,Convention,False,2011-04-06 05:39:01.183 +5900,2149,2,Necromancer,False,2011-04-06 06:25:35.510 +5920,1322,3,Popular Question,False,2011-04-06 20:45:42.460 +5924,346,2,Necromancer,False,2011-04-06 21:55:42.497 +5929,232,2,Enthusiast,False,2011-04-07 02:05:42.090 +5951,651,3,Nice Answer,False,2011-04-07 20:55:45.367 +5996,674,3,Popular Question,False,2011-04-10 09:35:48.730 +6001,793,3,Nice Question,False,2011-04-10 13:10:48.170 +6007,132,3,Nice Answer,False,2011-04-10 17:20:49.260 +6012,1406,3,Investor,False,2011-04-11 08:40:48.140 +6013,1895,3,Nice Answer,False,2011-04-11 10:10:48.340 +6015,1895,2,Enlightened,False,2011-04-11 10:25:48.320 +6022,211,2,Taxonomist,False,2011-04-11 13:05:48.607 +6025,668,3,Nice Answer,False,2011-04-11 13:15:48.360 +6033,436,3,Promoter,False,2011-04-11 17:00:50.200 +6034,190,3,Nice Answer,False,2011-04-11 17:05:50.117 +6045,1927,3,Nice Question,False,2011-04-12 01:00:52.357 +6046,1895,3,Citizen Patrol,False,2011-04-12 02:45:52.290 +6064,1741,3,Cleanup,False,2011-04-12 18:05:52.753 +6066,1741,3,Commentator,False,2011-04-12 20:15:54.043 +6150,855,3,Quorum,False,2011-04-16 16:39:12.067 +6158,182,3,Popular Question,False,2011-04-17 15:36:19.353 +6159,1040,3,Supporter,False,2011-04-17 18:41:23.150 +6160,1040,3,Scholar,False,2011-04-17 18:41:23.180 +6163,674,2,Good Question,False,2011-04-17 22:06:23.533 +6165,793,3,Popular Question,False,2011-04-18 01:41:23.503 +6175,793,3,Nice Question,False,2011-04-18 12:01:25.340 +6182,568,3,Supporter,False,2011-04-18 20:56:34.100 +6193,221,2,Enthusiast,False,2011-04-19 07:01:34.273 +6198,674,3,Nice Answer,False,2011-04-19 11:31:34.500 +6199,674,2,Enlightened,False,2011-04-19 11:31:34.670 +6203,1895,3,Organizer,False,2011-04-19 13:06:34.373 +6204,169,3,Popular Question,False,2011-04-19 15:11:34.390 +6205,1895,3,Nice Answer,False,2011-04-19 15:21:34.347 +6207,674,3,Popular Question,False,2011-04-19 16:01:34.420 +6214,674,3,Nice Answer,False,2011-04-19 22:21:34.363 +6215,674,2,Enlightened,False,2011-04-19 22:31:34.350 +6229,750,3,Revival,False,2011-04-20 12:36:41.710 +6231,2666,3,Teacher,False,2011-04-20 14:11:41.187 +6239,793,3,Nice Question,False,2011-04-20 23:46:41.203 +6253,1927,3,Nice Answer,False,2011-04-21 14:21:44.417 +6259,1927,2,Enlightened,False,2011-04-21 15:56:44.487 +6265,1322,3,Editor,False,2011-04-22 03:41:44.537 +6271,2690,3,Student,False,2011-04-22 19:46:59.173 +6275,155,2,Taxonomist,False,2011-04-23 00:16:59.227 +6277,1322,3,Supporter,False,2011-04-23 01:36:59.423 +6309,2690,3,Supporter,False,2011-04-25 08:02:07.093 +6310,2690,3,Scholar,False,2011-04-25 08:02:07.140 +6327,674,2,r,True,2011-04-26 03:00:06.227 +6328,668,3,Nice Question,False,2011-04-26 03:42:08.170 +6329,1693,3,Organizer,False,2011-04-26 05:17:08.473 +6343,155,3,Nice Answer,False,2011-04-26 22:12:18.427 +6389,261,3,Nice Answer,False,2011-04-28 21:27:48.537 +6392,1406,3,regression,True,2011-04-29 03:00:04.867 +6399,221,2,Civic Duty,False,2011-04-29 09:33:06.280 +6406,155,3,Nice Answer,False,2011-04-29 15:03:12.200 +6421,132,3,Nice Answer,False,2011-04-30 06:08:14.543 +6423,2765,3,Teacher,False,2011-04-30 07:28:14.103 +6424,2420,3,Editor,False,2011-04-30 07:48:14.577 +6441,2802,3,Student,False,2011-05-01 13:33:20.077 +6444,2806,3,Supporter,False,2011-05-01 16:38:20.250 +6448,132,3,Popular Question,False,2011-05-01 19:13:21.743 +6450,2802,3,Supporter,False,2011-05-01 23:03:21.113 +6453,2802,3,Scholar,False,2011-05-01 23:38:24.127 +6459,2806,3,Scholar,False,2011-05-02 06:33:28.497 +6471,2806,3,Student,False,2011-05-02 17:18:31.937 +6472,4,3,Nice Question,False,2011-05-02 20:43:35.153 +6478,2765,3,Editor,False,2011-05-03 04:38:35.427 +6485,1575,3,Critic,False,2011-05-03 12:43:44.540 +6487,651,2,Necromancer,False,2011-05-03 13:08:44.690 +6492,1506,3,Commentator,False,2011-05-03 16:33:44.670 +6495,2765,3,Supporter,False,2011-05-03 17:48:46.743 +6496,1945,3,Supporter,False,2011-05-03 18:03:46.680 +6509,2666,3,Editor,False,2011-05-04 01:58:47.087 +6511,1895,3,Nice Answer,False,2011-05-04 03:43:47.113 +6514,2690,3,Editor,False,2011-05-04 06:13:48.673 +6541,793,2,Necromancer,False,2011-05-05 13:21:42.720 +6561,4,2,Notable Question,False,2011-05-05 21:01:47.267 +6562,1805,3,Autobiographer,False,2011-05-05 21:11:47.097 +6563,60,2,Good Question,False,2011-05-05 21:36:47.520 +6564,346,3,Nice Answer,False,2011-05-05 22:06:47.387 +6565,1805,3,Suffrage,False,2011-05-05 22:26:47.050 +6566,60,3,Popular Question,False,2011-05-05 23:16:47.050 +6588,132,3,Nice Question,False,2011-05-06 15:22:15.777 +6593,674,3,Nice Question,False,2011-05-06 15:52:19.297 +6604,1406,2,Convention,False,2011-05-06 17:41:16.243 +6626,2872,3,Student,False,2011-05-07 12:42:28.683 +6628,2105,3,Editor,False,2011-05-07 16:27:29.213 +6630,2105,3,Student,False,2011-05-07 18:47:29.083 +6632,1945,3,Tumbleweed,False,2011-05-07 19:22:29.897 +6639,2872,3,Nice Question,False,2011-05-07 22:57:31.953 +6642,1895,1,Fanatic,False,2011-05-08 00:32:32.693 +6646,190,3,Nice Question,False,2011-05-08 02:37:33.140 +6650,668,3,Nice Answer,False,2011-05-08 02:42:33.287 +6651,1805,2,Civic Duty,False,2011-05-08 02:47:33.483 +6654,2873,3,Teacher,False,2011-05-08 04:42:33.227 +6661,2666,3,Commentator,False,2011-05-08 14:27:34.047 +6664,2352,3,Organizer,False,2011-05-08 17:07:35.443 +6666,22,2,Taxonomist,False,2011-05-08 19:32:37.707 +6669,287,3,Nice Question,False,2011-05-08 21:12:40.837 +6679,2352,3,Nice Answer,False,2011-05-09 06:32:45.773 +6691,2873,3,Editor,False,2011-05-09 15:52:47.850 +6693,2690,3,Critic,False,2011-05-09 16:47:47.970 +6696,2873,3,Supporter,False,2011-05-09 17:52:48.213 +6709,166,2,Good Question,False,2011-05-10 03:28:01.443 +6710,60,2,Notable Question,False,2011-05-10 03:33:01.367 +6715,1889,1,Fanatic,False,2011-05-10 09:08:04.190 +6721,1575,3,Commentator,False,2011-05-10 15:28:07.790 +6725,2915,3,Student,False,2011-05-10 18:43:22.473 +6729,2666,3,Supporter,False,2011-05-10 22:13:23.763 +6744,2915,3,Nice Question,False,2011-05-11 16:18:27.343 +6774,4,3,Nice Answer,False,2011-05-12 16:16:01.533 +6778,190,3,Nice Question,False,2011-05-12 17:01:02.903 +6783,211,3,Popular Question,False,2011-05-12 19:56:03.493 +6793,2765,3,Revival,False,2011-05-13 00:01:05.957 +6800,1693,3,Nice Answer,False,2011-05-13 12:46:15.643 +6801,1406,3,Nice Answer,False,2011-05-13 13:26:15.873 +6808,2765,3,Commentator,False,2011-05-13 18:11:18.613 +6809,2958,3,Editor,False,2011-05-13 18:21:18.330 +6826,2105,3,Scholar,False,2011-05-14 13:46:22.843 +6827,668,3,Nice Answer,False,2011-05-14 15:06:23.787 +6831,155,2,Good Question,False,2011-05-14 23:31:29.243 +6833,750,1,Electorate,False,2011-05-15 03:46:31.470 +6853,1406,3,Vox Populi,False,2011-05-16 11:16:47.540 +6865,2765,3,Critic,False,2011-05-16 16:51:57.100 +6873,1895,3,Nice Answer,False,2011-05-16 23:27:13.677 +6874,1895,2,Enlightened,False,2011-05-17 00:37:13.613 +6884,155,3,Investor,False,2011-05-17 06:02:15.170 +6885,1209,3,Popular Question,False,2011-05-17 06:17:14.710 +6901,1895,3,Vox Populi,False,2011-05-17 23:22:22.553 +6902,1895,3,Suffrage,False,2011-05-17 23:42:23.480 +6906,2765,3,Nice Answer,False,2011-05-18 07:42:33.183 +6909,1406,1,Electorate,False,2011-05-18 11:47:37.900 +6923,261,3,Nice Question,False,2011-05-19 05:47:45.293 +6930,2666,2,Enthusiast,False,2011-05-19 12:08:12.460 +6950,132,3,Citizen Patrol,False,2011-05-19 23:43:21.897 +6951,1805,3,Revival,False,2011-05-20 00:03:22.340 +6986,1927,3,Nice Answer,False,2011-05-21 01:53:36.720 +6997,2352,3,Critic,False,2011-05-21 17:53:40.680 +7004,651,3,Nice Answer,False,2011-05-22 01:49:02.447 +7018,3048,3,Student,False,2011-05-22 19:04:04.060 +7021,3048,3,Supporter,False,2011-05-22 20:24:04.490 +7022,3048,3,Scholar,False,2011-05-22 20:24:04.520 +7033,1322,3,Teacher,False,2011-05-23 08:59:12.607 +7039,155,1,Electorate,False,2011-05-23 12:59:16.220 +7063,166,3,Popular Question,False,2011-05-24 18:06:37.097 +7064,132,3,Popular Question,False,2011-05-24 18:16:37.240 +7074,1322,3,Commentator,False,2011-05-25 03:51:50.553 +7089,1945,3,Commentator,False,2011-05-25 22:52:08.653 +7098,728,3,Popular Question,False,2011-05-26 09:37:25.093 +7111,2873,3,Nice Answer,False,2011-05-26 19:47:52.217 +7114,2765,2,Enlightened,False,2011-05-26 20:52:54.450 +7118,668,3,Nice Answer,False,2011-05-26 22:47:57.120 +7128,4,3,Nice Answer,False,2011-05-27 10:38:04.963 +7130,190,2,Good Answer,False,2011-05-27 11:33:05.353 +7137,674,3,Revival,False,2011-05-27 15:38:06.500 +7147,1693,3,Promoter,False,2011-05-28 12:48:55.230 +7171,166,2,Taxonomist,False,2011-05-30 17:51:02.387 +7183,668,1,Fanatic,False,2011-05-31 04:11:52.053 +7188,132,3,Popular Question,False,2011-05-31 12:27:07.263 +7213,2873,3,Nice Answer,False,2011-06-01 10:58:29.863 +7222,674,3,Revival,False,2011-06-01 14:28:55.357 +7229,2873,3,Mortarboard,False,2011-06-01 18:28:59.740 +7230,1889,3,Nice Answer,False,2011-06-01 20:23:58.917 +7240,1889,2,Enlightened,False,2011-06-02 09:19:02.203 +7248,132,3,Nice Question,False,2011-06-02 13:59:15.210 +7265,1406,3,time-series,True,2011-06-03 03:00:06.227 +7270,3183,3,Teacher,False,2011-06-03 15:30:00.530 +7273,3183,3,Supporter,False,2011-06-03 18:50:39.937 +7274,3048,3,Editor,False,2011-06-03 19:10:40.940 +7281,1693,2,Enthusiast,False,2011-06-04 02:40:56.813 +7282,668,3,hypothesis-testing,True,2011-06-04 03:00:07.020 +7314,211,3,Nice Question,False,2011-06-05 21:17:35.667 +7319,674,3,anova,True,2011-06-06 03:00:06.797 +7337,3185,3,Supporter,False,2011-06-06 23:23:43.900 +7352,169,3,Nice Question,False,2011-06-07 16:13:47.057 +7356,668,3,Nice Answer,False,2011-06-07 19:09:31.100 +7359,60,3,Nice Answer,False,2011-06-07 20:49:30.857 +7393,674,3,Revival,False,2011-06-08 16:49:48.180 +7399,3183,3,Critic,False,2011-06-08 22:55:29.187 +7400,166,3,Popular Question,False,2011-06-08 23:10:30.150 +7401,674,3,Revival,False,2011-06-09 01:55:32.290 +7409,155,3,Nice Answer,False,2011-06-09 12:20:51.407 +7410,155,2,Enlightened,False,2011-06-09 12:25:50.267 +7444,668,2,Enlightened,False,2011-06-10 21:38:15.423 +7446,668,3,Nice Answer,False,2011-06-11 06:28:24.113 +7448,1945,3,Editor,False,2011-06-11 14:18:33.407 +7460,674,3,Nice Answer,False,2011-06-12 06:20:43.457 +7461,3183,3,Student,False,2011-06-12 08:30:43.253 +7470,1945,3,Scholar,False,2011-06-12 16:45:44.707 +7475,1209,3,Popular Question,False,2011-06-13 03:00:57.340 +7480,2690,3,Commentator,False,2011-06-13 10:46:15.583 +7487,3183,3,Commentator,False,2011-06-13 22:46:44.210 +7491,3183,3,Editor,False,2011-06-14 03:31:46.070 +7492,3183,3,Promoter,False,2011-06-14 03:31:46.260 +7499,674,3,Nice Answer,False,2011-06-14 13:56:52.413 +7500,211,3,Disciplined,False,2011-06-14 13:56:52.507 +7504,60,3,Popular Question,False,2011-06-14 14:51:52.207 +7531,668,3,Nice Answer,False,2011-06-15 18:49:14.877 +7532,668,2,Enlightened,False,2011-06-15 18:49:15.187 +7558,2873,3,Commentator,False,2011-06-16 17:32:00.030 +7559,450,3,Cleanup,False,2011-06-16 17:37:00.077 +7587,855,3,Nice Answer,False,2011-06-18 01:48:22.037 +7602,287,2,Favorite Question,False,2011-06-19 06:24:34.213 +7609,2873,3,Nice Answer,False,2011-06-19 14:49:40.427 +7616,4,2,Good Answer,False,2011-06-19 21:35:12.317 +7618,2690,3,Tumbleweed,False,2011-06-20 06:05:43.320 +7632,3185,3,Editor,False,2011-06-20 16:45:50.170 +7663,3185,3,Student,False,2011-06-21 19:46:46.193 +7672,674,3,clustering,True,2011-06-22 03:00:07.687 +7685,1406,3,Nice Question,False,2011-06-22 09:17:45.803 +7687,60,2,Favorite Question,False,2011-06-22 10:12:46.487 +7688,155,2,Notable Question,False,2011-06-22 11:07:46.637 +7714,2873,3,Revival,False,2011-06-23 00:58:03.977 +7717,668,2,Good Answer,False,2011-06-23 08:22:50.440 +7721,651,3,Nice Answer,False,2011-06-23 14:17:53.537 +7731,668,3,Nice Answer,False,2011-06-24 03:12:59.537 +7732,668,2,Enlightened,False,2011-06-24 03:48:05.387 +7736,651,2,Enlightened,False,2011-06-24 08:53:37.607 +7754,1506,3,Critic,False,2011-06-25 16:04:49.193 +7758,4,2,Favorite Question,False,2011-06-26 04:45:24.567 +7759,2765,3,Student,False,2011-06-26 07:50:37.567 +7760,2081,3,Editor,False,2011-06-26 08:20:37.943 +7766,3185,3,Scholar,False,2011-06-26 14:15:54.793 +7771,2666,3,Revival,False,2011-06-26 22:31:55.570 +7773,1298,3,Student,False,2011-06-27 04:01:55.913 +7784,155,3,Popular Question,False,2011-06-27 15:07:05.890 +7804,668,2,Enlightened,False,2011-06-28 09:12:54.833 +7806,503,3,Autobiographer,False,2011-06-28 10:37:55.963 +7809,1298,3,Scholar,False,2011-06-28 13:48:30.253 +7815,1945,3,Promoter,False,2011-06-28 18:53:54.287 +7825,60,3,Nice Question,False,2011-06-29 03:39:02.547 +7830,261,3,Nice Question,False,2011-06-29 09:04:06.587 +7845,2857,3,Editor,False,2011-06-30 03:22:12.960 +7848,2857,3,Student,False,2011-06-30 06:52:46.330 +7884,2873,3,Critic,False,2011-07-01 16:23:21.613 +7893,651,3,Nice Answer,False,2011-07-02 11:19:07.513 +7906,155,3,Nice Answer,False,2011-07-03 14:10:18.263 +7937,2081,3,Organizer,False,2011-07-05 06:47:26.460 +7956,1805,3,Nice Question,False,2011-07-06 07:38:32.000 +7960,155,3,Nice Answer,False,2011-07-06 14:33:32.400 +7968,2121,3,Editor,False,2011-07-06 19:48:32.723 +7969,2121,3,Scholar,False,2011-07-06 19:58:32.637 +7970,2121,3,Student,False,2011-07-06 20:03:32.460 +7977,399,3,Nice Question,False,2011-07-06 23:13:33.437 +7980,2105,3,Supporter,False,2011-07-07 01:48:54.377 +7981,132,3,Nice Answer,False,2011-07-07 02:49:31.753 +7982,668,3,Nice Answer,False,2011-07-07 02:49:31.783 +7983,674,3,Nice Answer,False,2011-07-07 02:49:31.800 +7984,155,3,Nice Question,False,2011-07-07 02:49:32.017 +7985,211,3,Nice Question,False,2011-07-07 02:49:32.033 +7986,674,3,Revival,False,2011-07-07 03:04:30.723 +7990,668,2,Enlightened,False,2011-07-07 03:42:03.127 +8000,2873,2,Enthusiast,False,2011-07-07 18:32:39.387 +8007,449,3,Scholar,False,2011-07-07 20:52:39.017 +8010,1359,3,Nice Question,False,2011-07-07 21:37:38.867 +8012,503,2,Necromancer,False,2011-07-08 00:42:39.507 +8018,399,3,Nice Answer,False,2011-07-08 06:27:55.653 +8021,674,3,Synonymizer,False,2011-07-08 08:13:07.453 +8025,132,3,Synonymizer,False,2011-07-08 08:13:07.453 +8023,668,3,Synonymizer,False,2011-07-08 08:13:07.453 +8077,1428,2,Necromancer,False,2011-07-11 06:41:16.403 +8089,1927,3,Nice Answer,False,2011-07-11 17:21:32.463 +8129,60,3,Popular Question,False,2011-07-12 21:52:47.537 +8161,2149,3,time-series,True,2011-07-14 03:00:06.873 +8163,2081,3,Supporter,False,2011-07-14 06:12:33.607 +8175,668,3,Nice Answer,False,2011-07-14 16:57:51.103 +8185,3641,3,Student,False,2011-07-15 11:37:50.330 +8198,2352,3,Scholar,False,2011-07-15 18:02:50.630 +8224,3641,3,Editor,False,2011-07-17 14:15:05.183 +8229,1889,3,Critic,False,2011-07-17 22:55:06.050 +8252,132,3,Nice Answer,False,2011-07-18 20:26:09.383 +8267,3693,3,Student,False,2011-07-19 08:22:01.660 +8269,221,3,Promoter,False,2011-07-19 08:57:01.447 +8277,2081,3,Scholar,False,2011-07-19 12:47:13.180 +8278,287,3,Nice Answer,False,2011-07-19 12:52:13.180 +8280,287,2,Enlightened,False,2011-07-19 13:12:13.397 +8284,1636,3,Supporter,False,2011-07-19 15:17:13.823 +8304,4,2,Yearling,False,2011-07-20 00:02:13.703 +8326,22,2,Yearling,False,2011-07-20 00:02:13.703 +8317,60,2,Yearling,False,2011-07-20 00:02:13.703 +8311,114,2,Yearling,False,2011-07-20 00:02:13.703 +8314,132,2,Yearling,False,2011-07-20 00:02:13.703 +8350,3693,3,Supporter,False,2011-07-20 05:48:14.950 +8358,3662,3,Teacher,False,2011-07-20 17:53:24.797 +8379,155,2,Yearling,False,2011-07-21 00:03:32.127 +8385,169,2,Yearling,False,2011-07-21 00:03:32.127 +8371,182,2,Yearling,False,2011-07-21 00:03:32.127 +8369,190,2,Yearling,False,2011-07-21 00:03:32.127 +8368,192,2,Yearling,False,2011-07-21 00:03:32.127 +8383,166,2,Yearling,False,2011-07-21 00:03:32.127 +8396,668,3,Nice Answer,False,2011-07-21 10:39:39.233 +8402,2069,3,Commentator,False,2011-07-21 17:34:43.727 +8408,211,2,Yearling,False,2011-07-22 00:19:47.690 +8417,1927,3,Nice Answer,False,2011-07-22 06:15:32.057 +8421,793,3,Nice Question,False,2011-07-22 12:40:36.137 +8422,3731,3,Student,False,2011-07-22 14:10:36.543 +8423,3641,3,Tumbleweed,False,2011-07-22 15:15:36.750 +8438,2765,3,Promoter,False,2011-07-22 23:00:38.487 +8441,221,2,Yearling,False,2011-07-23 00:00:39.870 +8443,186,3,Popular Question,False,2011-07-23 02:00:40.507 +8446,3693,3,Editor,False,2011-07-23 13:32:06.420 +8448,3662,3,Editor,False,2011-07-23 17:03:12.900 +8470,1693,3,Citizen Patrol,False,2011-07-24 23:30:36.450 +8474,2958,3,Teacher,False,2011-07-25 01:15:40.547 +8477,2081,2,Enthusiast,False,2011-07-25 06:00:49.020 +8483,3733,3,Editor,False,2011-07-25 11:55:50.930 +8488,668,3,Nice Answer,False,2011-07-25 14:30:51.110 +8489,668,2,Enlightened,False,2011-07-25 14:50:51.070 +8494,3733,3,Student,False,2011-07-25 16:46:37.837 +8498,1406,2,Enlightened,False,2011-07-25 22:26:55.350 +8499,1406,3,Nice Answer,False,2011-07-25 22:26:55.413 +8501,793,2,Enthusiast,False,2011-07-25 23:06:55.283 +8520,3580,3,Supporter,False,2011-07-26 15:42:05.897 +8521,1636,3,Editor,False,2011-07-26 17:42:05.593 +8523,628,2,Favorite Question,False,2011-07-26 19:52:07.370 +8524,2765,3,Nice Question,False,2011-07-26 20:02:07.563 +8525,450,3,Revival,False,2011-07-26 20:12:07.790 +8543,261,2,Yearling,False,2011-07-27 00:02:09.043 +8542,232,2,Yearling,False,2011-07-27 00:02:09.043 +8532,287,2,Yearling,False,2011-07-27 00:02:09.043 +8558,855,3,Student,False,2011-07-27 14:42:30.920 +8565,2958,3,Supporter,False,2011-07-27 19:37:56.430 +8584,346,2,Yearling,False,2011-07-28 00:32:57.643 +8589,346,2,Notable Question,False,2011-07-28 05:07:58.557 +8602,155,3,Popular Question,False,2011-07-28 16:58:18.873 +8607,1790,3,Commentator,False,2011-07-28 21:08:41.587 +8611,155,2,Good Question,False,2011-07-28 23:43:45.257 +8617,399,2,Yearling,False,2011-07-29 00:03:45.333 +8618,449,3,Critic,False,2011-07-29 01:18:44.863 +8625,674,3,Nice Answer,False,2011-07-29 08:58:56.957 +8627,155,3,Revival,False,2011-07-29 09:28:58.207 +8636,503,2,Necromancer,False,2011-07-29 13:38:59.803 +8637,674,2,Necromancer,False,2011-07-29 13:38:59.817 +8644,450,2,Yearling,False,2011-07-30 00:04:25.280 +8643,449,2,Yearling,False,2011-07-30 00:04:25.280 +8647,436,2,Yearling,False,2011-07-30 00:04:25.280 +8651,674,1,Copy Editor,False,2011-07-30 11:14:31.863 +8668,3641,3,Commentator,False,2011-07-31 14:45:36.377 +8690,1895,2,Civic Duty,False,2011-08-01 17:56:19.977 +8710,651,3,Popular Question,False,2011-08-02 07:47:29.667 +8715,2958,3,Commentator,False,2011-08-02 12:37:34.237 +8731,668,3,Nice Answer,False,2011-08-02 19:02:38.673 +8738,169,3,Autobiographer,False,2011-08-02 21:12:41.137 +8739,674,3,Nice Answer,False,2011-08-02 21:22:41.393 +8771,1895,3,Nice Answer,False,2011-08-03 15:22:58.170 +8772,1895,2,Enlightened,False,2011-08-03 16:07:59.287 +8784,503,2,Yearling,False,2011-08-04 00:08:08.257 +8785,1930,2,Good Question,False,2011-08-04 00:38:07.370 +8794,1895,3,Nice Answer,False,2011-08-04 11:28:36.280 +8800,3868,3,Student,False,2011-08-04 14:58:45.950 +8801,1805,3,Organizer,False,2011-08-04 15:43:46.893 +8808,132,1,Constable,False,2011-08-04 20:31:50.113 +8828,60,3,Nice Answer,False,2011-08-05 09:14:18.137 +8829,651,3,Nice Answer,False,2011-08-05 09:14:18.137 +8841,3580,3,Teacher,False,2011-08-05 17:35:50.533 +8844,750,3,Announcer,False,2011-08-05 19:15:52.570 +8848,155,3,r,True,2011-08-06 03:00:07.567 +8850,346,2,Necromancer,False,2011-08-06 03:26:09.180 +8855,3662,3,Scholar,False,2011-08-06 09:46:10.553 +8856,3662,3,Supporter,False,2011-08-06 12:11:10.637 +8865,346,3,Nice Answer,False,2011-08-06 23:13:20.170 +8867,346,2,Enlightened,False,2011-08-06 23:18:23.347 +8874,3641,3,Scholar,False,2011-08-07 09:03:26.613 +8879,594,2,Yearling,False,2011-08-08 00:08:53.370 +8885,3894,3,Editor,False,2011-08-08 10:54:29.223 +8890,3922,3,Autobiographer,False,2011-08-08 17:34:31.927 +8898,668,3,Nice Answer,False,2011-08-08 21:59:41.460 +8900,3922,3,Teacher,False,2011-08-08 22:39:42.927 +8902,668,2,Enlightened,False,2011-08-09 00:44:46.607 +8914,221,2,Necromancer,False,2011-08-09 13:25:26.423 +8924,750,3,Tag Editor,False,2011-08-09 18:40:32.837 +8930,3922,3,Commentator,False,2011-08-10 01:25:32.073 +8943,3641,3,Supporter,False,2011-08-10 12:30:47.527 +8946,793,3,Nice Question,False,2011-08-10 13:50:47.317 +8947,3662,3,Student,False,2011-08-10 13:55:47.577 +8951,3922,3,Supporter,False,2011-08-10 14:30:47.563 +8954,186,2,Taxonomist,False,2011-08-10 15:40:49.667 +8959,3922,3,Organizer,False,2011-08-10 17:05:49.770 +8971,4,1,Constable,False,2011-08-10 19:08:14.420 +8973,3922,3,Editor,False,2011-08-10 19:30:51.420 +8974,169,3,Nice Answer,False,2011-08-10 20:25:51.423 +8978,3733,3,Supporter,False,2011-08-10 21:10:53.550 +8981,3922,3,Tag Editor,False,2011-08-10 22:15:54.803 +8983,628,2,Yearling,False,2011-08-11 00:00:57.097 +8986,155,3,Nice Question,False,2011-08-11 02:00:59.557 +8990,436,2,Civic Duty,False,2011-08-11 06:31:17.043 +9008,3922,3,Nice Answer,False,2011-08-12 00:01:45.177 +9014,2873,3,Nice Answer,False,2011-08-12 05:27:23.647 +9016,2081,3,Critic,False,2011-08-12 07:42:29.450 +9020,2081,3,Nice Answer,False,2011-08-12 10:52:30.090 +9029,674,2,Necromancer,False,2011-08-12 14:32:35.883 +9037,211,3,Nice Question,False,2011-08-12 20:22:47.373 +9044,651,2,Yearling,False,2011-08-13 00:12:55.640 +9042,658,2,Yearling,False,2011-08-13 00:12:55.640 +9050,674,3,Nice Answer,False,2011-08-13 13:03:30.203 +9059,674,2,Yearling,False,2011-08-14 00:03:37.163 +9058,668,2,Yearling,False,2011-08-14 00:03:37.163 +9068,2081,3,Nice Answer,False,2011-08-14 14:23:49.747 +9071,668,2,Necromancer,False,2011-08-14 15:33:51.370 +9072,674,3,Vox Populi,False,2011-08-14 15:43:52.070 +9079,2666,3,Nice Answer,False,2011-08-14 23:04:18.823 +9088,1406,3,Excavator,False,2011-08-15 09:39:49.343 +9094,132,3,Excavator,False,2011-08-15 09:39:49.360 +9092,674,3,Excavator,False,2011-08-15 09:39:49.360 +9095,155,3,Excavator,False,2011-08-15 09:39:49.360 +9091,1693,3,Excavator,False,2011-08-15 09:39:49.360 +9103,668,3,Excavator,False,2011-08-15 09:39:49.377 +9104,3999,3,Teacher,False,2011-08-15 11:04:56.430 +9105,3999,3,Revival,False,2011-08-15 13:14:56.657 +9115,3999,3,Supporter,False,2011-08-15 21:10:24.530 +9116,3999,3,Citizen Patrol,False,2011-08-15 21:15:23.117 +9125,3999,3,Revival,False,2011-08-16 06:55:31.557 +9129,3999,3,Autobiographer,False,2011-08-16 08:15:31.070 +9132,3999,3,Student,False,2011-08-16 09:05:32.940 +9140,166,3,Nice Question,False,2011-08-16 13:45:49.067 +9144,2666,3,Nice Answer,False,2011-08-16 14:20:50.227 +9179,2666,3,Quorum,False,2011-08-17 13:25:20.233 +9185,3999,3,Editor,False,2011-08-17 16:56:23.973 +9188,155,3,Nice Question,False,2011-08-17 19:36:24.007 +9189,3733,3,Scholar,False,2011-08-17 20:16:24.823 +9193,3999,3,Critic,False,2011-08-17 23:56:26.977 +9194,3999,3,Commentator,False,2011-08-18 00:01:26.953 +9198,60,2,Necromancer,False,2011-08-18 01:36:27.143 +9202,1124,3,Student,False,2011-08-18 06:05:17.167 +9205,3999,3,Scholar,False,2011-08-18 08:30:47.350 +9213,1124,3,Scholar,False,2011-08-18 10:41:07.807 +9214,1124,3,Supporter,False,2011-08-18 10:41:07.933 +9218,1406,3,Revival,False,2011-08-18 12:36:13.440 +9219,232,3,Citizen Patrol,False,2011-08-18 14:21:14.137 +9228,1805,3,Nice Answer,False,2011-08-18 18:01:22.343 +9229,1805,2,Enlightened,False,2011-08-18 18:36:23.080 +9234,1790,3,Nice Question,False,2011-08-18 20:46:27.313 +9241,1322,3,Citizen Patrol,False,2011-08-19 06:56:35.830 +9245,1124,3,Nice Question,False,2011-08-19 14:22:22.520 +9257,728,2,Yearling,False,2011-08-20 00:02:35.730 +9262,169,3,Commentator,False,2011-08-20 10:32:41.983 +9269,1790,3,Teacher,False,2011-08-20 20:38:00.603 +9277,155,3,Nice Answer,False,2011-08-21 01:53:04.443 +9279,793,2,Necromancer,False,2011-08-21 03:53:09.253 +9304,155,3,regression,True,2011-08-22 03:00:07.260 +9320,1691,3,Critic,False,2011-08-22 13:58:55.260 +9331,750,2,Yearling,False,2011-08-23 00:04:24.593 +9335,3922,3,Citizen Patrol,False,2011-08-23 03:09:24.207 +9346,793,2,Necromancer,False,2011-08-23 12:19:37.187 +9363,3733,3,Commentator,False,2011-08-23 22:24:54.697 +9368,3733,3,Teacher,False,2011-08-24 03:40:10.923 +9383,2666,3,Nice Answer,False,2011-08-24 14:46:39.507 +9385,450,3,Nice Answer,False,2011-08-24 15:16:40.747 +9388,450,2,Enlightened,False,2011-08-24 16:21:42.637 +9399,2121,3,Teacher,False,2011-08-24 21:36:49.617 +9414,182,3,Nice Answer,False,2011-08-25 17:17:16.153 +9415,668,3,Nice Answer,False,2011-08-25 17:17:16.153 +9417,668,3,Revival,False,2011-08-25 18:12:18.437 +9434,2765,3,Nice Answer,False,2011-08-26 16:38:02.180 +9438,750,2,Sportsmanship,False,2011-08-26 17:43:12.277 +9454,169,2,Good Question,False,2011-08-27 19:20:04.523 +9456,793,2,Yearling,False,2011-08-28 00:00:14.050 +9469,132,2,Notable Question,False,2011-08-28 18:56:01.613 +9480,2765,2,Enthusiast,False,2011-08-29 07:31:16.593 +9482,1406,3,Analytical,False,2011-08-29 10:06:17.573 +9514,668,3,Nice Answer,False,2011-08-30 12:52:04.063 +9517,2161,3,Student,False,2011-08-30 15:27:07.833 +9518,2161,3,Editor,False,2011-08-30 16:17:08.267 +9523,2071,3,Teacher,False,2011-08-30 18:02:12.963 +9543,1406,3,Autobiographer,False,2011-08-31 12:22:58.130 +9544,3999,3,Nice Question,False,2011-08-31 12:27:59.557 +9549,2161,3,Scholar,False,2011-08-31 12:53:00.063 +9551,155,3,Nice Answer,False,2011-08-31 13:28:00.160 +9573,3999,3,Organizer,False,2011-08-31 23:19:21.297 +9587,668,3,Nice Answer,False,2011-09-01 11:14:32.673 +9588,2161,3,Supporter,False,2011-09-01 11:44:33.857 +9601,4221,3,Student,False,2011-09-01 15:14:38.553 +9602,169,3,Nice Question,False,2011-09-01 15:24:37.833 +9606,1927,3,Nice Answer,False,2011-09-01 17:14:41.430 +9607,1927,2,Enlightened,False,2011-09-01 17:24:41.350 +9625,2161,3,Commentator,False,2011-09-02 16:02:47.420 +9629,1322,2,Notable Question,False,2011-09-02 19:37:52.233 +9644,3999,3,Revival,False,2011-09-03 11:03:16.520 +9653,399,2,Good Answer,False,2011-09-03 22:33:30.893 +9658,855,2,Yearling,False,2011-09-04 00:08:36.323 +9667,1693,2,Civic Duty,False,2011-09-04 16:49:11.353 +9675,3662,3,Revival,False,2011-09-05 08:53:28.823 +9677,3662,3,Commentator,False,2011-09-05 09:38:27.983 +9678,2121,3,Quorum,False,2011-09-05 10:42:59.313 +9685,22,3,Organizer,False,2011-09-05 13:28:31.460 +9688,1805,2,Necromancer,False,2011-09-05 15:33:31.817 +9702,449,3,Nice Answer,False,2011-09-06 13:19:37.110 +9711,2121,3,Commentator,False,2011-09-06 20:35:04.057 +9721,3662,3,Organizer,False,2011-09-07 08:45:44.040 +9725,3894,3,Teacher,False,2011-09-07 12:10:50.977 +9732,855,3,Scholar,False,2011-09-07 18:36:52.467 +9755,674,3,Analytical,False,2011-09-08 15:07:15.087 +9757,22,3,Self-Learner,False,2011-09-08 15:27:23.070 +9771,155,2,Necromancer,False,2011-09-09 05:48:29.840 +9773,261,3,Nice Answer,False,2011-09-09 06:48:31.303 +9776,60,3,Nice Answer,False,2011-09-09 06:53:31.280 +9780,503,3,Nice Answer,False,2011-09-09 12:23:56.850 +9782,3999,3,Quorum,False,2011-09-09 15:47:09.780 +9786,155,3,Nice Answer,False,2011-09-09 16:04:03.450 +9790,2873,3,Organizer,False,2011-09-09 17:59:07.303 +9801,668,3,normal-distribution,True,2011-09-10 03:00:07.650 +9804,449,3,Revival,False,2011-09-10 09:30:30.153 +9817,503,3,Critic,False,2011-09-10 22:27:07.513 +9826,22,3,Promoter,False,2011-09-11 13:33:04.953 +9830,4320,3,Editor,False,2011-09-11 16:23:10.460 +9831,4320,3,Teacher,False,2011-09-11 17:23:12.753 +9844,3999,3,Revival,False,2011-09-12 06:43:35.337 +9870,82,3,Promoter,False,2011-09-13 18:50:35.853 +9875,1145,3,Nice Answer,False,2011-09-13 20:05:53.423 +9876,346,3,Talkative,False,2011-09-13 20:52:18.167 +9883,1895,2,Strunk & White,False,2011-09-14 05:01:04.517 +9888,232,3,Revival,False,2011-09-14 13:36:06.940 +9889,3999,3,Revival,False,2011-09-14 14:06:07.843 +9893,3999,3,Revival,False,2011-09-14 16:56:13.363 +9903,82,3,Editor,False,2011-09-14 22:51:18.460 +9905,155,1,Copy Editor,False,2011-09-15 01:01:23.227 +9940,82,3,Benefactor,False,2011-09-15 20:52:15.377 +9965,947,2,Yearling,False,2011-09-17 00:18:01.807 +9971,346,3,Popular Question,False,2011-09-17 05:20:00.190 +9988,22,3,Benefactor,False,2011-09-18 18:30:59.267 +10001,668,2,Good Answer,False,2011-09-19 11:37:29.087 +10034,182,3,Popular Question,False,2011-09-20 16:24:23.450 +10054,786,3,Commentator,False,2011-09-21 13:05:03.297 +10059,155,3,Quorum,False,2011-09-21 15:01:17.317 +10076,3999,2,Necromancer,False,2011-09-22 01:56:27.840 +10078,371,2,Necromancer,False,2011-09-22 04:42:08.017 +10080,22,2,Good Answer,False,2011-09-22 05:02:14.910 +10081,60,2,Good Answer,False,2011-09-22 05:02:14.927 +10082,211,2,Good Answer,False,2011-09-22 05:02:14.927 +10099,166,3,Popular Question,False,2011-09-22 13:43:13.637 +10108,2873,3,Nice Answer,False,2011-09-22 16:13:18.557 +10123,166,2,Notable Question,False,2011-09-22 18:53:23.800 +10128,1506,3,Nice Answer,False,2011-09-22 20:13:26.900 +10135,60,3,Excavator,False,2011-09-22 23:43:31.300 +10136,60,2,Good Answer,False,2011-09-22 23:48:31.247 +10138,155,3,Nice Answer,False,2011-09-23 02:18:34.307 +10146,190,3,Revival,False,2011-09-23 13:09:21.227 +10165,132,2,Taxonomist,False,2011-09-23 21:14:39.213 +10176,449,3,Cleanup,False,2011-09-24 22:56:24.773 +10179,166,2,Notable Question,False,2011-09-25 05:07:18.680 +10180,211,3,Popular Question,False,2011-09-25 08:17:24.600 +10182,82,3,Commentator,False,2011-09-25 13:42:27.077 +10192,346,3,Nice Answer,False,2011-09-26 03:57:46.430 +10204,4537,3,Autobiographer,False,2011-09-26 20:23:27.567 +10272,1506,3,Nice Question,False,2011-09-28 22:06:23.793 +10276,668,3,time-series,True,2011-09-29 03:00:07.690 +10284,155,3,Revival,False,2011-09-29 12:46:52.967 +10288,1805,3,Excavator,False,2011-09-29 13:26:54.423 +10290,1145,3,Critic,False,2011-09-29 13:56:55.277 +10301,4,2,Notable Question,False,2011-09-29 22:47:21.480 +10307,668,3,Investor,False,2011-09-30 03:17:26.617 +10340,668,3,Nice Answer,False,2011-10-01 02:23:07.847 +10342,668,2,Enlightened,False,2011-10-01 03:38:10.343 +10344,503,2,Enthusiast,False,2011-10-01 11:08:42.517 +10354,3922,3,Announcer,False,2011-10-01 16:54:14.203 +10368,2873,3,Nice Answer,False,2011-10-02 09:49:47.297 +10381,2081,1,Fanatic,False,2011-10-03 01:05:47.707 +10396,155,3,Nice Question,False,2011-10-03 16:56:32.633 +10397,4656,3,Teacher,False,2011-10-03 17:06:33.117 +10402,114,3,Popular Question,False,2011-10-03 23:07:15.920 +10417,4656,3,Editor,False,2011-10-04 12:11:46.523 +10425,674,3,Nice Answer,False,2011-10-04 15:06:52.897 +10429,674,2,Enlightened,False,2011-10-04 16:36:55.293 +10466,182,3,Nice Question,False,2011-10-05 17:32:58.793 +10470,674,3,Proofreader,False,2011-10-05 18:58:01.977 +10481,1889,3,Nice Answer,False,2011-10-06 02:19:33.670 +10483,221,3,Analytical,False,2011-10-06 06:39:35.030 +10490,4656,3,Supporter,False,2011-10-06 12:04:41.743 +10491,60,3,Nice Answer,False,2011-10-06 13:14:41.240 +10493,4,2,Favorite Question,False,2011-10-06 14:39:41.410 +10508,668,3,Altruist,False,2011-10-06 21:40:06.050 +10509,668,3,Analytical,False,2011-10-06 21:55:07.040 +10531,1406,3,Nice Answer,False,2011-10-07 13:20:43.173 +10534,399,2,Notable Question,False,2011-10-07 15:30:52.540 +10538,155,3,Popular Question,False,2011-10-07 21:31:06.417 +10543,3662,3,Critic,False,2011-10-08 01:21:13.060 +10590,1073,2,Yearling,False,2011-10-10 06:19:08.727 +10607,2490,3,Editor,False,2011-10-11 00:19:47.177 +10614,2121,3,Supporter,False,2011-10-11 05:15:28.287 +10639,2352,3,Citizen Patrol,False,2011-10-12 00:26:20.680 +10641,155,3,correlation,True,2011-10-12 03:00:08.980 +10648,4779,3,Teacher,False,2011-10-12 09:52:25.093 +10652,4656,3,Commentator,False,2011-10-12 11:52:26.283 +10656,3999,3,Nice Answer,False,2011-10-12 14:52:29.483 +10662,155,3,Nice Answer,False,2011-10-12 19:07:32.843 +10682,261,3,Nice Answer,False,2011-10-12 23:22:49.570 +10695,651,3,Nice Answer,False,2011-10-13 05:45:31.797 +10697,1085,2,Yearling,False,2011-10-13 06:40:38.777 +10728,1693,3,Nice Question,False,2011-10-13 21:01:33.580 +10740,155,3,Nice Answer,False,2011-10-14 07:12:35.290 +10745,2765,3,Nice Answer,False,2011-10-14 13:32:45.867 +10746,4318,3,Supporter,False,2011-10-14 13:32:46.270 +10751,450,3,Popular Question,False,2011-10-14 17:32:54.807 +10760,169,3,Popular Question,False,2011-10-14 22:43:00.690 +10770,371,2,Good Question,False,2011-10-15 14:15:42.600 +10774,668,3,Nice Answer,False,2011-10-15 16:21:20.057 +10776,668,2,Enlightened,False,2011-10-15 17:36:48.630 +10805,4656,3,Citizen Patrol,False,2011-10-16 21:24:31.233 +10814,4871,3,Teacher,False,2011-10-17 09:49:49.937 +10851,3999,3,Analytical,False,2011-10-17 22:15:05.603 +10852,4890,3,Student,False,2011-10-17 23:20:10.807 +10855,3999,3,Nice Question,False,2011-10-18 00:05:15.083 +10863,728,3,Nice Question,False,2011-10-18 07:50:44.273 +10869,3999,3,Nice Answer,False,2011-10-18 10:10:54.363 +10875,4890,3,Editor,False,2011-10-18 14:26:05.037 +10897,4911,3,Student,False,2011-10-18 22:06:18.103 +10899,793,2,Good Question,False,2011-10-19 02:26:22.660 +10921,211,3,Popular Question,False,2011-10-19 19:56:50.993 +10927,1805,3,Analytical,False,2011-10-19 21:02:00.940 +10929,1805,3,Vox Populi,False,2011-10-19 21:02:01.910 +10936,668,3,correlation,True,2011-10-20 03:00:05.797 +10969,2806,3,Teacher,False,2011-10-20 22:44:11.817 +10975,190,3,Nice Question,False,2011-10-21 10:06:22.430 +10982,211,3,Popular Question,False,2011-10-21 13:51:30.877 +10984,668,3,Proofreader,False,2011-10-21 14:46:32.887 +10998,668,3,Nice Answer,False,2011-10-21 20:06:42.660 +11009,1895,3,Nice Answer,False,2011-10-22 10:34:45.767 +11011,3999,3,Nice Answer,False,2011-10-22 14:04:52.160 +11014,658,2,Good Answer,False,2011-10-22 15:39:55.643 +11028,1889,3,Revival,False,2011-10-22 21:50:08.523 +11037,1412,3,Student,False,2011-10-23 04:50:54.223 +11038,1145,2,Yearling,False,2011-10-23 06:01:18.980 +11043,750,2,Deputy,False,2011-10-23 08:50:30.870 +11049,4,3,Popular Question,False,2011-10-23 17:36:38.467 +11069,1895,3,Nice Answer,False,2011-10-24 13:37:53.073 +11071,2765,3,Nice Answer,False,2011-10-24 14:38:04.293 +11072,2765,2,Enlightened,False,2011-10-24 15:08:09.707 +11089,1406,2,Sportsmanship,False,2011-10-25 09:36:53.027 +11094,3641,3,Promoter,False,2011-10-25 11:41:40.560 +11095,155,3,Popular Question,False,2011-10-25 11:46:40.550 +11123,668,3,Disciplined,False,2011-10-25 22:30:17.133 +11126,728,2,Taxonomist,False,2011-10-25 22:45:17.727 +11127,2806,3,Critic,False,2011-10-25 23:20:18.157 +11135,2806,3,Commentator,False,2011-10-26 03:40:31.107 +11136,1895,2,Enlightened,False,2011-10-26 05:45:32.090 +11137,1895,3,Nice Answer,False,2011-10-26 05:45:32.243 +11146,5038,3,Student,False,2011-10-26 09:55:40.780 +11150,5038,3,Scholar,False,2011-10-26 13:09:06.600 +11171,4911,3,Scholar,False,2011-10-27 00:36:48.927 +11184,2806,3,Nice Question,False,2011-10-27 13:56:12.557 +11196,503,3,Nice Answer,False,2011-10-27 19:27:04.923 +11223,1412,3,Critic,False,2011-10-28 17:09:01.660 +11245,155,2,Good Question,False,2011-10-29 16:44:14.963 +11246,3999,2,Good Question,False,2011-10-29 16:44:14.963 +11249,2806,3,Editor,False,2011-10-29 20:49:20.347 +11255,1805,3,Nice Question,False,2011-10-30 06:51:59.853 +11259,4582,3,Teacher,False,2011-10-30 10:47:46.143 +11262,1693,3,Revival,False,2011-10-30 12:32:46.513 +11271,4656,3,Critic,False,2011-10-31 00:52:55.180 +11279,1927,1,Fanatic,False,2011-10-31 06:38:00.787 +11293,2121,3,Tumbleweed,False,2011-10-31 17:09:55.297 +11295,1889,3,Nice Answer,False,2011-10-31 17:50:06.807 +11308,3999,2,Necromancer,False,2011-11-01 03:00:50.430 +11313,3999,2,Enthusiast,False,2011-11-01 05:00:51.233 +11344,503,3,Nice Answer,False,2011-11-02 03:01:59.883 +11356,793,2,Necromancer,False,2011-11-02 11:17:03.573 +11376,1889,2,Necromancer,False,2011-11-03 00:52:56.357 +11390,1985,3,Scholar,False,2011-11-03 16:03:33.863 +11392,166,3,Popular Question,False,2011-11-03 17:23:57.243 +11409,668,3,Revival,False,2011-11-04 09:05:30.500 +11418,1805,3,Nice Answer,False,2011-11-04 14:45:33.133 +11423,155,3,Popular Question,False,2011-11-04 16:15:35.070 +11425,674,3,Nice Answer,False,2011-11-04 17:25:48.103 +11426,674,2,Enlightened,False,2011-11-04 17:30:48.143 +11428,450,2,Necromancer,False,2011-11-04 18:51:05.360 +11433,1209,2,Yearling,False,2011-11-04 21:56:40.653 +11434,1805,3,Popular Question,False,2011-11-04 22:01:40.630 +11443,5179,3,Teacher,False,2011-11-05 10:30:02.510 +11457,651,3,machine-learning,True,2011-11-06 03:00:11.357 +11458,1805,3,Nice Question,False,2011-11-06 03:00:30.310 +11460,5196,3,Supporter,False,2011-11-06 09:05:46.790 +11461,5196,3,Scholar,False,2011-11-06 11:20:50.927 +11462,5196,3,Editor,False,2011-11-06 11:25:50.217 +11465,3999,2,Necromancer,False,2011-11-06 16:26:01.873 +11468,5179,3,Editor,False,2011-11-06 17:51:04.217 +11476,5179,3,Supporter,False,2011-11-07 06:36:35.957 +11480,5211,3,Student,False,2011-11-07 09:02:00.913 +11481,5211,3,Supporter,False,2011-11-07 09:02:00.943 +11483,5211,3,Scholar,False,2011-11-07 11:12:02.373 +11490,261,2,Good Answer,False,2011-11-07 17:57:28.457 +11501,5179,3,Critic,False,2011-11-07 21:43:30.623 +11511,1406,3,Nice Answer,False,2011-11-08 13:34:07.493 +11512,4582,3,Student,False,2011-11-08 13:39:07.940 +11520,1895,3,Nice Answer,False,2011-11-08 20:24:43.750 +11538,5234,3,Student,False,2011-11-09 06:20:33.413 +11540,4871,3,Student,False,2011-11-09 07:40:34.703 +11542,668,3,Nice Answer,False,2011-11-09 10:10:36.567 +11548,2666,3,Critic,False,2011-11-09 14:00:44.240 +11551,5237,3,Editor,False,2011-11-09 15:25:47.373 +11552,5237,3,Scholar,False,2011-11-09 15:30:47.740 +11554,668,1,Copy Editor,False,2011-11-09 16:20:49.223 +11557,5179,3,Commentator,False,2011-11-09 16:40:49.213 +11559,5179,3,Analytical,False,2011-11-09 16:55:49.977 +11560,5179,3,Autobiographer,False,2011-11-09 17:00:50.030 +11567,1895,3,Excavator,False,2011-11-09 18:45:54.567 +11578,5237,3,Teacher,False,2011-11-10 06:51:23.787 +11579,5249,3,Editor,False,2011-11-10 07:11:23.950 +11584,5249,3,Student,False,2011-11-10 13:31:24.497 +11588,668,3,Nice Answer,False,2011-11-10 14:01:25.303 +11591,399,3,Nice Answer,False,2011-11-10 14:46:25.480 +11594,1805,2,Good Question,False,2011-11-10 16:41:35.557 +11595,2420,3,Scholar,False,2011-11-10 17:56:55.957 +11603,1805,3,Revival,False,2011-11-11 02:13:48.380 +11607,5249,3,Teacher,False,2011-11-11 06:35:29.233 +11614,5237,3,Supporter,False,2011-11-11 16:50:54.397 +11620,2806,3,Nice Question,False,2011-11-11 18:40:54.137 +11621,668,3,Nice Answer,False,2011-11-11 18:45:54.253 +11627,3693,3,Scholar,False,2011-11-12 00:26:08.520 +11634,5273,3,Student,False,2011-11-12 12:58:32.480 +11637,5273,3,Editor,False,2011-11-12 17:05:21.110 +11639,5273,3,Scholar,False,2011-11-12 17:25:28.917 +11645,155,2,Sportsmanship,False,2011-11-12 22:36:32.987 +11671,5237,3,Commentator,False,2011-11-14 00:18:36.443 +11672,1322,2,Yearling,False,2011-11-14 03:03:42.670 +11686,1895,2,Pundit,False,2011-11-14 13:33:52.810 +11687,750,2,Necromancer,False,2011-11-14 14:03:52.607 +11694,2198,3,Editor,False,2011-11-14 16:34:15.987 +11697,4871,3,Supporter,False,2011-11-14 17:54:40.257 +11699,5273,3,Supporter,False,2011-11-14 18:39:56.270 +11702,2765,3,Nice Answer,False,2011-11-14 19:20:12.697 +11722,750,3,Excavator,False,2011-11-15 14:45:06.710 +11723,674,3,Nice Answer,False,2011-11-15 15:30:06.907 +11724,674,2,Enlightened,False,2011-11-15 15:35:06.810 +11728,5213,3,Editor,False,2011-11-15 16:35:15.267 +11737,5213,3,Student,False,2011-11-15 17:30:15.943 +11740,674,3,Nice Answer,False,2011-11-15 17:55:15.527 +11742,2015,3,Scholar,False,2011-11-15 18:15:15.850 +11746,674,3,Nice Answer,False,2011-11-15 20:05:18.730 +11747,449,3,Nice Answer,False,2011-11-15 20:10:18.790 +11752,132,2,Notable Question,False,2011-11-15 23:00:19.910 +11753,668,3,Nice Answer,False,2011-11-15 23:35:20.923 +11754,674,2,Pundit,False,2011-11-15 23:45:21.200 +11760,5249,3,Supporter,False,2011-11-16 04:25:34.653 +11783,1428,3,Nice Answer,False,2011-11-16 19:25:56.967 +11784,3999,2,Necromancer,False,2011-11-16 19:30:56.527 +11786,5273,3,Promoter,False,2011-11-16 20:45:56.553 +11790,346,3,Nice Question,False,2011-11-17 01:11:00.577 +11793,5237,3,Analytical,False,2011-11-17 03:51:00.577 +11794,668,3,Nice Answer,False,2011-11-17 04:31:00.900 +11795,169,3,Nice Question,False,2011-11-17 07:26:05.953 +11816,1359,2,Yearling,False,2011-11-17 23:53:38.750 +11818,1506,3,Self-Learner,False,2011-11-18 02:03:39.523 +11840,155,3,Nice Question,False,2011-11-18 16:10:23.973 +11855,5374,3,Student,False,2011-11-18 23:10:32.393 +11863,4656,3,Student,False,2011-11-19 09:30:38.977 +11864,5196,3,Student,False,2011-11-19 10:25:39.950 +11867,2217,3,Editor,False,2011-11-19 14:06:04.057 +11870,2217,3,Teacher,False,2011-11-19 14:16:04.767 +11877,4911,3,Supporter,False,2011-11-19 21:21:25.347 +11881,4656,3,Scholar,False,2011-11-20 02:01:36.350 +11883,5086,3,Student,False,2011-11-20 03:56:40.680 +11884,5086,3,Supporter,False,2011-11-20 03:56:40.743 +11885,668,3,Nice Answer,False,2011-11-20 04:41:40.847 +11892,3662,3,Excavator,False,2011-11-20 10:26:57.427 +11896,793,3,Popular Question,False,2011-11-20 15:12:41.097 +11900,5273,3,Benefactor,False,2011-11-20 21:53:54.653 +11906,3728,3,Supporter,False,2011-11-21 04:39:12.557 +11924,436,3,Popular Question,False,2011-11-21 13:49:30.490 +11932,4871,3,Promoter,False,2011-11-21 19:21:34.107 +11934,5374,3,Supporter,False,2011-11-21 19:46:45.973 +11956,803,3,Nice Answer,False,2011-11-22 14:40:59.933 +11961,803,2,Enlightened,False,2011-11-22 15:41:01.763 +11962,155,3,Nice Question,False,2011-11-22 16:16:02.977 +11978,3999,3,Self-Learner,False,2011-11-22 20:11:10.823 +11979,1805,3,Popular Question,False,2011-11-22 20:36:10.327 +11991,211,3,Popular Question,False,2011-11-23 00:36:44.363 +11994,1790,3,Nice Question,False,2011-11-23 02:42:07.613 +12003,3446,3,Teacher,False,2011-11-23 11:58:09.537 +12018,5448,3,Teacher,False,2011-11-23 18:58:15.097 +12024,4871,3,Benefactor,False,2011-11-24 00:48:29.063 +12033,1406,2,Yearling,False,2011-11-24 09:54:23.273 +12035,1805,2,Necromancer,False,2011-11-24 10:09:22.783 +12048,1411,2,Yearling,False,2011-11-24 20:49:35.553 +12051,5480,3,Autobiographer,False,2011-11-24 21:49:36.403 +12052,5479,3,Student,False,2011-11-24 22:09:38.423 +12053,5480,3,Student,False,2011-11-24 22:14:38.530 +12054,65,2,Yearling,False,2011-11-24 22:14:38.840 +12055,1412,2,Yearling,False,2011-11-25 00:09:46.010 +12057,3728,3,Editor,False,2011-11-25 01:50:26.087 +12058,3728,3,Excavator,False,2011-11-25 03:15:44.237 +12099,5211,3,Editor,False,2011-11-26 04:58:07.877 +12110,1428,2,Yearling,False,2011-11-26 13:29:04.930 +12114,4871,3,Scholar,False,2011-11-26 21:05:12.173 +12139,5448,3,Editor,False,2011-11-28 02:56:19.663 +12141,668,3,Nice Answer,False,2011-11-28 05:01:20.540 +12143,668,2,Enlightened,False,2011-11-28 05:56:21.560 +12151,169,2,Notable Question,False,2011-11-28 12:46:34.493 +12155,793,3,Nice Question,False,2011-11-28 14:06:35.890 +12163,65,3,Autobiographer,False,2011-11-28 16:56:57.337 +12164,5448,3,Revival,False,2011-11-28 17:01:57.770 +12169,668,2,Enlightened,False,2011-11-28 18:01:59.647 +12177,166,3,Popular Question,False,2011-11-28 20:32:02.273 +12212,668,2,Good Answer,False,2011-11-29 21:28:08.787 +12217,5556,3,Teacher,False,2011-11-29 22:48:09.950 +12218,5237,3,Critic,False,2011-11-30 01:08:13.737 +12220,5448,3,Supporter,False,2011-11-30 03:38:19.127 +12244,5237,3,Organizer,False,2011-11-30 19:00:34.443 +12247,155,3,Nice Answer,False,2011-11-30 20:00:51.623 +12266,60,2,Good Answer,False,2011-12-01 17:10:18.663 +12269,674,3,Nice Answer,False,2011-12-01 18:00:20.060 +12272,82,2,Yearling,False,2011-12-01 19:25:23.810 +12279,5448,3,Commentator,False,2011-12-01 20:55:25.827 +12312,786,2,Yearling,False,2011-12-02 17:37:32.223 +12317,4871,3,Editor,False,2011-12-02 22:17:40.680 +12319,1805,2,Necromancer,False,2011-12-03 00:27:46.233 +12321,5448,3,Autobiographer,False,2011-12-03 02:07:48.277 +12324,5211,3,Tumbleweed,False,2011-12-03 04:42:54.913 +12353,2075,3,Scholar,False,2011-12-04 20:31:34.560 +12356,4656,2,Enthusiast,False,2011-12-05 02:36:45.657 +12357,5643,3,Scholar,False,2011-12-05 03:11:47.543 +12358,1930,3,Scholar,False,2011-12-05 03:16:47.527 +12361,5643,3,Student,False,2011-12-05 08:32:00.980 +12364,3662,3,Promoter,False,2011-12-05 11:32:08.640 +12370,3728,3,Student,False,2011-12-05 15:07:17.757 +12399,5671,3,Supporter,False,2011-12-06 09:03:43.773 +12400,5671,3,Teacher,False,2011-12-06 09:38:45.770 +12401,5671,3,Editor,False,2011-12-06 10:33:45.963 +12402,5671,3,Organizer,False,2011-12-06 10:33:46.307 +12407,5643,3,Editor,False,2011-12-06 11:48:49.017 +12416,5045,3,Student,False,2011-12-06 16:54:27.677 +12427,5179,3,Organizer,False,2011-12-07 06:52:22.523 +12428,668,3,Nice Answer,False,2011-12-07 10:12:27.690 +12429,211,3,Popular Question,False,2011-12-07 10:42:29.033 +12439,5448,3,Revival,False,2011-12-07 14:37:37.593 +12443,5556,3,Editor,False,2011-12-07 15:22:38.540 +12457,1805,2,Favorite Question,False,2011-12-07 23:37:58.180 +12459,5237,2,Enthusiast,False,2011-12-08 03:23:37.800 +12482,4,2,Notable Question,False,2011-12-08 20:58:06.943 +12485,1805,3,Revival,False,2011-12-08 21:28:16.390 +12489,674,3,Nice Answer,False,2011-12-08 22:13:33.957 +12497,1506,2,Yearling,False,2011-12-09 01:54:31.310 +12498,2806,3,Tumbleweed,False,2011-12-09 08:14:40.477 +12502,3728,3,Scholar,False,2011-12-09 11:39:42.890 +12517,1359,3,Investor,False,2011-12-09 21:39:59.363 +12520,674,3,Nice Answer,False,2011-12-09 23:30:00.357 +12521,674,2,Enlightened,False,2011-12-10 00:55:15.467 +12544,1805,3,Revival,False,2011-12-11 08:07:43.143 +12549,668,3,Nice Answer,False,2011-12-11 14:19:26.093 +12554,1895,3,Nice Answer,False,2011-12-11 17:40:27.103 +12557,1895,2,Enlightened,False,2011-12-11 18:25:46.287 +12561,2666,3,Revival,False,2011-12-11 23:46:19.170 +12566,3662,3,Benefactor,False,2011-12-12 05:26:28.180 +12569,1805,3,Revival,False,2011-12-12 12:46:45.720 +12580,5448,3,Nice Answer,False,2011-12-12 18:12:45.063 +12584,5448,2,Enlightened,False,2011-12-12 19:38:04.650 +12588,211,3,Nice Question,False,2011-12-12 20:38:22.607 +12602,132,3,Nice Answer,False,2011-12-13 01:54:23.427 +12603,132,2,Enlightened,False,2011-12-13 02:44:23.997 +12609,5237,3,Nice Answer,False,2011-12-13 07:24:29.760 +12611,5237,2,Enlightened,False,2011-12-13 08:54:30.513 +12624,5556,3,Supporter,False,2011-12-13 16:09:46.783 +12630,5045,3,Teacher,False,2011-12-13 18:45:09.533 +12634,3662,2,Enthusiast,False,2011-12-13 21:15:43.447 +12635,4735,3,Scholar,False,2011-12-13 21:20:43.897 +12645,1895,2,Enlightened,False,2011-12-14 03:35:51.707 +12646,1895,3,Nice Answer,False,2011-12-14 03:35:51.953 +12660,1895,2,Deputy,False,2011-12-14 11:22:03.603 +12680,1406,2,Good Answer,False,2011-12-14 18:36:09.017 +12687,5821,3,Teacher,False,2011-12-15 00:56:29.417 +12690,674,2,Favorite Question,False,2011-12-15 04:26:45.213 +12691,155,2,Favorite Question,False,2011-12-15 04:26:45.213 +12699,5821,3,Revival,False,2011-12-15 10:36:49.740 +12713,5821,3,Supporter,False,2011-12-15 18:52:13.900 +12721,5237,3,Nice Answer,False,2011-12-15 23:43:28.020 +12756,1575,2,Yearling,False,2011-12-17 17:45:21.063 +12767,5875,3,Teacher,False,2011-12-18 13:51:21.490 +12773,5875,3,Supporter,False,2011-12-18 17:51:29.463 +12788,5875,3,Editor,False,2011-12-19 07:49:23.103 +12796,668,3,Nice Answer,False,2011-12-19 15:41:09.863 +12811,346,3,Nice Answer,False,2011-12-19 17:51:49.480 +12813,750,2,Strunk & White,False,2011-12-19 18:06:50.047 +12814,5671,3,Commentator,False,2011-12-19 18:16:55.900 +12817,5898,3,Commentator,False,2011-12-19 18:51:57.533 +12818,5898,3,Critic,False,2011-12-19 18:51:57.547 +12819,5898,3,Editor,False,2011-12-19 18:51:57.593 +12820,5898,3,Scholar,False,2011-12-19 18:51:57.860 +12821,5898,3,Student,False,2011-12-19 18:51:57.907 +12822,5898,3,Supporter,False,2011-12-19 18:51:57.923 +12823,5898,3,Teacher,False,2011-12-19 18:51:57.970 +12834,5898,3,Tumbleweed,False,2011-12-20 03:02:05.927 +12835,1959,3,Editor,False,2011-12-20 09:57:19.420 +12837,450,3,Citizen Patrol,False,2011-12-20 10:22:19.227 +12838,1959,3,Supporter,False,2011-12-20 10:27:19.617 +12840,1959,3,Scholar,False,2011-12-20 10:52:21.480 +12848,1959,3,Student,False,2011-12-20 13:57:53.673 +12851,5906,3,Editor,False,2011-12-20 14:57:53.690 +12852,2081,3,Nice Answer,False,2011-12-20 15:27:53.823 +12858,5906,3,Commentator,False,2011-12-20 16:52:53.880 +12865,1895,3,Nice Answer,False,2011-12-20 18:12:55.873 +12866,1895,2,Enlightened,False,2011-12-20 18:27:56.423 +12871,5821,3,Editor,False,2011-12-20 19:52:56.410 +12884,633,3,Excavator,False,2011-12-21 11:03:53.187 +12889,5875,3,Nice Answer,False,2011-12-21 13:39:26.550 +12890,5917,3,Teacher,False,2011-12-21 14:09:34.903 +12903,2666,3,Revival,False,2011-12-21 17:50:41.100 +12913,5875,3,Commentator,False,2011-12-21 21:01:04.620 +12917,5821,3,Student,False,2011-12-21 23:21:04.740 +12918,182,2,Notable Question,False,2011-12-22 00:06:04.910 +12928,651,3,Nice Answer,False,2011-12-22 11:36:19.980 +12930,1412,3,Nice Answer,False,2011-12-22 14:41:41.407 +12947,2666,2,Good Answer,False,2011-12-23 10:52:21.670 +12952,5911,3,Student,False,2011-12-24 09:48:40.260 +12958,5821,3,Commentator,False,2011-12-24 23:29:04.513 +12959,2666,3,Revival,False,2011-12-25 01:54:10.860 +12970,132,3,Nice Answer,False,2011-12-25 23:00:55.257 +12971,5448,2,Enthusiast,False,2011-12-26 01:50:59.697 +12980,5448,3,Organizer,False,2011-12-26 22:27:50.880 +12981,1085,3,Popular Question,False,2011-12-26 22:57:50.590 +12997,4318,3,Student,False,2011-12-27 15:28:39.127 +12998,668,3,Nice Answer,False,2011-12-27 15:38:38.583 +13013,5875,3,Organizer,False,2011-12-28 09:24:19.990 +13014,211,3,Popular Question,False,2011-12-28 11:29:24.130 +13015,5875,3,Critic,False,2011-12-28 12:04:24.747 +13019,1636,2,Yearling,False,2011-12-28 15:00:03.053 +13020,155,2,Good Question,False,2011-12-28 15:35:04.763 +13029,651,3,Quorum,False,2011-12-28 20:23:02.330 +13037,1150,3,Supporter,False,2011-12-29 07:59:14.270 +13038,5637,3,Teacher,False,2011-12-29 08:29:15.340 +13040,5637,3,Editor,False,2011-12-29 13:09:23.550 +13052,5671,3,Autobiographer,False,2011-12-29 18:09:33.717 +13056,3662,3,Quorum,False,2011-12-30 00:23:12.580 +13059,287,2,Notable Question,False,2011-12-30 03:24:49.310 +13063,793,3,Nice Answer,False,2011-12-30 10:34:58.063 +13068,728,2,Enthusiast,False,2011-12-30 16:30:16.743 +13071,5637,3,Commentator,False,2011-12-30 16:55:16.657 +13076,2873,3,Nice Answer,False,2011-12-30 17:10:18.160 +13077,2873,2,Enlightened,False,2011-12-30 17:15:17.787 +13081,5911,3,Scholar,False,2011-12-30 19:15:18.753 +13107,132,2,Notable Question,False,2012-01-01 00:04:42.223 +13108,674,2,Good Question,False,2012-01-01 06:31:04.620 +13155,1150,3,Scholar,False,2012-01-03 05:10:54.107 +13161,5875,3,Citizen Patrol,False,2012-01-03 10:26:00.783 +13169,3922,3,Student,False,2012-01-03 15:36:05.533 +13171,3922,3,Scholar,False,2012-01-03 17:56:10.347 +13181,1805,1,Electorate,False,2012-01-03 21:36:22.287 +13184,155,2,Good Answer,False,2012-01-03 22:41:32.273 +13185,2666,3,Nice Answer,False,2012-01-03 23:21:34.830 +13190,5448,3,Nice Answer,False,2012-01-04 03:07:06.263 +13191,1406,2,Good Answer,False,2012-01-04 05:32:33.563 +13192,674,3,Nice Answer,False,2012-01-04 05:37:32.300 +13196,5637,3,Critic,False,2012-01-04 07:22:38.867 +13201,1691,2,Yearling,False,2012-01-04 13:57:40.163 +13209,1693,2,Yearling,False,2012-01-04 17:47:44.187 +13210,5637,3,Analytical,False,2012-01-04 18:52:44.393 +13211,132,3,Nice Answer,False,2012-01-04 18:52:44.860 +13219,651,3,Nice Answer,False,2012-01-05 00:37:57.120 +13222,651,2,Enlightened,False,2012-01-05 01:17:57.977 +13225,668,3,sampling,True,2012-01-05 03:00:15.523 +13228,166,2,Favorite Question,False,2012-01-05 05:13:19.013 +13231,2958,3,Student,False,2012-01-05 08:58:19.830 +13234,4318,3,Scholar,False,2012-01-05 12:43:24.190 +13237,5237,3,Nice Answer,False,2012-01-05 13:18:24.933 +13244,5987,3,Teacher,False,2012-01-05 15:28:41.947 +13246,114,3,Popular Question,False,2012-01-05 16:13:49.633 +13248,5661,3,Editor,False,2012-01-05 17:14:03.037 +13250,5661,3,Supporter,False,2012-01-05 17:19:11.167 +13252,5448,3,Revival,False,2012-01-05 19:04:35.683 +13253,5984,3,Editor,False,2012-01-05 19:34:42.710 +13254,674,2,Favorite Question,False,2012-01-05 19:34:42.897 +13258,5984,3,Teacher,False,2012-01-05 22:55:10.910 +13266,5045,3,Scholar,False,2012-01-06 00:45:17.413 +13273,166,3,Nice Question,False,2012-01-06 11:30:49.587 +13274,1927,3,Citizen Patrol,False,2012-01-06 11:40:49.197 +13281,5448,2,Good Answer,False,2012-01-06 15:40:52.677 +13282,6136,3,Student,False,2012-01-06 15:55:53.930 +13283,3048,3,Promoter,False,2012-01-06 18:16:10.640 +13284,155,3,Popular Question,False,2012-01-06 19:06:20.657 +13287,22,3,Popular Question,False,2012-01-06 21:41:49.660 +13288,668,3,Nice Answer,False,2012-01-06 23:27:14.417 +13289,1359,3,Altruist,False,2012-01-07 00:27:15.247 +13290,5448,3,Mortarboard,False,2012-01-07 00:47:15.907 +13297,6136,3,Supporter,False,2012-01-07 12:07:52.487 +13298,5917,3,Supporter,False,2012-01-07 15:13:00.000 +13305,449,3,Nice Answer,False,2012-01-07 20:53:12.597 +13306,5875,3,Nice Answer,False,2012-01-07 21:33:15.447 +13307,449,2,Enlightened,False,2012-01-07 21:43:15.217 +13308,668,2,Enlightened,False,2012-01-07 21:43:15.217 +13309,5875,2,Enlightened,False,2012-01-07 21:43:15.217 +13317,6162,3,Editor,False,2012-01-08 11:01:11.733 +13318,6162,3,Student,False,2012-01-08 11:16:13.063 +13322,6162,3,Teacher,False,2012-01-08 13:01:16.743 +13325,4,3,Nice Question,False,2012-01-08 15:46:22.173 +13326,5480,3,Supporter,False,2012-01-08 16:21:23.573 +13330,6162,3,Supporter,False,2012-01-08 17:51:27.983 +13331,2069,3,Popular Question,False,2012-01-08 18:16:28.367 +13340,5984,3,Student,False,2012-01-08 21:01:34.307 +13350,1209,2,Notable Question,False,2012-01-08 22:51:38.507 +13357,668,2,Good Answer,False,2012-01-09 02:31:46.057 +13361,5637,3,Supporter,False,2012-01-09 08:57:14.573 +13362,132,3,Nice Answer,False,2012-01-09 09:47:22.280 +13386,1428,3,Nice Answer,False,2012-01-09 20:34:37.253 +13390,503,2,Necromancer,False,2012-01-09 21:55:01.600 +13399,1741,2,Yearling,False,2012-01-10 09:56:34.183 +13404,5179,3,Student,False,2012-01-10 12:56:39.827 +13405,5637,3,Autobiographer,False,2012-01-10 13:01:39.350 +13406,132,3,Announcer,False,2012-01-10 13:56:39.320 +13409,668,3,Announcer,False,2012-01-10 13:56:39.333 +13408,4,3,Announcer,False,2012-01-10 13:56:39.333 +13410,750,3,Announcer,False,2012-01-10 13:56:39.333 +13416,155,3,Announcer,False,2012-01-10 13:56:39.333 +13419,155,3,Announcer,False,2012-01-10 13:56:39.333 +13417,4,3,Announcer,False,2012-01-10 13:56:39.333 +13411,155,3,Announcer,False,2012-01-10 13:56:39.333 +13412,674,3,Announcer,False,2012-01-10 13:56:39.333 +13429,750,3,Announcer,False,2012-01-10 13:56:39.350 +13427,674,3,Announcer,False,2012-01-10 13:56:39.350 +13423,155,3,Announcer,False,2012-01-10 13:56:39.350 +13433,155,3,Announcer,False,2012-01-10 13:56:39.350 +13432,674,3,Announcer,False,2012-01-10 13:56:39.350 +13437,132,3,Announcer,False,2012-01-10 13:56:39.367 +13439,5179,3,Announcer,False,2012-01-10 13:56:39.367 +13435,5179,3,Announcer,False,2012-01-10 13:56:39.367 +13440,155,3,Announcer,False,2012-01-10 13:56:39.367 +13436,674,3,Announcer,False,2012-01-10 13:56:39.367 +13448,5179,3,Scholar,False,2012-01-10 15:46:40.600 +13458,674,3,Nice Answer,False,2012-01-10 19:36:44.043 +13460,674,2,Enlightened,False,2012-01-10 20:31:44.593 +13462,5661,3,Teacher,False,2012-01-10 22:46:49.757 +13471,5199,3,Student,False,2012-01-11 04:22:31.797 +13476,1691,3,Quorum,False,2012-01-11 14:23:57.090 +13486,4,3,Nice Answer,False,2012-01-11 20:43:25.353 +13489,5208,3,Student,False,2012-01-11 22:43:50.367 +13496,3999,3,Promoter,False,2012-01-12 04:40:55.533 +13502,5208,3,Supporter,False,2012-01-12 09:52:18.540 +13503,5208,3,Editor,False,2012-01-12 09:57:23.167 +13526,5237,3,Nice Answer,False,2012-01-13 00:56:19.417 +13536,5208,3,Scholar,False,2012-01-13 11:38:58.400 +13546,155,3,Nice Question,False,2012-01-13 16:29:56.703 +13547,1411,3,Citizen Patrol,False,2012-01-13 16:35:02.267 +13583,1411,3,Critic,False,2012-01-15 00:02:37.363 +13585,503,2,Good Answer,False,2012-01-15 01:07:49.453 +13595,166,3,Nice Question,False,2012-01-15 13:44:50.390 +13598,450,3,Peer Pressure,False,2012-01-15 15:19:53.107 +13609,1790,2,Yearling,False,2012-01-16 02:20:22.113 +13612,5875,2,Enthusiast,False,2012-01-16 07:46:04.877 +13619,5875,3,Student,False,2012-01-16 13:02:15.080 +13632,633,3,Student,False,2012-01-16 19:34:12.640 +13643,633,3,Scholar,False,2012-01-17 03:24:25.170 +13660,155,3,Popular Question,False,2012-01-17 13:44:44.577 +13670,651,3,Nice Answer,False,2012-01-17 20:41:16.447 +13672,182,2,Taxonomist,False,2012-01-17 21:01:16.947 +13684,4656,3,Quorum,False,2012-01-18 01:24:36.880 +13685,5917,2,Enthusiast,False,2012-01-18 01:51:18.793 +13687,1805,2,Yearling,False,2012-01-18 02:21:19.363 +13703,5643,3,Supporter,False,2012-01-18 14:21:31.650 +13721,503,3,Nice Question,False,2012-01-19 07:17:31.137 +13736,2105,3,Commentator,False,2012-01-19 18:23:56.630 +13738,5045,3,Commentator,False,2012-01-19 18:59:01.327 +13743,5875,3,Revival,False,2012-01-19 22:39:29.137 +13758,5906,3,Student,False,2012-01-20 14:44:54.817 +13771,4735,3,Student,False,2012-01-20 17:30:09.293 +13775,192,3,Nice Answer,False,2012-01-20 22:26:31.523 +13778,5045,3,Editor,False,2012-01-20 23:41:43.363 +13792,211,3,Nice Question,False,2012-01-21 07:18:39.260 +13802,1831,2,Yearling,False,2012-01-21 15:09:10.260 +13803,1412,3,Citizen Patrol,False,2012-01-21 15:39:09.773 +13805,5179,3,Nice Answer,False,2012-01-21 16:24:11.110 +13806,5179,2,Enlightened,False,2012-01-21 16:59:11.607 +13812,6384,3,Student,False,2012-01-21 19:39:13.053 +13814,22,3,Nice Answer,False,2012-01-21 20:59:13.327 +13815,132,2,Good Question,False,2012-01-21 21:14:13.320 +13816,132,3,Nice Answer,False,2012-01-21 21:19:13.310 +13821,5203,3,Editor,False,2012-01-22 02:34:46.200 +13822,6384,3,Scholar,False,2012-01-22 02:44:46.620 +13823,6384,3,Editor,False,2012-01-22 02:49:46.143 +13827,5203,3,Teacher,False,2012-01-22 05:55:27.713 +13834,6384,3,Supporter,False,2012-01-22 13:32:39.700 +13838,5875,3,Scholar,False,2012-01-22 20:03:47.890 +13841,674,3,Nice Answer,False,2012-01-22 21:23:49.010 +13842,674,2,Enlightened,False,2012-01-22 21:23:49.320 +13852,2666,3,Revival,False,2012-01-23 10:24:16.860 +13854,674,3,Announcer,False,2012-01-23 11:44:38.450 +13871,5045,3,Supporter,False,2012-01-23 15:35:34.513 +13874,3048,3,Commentator,False,2012-01-23 18:56:34.630 +13880,674,3,Nice Answer,False,2012-01-23 23:02:48.193 +13912,6404,3,Scholar,False,2012-01-24 20:13:04.287 +13913,6404,3,Student,False,2012-01-24 20:13:04.347 +13915,371,2,Yearling,False,2012-01-24 20:13:04.470 +13928,169,2,Favorite Question,False,2012-01-25 04:38:16.130 +13929,211,2,Necromancer,False,2012-01-25 04:43:16.217 +13935,371,3,Commentator,False,2012-01-25 10:18:27.060 +13946,1809,2,Yearling,False,2012-01-25 16:30:08.010 +13964,674,3,Announcer,False,2012-01-25 23:22:03.663 +13971,5906,3,Scholar,False,2012-01-26 11:12:29.083 +13972,5906,3,Supporter,False,2012-01-26 11:12:29.240 +13987,1359,3,Benefactor,False,2012-01-26 18:42:37.310 +13989,5448,3,Revival,False,2012-01-26 18:52:38.360 +13990,6404,3,Commentator,False,2012-01-26 19:02:38.233 +13999,3728,3,Teacher,False,2012-01-27 00:32:58.997 +14002,503,3,regression,True,2012-01-27 03:00:13.297 +14007,5911,3,Supporter,False,2012-01-27 12:36:10.353 +14008,211,3,Popular Question,False,2012-01-27 13:36:27.277 +14010,3922,3,Nice Answer,False,2012-01-27 13:56:32.500 +14022,155,3,Nice Answer,False,2012-01-27 21:18:04.493 +14027,5203,3,Supporter,False,2012-01-28 02:18:30.187 +14030,5086,3,Investor,False,2012-01-28 06:38:39.063 +14031,1889,2,Yearling,False,2012-01-28 08:03:42.753 +14046,728,2,Notable Question,False,2012-01-29 02:30:26.963 +14053,5038,3,Editor,False,2012-01-29 16:01:00.897 +14058,6162,3,Tumbleweed,False,2012-01-29 19:06:29.750 diff --git a/examples/csv_examples/comments.csv b/examples/csv_examples/comments.csv new file mode 100644 index 00000000..ca69a1b5 --- /dev/null +++ b/examples/csv_examples/comments.csv @@ -0,0 +1,2125 @@ +id,post_id,user_id,content_license,user_display_name,text,creation_date +82,143,,CC BY-SA 2.5,user88,I think this is the first candidate to be moved to Stack Overflow.,2010-07-19 21:38:27.890 +101,143,,CC BY-SA 2.5,,"Possibly, but it'd need a lot more explanation on SO.",2010-07-20 01:29:42.683 +127,143,,CC BY-SA 2.5,,"Most programmers know ""median"". (sort(array))[length/2] is a big enough hint for those who forgot. Also at its most basic for each new point you only need to do a bisection/insert on one half of the array...",2010-07-20 09:04:20.020 +172,143,,CC BY-SA 2.5,,@walkytalky I don't think it would require any more explanation than any other algorithm question on SO. Probably less as the median is a relatively basic concept.,2010-07-20 18:15:39.900 +194,143,,CC BY-SA 2.5,,@Sharpie Perhaps not more than other SO algo questions. But certainly more than what it actually says here!,2010-07-21 02:58:14.737 +254,414,4.0,CC BY-SA 2.5,,"@sharpie: are jokes out? We obviously don't want the entire site to be humor, but everyone benefits from a little educational humor in small doses.",2010-07-22 05:15:40.057 +273,414,,CC BY-SA 2.5,,"@Sharpie, feel free to close or reopen according to your feelings! I agree with Shane, a bit is ok, but not too much. For example, this question already included a funny cartoon. The jokes question not really a funny joke....",2010-07-22 13:58:03.300 +274,414,4.0,CC BY-SA 2.5,,These cartoons are useful too; they can be included in a lecture on a particular topic where you are trying to explain a concept (e.g. correlation/causation above). A little humor can help to keep an audience engaged.,2010-07-22 14:22:11.213 +306,541,190.0,CC BY-SA 2.5,,"If I am not mistaken, + +linear regression is the estimation of coefficients that define a good linear map from X to Y. + + ANOVA is a test to know if there is significant differences in X when Y take two different values. + +Can you explain us why you think they are the same?",2010-07-23 15:29:16.043 +309,543,,CC BY-SA 2.5,user28,"Thanks for the Gelman reference. I will read his paper. But, can't we analyze multilevel models using classical maximum likelihood? I agree that OLS is inefficient/inappropriate for multi-level models.",2010-07-23 15:50:47.240 +311,543,182.0,CC BY-SA 2.5,,"@Srikant - there any many ways to deal with multilevel data and Gelman is ""the king"" of this field. His point is that ANOVA is a simple/clear method of capturing the key features of complex and hierarchical data structures or study designs and ANOVA is a simple/clear way of presenting the key results. In this sense it's role is complementary or exploratory.",2010-07-23 16:30:00.160 +1023,1248,132.0,CC BY-SA 2.5,,I made this community wiki as there is no correct answer.,2010-08-06 02:44:34.817 +1077,1248,155.0,CC BY-SA 2.5,,It probably makes sense to leave cartoons in this question: http://stats.stackexchange.com/questions/423/what-is-your-favorite-data-analysis-cartoon,2010-08-08 11:49:27.500 +1121,28,4.0,CC BY-SA 2.5,,"Thanks @robin; made CW. Although I don't entirely see this as ""argumentative""; there are two fields which have informed each other (this is a fact), and the question is how much they have evolved together over the last decade.",2010-08-09 14:17:51.890 +1319,143,132.0,CC BY-SA 2.5,,Re-opened following discussion at http://meta.stats.stackexchange.com/questions/276/should-we-unclose-computing-questions,2010-08-13 00:31:38.923 +2603,2509,60.0,CC BY-SA 2.5,,"Good question. I agree with the quote as well. I believe there are many people in statistics and mathematics who are highly intelligent, and can get very deep into their work, but don't deeply understand what they are working on. Or they do, but are incapable of explaining it to others.I go out of my way to provide answers here in plain English, and ask questions demanding plan English answers.",2010-09-15 21:43:29.863 +2612,2509,,CC BY-SA 2.5,user1108,I had imagined a lengthy demo with a bunch of graphs and explanations when I stumbled across [this](http://www.youtube.com/watch?v=BfTMmoDFXyE).,2010-09-16 02:18:11.883 +2615,2509,668.0,CC BY-SA 2.5,,"This was asked on the Mathematics site in July, but not as well and it didn't get many answers (not surprising, given the different focus there). http://math.stackexchange.com/questions/1146/intuitive-way-to-understand-principal-component-analysis",2010-09-16 05:03:44.287 +2618,2509,,CC BY-SA 2.5,,Similar to explanation by Zuur et al in Analyzing ecological data where they talk about projecting your hand on an overhead projector. You keep rotating your hand so that the projection on the wall looks pretty similar to what you think a hand should look like.,2010-09-16 09:00:49.823 +3809,541,,CC BY-SA 2.5,,"ANOVA can be seen as ""syntactic sugar"" for a special subgroup of linear regression models. ANOVA is regularly used by researchers who are not statisticians by training. They are now ""institutionalized"" and its hard to convert them back to using the more general representation ;-)",2010-10-14 16:52:51.193 +4419,2509,,CC BY-SA 2.5,,"Here is the link to ""Analysing ecological data"" by Alain F. Zuur, Elena N. Ieno, Graham M. Smith, where the example with the overhead-projector and the hand is given: http://books.google.de/books?id=mmPvf-l7xFEC&lpg=PA15&ots=b_5iizOr3p&dq=Zuur%20et%20al%20in%20Analyzing%20ecological%20data&hl=en&pg=PA194#v=onepage&q&f=false",2010-10-26 06:25:04.147 +4636,3649,211.0,CC BY-SA 2.5,,"Thanks chl for the answer, I accepted it for the sheer scope of it. Best, Tal",2010-10-31 13:34:32.593 +5192,4187,287.0,CC BY-SA 2.5,,"I'm aware that ""sin"" is possibly inflammatory and that that some aspects of statistical analysis are not black-and-white. My intention is to solicit cases where a given commonly-taught practice is pretty clearly inappropriate.",2010-11-15 18:53:14.270 +5194,4187,436.0,CC BY-SA 2.5,,You can also add biology/life sciences students to the mix if you like ;),2010-11-15 19:03:17.187 +5196,4187,449.0,CC BY-SA 2.5,,maybe retitle it life science statistical sins?... or something else more specific...,2010-11-15 19:27:28.377 +5972,4705,,CC BY-SA 2.5,user88,Converted to community wiki.,2010-12-04 00:14:08.750 +5973,4705,1209.0,CC BY-SA 2.5,,what is community wiki?,2010-12-04 00:53:59.723 +5975,4705,,CC BY-SA 2.5,,@Mariana: http://www.sharepointoverflow.com/questions/432/what-is-community-wiki,2010-12-04 04:32:47.227 +5990,4705,,CC BY-SA 2.5,user88,@Mariana The idea is that pools and list-ofs are converted to a form in which they can be easily managed (due to lower rep req to edit) and voted up/down without hurting participants' reputation (votes on CW posts does not give/take reputation).,2010-12-04 16:01:50.720 +6372,5015,,CC BY-SA 2.5,,"you could probably get a better answer if you provided more details about your experimental design, research question, and statistical model.",2010-12-13 23:52:15.460 +6373,5015,1542.0,CC BY-SA 2.5,,"I have survey data, v1 and v2 predict the outcome, as I expected; however, the interaction between v1 (dichotomous) and v2 (5 groups) is not significant -- and (my question) it makes my v1 and v2 direct effects non-significant too. I can't find an example on reporting this in the literature.",2010-12-14 00:03:33.020 +6374,5015,,CC BY-SA 2.5,,"If the v1:v2 interaction is not significant, do you need to have it included in the model?",2010-12-14 01:40:08.247 +6379,5015,1506.0,CC BY-SA 2.5,,Maybe this question is relevant? http://stats.stackexchange.com/questions/5184/removing-factors-from-a-3-way-anova-table,2010-12-14 03:00:55.870 +6637,4714,,CC BY-SA 2.5,,But was he _really_ a statistician?,2010-12-19 22:27:36.527 +6656,4714,60.0,CC BY-SA 2.5,,That's a tough one...when did statistics become a real field? Many of the fathers of stats were not statisticians.,2010-12-20 18:23:52.967 +6702,4705,668.0,CC BY-SA 2.5,,If it weren't CW it would have to be closed as subjective and argumentative!,2010-12-21 19:37:00.287 +6803,4714,1209.0,CC BY-SA 2.5,,"I know my choice is kind of arbitrary, because many are important but this is my favourite one, and his method allowed me to do lots of things.",2010-12-24 18:35:22.593 +8089,1760,,CC BY-SA 2.5,,This is related to a past question: http://stats.stackexchange.com/q/2275/495,2011-01-25 20:19:06.560 +8736,4187,,CC BY-SA 2.5,user88,"@whuber There was some good answers, so I've merged them both.",2011-02-06 11:02:41.973 +9263,1787,723.0,CC BY-SA 2.5,,"Hmm - my question is not very well defined. The only thing I can do is pick *some* model for q() that permits setting parameters, and maximise the goodness of fit by fiddling with those parameters. That is - no matter what I do I will have to make some assumptions about what q() basically looks like.",2011-02-16 06:35:44.297 +10996,543,,CC BY-SA 2.5,,"+1 for a nice clear answer. Paragraph 3 is essentially what I was taught as a biology undergraduate, with emphasis laid on the ease of combining continuous and categorical independent variables in an ANOVA framework.",2011-03-14 15:43:11.747 +11730,7965,1691.0,CC BY-SA 2.5,,"I realize that these are two slightly different questions, but I want them to be together to give some background to what I am using the results of the `scale()` function.",2011-03-24 14:52:09.670 +11731,7965,674.0,CC BY-SA 2.5,,"@celenius `center=true` just means remove the mean, and `scale=TRUE` stands for divide by SD; in other words, with both options active, you're getting standardized variables (with mean 0, unit variance, and values expressed in SD units).",2011-03-24 14:58:31.323 +11733,7965,1691.0,CC BY-SA 2.5,,"@chl Ah! Thank you. That is a much clearer explanation to me than the help file. Does it make a difference how a variable is standardized for clustering with kmeans though? I would assume not, but don't know for sure.",2011-03-24 15:01:49.490 +11736,7965,674.0,CC BY-SA 2.5,,"@celenius I can suggest this article, [Standardizing Variables in K-means Clustering](http://www.springerlink.com/content/l438152634256311/) (Steinley, 2004) to get an idea of the effects of different kind of transformations. The utility of scaling depends on the data you have, but usually one reason to use standardized (scaled) variables is to avoid obtaining clusters that are dominated by variables having the largest amount of variation.",2011-03-24 15:18:09.887 +11745,7965,668.0,CC BY-SA 2.5,,@celenius Why is collinearity a problem? It shouldn't affect k-means at all.,2011-03-24 17:49:39.270 +11749,7965,1691.0,CC BY-SA 2.5,,"@whuber I assumed that the more collinear variables I include, the more that it biases the clustering towards those collinear dimensions.",2011-03-24 18:18:28.917 +11751,7965,668.0,CC BY-SA 2.5,,"@celenius It shouldn't introduce bias. Take, for instance, the extreme of perfect collinearity in 2D: all observations are of the form $(a+t x, b+t y)$ for constants $a,b,x,y$. K-means uses Euclidean distances and the distance between two such points given by $t$ and $s$ is just $|s-t|$: it's really a 1D problem and there's no special weighting given to any of the data.",2011-03-24 19:31:46.477 +12756,8529,1895.0,CC BY-SA 3.0,,Do you have some general interests that you'd like to list? That might help guide suggestions. Applications of statistics have become pretty pervasive in a remarkably broad array of fields.,2011-04-09 01:58:29.253 +12889,8529,793.0,CC BY-SA 3.0,,"@cardinal, nope, no particular interests -- the purpose was to branch out from the stuff I typically read, so I'm trying not to limit any answers. (This does maybe make the question a bit too broad, but I guess I'm looking for people's personal ""best of"" lists.)",2011-04-11 21:40:41.700 +12963,2509,,CC BY-SA 3.0,,"A two pages article explaining PCA for biologists: Ringnér. [What is principal component analysis?](http://www.nature.com/nbt/journal/v26/n3/full/nbt0308-303.html). Nature Biotechnology 26, 303-304 (2008)",2011-04-13 01:55:06.840 +14478,9524,221.0,CC BY-SA 3.0,,"nice idea for a collection ! @mods: Beside cw maybe renaming to ""movies every statistician should have seen"" or something like that ?",2011-05-07 12:28:31.583 +14480,9524,674.0,CC BY-SA 3.0,,@steffen Thx. Better to flag the question for mods attention in the future (we aren't notified with @mods).,2011-05-07 12:37:52.090 +14485,9524,2872.0,CC BY-SA 3.0,,"By the way, I found this page, which might add some movies to the collection: http://world.std.com/~reinhold/mathmovies.html",2011-05-07 14:41:51.013 +14487,9524,,CC BY-SA 3.0,,Nothing that will inspire anybody to take up mathematics. Stick to books.,2011-05-07 15:21:49.440 +14497,9524,221.0,CC BY-SA 3.0,,@Emre Maybe. But nevertheless it is entertaining for those who already deal with math (even more if the movie is presenting things wrong or with exaggeration).,2011-05-07 18:38:36.787 +14507,9529,1693.0,CC BY-SA 3.0,,"An absorbing story and one I'd recommend, but it referred to math rather than truly involding math. I'd say the same thing about the tv show Numb3rs.",2011-05-07 22:43:12.617 +14519,4705,1209.0,CC BY-SA 3.0,,"Thank you very much for doing that, but I am sorry to tell you that I can not see anything i the first link.",2011-05-08 05:28:34.270 +15357,10008,2352.0,CC BY-SA 3.0,,"My philosophy is run lots of models, check their predictions, compare, explain, run more models.",2011-05-20 03:43:22.080 +15413,10008,,CC BY-SA 3.0,,"If the interactions are only significant when the main effects are NOT in the model, it may be that the main effects are significant and the interactions not. Consider one highly significant main effect with variance on the order of 100 and another insignificant main effect for which all values are approximately one with very low variance. Their interaction is not significant, but the interaction effect will appear to be significant if the main effects are removed from the model.",2011-05-20 16:26:35.877 +16234,10541,674.0,CC BY-SA 3.0,,"Please, don't cross-post on [SO](http://stackoverflow.com/questions/6241181/gap-statistics-matlab-implementation) and [MatlabCentral](http://www.mathworks.fr/matlabcentral/answers/8811-gap-staticstics-implementation)!",2011-06-05 08:48:49.270 +16249,10541,1895.0,CC BY-SA 3.0,,The gap statistic takes 20 lines of code or less to implement. It may be faster just to write it yourself.,2011-06-05 14:45:59.767 +16253,10541,2690.0,CC BY-SA 3.0,,@chi ok will not repeat the same.,2011-06-05 15:14:57.170 +16955,10911,,CC BY-SA 3.0,,"Not an answer, just an intuitive observation. The CI for the pooled data mean (all nine obs) is $(39.7 \pm 2.13)$, CI based on the means only is $(39.7\pm 12.83)$. Not sure what your CI is doing (typo? 17 not 27, and 51 not 61?), I get $2.98$ for std err of three means, and $4.30$ as $0.975$ quantile of T dist with 2 df. I would think that the CI you seek would lie somewhere in between these two - as you have partial pooling. Could also think in terms of variance formula $V(Y)=E[V(Y|Y_g)]+V[E(Y|Y_g)]$, each CI uses half of the formula",2011-06-18 04:58:50.833 +16992,10911,22.0,CC BY-SA 3.0,,"@probabilityislogic: The SEM of the three experiment means is 5.168 (not 2.98 as you wrote), and the confidence interval I gave in the original post (17.4 to 61.9) is correct. The SEM is computed from the SD (8.95) by dividing by the square root of n (square root of 3). You divided by n (3) instead.",2011-06-19 13:42:39.630 +17011,10911,,CC BY-SA 3.0,,"my mistake, should also replace $2.13$ by $6.40$ in the pooled interval (same mistake there)",2011-06-20 05:10:22.410 +18283,412,,CC BY-SA 3.0,,"Could you be a little more precise? What type of analysis, in what context, etc.",2011-07-13 05:38:35.730 +18284,412,,CC BY-SA 3.0,,"Well, I'm talking about the basics, an overview of as much as is possible.",2011-07-13 07:08:38.587 +19728,4714,,CC BY-SA 3.0,,That this was selected indicates a biased prior.,2011-08-06 00:20:15.883 +20768,13058,668.0,CC BY-SA 3.0,,"For one solution, see the comments to [this reply](http://stats.stackexchange.com/questions/14351/forecasting-time-series-based-on-a-behavior-of-other-one/14366#14366). Open source solutions would include image processing or raster GIS software ([GRASS](http://grass.fbk.eu/) is a likely candidate) or, perhaps, [GNU Octave](http://www.gnu.org/software/octave/). I'm mentioning these as a comment because I haven't used either for this specific purpose, so please take them as possibilities, not as definite solutions.",2011-08-18 04:20:43.743 +20769,13058,1124.0,CC BY-SA 3.0,,"I'm hoping for code/software specifically for scraping graphs, and I remember such packages existed, at least they did 10 yrs ago, but I can't remember their names now, and don't know if they work on current operating systems.",2011-08-18 04:56:35.977 +20772,13058,,CC BY-SA 3.0,,"@Alex, try googling [""Graph Digitizer Open Source""](http://www.google.com/search?sourceid=chrome&ie=UTF-8&q=graph+digitizer+open+source)",2011-08-18 05:52:45.057 +21254,13060,,CC BY-SA 3.0,,"There is a nice article / tutorial in [R Journal, June 2011](http://journal.r-project.org/archive/2011-1/RJournal_2011-1_Poisot.pdf)",2011-08-23 22:56:28.507 +21943,13631,,CC BY-SA 3.0,,"see if this helps +http://www.math.bme.hu/~morvai/publications/papers/MorvaiWeissActApplMath2003ARX.pdf good day",2011-09-02 03:22:09.713 +21945,13631,5898.0,CC BY-SA 3.0,,Have you considered a hidden markov model?,2011-09-02 03:25:56.700 +21996,13631,,CC BY-SA 3.0,user6145,"Thanks for the answers. But there is any software package already available with some implementations? I have searched in R, but I only found the VLMC package. Thanks, +Ricardo Bessa",2011-09-02 18:17:36.373 +21997,13631,,CC BY-SA 3.0,,"Ricardo, you should edit your question with this additional information instead of adding it as an answer. Thanks, and welcome to the site!",2011-09-02 18:26:26.157 +22002,13631,,CC BY-SA 3.0,,"Are there really two types of 1's in your data? That is, 1 meaning the car could be moving but is not versus 1 meaning that your car really could not be moving at this time. That would be called one-inflation (it's usually zero-inflation). If so, you need to model when the car can be moving or not versus when it might be moving but is not.",2011-09-02 19:13:09.537 +22022,13631,4221.0,CC BY-SA 3.0,,"In this case there is only 1, meaning that the car is not moving. Perhaps hidden markov models can be a good option.",2011-09-03 00:13:46.983 +23827,14729,2081.0,CC BY-SA 3.0,,"Your matrix is positive semi-definite, it's not positive definite though, for it is singular.",2011-10-01 17:53:04.460 +23828,14729,,CC BY-SA 3.0,,What are the dimensions (no. variables; no. samples)?,2011-10-01 17:59:46.217 +23830,14729,5898.0,CC BY-SA 3.0,,"Number of columns = 480. # of rows for each time series = 502. In general you find that the larger the time series the sample covariance matrix tends to be positive definite. However, there are many cases where you'd like to use a substantially smaller value of T (or exponentially weight) to reflect recent market conditions.",2011-10-01 19:24:35.453 +23835,14729,2765.0,CC BY-SA 3.0,,"The question is ill-posed. If your data matrix is 480 by 502 then saying that the matrix has rank $q < 480$ (the column space of the matrix has dimension $q < 480$) is mathematically equivalent to saying that some column is a linear combination of the others, but you can't pick out one column and say that this is the column that is linearly dependent. So there is no procedure for doing this, and the suggested procedure will pick a quite arbitrary security depending on the order they are included.",2011-10-01 21:34:07.430 +23846,14729,5898.0,CC BY-SA 3.0,,The covariance matrix is symmetric. It is generated by transpose(A) * A. The matrix A has dimensions 480x502. However the covariance matrix is 480x480,2011-10-01 23:26:30.863 +24702,15281,668.0,CC BY-SA 3.0,,We have a specialist in this subject who is a frequent contributor. Consult almost any of [his replies](http://stats.stackexchange.com/users/3382/irishstat) for an account of his favorite methods. Here is a [typical one](http://stats.stackexchange.com/questions/6033/detect-changes-in-time-series/8549#8549).,2011-10-13 17:37:36.160 +24706,15281,3641.0,CC BY-SA 3.0,,"@whuber, I didn't find methods on his post.",2011-10-13 19:44:42.753 +24707,15281,668.0,CC BY-SA 3.0,,He provides a link to documents with references and equations.,2011-10-13 20:05:47.283 +24724,14790,,CC BY-SA 3.0,,"Isn't this the reduced row echelon form? If so, aren't there packages / functions available in R?",2011-10-14 00:39:40.743 +24731,14790,2081.0,CC BY-SA 3.0,,"@Arun, I'm not R user so can't know.",2011-10-14 06:42:25.397 +27748,17000,5479.0,CC BY-SA 3.0,,But how could I download them in .txt,2011-11-24 22:44:25.457 +28379,4714,436.0,CC BY-SA 3.0,,Does one *discover* a theorem? Shouldn't it be *postulating* or *theorizing*?,2011-12-04 13:21:30.270 +28490,28,5671.0,CC BY-SA 3.0,,"Add a third culture: **data mining**. Machine learners and data miners speak quite different languages. Usually, the machine learners don't even understand what is different in data mining. To them, it's just unsupervised learning; they ignore the data management aspects and apply the *buzzword* data mining to machine learning, too, adding further to the confusion.",2011-12-06 12:05:25.373 +30338,4714,5237.0,CC BY-SA 3.0,,"Sigh. I would have hoped that one could be an enthusiast of the Bayesian approach without actually believing that Bayes the man was the greatest statistician of all time. Shouldn't we honor someone who contributed important work to its development, like, e.g., [Jaynes](http://en.wikipedia.org/wiki/Edwin_Thompson_Jaynes)?",2012-01-07 03:26:45.850 +31657,1248,668.0,CC BY-SA 3.0,,"This is a popular and much-loved thread, even though it does not (on the face of it) seem to conform to SE standards for content. (Just what practical question is being asked here? :-) Some rules benefit from being ... bent ... once in a while. However, please don't use the existence of this thread to justify creating new ones that fall outside our guidelines unless you think there is a very good reason to do so! Questions about site policy are always appropriate in [Meta](http://meta.stats.stackexchange.com/) and debate is warmly welcomed in [chat](http://chat.stackexchange.com/).",2012-01-24 15:34:51.807 +33305,20240,,CC BY-SA 3.0,vzn,"re ""numerical optimization methods"", ""discrete optimization methods"", it seems many ML techniques could be proven to be within a constant factor of the true optimum if their ""initial search space"" is forced to be large, but I havent seen a ref on this.",2012-02-10 22:10:31.230 +33337,20240,,CC BY-SA 3.0,,"I disagree. + +* for numerical optimization you can get into local minimum (of course you can also apply procedures that make this unprobrable). + +* The same is for Neural Networks (at least it can happen during training of perceptron). + +* Genetic algorithms can also get into local minimum, moreover if you choose to big mutation rates you will get no sensible evolution! + +II also strongly suspect that there are datasets that will always make certain models have arbitralily big errors.",2012-02-11 13:21:38.393 +33589,20240,,CC BY-SA 3.0,,"@vzn many people choose models for which the optimal solution can be found. This is because the use convex loss functions, as SVMs do. Finding the true optimum here means ""finding the optimal solution in your search space"", so that has nothing to do with how the search space looks like. +As jb said, for general loss functions, finding the true optimum is usually impossible / infeasible.",2012-02-14 08:44:30.200 +34278,20667,221.0,CC BY-SA 3.0,,I vote for cw ;),2012-02-20 12:53:13.573 +37169,28,,CC BY-SA 3.0,,There's a similar question on [data mining and statistics](http://stats.stackexchange.com/questions/1521/data-mining-and-statistical-analysis),2012-03-22 23:51:48.623 +38536,23019,,CC BY-SA 3.0,,"It seems to me model b makes more sense, since you could imagine fixed developmental effects occurring at particular ages in all mice, but occasion specific conditions (captured by the random intercept, grouped by time) may be more thought of as random perturbations.",2012-04-06 01:48:46.123 +38693,23087,5179.0,CC BY-SA 3.0,,"No, I think you are confusing the definition of an MA(n) model, where the regression is only in terms of the $e_{t-i}$'s, with its estimation, where the $e_{t-i}$'s are estimated from the data.",2012-04-07 17:25:28.287 +38713,23087,1406.0,CC BY-SA 3.0,,"The main problem in your question is that you say that MA model is basically a linear regression. This is simply not true, since we do not observe error terms.",2012-04-07 20:18:02.153 +38863,23087,5643.0,CC BY-SA 3.0,,"I think the error term *is* actually $Y_t - \hat{Y_t}$, where $\hat{Y}$ is $E(Y|Y_{t,...,t-n})$ or simply $Y_t - Y_{t-1}$. That is why an MA model parameter estimate is derived from a recurring pattern in the $Y$ partial autocorrelation function, that is the behavior of the residuals. The AR parameter estimation instead, is based on a recurring pattern of the acf(Y).",2012-04-09 15:23:42.327 +41789,24506,,CC BY-SA 3.0,,It's not totally clear what kind of model you're fitting - is this a linear regression model?,2012-05-06 00:40:29.510 +42014,24506,7341.0,CC BY-SA 3.0,,"Yes, it's linear regression.",2012-05-07 21:52:37.023 +42176,24602,7341.0,CC BY-SA 3.0,,"Thank you very much - I haven't been able to get with my coworker who knows much more R than me, but I'm sure this is the ticket.",2012-05-08 19:46:30.480 +42863,25087,7421.0,CC BY-SA 3.0,,"Thank you. Just to make sure on your thoughts this is the quote, P. 117: ""Principal factors as normally extracted are based on equations for which it is assumed that the population correlation matrix is being factored. In maximum like hood procedures it is explicitly recognized that a sample is being analyzed. Maximum like hood procedures are defined as those that best reproduce the population values (the meaning of ""best reproduce"" is further discussed in Chapter 7). Any factor solution that best reproduces the population values is a maximum like hood analysis.",2012-05-14 16:09:20.817 +42870,25087,8363.0,CC BY-SA 3.0,,"The terminology is bad. We don't say that maximum likelihood estimates best reproduce the population values. The maximum likelihood estimates are obtained for population parameters by selecting the values that maximize the likelihood function. This means that the exact parametric model using the fitted parameter(s), in a way best describes the observed data. Maybe this is what he intended to say but phrased it poorly.",2012-05-14 16:39:30.570 +42872,25087,8363.0,CC BY-SA 3.0,,"The population correlation matrix cannot be factored since it is unknown. PCA does what I described for the sample data. I don't even think that means factoring the sample correlation matrix. I am assuming he means principal components when he says ""principal factors"" but it could mean something else and really have something to do with the sample correlation matrix.I think you should look at other books on this topic that are much better written (e.g. books on multivariate analysis such as Gnandesikan or Jolliffee's book on PCA).",2012-05-14 16:46:43.573 +42887,25087,7421.0,CC BY-SA 3.0,,Thank you Michael for answering this question 2. Hopefully I can get an answer for the others too!,2012-05-14 18:22:52.870 +45977,16998,668.0,CC BY-SA 3.0,,This thread appears to be off topic. See http://meta.stats.stackexchange.com/questions/1032/data-sourcing-we-need-to-make-up-our-mind/1033#comment2001_1033.,2012-06-07 21:39:07.407 +47052,412,,CC BY-SA 3.0,,[Statistics explained](http://www.amazon.com/Statistics-Explained-Introductory-Guide-Scientists/dp/0521183286) covers the basics using examples from the life sciences. The answers to [this question](http://stats.stackexchange.com/questions/29380/a-statistics-book-that-explains-using-more-images-than-equations/) may also contain recommendations that you'll find useful.,2012-06-15 13:59:49.403 +47139,28,,CC BY-SA 3.0,user10525,An interesting discussion in [Wasserman's blog](http://normaldeviate.wordpress.com/2012/06/12/statistics-versus-machine-learning-5-2/).,2012-06-16 10:43:07.713 +47631,27120,2081.0,CC BY-SA 3.0,,"Just a side note. ""Error"" in abbreviations like MSE usually actual mean ""residuals"". Practically, residuals and errors frequently are treated as synonims, MSE=MSR. But theoretically there's important [distinction](http://en.wikipedia.org/wiki/Errors_and_residuals_in_statistics) between the two terms",2012-06-20 17:55:33.763 +49899,5020,449.0,CC BY-SA 3.0,,"There's no moral hazard. The calculation of the main effects with the interaction included is quite different from the calculation without it. You have to do the additive model to report the main effects and then include the interaction in a separate model anyway. You ignore the main effects in the model that includes the interaction because they're not really main effects, they're effects at specific levels of the other predictors (including the interaction).",2012-07-10 14:08:28.770 +55166,30434,5249.0,CC BY-SA 3.0,,Thanks for dredging up this question! Wallach et al also have a paper on topic model evaluations: [Evaluation methods for topic models](http://doi.acm.org/10.1145/1553374.1553515),2012-08-22 08:27:14.353 +55325,30434,9605.0,CC BY-SA 3.0,,"No worries. I've found there's some code for Wallach's left-to-right method in the MALLET topic modelling toolbox, if you're happy to use their LDA implementation it's an easy win although it doesn't seem super easy to run it on a set of topics learned elsewhere from a different variant of LDA, which is what I'm looking to do. I ended up implementing the Chib-style estimator from their paper using the matlab code they supply as a guide although had to fix a couple of issues in doing that, let me know if you want the code.",2012-08-23 11:28:40.423 +56111,30957,8363.0,CC BY-SA 3.0,,"Just curious, why do you want to initialize the model with a different time series?",2012-08-29 15:31:12.027 +56118,30957,9446.0,CC BY-SA 3.0,,@MichaelChernick I can think of few reasons why one might want to initialize the simulation with a different time-series. 1). One might have a population that is very similar to the one modeled but few time points to build a new model from (of course the big assumption would be the new population exhibits population dynamics just as the original one did which is probably not the best assumption but could still be informative about what to expect from the new population).,2012-08-29 16:16:21.897 +62533,34166,,CC BY-SA 3.0,,"It would be good if you could describe the experiment a bit clearer. Without reading the original post, it is really hard to understand what the paradox is about.",2012-10-25 22:51:17.727 +62693,34166,,CC BY-SA 3.0,,My comment wasn't meant to be rude btw. I realized later it might have come across a bit harsh. Hope you didn't take it the wrong way.,2012-10-28 00:42:19.703 +56120,30957,9446.0,CC BY-SA 3.0,,"2). Also, one might want to simulate potential impact of a rare but extreme short-term event that causes high mortality in the population (i.e., hurricane, chemical spill, disease outbreak, etc). Of course one would have make some assumptions about how those events might change the population’s growth rate but could use the simulation results to inform wildlife managers as to potential effects by magnitude and duration of declines in population growth rates on the population’s trajectory. Those results could then be used to help inform planning and management decisions for the population.",2012-08-29 16:17:42.660 +56136,30960,9446.0,CC BY-SA 3.0,,Wouldn't using a non-seasonal ARIMA model with higher AR orders (p > 1) results in the cyclic pattern being dampened as the forecasts increase a cycle or two beyond the last data point?,2012-08-29 19:02:41.150 +56162,30960,132.0,CC BY-SA 3.0,,"Yes, but that's because the cycles are not strictly periodic. They are almost periodic, and as you go further out in the forecast horizon, it becomes harder to predict which part of the cycle you will be in. The point forecasts are means, and so they will naturally flatten out, indicating this uncertainty.",2012-08-30 04:24:43.993 +56184,30960,8363.0,CC BY-SA 3.0,,I still think this is not a very sensible thing to do. It is a rather strong assumption to think that a different series will have the same model form and coefficients as another series. When you don't have sufficient data to forecast I think it is best not to.,2012-08-30 12:19:07.520 +56195,30960,9446.0,CC BY-SA 3.0,,@RobHyndman Thank you for your insight. Is the link you were refering to [Research tips - Cyclic and seasonal time series](http://robjhyndman.com/researchtips/cyclicts/)?,2012-08-30 14:05:03.150 +56248,30960,132.0,CC BY-SA 3.0,,"That's relevant, but I was referring to papers such as http://www.publish.csiro.au/?paper=ZO9530163",2012-08-30 20:08:29.103 +57490,31575,7007.0,CC BY-SA 3.0,,"You have $4$ states: $S=\{1:=A,2:=B,3:=C,4:=D\}$. Let $n_{ij}$ be the number of times the chain made a transition from state $i$ to state $j$, for $ij,=1,2,3,4$. Compute the $n_{ij}$'s from your sample and estimate the transition matrix $(p_{ij})$ by maximum likelihood using the estimates $\hat{p}_{ij}=n_{ij}/\sum_{j=1}^4 n_{ij}$.",2012-09-11 16:29:52.470 +57491,31575,7007.0,CC BY-SA 3.0,,These notes derive the MLE estimates: http://www.stat.cmu.edu/~cshalizi/462/lectures/06/markov-mle.pdf,2012-09-11 16:30:57.337 +57500,31587,6404.0,CC BY-SA 3.0,,Looks great! I'm not sure what the 3rd line does in your code though (mainly because I'm familiar with Matlab). Any chance you could write it in matlab or pseudo-code? I'd be much obliged.,2012-09-11 17:22:15.067 +57524,31587,7007.0,CC BY-SA 3.0,,"The third line does this: the chain values are $x_1,\dots,x_n$. For $t=1,\dots,n-1$, increment $p_{x_t,x_{t+1}}$.",2012-09-11 19:32:17.700 +57525,31587,7007.0,CC BY-SA 3.0,,The fourth line normalizes each line of the matrix $(p_{ij})$.,2012-09-11 19:34:22.347 +57535,31575,1359.0,CC BY-SA 3.0,,Similar question:http://stats.stackexchange.com/questions/26722/calculate-transition-matrix-markov-in-r,2012-09-11 20:18:13.553 +57542,31587,6404.0,CC BY-SA 3.0,,Bare with my slowness here. I do appreciate the MATLAB code translation although I still can't see what it's attempting to do in your first `for` loop. The 3rd line from the original code is counting the number of times $x$ goes from state $x_i$ to state $x_j$? If you could say it in words I'd appreciate that a lot. Cheers,2012-09-11 20:35:23.493 +57551,31575,6404.0,CC BY-SA 3.0,,@B_Miner could you write your code in pseudo-code form for me? Or explain it in lay terms... However I see it works in my R console.,2012-09-11 21:41:01.917 +57553,31587,6404.0,CC BY-SA 3.0,,I have implemented your R code and it works just as you explained. Which is great! My query is now with regards to the $x$ vector... My observation sequences are of uneven lengths. Eg the matrix I provided above. How would I change your code to handle this?,2012-09-11 21:44:50.053 +57558,10911,,CC BY-SA 3.0,user14015,"does the following link answers' this? +http://www.talkstats.com/showthread.php/11554-mean-of-means",2012-09-11 21:55:13.197 +57572,31587,7007.0,CC BY-SA 3.0,,"A realization of the chain is something like $1, 1, 2, 1, 2, 1, 2, 4, 3, 1$. By inspection, it jumped from $1$ to $1$ one time, from $1$ to $2$ three times, and so on. What exactly you don't understand?",2012-09-12 00:41:29.340 +57583,10911,674.0,CC BY-SA 3.0,,"@TST, There appears to be nothing but a link to Wikipedia on [Pooled variance](http://en.wikipedia.org/wiki/Pooled_variance). Care to elaborate?",2012-09-12 07:45:38.580 +57973,31587,6404.0,CC BY-SA 3.0,,Can you clarify for me that your vector x is the same as concatenating my matrix rows? Ie one long sequence instead of many short ones?,2012-09-15 22:41:23.583 +57974,31587,7007.0,CC BY-SA 3.0,,"No, $x$ is just one row. Don't concatenate because you will introduce ""false"" transitions: last state of one line $\to$ first state of the next line. You have to change the code to loop through the lines of your matrix and count the transitions. At the end, normalize each line of the transition matrix.",2012-09-15 23:57:51.537 +59038,32317,,CC BY-SA 3.0,,"I believe Apache Mahout has some clustering algorithms, but I don't know much about it (other than that is runs over Hadoop).",2012-09-25 20:59:31.763 +59207,32388,,CC BY-SA 3.0,user88,"IMO the main problem is that you have so little data here that any model makes (no) sense; remember that in contrary to you R doesn't understand variable names and can't infer from that. BTW you can get ""right"" result simply by reversing the order of columns -- this even better shows that you are torturing this poor algorithm stranded in a no-information regime.",2012-09-26 23:25:18.097 +59212,32388,633.0,CC BY-SA 3.0,,"In any case, the two dependence structures are identical (as long as you do not have interventions). Also, I think you are confusing the direction of reasoning with the directions of the arrows. Finally, all variables in a Bayesian network can be a response variable and can have arcs to other variables.",2012-09-27 00:01:10.453 +59303,32317,8208.0,CC BY-SA 3.0,,Actually I will have to cluster data in the form of sets of features. In another question I have been told that I should try using some set-based distance metrics (Such as Jaccard or Tanimoto) but they're not implemented in Weka. Do you recommend me other libraries supporting these distances?,2012-09-27 14:07:10.020 +62043,4187,,CC BY-SA 3.0,user16110,I just gave a talk on this subject... A link to the video follows if you are interested. http://www.youtube.com/watch?v=1SNQQvY1ESo&feature=g-upl,2012-10-21 01:13:53.057 +62045,4187,,CC BY-SA 3.0,,"Hi @Amanda, could you give some indication here of what's in the talk? No-one likes the possibility of being rick-rolled.",2012-10-21 02:17:50.833 +62455,10069,3733.0,CC BY-SA 3.0,,"This means that we should start deleting the terms from y ~ x1 * x2 * x3 * x4, starting deleting the highest-order terms, i.e. the normal deletion method, right?",2012-10-25 08:11:04.330 +62515,10069,2666.0,CC BY-SA 3.0,,"Deletion of terms is not recommended unless you can test entire classes of terms as a ""chunk"". For example it may be reasonable to either keep or delete all interaction terms, or to keep or delete all interactions that are 3rd or 4th order.",2012-10-25 19:13:21.093 +62520,34166,668.0,CC BY-SA 3.0,,I was moved to post this as a separate question based on comments at http://stats.stackexchange.com/questions/23779.,2012-10-25 20:10:56.277 +104671,57730,594.0,CC BY-SA 3.0,,How did the continuous variables became discrete (multinomial)?,2013-10-17 20:12:26.730 +63093,34166,,CC BY-SA 3.0,user16414,"You might be interested in the (now large) literature in philosophy on this paradox. Here is a fairly complete bibliography (with links): +http://philpapers.org/browse/sleeping-beauty",2012-10-31 14:13:08.683 +64170,35097,1506.0,CC BY-SA 3.0,,Discussion on Gelman's blog: http://andrewgelman.com/2012/11/16808/,2012-11-11 16:27:06.533 +64176,35097,,CC BY-SA 3.0,,"I think a lot is wrong, both from the frequentist and Bayesian point of view. My biggest criticism each: First, P values are ultimately heuristics and are properties of a number of things including the statistical problem, data and experiment. Here, all three are grossly misrepresented for that particular question. Second, the ""Bayesian"" uses a decision theoretic approach which need not be Bayesian. It's funny, though.",2012-11-11 17:33:54.870 +64262,35097,,CC BY-SA 3.0,,"To take it out of the statistics realm....the sun isn't massive enough to go nova. QED, the Bayesian is right. ([The Sun will instead become a Red Giant](http://simple.wikipedia.org/wiki/Sun#The_fate_of_the_Sun))",2012-11-12 17:15:25.793 +64268,35160,,CC BY-SA 3.0,user10525,"(+1) A nice reference on this *strong and crucial* assumption of repeatability in frequentism is [Statistical Inference in Science (2000)](http://www.amazon.com/Statistical-Inference-Science-D-Sprott/dp/0387950192), chapter 1. (Although there are so many issues that it is difficult to tell which one is *the main* one)",2012-11-12 17:41:32.140 +64270,35097,,CC BY-SA 3.0,,"@Glen et alii, in particular, note Randall Munroe's response to Gelman: http://andrewgelman.com/2012/11/16808/#comment-109366",2012-11-12 17:48:10.677 +64271,35097,,CC BY-SA 3.0,user10525,"I believe the comic confounds **estimation** and **hypothesis testing** (Basic mistake!). + +The machine *estimates*,using a decision-theoretic approach, the probability of an event. The outcome is 0 or 1, based on the decision rule. + +The frequentist statistician relates this with a p-value (why? Just for fun). He/she should have related this value with a point estimator.",2012-11-12 18:00:20.307 +64356,35160,,CC BY-SA 3.0,,"Not so fast with the repeatability argument... First, the experiment that is repeatable is the querying of the machine not the sun going nova The truth of *that* is the fixed but unknown object of inference. The querying experiment can certainly be repeated, and if it were for a few more times the frequentist strategy could easily seem reasonable.",2012-11-13 09:42:33.477 +64357,35160,,CC BY-SA 3.0,,"Second, one should not be too stringent on the repeatability business anyway, lest frequentists be stuck not being able to infer anything at all in non-experimental situations. Assume for a moment that 'sun goes nova' was the candidate event. I'm no physicist, but I'm told that the event 'sun goes nova' happens rather often (just not so much around here) so this sounds to me like a repeat. In any case, folk like David Cox (in 'Foundations of Statistics') cheerfully say things like: ""the repetitions contemplated are *almost always hypothetical*. This by itself seems no drawback"".",2012-11-13 09:51:26.513 +64392,35160,651.0,CC BY-SA 3.0,,We could view the sun as a random sample from a population of suns in parallel universes in which we could in principle repeat the experiment if only we had a quantum mirror! ;o),2012-11-13 17:46:25.117 +64393,35097,,CC BY-SA 3.0,,Discussion and response on Larry's blog: [http://normaldeviate.wordpress.com/2012/11/09/anti-xkcd](http://normaldeviate.wordpress.com/2012/11/09/anti-xkcd),2012-11-13 17:58:18.583 +64445,35249,668.0,CC BY-SA 3.0,,"How, *exactly,* are the samples obtained and measured? This matters because if the samples represent averages within each layer at their locations, then the varying layer thickness will change the distributions of the values and thereby suggest one set of approaches. Otherwise, if the results are from subsampling each layer (which often happens in the lab), then another set of approaches might be favored.",2012-11-13 22:43:26.443 +64447,35249,11884.0,CC BY-SA 3.0,,"Thanks @whuber for the fast reply: Layers are (re)calculated with weighted average, from sampling layers. So they do not represent the actual sampling and lab measured samples (every profile has a different layer divison for sampling). And after recalculating the layering is uniform for every sample for every property.",2012-11-13 22:49:11.590 +64448,35249,668.0,CC BY-SA 3.0,,"You may have a hard time, then, interpreting the results: they could tell you more about your interpolation (averaging) method than about what's really going on. Is there a reason not to do the PCA with the original data?",2012-11-13 22:50:58.050 +64451,35249,11884.0,CC BY-SA 3.0,,"Sampling is really diverse for every single point. It means sometimes the first layer is 0-1cm sometimes 0-100cms. The goal would be clustering, on a uniform layering, but I'd like to get rid of correlating properties.",2012-11-13 22:54:32.360 +64453,35249,11884.0,CC BY-SA 3.0,,"@whuber I was considering also splines, not weighted average but in some cases it would have result to misleading values if NAs are present in a profile.",2012-11-13 23:05:45.187 +66501,20234,12900.0,CC BY-SA 3.0,,on further thought on this question it seems the related/relevant area of research is called _global optimization_ methods/variants on top of local-type algorithms eg gradient descent...,2012-12-05 06:29:31.657 +66502,20240,12900.0,CC BY-SA 3.0,,"accepting this answer as a description of current state of affairs & general categories of applications but still think there are some bridge thms that exist & remain to be proven that link up the separate areas. the proof that NNs can model or _""approximate""_ any continuous mathematical fn to arbitrary degree of accuracy is closely related... ie [kolmogorovs thm](http://neuron.eng.wayne.edu/tarek/MITbook/chap2/2_3.html)",2012-12-05 06:42:15.477 +66503,20234,12900.0,CC BY-SA 3.0,,"eg [""global optimization for neural network training""](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.42.189) by shang & wah",2012-12-05 06:43:57.550 +68498,20667,4890.0,CC BY-SA 3.0,,A similar question in lines of specific datasets has been closed here : http://stats.stackexchange.com/questions/38928/high-dimensional-regression-datasets,2012-12-22 18:09:39.020 +69088,37981,,CC BY-SA 3.0,user10525,"Indeed, the measures of kurtosis are typically applied to symmetric distributions. You can calculate it for skewed ones as well but the interpretation changes since this value varies when the asymmetry is introduced. In fact, these two concepts are difficult to separate. Recently, a skewness-invariant measure of kurtosis was proposed in [this paper](http://amstat.tandfonline.com/doi/abs/10.1198/tast.2011.10194).",2013-01-03 16:22:03.980 +69148,37981,594.0,CC BY-SA 3.0,,"High kurtosis is associated with peakedness and with heavy tailedness (it's also characterized as 'lack of shoulders'). One of the volumes of Kendall and Stuart discuss these issues at some length. But such interpretations, are, as you note, generally given in the situation of near-symmetry. In nonsymmetric cases, the standardized 4th moment is usually highly correlated with the square of the standardized third moment, so they're mostly measuring much the same kind of thing.",2013-01-03 23:44:05.083 +70509,34166,668.0,CC BY-SA 3.0,,"Thank you, @jpust! That is a wonderful resource. It's going to take a while to get through it. :-)",2013-01-17 01:53:36.057 +100470,55576,2149.0,CC BY-SA 3.0,,@whuber the partial acf is the conditional acf. THe relationship between the pacf and the acf is the same as that between partial regression coefficients and regression coefficients. It is a matrix of partial (auto) correlations.,2013-09-18 15:15:45.967 +72633,40104,594.0,CC BY-SA 3.0,,"Some *particular* kinds of combinations have names (e.g. [the Skellam distribution](http://en.wikipedia.org/wiki/Skellam_distribution)), but the general case its just compound Poisson, I think. There will be no closed form expression that's simpler than the original sum in general, though I suppose you can just write it as a convolution. When you say 'generate' do you mean 'simulate random values from', or something else?",2013-02-08 02:56:43.813 +72636,40104,668.0,CC BY-SA 3.0,,"A closely related special case--that of the average of Poisson variates, binned by integers--is treated at http://stats.stackexchange.com/questions/35042/what-is-the-distribution-of-an-average-of-poisson-random-variables.",2013-02-08 03:17:58.850 +72744,40104,,CC BY-SA 3.0,,"Why are you scaling the summands with $a_1$ and $a_2$? The sum is just another Poisson distribution without this. The variables take values in the positive integers, so something like $1$ times the first plus $\sqrt{2}$ times the second is usually quite unnatural, and would let you recover the values of both variables.",2013-02-09 19:57:47.617 +72751,40104,4656.0,CC BY-SA 3.0,,"The difficulty here is that unless both $a_1$ and $a_2$ are integers, one cannot be sure that $S_2$ takes on integer values only. Thus, you need to find not just $P(S_2 = k)$ for integer values of $k$ but also $P(S_2 = \alpha)$ for each $\alpha$ that can be expressed as $a_1m + a_2n$ for nonnegative integers $m$ and $n$.",2013-02-09 21:30:33.927 +72753,40104,14684.0,CC BY-SA 3.0,,@DilipSarwate Is that possible? Is there an other approach to do this?,2013-02-09 21:43:26.710 +72754,40104,14684.0,CC BY-SA 3.0,,@DouglasZare I have to do this... Maybe I have to turn to some kind of bootstrapping method.,2013-02-09 21:45:04.850 +72755,40104,4656.0,CC BY-SA 3.0,,"I don't think you can do much better than a brute-force approach which finds the possible values that $S_2$ can take on and then for each $\alpha$, use $$P\{S_2 = \alpha\} = \sum_{a_1m + a_2n = \alpha}P\{X_1=m\}P\{X_2=n\} = \sum_{a_1m + a_2n = \alpha} \exp(-\lambda_1m)\frac{\lambda_1^m}{m!}\exp(-\lambda_2n)\frac{\lambda_2^n}{n!}.$$ For most choices of $a_1$ and $a_2$, I would expect that most sums will reduce to a single term. I expect you know that for $a_1=a_2=1$, $S_2$ is a Poisson random variable with parameter $\lambda_1+\lambda_2$.",2013-02-09 22:01:52.060 +72762,40104,,CC BY-SA 3.0,,"You can't provide more context than that you ""have to do this?"" Is it homework, possibly miscopied?",2013-02-09 23:51:40.077 +72772,40104,14684.0,CC BY-SA 3.0,,"@DouglasZare Yes, I don't like to end up doing monte carlo simulations (bootstrapping), so I would really like to obtain at least an approximation.",2013-02-10 03:23:31.960 +72790,40121,,CC BY-SA 3.0,user88,What kind of software is that?,2013-02-10 12:17:02.557 +72806,40121,14728.0,CC BY-SA 3.0,,"@mbq, This is JMP. But is this comparison circle a general concept, not specific to the software?",2013-02-10 17:13:24.430 +72816,40121,,CC BY-SA 3.0,user88,"This is a first time I see such plot, so I think it is native to JMP. BTW I bet SAS would sue for even thinking to implement this in other applications (-;",2013-02-10 19:53:01.127 +72823,40121,14728.0,CC BY-SA 3.0,,"@mbq, I'm sure they'll do. But what applications are these plots for? Or what other criteria I can use to compare data? Basically I want to compare several data sets and find out which set is _significantly different_ from others, so I can exclude them in my future development.",2013-02-10 20:41:46.593 +72828,40121,,CC BY-SA 3.0,user88,I suspect the red ones are outliers due to various methods and radii somewhat correspond to confidence intervals; but honestly I have no idea what this mean. However the question is now properly tagged so I hope some JMP expert will give you a satisfying answer soon.,2013-02-10 21:18:19.477 +74195,40859,,CC BY-SA 3.0,,"The Frank Harrel quote is out of context. Please provide the source. Otherwise, splitting your data in the way you described seems appropriate. You may consider a split into training, test, and validation set to strengthen the validity of your model.",2013-02-22 09:36:04.627 +74201,40859,15044.0,CC BY-SA 3.0,,Thanks @Bernhard. Sorry if it was out of context. I thought the example discussed in that thread was very similar to mine. Why do you think that was different?,2013-02-22 11:39:19.347 +74213,40859,2666.0,CC BY-SA 3.0,,"No, the context would not help. Data splitting for your sample size is not reliable. I.e. you get different model and different test data performance if you split again, and the mean squared error of your performance metric is high.",2013-02-22 13:18:21.483 +74218,40859,13037.0,CC BY-SA 3.0,,"You could split your data and do the testing many times: split into train and test, fit model, do diagnostics, and then repeat many times (make sure your new samples are different!) I think thats called like cross-validation or something",2013-02-22 13:37:34.623 +74229,40859,15044.0,CC BY-SA 3.0,,"@FrankHarrell Thanks! What alternative to ""Data splitting"" would you suggest?",2013-02-22 16:29:05.890 +74231,40859,2666.0,CC BY-SA 3.0,,"The bootstrap, as in the `validate` and `calibrate` functions in the R `rms` package.",2013-02-22 17:14:43.933 +74924,41244,166.0,CC BY-SA 3.0,,Is this homework? If so then please tag it as such.,2013-03-01 05:23:11.897 +74925,41244,166.0,CC BY-SA 3.0,,"Also please tell us your current thinking on this question, c.f.: http://stats.stackexchange.com/faq",2013-03-01 05:24:24.830 +74940,41244,15330.0,CC BY-SA 3.0,,"This is not homework. +I've found some approaches on wikipedia (http://en.wikipedia.org/wiki/Checking_whether_a_coin_is_fair). +The bayesian approach depends on an arbitrary prior distribution, which seems bizarre. He recommends using a distribution centered on 0.5 but uses an uniform. The choice of the prior distribution is not so important? How can I trust a distribution that was chosen arbitrarily? +I don't understand how the frequentist approach might give me a distribution. +A friend said that bootstrapping might give a good answer, is it true? How would that work?",2013-03-01 11:10:20.163 +74943,41244,15330.0,CC BY-SA 3.0,,"Reading about priors, the article on wikipedia (http://en.wikipedia.org/wiki/Prior_probability) seems to recommend Jeffrey's prior (http://en.wikipedia.org/wiki/Jeffreys_prior#Bernoulli_trial) which is 1/sqrt[p(1-p)], although I didnt understand the explanation of why.",2013-03-01 11:43:03.910 +74956,41244,15330.0,CC BY-SA 3.0,,The distribution function cannot be estimated from the coin results alone? It will always require a prior distribution? This is counterintuitive.,2013-03-01 15:12:47.037 +74958,41244,594.0,CC BY-SA 3.0,,@LeoHMArruda Can you explain the context in which such a question arises (how did you arrive at this question?),2013-03-01 15:20:51.333 +74960,41244,166.0,CC BY-SA 3.0,,The question you have posed is not directly about the fairness of the coin.,2013-03-01 15:47:24.230 +74990,41244,15330.0,CC BY-SA 3.0,,Well it is basically the problem of finding out if a Bernoulli trial is biased and how much by repeating it many times. I was discussing it as a simplified model of hypothesis testing in a class.,2013-03-01 20:20:27.067 +76103,41914,2164.0,CC BY-SA 3.0,,There is no published or described process of how the Lewandowski method works. It is a mystery.,2013-03-12 12:53:22.320 +94842,52567,594.0,CC BY-SA 3.0,,"The origin of the term is pretty old, and I believe goes back to the origins of inferential statistics in analysis of experiments; in particular, I think it referred to the way that the X-matrix related to the actual experimental design (the specific settings of the $x$-values). If I can find a specific reference I'll post an answer.",2013-08-04 22:48:37.623 +77195,42513,1741.0,CC BY-SA 3.0,,"Why not. Just to add a comment. The mean is a summary value as the histogram is. You can vary the degree of information provided varying the bucket size of the histogram for example. However, usually the histogram provides more information than just the mean. You can actually approximate the mean value from an histogram. I think that is why they are not usually provided together.",2013-03-19 22:53:58.093 +77393,42513,,CC BY-SA 3.0,,"One sometimes sees histograms with an overlaid distribution (e.g. most commonly in my experience, the normal distribution plotted using the sample mean and standard deviation.) Which is doing the same thing (and a bit more) as drawing a vertical line (indicating where sample mean is with the peak of the curve.)",2013-03-20 23:27:38.200 +78776,43458,,CC BY-SA 3.0,,"As @EngrStudent mentions below, 5 data points isn't much. But, could you perhaps describe your data a bit better?",2013-04-02 17:30:04.907 +79611,42517,5237.0,CC BY-SA 3.0,,"+1, these are nice; care to add the code? `abline(v=mean(Davis2[,2]))` & `rug(Davis2[,2])` I would guess, but how did you wedge the boxplot in there?",2013-04-10 02:45:13.830 +79613,42517,594.0,CC BY-SA 3.0,,"@gung See the edit for brief details, including a reproducible example similar to the one with the boxplot. It's really doing nothing more clever than making use of several of the arguments to the `boxplot` function. Between `boxplot` and `boxp` you can do some rather nifty things with little effort.",2013-04-10 03:29:02.947 +79614,42517,5237.0,CC BY-SA 3.0,,"Wisdom for the ages: ""If you don't write an intelligent function to take care of everything, it's necessary to pay attention to what everything does to make sure it's doing what you want"" ;-).",2013-04-10 03:45:13.107 +79615,42517,594.0,CC BY-SA 3.0,,"Yep. I even contemplated writing something clever to set `at` and `boxwex` and so on... but at best I only do a few plots like that a year, and it takes a few seconds each time to type ?boxplot and set the right options. I figured it's easier to just pay attention to what I am doing.",2013-04-10 04:14:13.197 +79654,42517,594.0,CC BY-SA 3.0,,@gung I edited to give code to create the Davis2 data I was using. Hope that helps.,2013-04-10 08:55:36.807 +80250,2509,,CC BY-SA 3.0,,"This question lead me to a good paper, and even though I think that is a great quote it is not from Einstein. This is a common misattribution, and the more likely original quote is probably this one from Ernest Rutherford who said, ""If you can't explain your physics to a barmaid it is probably not very good physics."" All the same thanks for starting this thread.",2013-04-15 15:03:37.020 +80264,44370,,CC BY-SA 3.0,,Why would you look at the entire genome when you have a control for the region that you know you are interested in?,2013-04-15 16:20:08.277 +80274,44370,8063.0,CC BY-SA 3.0,,"The main reason I think is that my modification may also affect distant genomic regions (some trans effect, I imagine if I hit a transcription factor, many genes elsewhere may be messed up). So while I expect local effect, I cannot exclude distal ones... Also but less importantly, I think it would make my statement about local effect much stronger if I could show that only that region is affected in the whole genome.",2013-04-15 21:22:22.763 +80315,44370,,CC BY-SA 3.0,,Your idea is good. You might save a little effort and see what happens when you normalize your DE count by gene density; i.e. $\frac{DE_i}{GD_i}$ before attempting the permutation testing.,2013-04-16 12:20:22.480 +80659,44635,668.0,CC BY-SA 3.0,,"A more robust method would be to test for symmetry around the *median*: after all, when a distribution is symmetric about its mean, its median must coincide with its median. Robustness is desirable because a single outlier would cause the empirical distribution to look highly asymmetric around the sample mean but would barely affect the symmetry around the sample median. But precisely how do you propose to apply the Wilcoxon test here?",2013-04-18 18:44:47.707 +80660,44635,728.0,CC BY-SA 3.0,,"@whuber: (1) We can test the symmetry of a distribution around 0 by Wilcoxon sign rank test, based on its sample. This can be done, by letting $x_{1i} = 0, \forall i$ in [wikipedia](http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test). (2) How do people tell if a testing procedure is ""valid"" for a task?",2013-04-18 18:49:02.607 +80916,44772,,CC BY-SA 3.0,,"For clarity, did the same two raters see all 18 participants for all four time points? Did all 18 participants experience the same four-conditions? Finally, what is your research question: rater difference, condition difference, or time difference?",2013-04-20 21:16:40.430 +80978,44772,17076.0,CC BY-SA 3.0,,"Yes. Both raters reviewed all 18 participants at the 4 points. I am interested in condition difference (perhaps controlling for practice effect over time), but using the raters to confirm scoring. THANKS!!!",2013-04-22 00:24:37.880 +81019,44772,,CC BY-SA 3.0,,"Try to understanding your data before you do an ANOVA. First, plot your data to ensure that it is normally distributed. Then plot total score by rater to determine if there is a difference between raters. Estimate mean and 95% CI for the mean for each rater. Then estimate mean and 95% CI for the mean for the average total score of the raters for each of the four conditions. Check and see conditions don't have overlapping of the confidence intervals. Try doing boxplots with notches to visualize overlap. ANOVA will tell you if one of the conditions is different, but not which.",2013-04-22 12:55:44.710 +81612,45280,503.0,CC BY-SA 3.0,,"With only 6 data points, each with noise, no statistical test is going to give significant results and there will possibly be many shapes that fit.",2013-04-26 18:08:51.113 +81655,45280,594.0,CC BY-SA 3.0,,"If you had 6000 data points, identifying fairly general kinds of association should be possible. If you had 600 observations and made some weak-to-moderate assumptions about the general forms of association, you might get somewhere. If you had 60 observations and made fairly strong assumptions about particular forms of association it might be doable. On six observations with large error? I doubt there's any way you're going to achieve much; even something as explicit as monotonic association is going to be nigh impossible to show if the errors are large.",2013-04-27 03:07:55.383 +81900,45457,17179.0,CC BY-SA 3.0,,R for statistical computing,2013-04-29 17:46:41.690 +82042,45536,17447.0,CC BY-SA 3.0,,"Could you explain the stats behind it? (im a first year so im not the best) + +I understand theres a pdf, f(c) which has a CDF F(c) + +and then they go onto talk about F(L) + +what is the function F in F(L) here? is this a cdf? + +and more importantly, the coursework question is, ""what does it imply when F(L) < 0.4""",2013-04-30 15:46:12.873 +82044,45536,15663.0,CC BY-SA 3.0,,"I d'ont use these words but yes, i think pdf = distribution and cdf = cumulative function. F is still the cumulative function for C. As $F(infinity) = 1$, $F(L) < 1$ strictly means that $P(c>L)>0$, that there is cost superior to the fine. This does NOT strictly means that the fine is lower to the compliance cost. But as far as I don't have the question I'am not sure wether it's an hypothesis you want to make or something you wnat to show.",2013-04-30 16:01:48.870 +82048,45536,15663.0,CC BY-SA 3.0,,"$F(L) < 0.4$ means that c has <40% chance to be below L and >60 to be above. Express what does it mean for the expected profit of the company, the environmental damage.",2013-04-30 16:27:55.377 +103589,57200,594.0,CC BY-SA 3.0,,"You (almost) always will do a good deal better on the training data than the test data, since you optimize the fit on the training data.",2013-10-10 08:52:07.413 +82051,45536,17447.0,CC BY-SA 3.0,,"hey imorin, ive attached the pdf file here +[link](http://www.scribd.com/doc/138719032/Anthony-Heyes-and-Neil-Rickman) + +could you help me figure this out. there are 2 equations i do not understand how they were derived. none of my coursemates have chosen this topic so i cant get help anywhere else. + + +could you explain equation (3) and (4) to me? + +and could you tell me how i should structure my answer with regards to what F(L) < 0.4 implies?",2013-04-30 16:39:50.690 +82064,45543,668.0,CC BY-SA 3.0,,@Andre Silva http://en.wikipedia.org/wiki/Metric_%28mathematics%29.,2013-04-30 18:37:13.613 +82103,45534,17447.0,CC BY-SA 3.0,,"@Glen_b alright i signed up for this website today so i wasnt aware of it, my other question which was legit just get downvoted and i was wondering why, makes sense now, thanks for the heads up",2013-05-01 01:47:35.990 +82105,45534,594.0,CC BY-SA 3.0,,You can edit your questions to better follow the guidelines. That may help.,2013-05-01 01:55:23.953 +82273,45536,15663.0,CC BY-SA 3.0,,I'm sorry but I can reach your pdf. Probably due to my internet proxy,2013-05-02 14:24:07.263 +82294,45279,,CC BY-SA 3.0,,"If you find a solution that satisfies you, you can also post your code as an answer",2013-05-02 16:31:09.013 +82303,45279,750.0,CC BY-SA 3.0,,This is turning into something like [parsets](https://code.google.com/p/parsets/). See [ggparallel](http://cran.r-project.org/web/packages/ggparallel/index.html) for an R implementation.,2013-05-02 17:23:19.523 +82393,45279,674.0,CC BY-SA 3.0,,"Until I noticed @Andy's comment, I was thinking of something like a [clustergram](http://www.r-statistics.com/2010/06/clustergram-visualization-and-diagnostics-for-cluster-analysis-r-code/) (with subjects' ID vs. no. clusters) or maybe a [streamgraph](http://en.wikipedia.org/wiki/Streamgraph) (probably less appealing if you have few clusters). This, of course, assume you are willing to work at the individual level.",2013-05-03 11:39:42.770 +85258,47497,18382.0,CC BY-SA 3.0,,"in the first line I don`t write ""family=""yjPower"".",2013-05-28 08:56:46.570 +85261,47497,,CC BY-SA 3.0,,"Hi Daniel, welcome to the site. You don't have to code it yourself. The Yeo-Johnson transformations are implemented in the [`car` package](http://cran.r-project.org/web/packages/car/car.pdf) with the function `yjPower`. So just use `yjPower(datc$plot, lambda=lambda.max, jacobian.adjusted=FALSE)`. I think that should work.",2013-05-28 09:12:01.920 +85262,47497,18382.0,CC BY-SA 3.0,,"Thanks @ COOLSerdash I tried, but the problem is in the fist line:",2013-05-28 09:24:43.480 +85263,47497,18382.0,CC BY-SA 3.0,,"lambda.fm1 <- boxcox(datc$plot, ... doesn´t work because datc$plot contains zeros",2013-05-28 09:26:30.017 +85264,47497,,CC BY-SA 3.0,,"Okay, then try to use the function `boxCox` from the `car` package and use it with the option `family=""yjPower""`.",2013-05-28 09:26:34.570 +85290,47497,594.0,CC BY-SA 3.0,,See the `yeo.johnson` [function](http://hosho.ees.hokudai.ac.jp/~kubo/Rdoc/library/VGAM/html/yeo.johnson.html) in the package `VGAM` as well. It is on CRAN.,2013-05-28 12:07:01.123 +86035,34166,,CC BY-SA 3.0,,"This is nice because it's a really easy way to explain the different interpretations of probability (objective vs. personal) to someone with no experience - i.e. is the coin fair, or how many possible ways to wake are there? Obviously there's tonnes of ways to make nuanced arguments either way... Somehow this seems to me more like a rabbit hole than a paradox :D",2013-06-03 01:52:11.707 +86204,47981,15827.0,CC BY-SA 3.0,,"0.04993 < 0.05, so it's just lower. Your instinct is good that no P-value can be trusted to several decimal places, but if the program says less than 0.05, people generally take it as delivered. The real issue here is making a fetish of fixed-level significance testing so that < 0.05 means ""real"", ""publishable"", ""cause for happiness"" and the opposite means ""illusory"", ""not publishable"", ""cause for misery"". Most good introductory texts on statistics discuss this to some extent. One good one is Freedman, Pisani, Purves, _Statistics_. New York: W.W. Norton, any edition.",2013-06-04 09:35:35.103 +86205,47981,,CC BY-SA 3.0,,You have to ask yourself what would be your decision if the p-value is 0.051? what if it is 0.049? Would you make different decisions? Why?,2013-06-04 09:43:12.570 +86207,47981,16990.0,CC BY-SA 3.0,,"Thank you for your comments. In our case we are not pondering whether the data is publishable or not, etc... We are simply considering making a statement in the paper about the statistical significance of this result, and we want to make sure our statement is not incorrect or inaccurate.",2013-06-04 10:52:23.860 +86210,47981,15827.0,CC BY-SA 3.0,,"Reporting P=0.04993 is what springs to mind. It's difficult to predict reviewers' or editors' comments. If you want to round, specifying a consistent rounding convention is always a good idea and widely acceptable. Some people would round to 3 d.p. and might also use some kind of starring convention so reporting 0.050 (3 d.p.) and starring it as <0.05 are consistent.",2013-06-04 11:27:28.397 +86219,47981,,CC BY-SA 3.0,,What is the W value of the statistic? What would the corresponding critical value of your p-value be? How far are those apart?,2013-06-04 13:58:51.567 +86229,47981,,CC BY-SA 3.0,,"@IslamEl-Nabarawy since the significance level is arbitrary anyway, if you have defined it at 5% then yes the p-value is significant by your definition of it. At the same time, I think AlefSin makes a good point.",2013-06-04 16:01:59.183 +86391,48103,,CC BY-SA 3.0,,You're trying to fit a sine wave to the data or are you trying to fit some kind of a harmonic model with a sine and a cosine component? There is a harmonic function in the TSA package in R that you might want to check out. Fit your model using that and see what kind of results you get.,2013-06-05 18:31:29.100 +86392,48103,16992.0,CC BY-SA 3.0,,"Have you tried different starting values? Your loss function is non-convex, so different starting values can lead to different solutions.",2013-06-05 18:34:11.083 +86394,48103,15827.0,CC BY-SA 3.0,,"Tell us more about the data. Usually there is a known periodicity, so that need not be estimated from the data. Is this a time series or something else? It is much easier if you can fit separate sine and cosine terms by a linear model.",2013-06-05 18:46:34.350 +86411,47981,16990.0,CC BY-SA 3.0,,"@NickCox: We reported all the results to 4 d.p., and in the text we noted that while it is lower than 0.05, it was only by a very narrow margin.",2013-06-05 21:45:33.733 +86426,48103,594.0,CC BY-SA 3.0,,"Having an unknown period makes your model nonlinear (such an event is alluded to in the selected answer at the linked post). The given that, the other parameters are conditionally linear; for some nonlinear LS routines that information is important and can improve the behaviour. One option might be to use spectral methods to get the period and condition on that; another would be to update the period and the other parameters via a nonlinear and linear optimization respectively in an iterative fashion.",2013-06-05 23:13:14.360 +86427,48103,594.0,CC BY-SA 3.0,,(I just edited the answer there to make the particular case of unknown period an explicit example of what can make it nonlinear.),2013-06-05 23:19:06.557 +94847,52567,728.0,CC BY-SA 3.0,,"@Glen_b: Thanks! Does ""design"" have something to do with choosing a transform on the input variable, so that the output variable is also linear in the transformed input variable? For example, the design matrix in polynomial regression?",2013-08-05 00:04:27.643 +94854,52567,668.0,CC BY-SA 3.0,,When you design an experiment you specify the values of $X$.,2013-08-05 00:49:16.323 +86428,48103,594.0,CC BY-SA 3.0,,"Because the other parameters can be estimated linearly, you can plot $S$, the sum of squares of residual (SSE), against $\omega$ for a wide range of $\omega$ (for each $\omega$ over some set - say a grid or whatever around a sensible start value - use LS to estimate the other parameters and hence $S$); refine the detail in the more interesting areas of $\omega$ (better values of $S$). This lets you optimize $\omega$ without worrying about $-S$ being unimodal.",2013-06-05 23:24:17.103 +86455,48133,,CC BY-SA 3.0,,(+1) nice answer. I tried to fit the linear model with `lm(y~sin(2*pi*t)+cos(2*pi*t)` but this didn't work (`cos` term was always 1). Just out of curiosity: what do the first two lines do (I know that `spectrum` estimates the spectral density)?,2013-06-06 08:44:28.790 +86458,48133,594.0,CC BY-SA 3.0,,"@COOLSerdash Yeah, you have to have the units of $t$ being the period (as it was in the linked question) for `2*pi*t` to work. I should go back and emphasize that in the other answer. (ctd)",2013-06-06 10:06:39.880 +86459,48133,594.0,CC BY-SA 3.0,,"@COOLSerdash (ctd)- The 2nd line finds the frequency associated with the biggest peak in the spectrum and inverts to identify the period. At least in this case (but I suspect more widely), the defaults on it essentially identifies the period that maximizes the likelihood so closely that I deleted the steps I had in to maximize the profile likelihood in the region around that period. The function `spec` in TSA may be better (it seems to have more options, one of which may be important sometimes), but in this case the main peak was in exactly the same place as with `spectrum` so I didn't bother.",2013-06-06 10:07:13.630 +87234,48597,5237.0,CC BY-SA 3.0,,"How many variables did you measure / test? Do you think of them as related to each other, or are they independent?",2013-06-13 05:13:56.340 +87235,48597,594.0,CC BY-SA 3.0,,"With the 'probability test', do you think he might be referring to [Fisher's method](http://en.wikipedia.org/wiki/Fisher%27s_method)? If so, you'd need the various response variables to be independent, which I would usually doubt.",2013-06-13 05:39:14.840 +87236,48597,594.0,CC BY-SA 3.0,,"You say ""T-test"" in your title but ""t-test"" in your body text. Did you do several univariate two-sample t-tests, or did you do a single multivariate [T-test](http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.bsmsp/1200500217)? (See also [this](http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution#Hotelling.27s_two-sample_T-squared_statistic) and [this](http://faculty.smu.edu/kyler/courses/7314/Hotellings_T.pdf) and [this](http://www.psych.yorku.ca/lab/psy6140/lectures/HotellingT2-2x2.pdf))",2013-06-13 05:40:27.923 +87673,48103,11200.0,CC BY-SA 3.0,,"Ok guys, it's been a while but I solved the problem (at least to my satisfaction). What I did is to take the linearization (see linked thread) and the to just loop through all the possible periods and calculate the goodness of fit. By that value I now chose the period which fitted the data best. It's a real low level solution but it worked fine for my purpose.",2013-06-17 12:36:50.907 +87845,37981,594.0,CC BY-SA 3.0,,"Indeed, given the particular way I phrased it in my earlier comment, it's true even of symmetric distributions - the square of the sample standardized third moment (squared moment skewness) is highly correlated with the sample standardized fourth moment ('kurtosis'), even at say the normal.",2013-06-19 00:27:14.723 +88040,2509,15827.0,CC BY-SA 3.0,,"Alice Calaprice, _The ultimate quotable Einstein_, Princeton U.P. 2011 flags the quotation here as one of many ""Probably not by Einstein"". See p.482.",2013-06-20 10:01:41.333 +89173,31587,6404.0,CC BY-SA 3.0,,I'd really appreciate it if you could comment on whether you think it is possible to create a confidence interval for the transition probabilities?,2013-06-30 08:38:16.763 +90150,49879,594.0,CC BY-SA 3.0,,"That appearance might be a consequence of the intervals that were chosen, or of something else. There's really very little to go on here. When you say ""my estimates"" you suggest the depicted estimates are yours, but your question earlier suggests the slides belong to someone else. Are the estimates yours, and if not, why call them yours?",2013-07-07 04:48:11.837 +91311,49879,19492.0,CC BY-SA 3.0,,"@Glen_b no, I just were talking from my point of view, so these are not my estimates.",2013-07-13 07:54:31.493 +91413,50739,503.0,CC BY-SA 3.0,,"If A, B and C are separate groups (not levels of a continuous variable) then you can't really draw lines between them, or place them equidistant on the x-axis.",2013-07-14 11:13:58.673 +92286,49906,15473.0,CC BY-SA 3.0,,"It is interesting! The two methods should have similar results according to the theory. If I understand the question correctly, the null hypothesis of McNemar's test is $p_b=p_c$, while the null hypothesis of the test of conditional logistic regression is odds ratio $ad/bc=1$ within a stratum.",2013-07-18 21:29:58.033 +92328,49906,5821.0,CC BY-SA 3.0,,"I seem to recall that one can parameterize the McNemar's test as a test of an odds ratio, so I wonder how one would write out the likelihood (conditional likelihood?) for that test.",2013-07-19 04:51:41.013 +92517,49906,15473.0,CC BY-SA 3.0,,"I am not sure if you mean the exact version of McNemar's Test. [Breslow and Day (1980)](http://w2.iarc.fr/en/publications/pdfs-online/stat/sp32/SP32.pdf), p. 164-166 and [package](http://cran.r-project.org/web/packages/exact2x2/vignettes/exactMcNemar.pdf) `exact2x2` may be references.",2013-07-19 22:10:19.587 +92675,50982,,CC BY-SA 3.0,,"I think the term *bias-variance tradeoff* does not apply here, because you are not decomposing a mean squared error into a bias and a variance, and you are not talking about the variance of an estimator but about the variance of a score.",2013-07-21 10:48:21.457 +92699,50982,1790.0,CC BY-SA 3.0,,"Thanks @Lucas. I am trying to estimate the score of my classifiers $A$ and $B$ on *unseen* data. For this, I thought I could take the mean of scores on *seen* data as my estimators (i.e. $E(P_A)$ and $E(P_B)$ for $A$ and $B$ respectively). Is the variance of these estimators different from the variance of the scores $P_A$ and $P_B$ ?",2013-07-21 16:29:30.840 +92756,50982,,CC BY-SA 3.0,,"@user815423426 I think the comparison depends on the loss function you have. Diebold and Mariano (2002) have a nice paper studying your question. They proposed some statistical tests comparing the ""generalization"" performance. I don't know how to set up a link in comments. The paper is: Diebold, Francis X., and Robert S. Mariano. ""Comparing Predictive Accuracy."" Journal of Business & Economic Statistics 20.1 (2002): 134-144.",2013-07-22 03:24:43.083 +93226,51644,9049.0,CC BY-SA 3.0,,"I guess you want to say ""from model `m3` it is 0.0011713"" instead of `m2`.",2013-07-24 12:57:34.287 +93229,51644,3733.0,CC BY-SA 3.0,,"I am sorry @user11852, yes you are correct, thanks. (BTW, for `m2` it is valid also (which is subject of [another question](http://stats.stackexchange.com/q/65386/5509)).",2013-07-24 13:17:10.070 +94049,2509,2081.0,CC BY-SA 3.0,,A [link](http://stats.stackexchange.com/a/65817/3277) to a geometrical account of PCA vs regression vs canonical correlation.,2013-07-29 20:43:09.140 +94400,43458,,CC BY-SA 3.0,,"@JasonMorgan i suspect that with datapoint, he refers to a single time they measured the sample. Not to the number of observations within a single measuring session.",2013-08-01 10:55:30.050 +103394,57086,1805.0,CC BY-SA 3.0,,"Make a prediction equation for each tree (it will be simple split points), and then average the predictions from each equation? You'll get one monster equation, but it will fully represent the forest.",2013-10-08 18:56:02.367 +95452,52910,668.0,CC BY-SA 3.0,,"It would appear that condition numbers are used here merely as a surrogate for the sample size, so why don't you just focus on sample sizes? I am baffled by two aspects of this question. The first concerns the nature of your asymptotics: what are you doing that assures the condition numbers won't grow without bound? The second is why a ""typical"" condition number in any field would even matter, since there is no general connection between it and the inferences one would like to draw.",2013-08-09 14:29:11.350 +95464,52910,3731.0,CC BY-SA 3.0,,"@whuber Thanks for the comment. (1) My goal is understanding sample size via condition numbers. (2) In a simple 2-level hierarchical model, with indep. groups and equal correlation within groups, then bounding the size of the largest group bounds the condition number (e.g. adding more ""schools"", but not more ""students per school""). (3) From my (limited) understanding, condition numbers are proxies for the expected numerical error. My naive assumption is that different fields will have orders of magnitude differences (e.g. IQ tests v. diameters of tree trunks). Do you believe that to be false?",2013-08-09 15:10:14.397 +95471,52910,3922.0,CC BY-SA 3.0,,"I have seen everything from 10 to 10,000. It might be true that in well designed experiments, the condition numbers will be close to 1. It is definitely true that there is no single ""social science"" number to talk about.",2013-08-09 15:48:35.570 +95475,52910,668.0,CC BY-SA 3.0,,"@StasK I have seen *infinite* values but they didn't matter: they arise when there are collinearities among variables that don't affect the parameters of interest. That's the basis of my concerns about this approach: although it's true that high condition numbers (CN) create numerical instability (a practical issue) and large standard errors for *some* coefficients (a theoretical and practical issue), what matters is whether those inflated SEs are pertinent to the investigation objective. I therefore don't see how it would be possible to establish any meaningful kind of ""typical"" CN threshold.",2013-08-09 16:15:14.267 +95488,52910,3922.0,CC BY-SA 3.0,,"@whuber, I agree that there is little in the ways of providing the single best number; I assumed that the OP has weeded out perfect multicollinearities. Also, there may be condition numbers for the raw data, but then you can start adding nonlinear terms (interactions; polynomials, splines, etc.) that would affect the CN actually encountered. For non-linear models like logit, the CNs on the parameter estimates would not be the same as the CN for the regressors. Finally, there are also multilevel models in which information set differ for different parameters, producing really weird CN patterns.",2013-08-09 18:17:17.227 +95510,52910,3731.0,CC BY-SA 3.0,,"@StasK So then the short answer to my question is: No. People do not normally compile these things because they are specific to both the particular model chosen and the particular data onto which the model is fit. To talk of a ""typical"" number is not well posed.",2013-08-09 20:34:34.693 +96075,53264,449.0,CC BY-SA 3.0,,Given that no one else has come up with an alternative I'll mark this correct. However Patrick it's entirely possible this is an (misapplied) $\omega^2$. The $\eta^2$ better suits a typo I think (upside down $\mu$?).,2013-08-14 21:25:20.470 +96227,53384,,CC BY-SA 3.0,,It will be clearer if you show the code that you have done so far.,2013-08-15 23:01:21.997 +96228,53384,20838.0,CC BY-SA 3.0,,"I actually haven't coded anything so far in terms of bootstrapping. The code for my model is pretty complex, I don't thank that would help. As an example, we can assume that the model is a smoothing procedure like a moving average, with the moving window as the only model parameter. I have a series of (synthetic) measurements over time and add an error (not necessarily homoskedastic and normally distributed) to that. I then want to estimate the moving window which comes closest to the underlying ""true"" I know and want to assess uncertainty by bootstrapping my synthetic error. Does that help?",2013-08-15 23:09:27.523 +96230,53384,20838.0,CC BY-SA 3.0,,"Here's some very bad MATLAB-style pseudo code, maybe it helps understand what I'd like to do: http://pastebin.com/yTRahzr5",2013-08-15 23:24:48.637 +96231,53384,,CC BY-SA 3.0,,"Sorry Fred, I don't know Matlab.Please tag as Matlab to get inputs from users.",2013-08-15 23:27:55.787 +96232,53384,20838.0,CC BY-SA 3.0,,"Oh my question really isn't limited to MATLAB (and that isn't really MATLAB code, it's just some pseudo-code based on MATLABs syntax for for-loops and comments that wouldn't work anyway). But I can tag it just in case.",2013-08-15 23:32:52.950 +96268,53404,166.0,CC BY-SA 3.0,,Some questions to guide your thinking... What does a very negative t statistic mean? Is a negative F statistic possible? What does a very low F statistic mean? What does a high F statistic mean?,2013-08-16 06:41:29.950 +96271,53404,,CC BY-SA 3.0,,Why are you under the impression that a one-tailed test has to be an F-Test? To answer your question: The F-Test allows to test a hypothesis with more than one linear combination of parameters.,2013-08-16 07:17:40.957 +96279,53404,19125.0,CC BY-SA 3.0,,Do you want to know why one would use a one-tailed instead of a two-tailed test?,2013-08-16 07:53:59.867 +98029,31587,,CC BY-SA 3.0,,"@Zen Suppose instead of finding state transition matrix for 1 history step I wish to calculate for 2 history step. +P(State1|(State1, State2)): Probability of getting state1 given t-1 was state1 and t-2 was state2. How do I calculate these probabilities?",2013-08-30 07:09:58.817 +98046,31587,7007.0,CC BY-SA 3.0,,"If it is a Markov chain, then $P(1\mid 1,2)=P(1\mid 2)$. So, there is nothing new. If it is not a Markov chain, keep a record of the $n^3$ triple of three consecutive transitions, simulate the process and compute the fraction of each triple.",2013-08-30 11:47:56.480 +98500,54637,,CC BY-SA 3.0,,You might want to check out information about p-value meta-analysis. One good starting point: http://en.wikipedia.org/wiki/Fisher%27s_method,2013-09-04 07:00:37.703 +98533,54624,,CC BY-SA 3.0,,"The R commands are either lme() or lmer(). Both are mixed effects models. lmer() is newer and preferred, although lme() seems to work fine for most applications. Pinheiro and Bates - Mixed Effects Models in S and S-Plus covers both the theory and applications of these models in R pretty well.",2013-09-04 12:12:46.180 +98603,54724,17740.0,CC BY-SA 3.0,,Generalization error is far from trivial. Unfortunately rules of thumb don't help much in this regard.,2013-09-04 20:28:43.960 +99036,54915,5237.0,CC BY-SA 3.0,,"I don't think this question needs to be migrated to SO necessarily, b/c the OP wants to know if his code matches the how the test is supposed to work. Ie, the OP is asking about the *ideas* primarily. It's a subtle distinction, & admittedly, someone will have to know both R & the Pettitt test to answer, but I think this is a stats question, not a coding question.",2013-09-08 14:08:47.190 +99055,54915,668.0,CC BY-SA 3.0,,"@gung OK, I won't vote to close, but I must ask Raz_Lobo: please explain how you know your code is giving incorrect output. (Warning to readers: the two pdf links in the question appear to have very long load times.)",2013-09-08 16:22:50.893 +99059,54915,668.0,CC BY-SA 3.0,,"Raz_Lobo, your second link does not load (at least not for me) and the formulas in the first one on pp 5 and 6 are clearly incorrect. For a more accurate account of the Pettitt test (published in a reputable journal), please see Equation 5 in http://www.stat.purdue.edu/~zhanghao/ASS/Reference%20Papers/Temporal%20and%20Spatial%20variability%20of%20annual%20water%20level.pdf.",2013-09-08 16:30:23.080 +99145,54915,21523.0,CC BY-SA 3.0,,"@whuber I see these other equation you appoint early in other paper, but, really, I don't know how to implement it, where is a max in a oscilating function between -1 < x < 1? You can see the equations I implemented in this other paper: ""http://www.homogenisation.org/files/private/WG1/Bibliography/Applications/Applications (P-S)/sahin_etal_2010.pdf"" (I think that this is reputable, only links other because them reflects level of significance equation) In order to check code is giving incorrect output, you can see table IX in the paper I link.",2013-09-09 09:34:46.857 +99165,54915,,CC BY-SA 3.0,anon,"Are you purely interested in implementing the Pettitt test in R or do you have some applications in mind, also? If your aim is to produce an implementation of the test, could you consider getting the original article from JSTOR (http://www.jstor.org/discover/10.2307/2346729?uid=2129&uid=2&uid=70&uid=4&sid=21102620240847) which would probably describe the test in more details. If you are more interested in applying the test in some research setting, have you checked the cpm package that contains an implementation of the Mann-Whitney test (similar to Pettitt test) using a simulation approach?",2013-09-09 13:39:28.897 +99286,35249,,CC BY-SA 3.0,,"Slightly off-topic, but there's a [geoscience proposal on area51](http://area51.stackexchange.com/proposals/36296/geoscience) that's currently in the commitment phase. While this is certainly a stats question, the fact that it's so field specific means you might get better help there or pointers to solutions that CV users might not be aware of. So go and sign up! :D",2013-09-10 08:23:13.240 +99288,54915,21523.0,CC BY-SA 3.0,,"@JTT I'm trying to apply Sahin et al metodology to my own data, then yes, I interested in implement Pettitt test and, yes, in R. Finally, a friend I ask, can found original article: [link] ftp://oceane.obs-vlfr.fr/pub/irisson/papers/Pettitt1979-A%20non-parametric%20approach%20to%20the%20change-point%20problem00.pdf I think that, with original article, I'll could implement it. At respect of cpm package, I tryed it but I can't compile it.",2013-09-10 08:42:39.803 +99315,55043,,CC BY-SA 3.0,,Your intuition seems related to “parallel analysis”.,2013-09-10 13:28:20.927 +99441,35249,503.0,CC BY-SA 3.0,,"Have you looked into functional data analysis? I am very far from an expert in that, but the little I know suggests it might be useful here. See e.g. [this book](http://www.textbooks.com/BooksDescription.php?BKN=773525&SBC=ME3&kpid=9780387400808U&network=GoogleShopping&tracking_id=9780387400808U&utm_medium=cpc&utm_term=9780387400808U&utm_source=googleshopping&kenshu=2275d976-eb91-6b88-5311-00004cb31ff0&gclid=CIGl0ceWw7kCFaYDOgodFxIA6A)",2013-09-11 10:52:36.547 +99559,55150,9175.0,CC BY-SA 3.0,,"You said that $E(XE(Y|X))=E(E(XY|X))=E(XY)$. I believe this is wrong. E(Y|X) is a constant. Therefore, $E(XE(Y|X))$ is equal to $E(Y|X)E(X)$. Another point, $E(Y|X)=b0+b1*X$ comes from the simple linear regression model.",2013-09-12 01:36:15.333 +99561,55150,21630.0,CC BY-SA 3.0,,"Let E(Y|X) = b, where b is a constant. Then take expectations of both sides. One finds that E(E(Y|X)) = E(b) = b. By law of iterated expectations, E(E(Y|X)) = E(Y). Therefore, if E(Y|X) is constant, it must be equal to E(Y).",2013-09-12 01:38:27.553 +99578,55150,,CC BY-SA 3.0,,"If E(Y/X)=b, that's implies Y does not depend on X, and E(Y)=b, you are confusing yourself.",2013-09-12 05:39:57.397 +99588,55150,11489.0,CC BY-SA 3.0,,"I don't understand why ""this makes no sense"". You are starting off with a definition of causality that is I think equivalent to definition of independence in statistics. And independent variables have zero covariance, where is the story?",2013-09-12 06:17:54.650 +99589,55150,21630.0,CC BY-SA 3.0,,"January, no, they are not the same thing! X and Y are independent if the joint distribution factors into the product of the marginals, and this is definitely not the same thing. I don't see what your point is? Azeem, aside from restating what I previously said, do you have anything to contribute? Instead of saying I am wrong, can you explain WHY I am wrong?",2013-09-12 06:41:03.020 +99592,55150,16474.0,CC BY-SA 3.0,,"In your set up you ignore the possibility of confounding variables etc., which is an OK first step if you are trying to get your head around a complicated concept like causality. In that case, I don't see a problem with the statement that no causal relation between $X$ and $Y$ implies no correlation between $X$ and $Y$. In your world there are only two variables, and they are either related or not. If they are not related then, in your world, there is no other mechanism that could result in a correlation between the two.",2013-09-12 07:41:33.470 +99603,55150,21630.0,CC BY-SA 3.0,,"Thank you Maarten Buis! That helps a lot! However, I am interested in something more mathematically rigorous. Let Z be another variable. Consider $E(Y|X,Z)$. Now, X is causally unrelated to Y iff X does not influence $E(Y|X,Z)$, i.e., $E(Y|X,Z) = E(Y|Z)$ (using the same logic). As before, $E(XY) = E(E(XY|X,Z)) = E(XE(Y|X,Z))$ which is only slightly more complicated. But now the assumption $E(Y|X,Z) = E(Y|Z)$ means that $E(XY) = E(XE(Y|Z))$, which is not, in general, equal to $E(X)E(Y)$. Therefore, in the presence of possible confounding factors, no causation does not imply no correlation.",2013-09-12 10:52:51.493 +99605,55150,21630.0,CC BY-SA 3.0,,I'm still not entirely happy with this argument. What do you think?,2013-09-12 10:55:43.037 +99606,55150,11489.0,CC BY-SA 3.0,,"@Christian, if X and Y are independent, then $E(Y|X)=E(Y)$, this is simple to prove (I'm not sure about the other way, though, meaning that independence is a special case of not having causal relationship as per your definition). Why do you expect that two variables for which $E(Y|X)=E(Y)$ have $cov(X,Y) \ne 0$? What is so unexpected about them having zero covariance?",2013-09-12 10:58:45.620 +99608,55150,11489.0,CC BY-SA 3.0,,"OK, wait a sec. Actually, $E(Y|X)=E(Y)$ iff $cov( X, Y ) = 0$, right? So I think that you just found a fancy name for a non-zero covariance...",2013-09-12 11:36:26.467 +99621,55182,21630.0,CC BY-SA 3.0,,Thank you sincerely. I will have a read of his work and get back to you when I have time.,2013-09-12 13:45:24.580 +99668,55182,5045.0,CC BY-SA 3.0,,"Excellent answer. The [Morgan & Winship book](http://books.google.com/books?id=lFTP49c5wJoC&lpg=PP1&pg=PP1#v=onepage&q&f=false) is quite a bit easier than Pearl, with a focus on social science problems.",2013-09-12 16:53:16.847 +99762,55150,21630.0,CC BY-SA 3.0,,"@January, I don't know about the converse. Can you please provide a mathematical proof that $Cov(X,Y) = 0$ implies $E(Y|X) = E(Y)$?",2013-09-13 11:10:20.757 +99769,55150,11489.0,CC BY-SA 3.0,,"@Christian I think it is straightforward and it feels correct, but I don't have time to work it out right now; do you have a counter-example?",2013-09-13 11:28:23.717 +99868,55150,21630.0,CC BY-SA 3.0,,I (confidently) suspect it is not true.,2013-09-14 09:57:43.040 +100433,55576,,CC BY-SA 3.0,,This is a cross-post http://stackoverflow.com/questions/18871792/generating-random-data-based-on-partial-correlation Please decide whether it belongs here (as I'd say) or on SO and ask a moderator to migrate or perhaps close it.,2013-09-18 12:40:38.503 +100451,55576,668.0,CC BY-SA 3.0,,"Could you please explain precisely what you mean by a ""partial correlation matrix""? Is this a correlation matrix, a correlation matrix with missing entries, or a matrix of partial correlations?",2013-09-18 13:54:16.090 +103395,57086,22034.0,CC BY-SA 3.0,,"Good idea @Zach. But unfortunately I'm trying to avoid anything ""monster.""",2013-10-08 18:57:21.623 +100478,55576,668.0,CC BY-SA 3.0,,"@Irish Thank you. Your interpretation might be correct or it might not: it assumes this is a question about time series, even though time series have not been mentioned or tagged. (The value of 168 = 7*24 certainly is suggestive.) I want to hear from the *original poster* concerning his question rather than guesses (no matter how intelligent or well-meaning) from others.",2013-09-18 16:07:36.940 +100527,55617,,CC BY-SA 3.0,,You could convert the ranks into preference scores that come from a normal distribution: http://www.ats.ucla.edu/stat/stata/faq/prank.htm,2013-09-18 20:08:59.427 +100750,55722,,CC BY-SA 3.0,user25658,What level of text are you looking for? I think that Degroot book is aimed more at undergraduate students. A good book for graduate level studies is Statistical Infernece by Casella and Berger.,2013-09-19 22:17:19.853 +100776,55722,668.0,CC BY-SA 3.0,,"This definition of ""self sufficient"" is subjective, because your ability to ""understand the book"" depends on your background.",2013-09-20 01:44:27.580 +100806,55576,21833.0,CC BY-SA 3.0,,@Momo Apologies for cross posting. Will look into that.,2013-09-20 06:11:44.113 +100807,55576,21833.0,CC BY-SA 3.0,,"@whuber What I meant by a partial correlation matrix is a matrix that has partial correlations in it (calculated for any two pairs of entries by partialling out all other pairs. In your words ""a matrix of partial correlations"". Yes, this is regarding time series as you have rightly pointed out. It is on the lines of back calculating a time series (168*12) if I have a pre-defined matrix having partial correlation data.",2013-09-20 06:12:14.773 +100821,55722,,CC BY-SA 3.0,,I'm guessing that there is no book that you will find completely satisfactory.,2013-09-20 09:18:09.120 +100831,55617,21846.0,CC BY-SA 3.0,,"This seems reasonable to me. However, I am a bit confused by their example. They say that ""The z-scores will be normally distributed with mean equal to zero and a standard deviation of one."" but the inverse normal transformation they apply actually results in scores with a standard deviation of 1.486. Am I missing something or is there an error in the example?",2013-09-20 10:37:47.500 +100836,55722,21885.0,CC BY-SA 3.0,,Self sufficient given the knowledge that you have after obtaining a bachelor in mathematics. With regards to the topics Degroot is what I am looking for but I don't like books in which core results (e.g. chi square distribution of the test statistics given the null hypothesis is true for the likelihood ratio test) are not derived. I will have a look at Statistical Inference by Casella and Berger.,2013-09-20 11:25:44.337 +100927,55722,10448.0,CC BY-SA 3.0,,[Here](http://bayesianthink.blogspot.com/2012/12/the-best-books-to-learn-probability.html#.Ujzq9BVx05k) is a good list of books for to learn probability and statistics. There may be German versions for these books but I'm not sure. HTH,2013-09-21 00:41:50.377 +100935,55722,594.0,CC BY-SA 3.0,,"How can a book on probability and statistics ever be *complete*? Even huge multi-volume tomes (Kendall and Stuart's .. etc's *Advanced theory of Statistics* in its latest incarnations, for example, come to thousands of pages if I recall correctly) aren't remotely complete.",2013-09-21 01:36:14.747 +101555,541,,CC BY-SA 3.0,,"Upvoted your comment, but experimentalists are even crazier than I thought if this is syntactic sugar for them! Which version is more intuitive.... ANOVA hypothesis test on $\beta$: is the ratio of explained variance to the unexplained variance sufficiently high? T-test on the $\beta$ term of a regression model: is the effect of $\beta$ sufficiently different from zero? And, with the latter formulation you also get the direction of change. And, if you had to transform the data, you can back-transform the parameter estimate into a physically meaningful quantity. Unlike SS.",2013-09-25 20:44:50.790 +101769,56273,503.0,CC BY-SA 3.0,,"Welcome to the site. You can't *prove* this formula. It's a guideline. It can be wrong or right, and whether you need more or less classes than it suggests is (at least in part) a matter of opinion.",2013-09-27 11:57:24.887 +101770,56273,22126.0,CC BY-SA 3.0,,so from where comes this formula?,2013-09-27 12:04:05.967 +101773,56273,503.0,CC BY-SA 3.0,,"I am not sure where that specific formula comes from, but probably someone who had run a lot of histograms thought that it generally gave good results.",2013-09-27 12:18:28.583 +102144,56273,594.0,CC BY-SA 3.0,,see here: http://www.robjhyndman.com/papers/sturges.pdf,2013-09-30 12:52:01.127 +102488,56684,,CC BY-SA 3.0,Willemien,"I Don't understand the problem, what do you want to test?",2013-10-02 22:25:12.987 +102575,56684,,CC BY-SA 3.0,Ben,"@Willemien I want to test difference of in proportions of the two samples. One sample will be ""true"" 1.21% of the time, and the other will be ""true"" 1.33% of the time, for instance. Is this difference significant? My main problem stems from the heavy skew.",2013-10-03 12:41:03.603 +102716,56783,15280.0,CC BY-SA 3.0,,"Thanks for you answer. It helps clarifying my second question. As I tried to convey in the title of the question, my main issue (the first one in the post) was more about the proof mechanism. My main concern is about my understanding of the proof I presented in the question. As I explained, my understanding of the proof leads me to blatantly problematic statement. So I would like to understand were my mistake is as it might reveal some deeper misunderstandings about concepts of expectaction and conditional expectation. Any thoughts about this?",2013-10-04 04:00:08.607 +102740,56783,20473.0,CC BY-SA 3.0,,"I added some explanation on the ""add and subtract"" approach to proof.",2013-10-04 10:27:24.233 +102789,56783,15280.0,CC BY-SA 3.0,,"Took me some time to understand it, but I finally got my basic mistake : true enough $E \Big[ - 2 \big(Y - h(X) \big) \big(h(X) - g(X)\big) + \big(h(X) - g(X)\big)^2\Big] = 0 $ when $g(X) = h(X)$, but by no means does it imply that $h(X)$ minimizes the expression. There is no reason which the bracketed expression could not be lower than zero. Because of the minus sign in front of $\big(Y - h(X) \big) \big(h(X) - g(X)\big)$ one could find some $g(X)$ such that $E \Big[ - 2 \big(Y - h(X) \big) \big(h(X) - g(X)\big) + \big(h(X) - g(X)\big)^2\Big] < 0$.",2013-10-04 16:24:22.190 +102832,56783,20473.0,CC BY-SA 3.0,,Hmmm... the minus sign in the expression you refer to is a mistake - it should be a plus sign. You could of course then rearrange the terms to obtain again a minus sign... does this hurt the intuition you gained?,2013-10-04 21:03:11.077 +102845,56783,15280.0,CC BY-SA 3.0,,"Thanks for keeping up with the question. I edited the initial post to correct for this mistake. Fortunately, I think it does not hurt the gained intuition. Actually it helps me understand yet another mistake : I was assuming that the minus sign was important to guarantee that $0$ was not necessarily the minimum of $E[−2(Y−h(X))(h(X)−g(X))+(h(X)−g(X))^2]$. But I realize this is not just about the sign before the 2. (Hopefully) What I really needed to understand is that, in general (i.e. for arbitrary $h(X)$) $E[2(Y−h(X))(h(X)−g(X))]$ needs not be minimized when $g(X)=h(X)$ (right?).",2013-10-04 23:36:23.880 +102846,56783,20473.0,CC BY-SA 3.0,,"Right. Just think of it as any other minimization. Taking the first derivative w.r.t $g$ for this expression you get the necessary condition $-2E(Y-h) = 0$, so it has nothing to do with $g=h$.",2013-10-04 23:53:20.463 +102881,56860,20120.0,CC BY-SA 3.0,,How is it justified to call t or F *scores* (rather than e.g. t-*tests*) inferential statistics?,2013-10-05 13:41:54.423 +104153,57444,22262.0,CC BY-SA 3.0,,@zkurtz I had this in an old version -- looking for solutions other than logistic regression.,2013-10-14 16:36:57.877 +102882,56859,4656.0,CC BY-SA 3.0,,"Descriptive statistics: A coin was tossed ten times and came down heads six times. Statistical inference: The maximum likelihood estimate of the probability of Heads is $0.6$, or, This information is insufficient to reject the hypothesis that the coin is a fair coin.",2013-10-05 13:44:39.663 +102883,56860,155.0,CC BY-SA 3.0,,"@jona The t-score is the ""statistic"" that is used in the t-test, therefore one could describe the t-score as an inferential statistic when used as part of such an inferential process. I guess I have started with the assumption that a statistic is a function of the data. But perhaps you are alluding to the point that we often think of inferential statistics as the broader set of techniques used to do inference?",2013-10-05 13:47:04.867 +102884,56860,20120.0,CC BY-SA 3.0,,"Let me phrase it differently - isn't a t-statistic a description of a sample, rather than an inferential statement (such as a p-value)?",2013-10-05 13:56:31.233 +102885,56860,155.0,CC BY-SA 3.0,,"Well yes, a function of the data is equivalent to a description of a sample. I guess I was thinking that such statistics are used in an inferential process (e.g., researchers relate the t-statistic to a t-distribution to get a p-value and then relate p to alpha to draw an inference). I've often seen textbooks use these examples. But I suppose the p-value and the binary inference itself could be seen as statistics (i.e., functions of the sample data). And the binary inference itself could be seen as most clearly aligned to the inference. Is that what you are getting at?",2013-10-05 14:06:14.737 +102887,56860,20120.0,CC BY-SA 3.0,,"The definition of the p-value (probability of sample given some population) refers to the population (or alternatively, long-run frequencies), so I'd file it under inferential. The definition of *t* is phrased only in reference to the sample, isn't it?",2013-10-05 14:20:57.633 +102891,56860,155.0,CC BY-SA 3.0,,"So for example, you use the data to get to *t* which is related to a distribution, which gives you *p*, which in turn yields a binary inference about a population parameter. So from a frequentist perspective, t, p, and the binary inference are all random variables. All were involved in the inferential process. I'm not sure what the pros and cons are of labelling all or only some such statistics as inferential.",2013-10-05 14:36:52.690 +102892,56860,155.0,CC BY-SA 3.0,,"There are also many other ways of doing inference (e.g., bootstrapped confidence intervals, cut-offs on Bayesian posterior densities). So perhaps in those cases the above definitions would need to be tweaked to focus more on the final inference. That said, once I go outside the traditional frequentist test statistics, I tend to think more in terms of inferential procedures rather than needing to clearly distinguish descriptive from inferential statistics.",2013-10-05 14:37:43.160 +102913,56859,21762.0,CC BY-SA 3.0,,"Inference without the concept of ""population"": Assume your data are generated by some (partially) unknown random mechanism/rule. Inferential methods allow to assess properties of this mechanism based on the data. Example: You want to verify an electro-physical formula based on outcomes that can be measured only approximately or under imperfect conditions.",2013-10-05 17:09:41.500 +102914,56859,12683.0,CC BY-SA 3.0,,@Michael: Yes; or indeed *make* your data be generated by a known random mechanism - random assignment of experimental treatments.,2013-10-05 17:23:25.437 +102944,56875,947.0,CC BY-SA 3.0,,"Is it appropriate to focus solely on the 2008-2009 period (learning sample of the model) to develop an econometrics model earmarked for estimating what would happen in adverse economic scenarios? I gather I am simply repeating my question. As is, I think it is clear enough.",2013-10-06 00:28:09.963 +102981,56911,503.0,CC BY-SA 3.0,,"Unless ""observation"" has some order (which it usually does not) then you should not draw a line plot like this. It implies that ""observation"" isn't just a label.",2013-10-06 10:28:52.367 +102991,56911,5237.0,CC BY-SA 3.0,,"How is this different from *outlier detection* (applied to residuals)? If you want this to be done non-visually, what does the *plot* of the residuals have to do with anything?",2013-10-06 14:05:51.327 +102995,56928,16046.0,CC BY-SA 3.0,,"Maybe I should move the question to math exchange, any suggestions?",2013-10-06 15:42:08.587 +102996,56875,947.0,CC BY-SA 3.0,,"You can answer it in a precise way. The answer is either Yes, No, or it depends. The key is to support one's answer most rigorously.",2013-10-06 16:05:01.650 +103009,56911,1506.0,CC BY-SA 3.0,,The plot was just to help with the explanation. The observations are ordered.,2013-10-06 18:35:27.720 +103010,56911,5237.0,CC BY-SA 3.0,,"The plot does help with the explanation. So, I gather the plot is (ultimately) irrelevant and does not have to do with your real issue beyond its communicative role here; this is all well & good. How does what you want differ from outlier detection?",2013-10-06 18:42:30.477 +103013,56928,6162.0,CC BY-SA 3.0,,"I totally agree with your opinion about Gelman et al's book. Try *The Bayesian choice*, by C. Robert.",2013-10-06 20:01:03.637 +103017,56911,2958.0,CC BY-SA 3.0,,Would the calculations behing [Process Control Charts](http://en.wikipedia.org/wiki/Control_chart) help?,2013-10-06 20:34:52.580 +103018,56928,16046.0,CC BY-SA 3.0,,@StéphaneLaurent pretty happy that got the same opinion from somebody else. Will give it a try. Thanks.,2013-10-06 20:36:23.937 +103058,56768,,CC BY-SA 3.0,,"In econometrics, there is a systematic approach called BLP model studying your problem. A good reference about BLP model is this paper ""A Practitioner's Guide to Estimation of Random Coefficients Logit Models of Demand"" by Nevo(200).",2013-10-07 03:26:03.317 +103060,56911,1506.0,CC BY-SA 3.0,,@gung It is similar to outlier detection and can certainly be used here. However this does not take into account the independence violation. Plus the standard deviation may be so inflated in some situations that the points do not seem like outliers.,2013-10-07 04:15:32.383 +103061,56928,3183.0,CC BY-SA 3.0,,"For what it's worth, there's a [new edition of Bayesian Data Analysis](http://www.amazon.com/Bayesian-Analysis-Edition-Chapman-Statistical/dp/1439840954) coming out soon. I don't know if it would be better for your taste, but it looks like it will have a lot of other improvements.",2013-10-07 04:24:50.827 +103066,56928,6162.0,CC BY-SA 3.0,,"@DavidJ.Harris This book will always be nubearable for me. Too much text, not enough mathematics. There's more information in one half page of *The Bayesian Choice* than in 5 pages of Gelman et al's book. This book provides some cooking recipes for Bayesian analysis: some recommendations without any theoretical justification. I've also started to read [Gelman & Hill's book](http://www.stat.columbia.edu/~gelman/arm/). Nice to learn R & WinBUGS, but the practical approach is grisly, for example the authors fit Gaussian models to data far, far , far to be Gaussian, without worrying about that.",2013-10-07 06:18:31.290 +103125,56768,4910.0,CC BY-SA 3.0,,"Do you have more information, like a model you are assuming or something? Or are you actually going to use more than one data point? Otherwise, in your case, if the shares at t-1 is A, B, C and A looses say 10 % then B would become B + B / (B+C) * 10 % and C would become C + C / (B + C) * 10 %. Right?",2013-10-07 16:02:51.150 +103397,57086,7155.0,CC BY-SA 3.0,,"Would you mind restating why random forests pose implementation issues for you? They're not particularly intensive unless you have thousands of features. You could prune it down, but it's unlikely you'll have an analytic form that's digestible.",2013-10-08 19:14:17.357 +103173,56768,11490.0,CC BY-SA 3.0,,"Hi Rasmus. I don't have any model at the moment. The only information that I can use is the market share of each company in each year $1, ..., T$. For example in year 1 we might have $A_1 = 0.2$, $B_1 = 0.5$ and $C_1 = 0.3$, which could become $A_2 = 0.4$, $B_2 = 0.1$ and $C_2 = 0.5$. I have this kind of data for T years. What I want to estimate is, for example, given that $A_2 = 0.4$ how much how that market share comes from $A_1$ (""loyal costumers""), $B_1$ and $C_1$ (""stolen costumers"")?",2013-10-07 21:07:15.300 +103178,57015,668.0,CC BY-SA 3.0,,"I have a high degree of belief that exactly one of your hypotheses is true :-). Your data might be able to cast doubt on one of them. But what is ""small""? To some people that means $N\le 2$; to others, it might mean $N\le 10^6$.",2013-10-07 21:34:19.693 +103181,57015,22454.0,CC BY-SA 3.0,,My sample size is $N = 10$.,2013-10-07 21:39:30.730 +103202,57015,594.0,CC BY-SA 3.0,,"Hypotheses are about *populations*, not samples. You can check whether $\bar x < \bar y$ at a glance - no need for p-values or anything.",2013-10-07 23:20:35.337 +103209,57015,503.0,CC BY-SA 3.0,,"Given your small sample and lack of knowledge about the distributions, I'd suggest a permutation test.",2013-10-07 23:53:50.697 +103210,57015,503.0,CC BY-SA 3.0,,"@whuber ""N le 2""? :-)",2013-10-07 23:54:21.417 +103221,56955,2121.0,CC BY-SA 3.0,,"You mentioned each survey was already weighted. Are they weighted using similar methods? Or, did the particulars of the surveys necessitate using different weighting methods?",2013-10-08 01:20:35.267 +103225,56955,22423.0,CC BY-SA 3.0,,"hi @Jonathan, sampling method is the same as mentioned in the example, but of course the values of the calculated weights, sample size, and subject particulars are different for the 5 surveys",2013-10-08 01:51:10.383 +103228,57026,5237.0,CC BY-SA 3.0,,"What does ""C"" stand for in this context?",2013-10-08 02:07:03.430 +103229,57026,22458.0,CC BY-SA 3.0,,"Umm, the hinge loss coefficient? I've mostly found it called C, some people use gamma but that confuses it with the radial basis coefficient. Basically the coefficient associated with the slack variables in the objective function.",2013-10-08 02:32:26.233 +103232,57015,668.0,CC BY-SA 3.0,,"@Peter See http://stats.stackexchange.com/a/1836/919 for why $N=1$ can work. $N=2$ is usually needed in order to estimate the variability. In applications where observations are sufficiently expensive, $N \gt 2$ may be considered extremely *large*. (I work in a field where (a) private parties pay for observations which are (b) required by government regulations that (c) are viewed as a burden and, in the worst situations, (d) an observation (actually a monitoring station) can cost \$100K or more. If you want to tell my clients they need a larger $N$, you had better have a *great* reason!)",2013-10-08 03:10:44.680 +103287,57015,21762.0,CC BY-SA 3.0,,"@whuber: Maybe there is an additional binary grouping variable ""condition"" involved which is expected to change the order of the true means(?).",2013-10-08 10:27:23.497 +103306,57053,,CC BY-SA 3.0,,47% of observations are coded as 1?,2013-10-08 13:10:25.643 +103309,57053,10060.0,CC BY-SA 3.0,,"Just a side tip, don't name dichotomous variables this way. Instead of calling it ""gender,"" call it ""male"" or ""female."" That way you'd know what 1 (aka Yes) stands for. If this is ""male,"" then you can easily figure out 0.47 is the fraction of males.",2013-10-08 13:20:54.527 +103337,57015,668.0,CC BY-SA 3.0,,@Michael Thank you! Your interpretation sheds new light on the question.,2013-10-08 14:46:04.770 +103346,57065,9049.0,CC BY-SA 3.0,,"Good question: I can probably argue pro and against the removal of outliers. Why not use medians if you worried about outliers and what you are looking for is just a ""central tendency""? Given that money-related variables often have highly skewed distribution (eg. Pareto) that might not unreasonable in the first place.",2013-10-08 15:14:30.707 +103354,57065,668.0,CC BY-SA 3.0,,"@user11852 Medians tell you little about the mean, which is what is relevant to revenue. It would be interesting to see your argument in favor of removing the ""outliers,"" especially when these are likely the major contributors to the total revenue.",2013-10-08 15:33:57.610 +103355,57065,22477.0,CC BY-SA 3.0,,"Unfortunately median would always be zero, as < 10% of users spend at all",2013-10-08 15:35:54.210 +103357,57065,20286.0,CC BY-SA 3.0,,"Also, consider whether in practice you make more profit from the ""lots of small spenders"" or the ""very few big spenders."" If you make your profit from those ""outliers"" you probably don't want to remove them--maybe you want to analyze them separately.",2013-10-08 15:38:51.637 +103359,57012,20286.0,CC BY-SA 3.0,,Your examples all have measurements from one or the other Labs but no samples analyzed by both labs. Is that always the case? The best way to proceed will depend on that.,2013-10-08 15:44:43.340 +103362,57065,9049.0,CC BY-SA 3.0,,"@whuber: Let me stress, it was a comment, not an answer. I definitely not an expert on bootstrapping; I would generally argue that outliers are *legitimate* data, if they are not obviously corrupted observations. Nevertheless if one bootstraps a somewhat small sample that has obvious outliers I would worry that he could end up ""amplifying"" their influences or overlooking sample heterogeneity.",2013-10-08 15:46:52.757 +103364,57065,668.0,CC BY-SA 3.0,,"@user11852 Your general argument that outliers are legitimate is helpful. But, concerning the possibility of amplification, it seems to me that the contrary is true: bootstrapping has a chance of working only if the full sample is used. Otherwise it presents a fairy tale, telling us how things would be if outliers didn't exist--but obviously they do. The larger problem is that bootstrapping has little theoretical justification when applied to small samples: the theory is an *asymptotic* one.",2013-10-08 15:53:41.447 +103368,57065,9049.0,CC BY-SA 3.0,,"@whuber: I agree with what you say. Regarding your median comment: I guess an issue is that I treated revenue as roughly equivalent to income. Usually a debate between using mean or median does arise there; eg. in household income cases. (In retrospect, having just <10% of the user data generating revenue definitely it is not a good assumption.) Also I didn't mean to imply that a median ""is the mean"" or something like that. I specifically mentioned it *as a ""central tendency""* value.",2013-10-08 16:17:03.443 +103370,57065,668.0,CC BY-SA 3.0,,"@user11852 And that's the crux of the matter: central tendency is not terribly meaningful when tracking revenue; only the sum (or equivalently, the mean) is. As far as the less than 10% goes, that depends on the business. Plenty rely on just covering costs with routine transactions and making profits on a small number of very large or high-profit sales. *E.g.*, one model for how an airline could make a profit is from the outsized margins reaped from the very small number of first-class passengers.",2013-10-08 16:24:05.757 +103371,57012,20773.0,CC BY-SA 3.0,,"@EdM yes, that is precisely the problem. I am sorry I was not more clear about this.",2013-10-08 16:24:38.037 +103380,57065,9049.0,CC BY-SA 3.0,,"@whuber: Cool, thank you for the insight on the matter!",2013-10-08 16:59:15.470 +103383,57065,450.0,CC BY-SA 3.0,,This is an important question (+1). Can you add a small sample of your dataset or a simulated sample resembling it to the question? I think providing an illustration will be more fruitful in this case.,2013-10-08 17:31:08.450 +103588,57198,1406.0,CC BY-SA 3.0,,Does the wikipedia answer your question: http://en.wikipedia.org/wiki/Linear_regression?,2013-10-10 08:28:21.573 +103402,57086,22034.0,CC BY-SA 3.0,,"@Jacob--The problem is that RF has lots of decision trees. I'd love to report a single formula (< a few lines long if possible) that predicts nearly as accurately as RF. Since I'm publishing my work to an audience of modest statistical sophistication, I think exporting pages upon pages of trees would severely limit the probability of my findings becoming implemented in clinical settings.",2013-10-08 19:38:44.853 +103427,57012,594.0,CC BY-SA 3.0,,Let's assume the sensitivity/detection limit problem was solvable. What is the question of interest? Are you trying to compare means across labs or something?,2013-10-08 23:15:16.167 +103446,57110,668.0,CC BY-SA 3.0,,What units are your dates measured in? The lags in the plots are enormous--they look like seconds. Taking first differences at lags of seven *seconds* won't fix a weekly cycle!,2013-10-09 04:14:57.447 +103454,5015,,CC BY-SA 3.0,,Another possibility is paradoxical confounding: Example 1: http://epm.sagepub.com/content/56/3/430.abstract Example 2: http://optimalprediction.com/files/pdf/V1A19.pdf,2013-10-09 07:06:50.557 +103461,57015,21762.0,CC BY-SA 3.0,,"Okay. To provide help, we would need to know much more about $x$ and $y$. What do they measure? Are they paired or unpaired?",2013-10-09 08:01:54.570 +103480,57137,,CC BY-SA 3.0,,"since you are right with your suspicion, why not answer your question yourself? I means you basically state the answer in the question anyways.",2013-10-09 12:48:05.200 +103485,57012,20773.0,CC BY-SA 3.0,,"Thanks, @Glen_b. Upon looking back, I can see how unclear my question is. The question is how to combine information from Lab A and Lab B such that they are on the same scale. I've gone back to edit my original post, so hopefully that clears things up.",2013-10-09 13:29:54.123 +103488,57126,3922.0,CC BY-SA 3.0,,"Your client should just concentrate on pursuing these wealthy guys to buy/contribute, and this is a PR question, not a statistics question.",2013-10-09 13:53:41.350 +103534,57110,22494.0,CC BY-SA 3.0,,"my data is daily data measured in MWs, I tried first (and second) differences at lags of 7, 14, 365,364,366, but there is still seasonality.",2013-10-09 19:02:16.607 +103547,57167,13037.0,CC BY-SA 3.0,,Which variables do you actually observe/have data on?,2013-10-09 20:47:33.067 +103551,57167,21952.0,CC BY-SA 3.0,,I have observations on all the variables.,2013-10-09 21:07:09.263 +103554,57167,13037.0,CC BY-SA 3.0,,"All? As in you know values of $Y$, $X$s, and $Z$s? A simple approach is just to do linear regression of $Y$ with $Z_1, Z_2, Z_3,$ and $Z_4$.",2013-10-09 21:12:06.590 +103558,57167,21952.0,CC BY-SA 3.0,,But the problem here is with endogeneity. the values of Z's are determined by Y in some way (as explained above). Also changing any value of Z's will have an impact of changing everything else in the equation. so everything is being determined simultaneously.,2013-10-09 21:33:13.723 +103559,57164,594.0,CC BY-SA 3.0,,Why would such a property be of any value in deciding which would be an appropriate model?,2013-10-09 21:57:54.357 +103560,57164,19264.0,CC BY-SA 3.0,,"@Glen_b I'm still a beginner when it comes to statistics so my knowledge is pretty basic. Looking at the plots of gamma and lognormal distributions, qualitatively they look very similar. I'm looking for quantitative differences between the two. For instance, what are some examples of physical applications where gamma or lognormal distributions occur?",2013-10-09 22:06:30.377 +103563,57164,594.0,CC BY-SA 3.0,,"In reality, likely neither ever actually occurs; they're extraordinarily simple models which are sometimes useful (if rough) approximations of reality. I will post an answer that discusses some qualitative differences.",2013-10-09 22:31:39.953 +103564,57167,5045.0,CC BY-SA 3.0,,"I don't think this system of equations is identified. It it was, however, one might use 3SLS (Three Stage Least Squares).",2013-10-09 22:38:40.917 +103566,57156,1741.0,CC BY-SA 3.0,,Can you elaborate further? If want to predict the classes you have to use supervised classification.,2013-10-09 23:19:24.733 +103569,57175,594.0,CC BY-SA 3.0,,"""ML"" is not an algorithm but a criterion. MLE doesn't find a minimum, it's the value of (a monotonic function) of the thing you try to optimize. Minimizing $J$ probably\* *is* ML (in that its argmin will correspond to ML. \*(I haven't done more than taken a quick glance at the paper, but you should have at least mentioned you were doing some form of nearest-neighbor modelling here, and better still you should write more details of your model in your question.)",2013-10-10 00:34:40.740 +103571,57175,20320.0,CC BY-SA 3.0,,"I have added the nearest neighbor point, my mistake sorry for that. So, the thing is EM solution for minimizing the objective function.",2013-10-10 00:45:27.693 +103572,57183,13037.0,CC BY-SA 3.0,,Mcnemars is great and all...but why not just use logistic regression?,2013-10-10 00:53:41.957 +103576,57186,12544.0,CC BY-SA 3.0,,"Not sure what you mean by a difference of 20%. There are three schools, so there are two differences (because once you know the first two, you know the third). In addition, power in this analysis depends on the proportions. You have more power if the proportion is around 50% than if it is around 1%.",2013-10-10 03:52:55.920 +103577,57186,12544.0,CC BY-SA 3.0,,"Can you give an example of the effect you expect, e.g. 20%, 30%, 50%. Also, how do you plan to analyze?",2013-10-10 03:53:59.877 +103578,57186,22542.0,CC BY-SA 3.0,,"Hello - thank you for your response. I will try to clarify. I will be asking students at each school if, for example, there was another school they could have attended, and the using chi2 tests to compare proportions at each. Within each school, I will be using logistic regression to look at association between factors examined (e.g. option for other school) and school of choice. I do not know what effect to expect.",2013-10-10 04:06:05.980 +103579,57186,22542.0,CC BY-SA 3.0,,"In my calculations above, I simply assumed that 50% of population could have chosen a different school. I had been told,I thought, that this would give a good sample size where such proportions are unknown?",2013-10-10 04:15:12.213 +103580,57189,5237.0,CC BY-SA 3.0,,"What would happen if $X$ were a *standard normal*? (Ie, $X\sim\mathcal N(0,1)$.) What would happen then?",2013-10-10 04:29:46.530 +103581,57198,594.0,CC BY-SA 3.0,,"""*is it Gaussian noise or random error*"" -- yes, that random error term is usually taken to be Gaussian noise, though if you're only estimating the coefficients (rather than computing intervals or doing hypothesis tests) it doesn't have to be Gaussian. If it's Gaussian then LS is optimal in several different senses at once. If it's not Gaussian then you still have that it's best linear unbiased.",2013-10-10 06:43:58.463 +103582,57198,22548.0,CC BY-SA 3.0,,"Thanks a lot for reply :). the model is going to test hypotheses that are made before, then it's Gaussian noise. would you please give some reasons why we add this term to the model?",2013-10-10 06:56:00.780 +103584,57195,10450.0,CC BY-SA 3.0,,"[Describing Temporal Correlation Spatially in a Visual Analytics Environment,"" Abish Malik et al.][1] + + + [1]: http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&cad=rja&ved=0CEkQFjAB&url=http://www.purdue.edu/discoverypark/vaccine/assets/pdfs/publications/pdf/Describing%2520Temporal%2520Correlation%2520Spatially.pdf&ei=elhWUsriM8eZiAKh7oDYCg&usg=AFQjCNHAL8nhgTwNQTfWEoy6VNdgO4mbdQ&bvm=bv.53899372,d.cGE",2013-10-10 07:35:51.723 +103586,57198,594.0,CC BY-SA 3.0,,Because the observations don't actually lie on a line/plane/hyperplane.,2013-10-10 07:50:41.407 +104672,57730,13037.0,CC BY-SA 3.0,,"I thought she said speaker was discrete (1,2,or 3)",2013-10-17 20:13:26.633 +103591,57198,22548.0,CC BY-SA 3.0,,"this is wikipedia's definition: ε is called the error term, disturbance term, or noise. This variable captures all other factors which influence the dependent variable yi other than the regressors xi. The relationship between the error term and the regressors, for example whether they are correlated, is a crucial step in formulating a linear regression model, as it will determine the method to use for estimation. I'm writing a research paper, is this definition good enough to bring in research paper?",2013-10-10 08:59:00.677 +103592,57200,15563.0,CC BY-SA 3.0,,"Yes, that makes sense. I guess I did not separate the Train data into two sets. Thank you.",2013-10-10 09:01:14.910 +103593,57185,15827.0,CC BY-SA 3.0,,"Your question title is about measurement error; your final question is about bias. The two are not the same. You also ask simultaneously: what is the difference, and is there a difference? That said, here goes. The central idea of bias is that a procedure is wrong on average, yielding a mean that is higher or lower than the true or correct value, known somehow. That could be true of a measurement procedure, but it can apply where measurement error is not the question. That's what I understand from statistics; psychometrics may have a different notion, but I doubt it.",2013-10-10 09:02:10.390 +103594,57164,633.0,CC BY-SA 3.0,,"@glen_b: the reason is that if you're measuring only those statistics then the minimum assumptive distribution is uniquely the exponential family distribution with those sufficient statistics. Whereas any distribution might be a poor model of reality, if one is not free to choose which measurements are taken, then this is an excellent way of choosing a model.",2013-10-10 09:04:18.900 +103595,57164,6162.0,CC BY-SA 3.0,,@Glen_b I guess the lognormal distribution should appear in some physical situations because of the CLT.,2013-10-10 09:06:39.173 +103596,57186,12544.0,CC BY-SA 3.0,,So you're planning to do three separate logistic regressions? (Chi-square tests on contingency tables and logistic regression give the same answer). I'm still not sure what you are going to be testing (but maybe that's me). Can you give an example?,2013-10-10 09:12:21.053 +103598,57186,21762.0,CC BY-SA 3.0,,How are you dealing with multiple testing?,2013-10-10 09:39:07.073 +103599,57203,6162.0,CC BY-SA 3.0,,"No need of ""many"" exponential variates to be Gamma.",2013-10-10 09:40:37.467 +103601,57186,503.0,CC BY-SA 3.0,,What software do you have access to?,2013-10-10 10:08:37.793 +103602,57205,,CC BY-SA 3.0,,"Welcome @Fabio: Could you please give more explanation to what you mean with ""not touch the data""? Do you mean not to remove anything? It would also help to introduce the content you linked to and explain why this helps the OP.",2013-10-10 10:22:03.277 +103604,57212,503.0,CC BY-SA 3.0,,Welcome to the site. What sort of model did you develop?,2013-10-10 11:00:26.270 +103605,57212,22190.0,CC BY-SA 3.0,,I am developing a logistic model.,2013-10-10 11:01:25.997 +103606,57210,21762.0,CC BY-SA 3.0,,Is this about PCA or FA?,2013-10-10 11:11:28.233 +103608,57214,13889.0,CC BY-SA 3.0,,Thanks for the answer but I'm afraid the AUC statistics are expected to vary depending on the benchmark. Sorry I didn't make that clear.,2013-10-10 12:19:53.817 +103609,57218,21398.0,CC BY-SA 3.0,,I mean running my analysis with the missing value dataset.,2013-10-10 12:48:21.707 +103611,57195,750.0,CC BY-SA 3.0,,"Your approach sounds very similar to estimating the [variogram](http://en.wikipedia.org/wiki/Variogram) - which suggests a non-map based scatterplot of the semi-variogram. If you can settle on a spatial weights matrix, you could also plot a Moran Scatterplot, which is $y$ on the y-axis and $W\cdot y$ on the x-axis.",2013-10-10 13:33:26.207 +103612,57216,674.0,CC BY-SA 3.0,,"Could you provide a reference for the above screenshot? It is good practice to properly cite the work of others. By the way, this should be amended to your previous answer, not posted as a new one.",2013-10-10 13:34:37.753 +103613,57221,21762.0,CC BY-SA 3.0,,What is $x$? Is it an additional independent observation? Why do you think $(x-\text{true mean})/s$ would follow a $t$?,2013-10-10 14:01:30.037 +103614,57221,503.0,CC BY-SA 3.0,,Do you mean to ask what distribution it would follow *if* something about $x$ was true? (i.e. some null hypothesis).,2013-10-10 14:12:00.540 +103616,57221,594.0,CC BY-SA 3.0,,"If you had the true mean there, why would that be distributed as a $t$?",2013-10-10 14:19:59.793 +103618,5015,5237.0,CC BY-SA 3.0,,"It is not clear how this addresses the OP's question (especially w/o any discussion), & in general, Simpson's paradox isn't the best way to think about the relationship b/t an interaction & its composite variables.",2013-10-10 14:44:43.233 +103619,57226,674.0,CC BY-SA 3.0,,"Please, register your original account and merge it with this one; follow the instructions on our [Help Center](http://stats.stackexchange.com/help/merging-accounts). Next, I would advise to merge your three different replies into one single answer.",2013-10-10 15:04:10.173 +103620,57160,2164.0,CC BY-SA 3.0,,"David, Post your data to dropbox.com so we can take a look. I'm not sure why you a reaching for logs. Transformations (like drugs) can have nasty side effects that are masking the real issue (like stress :) ). Instead of a log, you might need to consider other things like level shifts, changes in parameters, etc.",2013-10-10 15:23:25.743 +103625,57230,668.0,CC BY-SA 3.0,,"It is difficult to understand this description. Could you perhaps post a simple example of your dataset? And please clarify what you mean by ""represent"": does that mean a logical file format, a graphical visualization, a statistical summary, or perhaps something else? What is the intended purpose of this representation?",2013-10-10 15:33:58.047 +103627,57230,22569.0,CC BY-SA 3.0,,"Hey, updated the question",2013-10-10 15:39:42.840 +103628,57223,346.0,CC BY-SA 3.0,,Can you post the data via `dput()`?,2013-10-10 15:50:32.060 +103629,57223,22564.0,CC BY-SA 3.0,,"@Henrik. I don't have the simulation data used to generate that figure any longer. Do you want an example of the simulation result or the actual data? Also, is there some advantage to using dput() rather than simply posting a table?",2013-10-10 15:59:03.010 +103630,57233,22567.0,CC BY-SA 3.0,,"I do have a common intercept $\beta_0$, but I have a number of factors in the model. I was just showing one factor in the example. Is it possible to suppress the intercept with more than one factor? I thought that you could do it for one, but then it broke down with multiple. That would fix my problem, as it would allocate the overall intercept to each factor.",2013-10-10 16:02:07.313 +103631,57228,668.0,CC BY-SA 3.0,,"There are several people on [gis.se] actively using `R` for their work: consider [searching the site](http://gis.stackexchange.com/questions/tagged/r?sort=votes). Certainly you will want to take a look at [""How to make beautiful maps in R""](http://gis.stackexchange.com/questions/48828).",2013-10-10 16:06:41.993 +103632,52871,,CC BY-SA 3.0,,"any chance you can provide a slimmed down data set, otherwise it is difficult for one to replicate your error and help you debug?",2013-10-10 16:07:34.920 +103661,57193,1717.0,CC BY-SA 3.0,,"If you can't solve the ML because the derivatives are too complicated, then EM is a good alternative. If $u$'s are observations and follow a gaussian distribution, then you need to include latent variables in that distribution. For example, what is the probability of a signal $u_{t}$ given some other variable $z_{t}$ provided you can calculate $p(z_{t}|\theta)$. If you can do that, follow the procedure I described and it will give you an estimate of $\theta$.",2013-10-10 19:12:37.063 +103633,57228,5237.0,CC BY-SA 3.0,,"Welcome to the site, @mikeLdub. This question seems to be *only* about how to do this in R. As such, it may be off-topic for CV (see our [help page](http://stats.stackexchange.com/help)); but could be on-topic on [Stack Overflow](http://stackoverflow.com/). If you have a question about the substantive statistical / visualization issues, please edit to clarify; if not we will migrate it for you (*please don't cross-post, though*). If it does go to SO, it will need a [reproducible example](http://stackoverflow.com/questions/5963269/) to be on-topic there; can you add a `dput()` of your data?",2013-10-10 16:08:33.253 +103634,57228,5237.0,CC BY-SA 3.0,,This question appears to be off-topic because it is only about how to use R.,2013-10-10 16:09:37.697 +103635,57230,668.0,CC BY-SA 3.0,,"Thanks. But what are the data? The quiz answers? Scores and subscores? Counts of questions attempted? I notice your edit does not address the last two questions I asked in my previous comment--please clarify those points, too.",2013-10-10 16:12:30.330 +103636,57233,5237.0,CC BY-SA 3.0,,"You can use your entire regression equation, ie w/ the intercept & the other factors & continuous covariates, etc. The only difference is that you are multiplying the parameters related to the unknown factor by the marginal probabilities that the observation is each possible level of the unknown factor.",2013-10-10 16:14:00.057 +103637,57236,594.0,CC BY-SA 3.0,,"so if you simply choose the right definitions ('death' in this case is 'of immobility') ... it would seem to be very sensible; any change of state works really, it doesn't have to be death. The only thing that worries me is that while people don't generally recover from death after a few months (once you have entered the state for more than a few minutes, you remain there) ... but people can potentially become non-mobile again. If that can't/won't happen in your circumstances, there should be no problem; if it can, then you might look at models where people can move between states over time.",2013-10-10 16:19:03.420 +103638,57236,10060.0,CC BY-SA 3.0,,"This is more about how you define an ""event."" It's called ""survival"" but the event does not have to be lacking ability to survive. Turning the idea around, you can use 1 to represent ""revival.""",2013-10-10 16:19:53.440 +103639,57236,22572.0,CC BY-SA 3.0,,"@Glen_b, thats exactly my problem. during recovery you can certainly move back and forth between states. I'm envisioning something like this: [link](http://imgur.com/1KkvScI) but with the ability to have increasing/decreasing mobility, and the ""event"" being reaching your baseline",2013-10-10 16:25:28.857 +103640,57128,1693.0,CC BY-SA 3.0,,"I'm curious - how, mechanically, would you go about combining the 1000 sets of results using spss?",2013-10-10 16:29:08.773 +103641,57236,594.0,CC BY-SA 3.0,,"There are already models for sickness and disability of varying levels of sophistication, often based on Markov chains (a small number of states, often continuous time), or related to them. It's quite possible to have several different levels of mobility. [e.g.](http://www.google.com/search?q=markov+chain+sickness|disability+models)",2013-10-10 16:34:31.747 +103643,57237,668.0,CC BY-SA 3.0,,"An interesting question. However, the second ""obviously"" (about not respecting the marginal distribution) is not at all clear to me. Why is it obvious? The distribution of $(v,a)$, as reflected by your ""two-dimensional histogram,"" depends on how you have sampled these variables; I wonder whether this might explain possible differences. What kind of data are represented by this histogram and how exactly do you ""draw values"" from it?",2013-10-10 16:44:04.057 +103646,57156,19822.0,CC BY-SA 3.0,,"Let's just say that i have two datasets with same number of variables and samples. the first data set contain the class information(A and B) while second dataset does not have any class info. Using RF code, first dataset was classified in to the two classes with very good accuracy. The run parameters given above are for that particular classification where the class info is required in the dataset. Now my question is how to classify the second datset in to two class?",2013-10-10 16:54:34.700 +103647,57230,22569.0,CC BY-SA 3.0,,Count of questions attempted.,2013-10-10 16:57:16.037 +103648,57241,8414.0,CC BY-SA 3.0,,"gung, thanks for your answer. I suppose my question might be a little ambiguous. What I want is not a relationship between x and y in model 3 (which is what you've done), but in model 1 (Y = b11 + b12 * X + e1). I have clarified my question to this effect.",2013-10-10 17:18:27.297 +103649,57223,346.0,CC BY-SA 3.0,,@Flask I am interested in the actual data. And the `dput()` of the data makes it the easiest to read it into R.,2013-10-10 17:31:04.020 +103651,57241,8414.0,CC BY-SA 3.0,,Thanks for the edit. Is it possible to directly specify the size of the population effect for coefficient b12?,2013-10-10 17:33:19.510 +103653,57223,346.0,CC BY-SA 3.0,,"Difference between which two groups actually interests you (given the original question, I expect you are only interested in two groups)?",2013-10-10 17:39:51.300 +103655,57241,5237.0,CC BY-SA 3.0,,"Your question at this point is what would be: what is the population correlation between $x$ & $y$ in general. I wonder if that might be best asked as a new question, as I'm not sure off the top of my head. In the simplest case, where all 3 variables ($x$, $med$, $y$) are normally distributed, & the relationship b/t $x$ & $y$ is *fully* mediated, then $\rho_{x,y} = \rho_{x,med}*\rho_{med,y}$. However, it's more complex if the distributions aren't normal (eg, your $x$ is equal frequencies of $-.5$ & $+.5$), or w/ more complex mediational situations.",2013-10-10 17:47:37.873 +103656,57223,22564.0,CC BY-SA 3.0,,"@Henrik. I am interested in all comparisons. The example of two groups was just a simplification. As noted in the question I am not interested only in this specific data. There is data in publications that was generated via the same process that I would like to judge the reliability of given they performed t-tests. Actually, that is also a simplification. What has actually been done previously varies including two-way anova, one-way anova followed by newman-keuls, ""SAS glm"". I am most interested in the accuracy of the newman-keuls method.",2013-10-10 17:55:01.463 +103657,57193,20320.0,CC BY-SA 3.0,,Thank you for a good general introduction. My question was I have observations u whose distribution is gaussian. I need to minimize/optimize using ML and EM.In my case u's are higher dimensional signal which has the parameters embedded in it.I need to formulate u's so that I can apply EM.Assuming u's follow a Gaussian distribution & u is a 2*N dimensional signal where N represents no.of samples.How do I formulate since I cannot do a derivative of u's?Is it possbile to find the parameters assuming gaussian distribution and unknown signal generating model?,2013-10-10 18:04:35.810 +103658,57223,6162.0,CC BY-SA 3.0,,"Have you tried to fit a Gaussian model with a data transformation ? Such as `gls(f(Value) ~ Group, data=dat, na.action=na.omit, + correlation=corSymm(form= ~ 1 | Subject), + weights=varIdent(form = ~1 | Group))` (with `nlme` package)",2013-10-10 18:41:07.590 +103659,57225,21398.0,CC BY-SA 3.0,,Thanks. I'm doing a multilevel logistic analysis with 5 imputed datasets and I'm going to combine them manually in a pooled version,2013-10-10 18:49:04.077 +103689,57223,346.0,CC BY-SA 3.0,,"@StéphaneLaurent I see, makes sense. And the residuals are relatively similar to the ones reported below. But only if you use `lme` they are really identical.",2013-10-10 21:06:23.107 +103690,57193,20320.0,CC BY-SA 3.0,,Is there any implementation for this kind of thing or do I follow the matlab link for implementation?,2013-10-10 21:07:18.417 +103662,57244,674.0,CC BY-SA 3.0,,"Some relevant threads: [How to convert molecular categorical variables to dummy variables for cluster analysis?](http://stats.stackexchange.com/q/22210/930), [What is the difference between independence.test in R and Cochrane and Armitage trend test?](http://stats.stackexchange.com/q/8774/930) This [response of mine](http://stats.stackexchange.com/a/9394/930) has some references [1,2,4] that might be useful, but [Introduction to Genetic Association Studies](http://cshprotocols.cshlp.org/content/2012/3/pdb.top068163.full), by Cathryn M. Lewis and Jo Knight, is probably more recent.",2013-10-10 19:26:46.907 +103663,57242,668.0,CC BY-SA 3.0,,"Whether it ""makes sense"" or not may depend on the costs and constraints on your data collection process. Conceivably, it is easy to collect lots of data along individual lines but expensive to set up each line: such a circumstance would suggest an approach like this one. What is optimal, though, depends on details of the costs and the specific constraints. Could you perhaps share this kind of information with us or, more generally, explain why you are contemplating such an approach?",2013-10-10 19:34:54.857 +103664,57245,22564.0,CC BY-SA 3.0,,Thank you for your response. I have run the code and duplicated your results. I will need to examine the code of these functions and run some simulations to understand what is occurring. I added some further questions in the original post.,2013-10-10 19:51:36.890 +103666,57248,,CC BY-SA 3.0,,"It should be the other way around. If you're assuming a gene-dosage effect you have only one parameter and it's a one degree of freedom test. If you dpn't assume the gene-dosage effect, you have 2 parameters and if you want to test them jointly it's a 2 dof test.",2013-10-10 19:55:23.267 +103667,57193,20320.0,CC BY-SA 3.0,,Is the process similar to EM of gaussian mixture models and its implementation http://www.mathworks.com/matlabcentral/fileexchange/26184-em-algorithm-for-gaussian-mixture-model. In my case what is z_t?After I find out the distribution how do I find the probability?By this time u must have know that I am really weak in this area.From your procedure which steps should I follow to minimize u_t using EM?It will be really nice if you can edit your answer adding how to apply EM on u's such that u's can be minimized.,2013-10-10 19:58:24.427 +103668,57245,6162.0,CC BY-SA 3.0,,"I'm wondering why `gls(f(Value) ~ Group, data=dat, na.action=na.omit, correlation=corSymm(form= ~ 1 | Subject))` provides quite different residuals. Isn't it the same model ?",2013-10-10 20:02:43.097 +103670,57223,22564.0,CC BY-SA 3.0,,"@Stéphane. I have done this using the sqrt transformation as suggested by Henrik, but do not understand the output.",2013-10-10 20:14:14.830 +103671,57253,668.0,CC BY-SA 3.0,,"Welcome to our site, Jen. You would open your question up to many more (knowledgeable) people by explaining what these parameters 'lambda' and 'lambda2' mean: many of us don't want to go to the trouble of looking up the documentation for your package in order to find out (and that's something that surely you have already done).",2013-10-10 20:18:27.730 +103672,57252,12683.0,CC BY-SA 3.0,,Welcome to Cross Validated! Please have a look at the possible duplicate & if you're still in doubt edit your question to explain how.,2013-10-10 20:19:45.630 +103673,57193,1717.0,CC BY-SA 3.0,,"This process is exactly the same as in mixture models. $z_{t}$ depends on your problem. As I said, in mixture models $z_{t}$ are the mixture coefficients, so it's quite easy to define a joint distribution $p(X,Z|\theta)$. Obviously, I don't know the dependencies of your observations $u_{t}$, that is something you should know, but basically the idea is to find such joint distribution. Furthermore, in EM you don't minimize $u$. Instead, maximize the likelihood of $p(U|\theta)$ using $Q$ as a proxy. I recommend you to read chapter 9 of Bishop's book.",2013-10-10 20:20:37.147 +103674,57252,2857.0,CC BY-SA 3.0,,"""standard errors and p-values of interactions of standardized models are not reliable""? That is not true as far as I know. SE and p-values are equivalent between standardized and un-standardized models.",2013-10-10 20:25:15.777 +103675,57248,10278.0,CC BY-SA 3.0,,@andrea you are right I will correct my answer.,2013-10-10 20:37:27.703 +103676,57245,346.0,CC BY-SA 3.0,,"@StéphaneLaurent I think the same model can only be obtained with `lme` as you need to take the nested structure of the data into account. The following model gives the same F and p value for the effect of `Group` and the same residuals: `lme(Value ~ Group, random = ~ 1|Subject, dat)`",2013-10-10 20:37:55.473 +103677,57223,346.0,CC BY-SA 3.0,,"Taken from `?aov`: ""`aov` is designed for balanced designs"". So your second edit doesn't provide a reasonable model I guess.",2013-10-10 20:39:40.670 +103678,57245,6162.0,CC BY-SA 3.0,,"Sorry, the equivalent `gls` model is `gls(f(Value) ~ Group, data=dat, na.action=na.omit, correlation=corCompSymm(form= ~ 1 | Subject))` (exchangeable repeated measures). But it also provides different residuals.",2013-10-10 20:41:26.763 +103679,57223,6162.0,CC BY-SA 3.0,,"Flask, see also the comments below @Henrik's answer. The advantage of `gls` is that one can specifiy different variances per group with the `weights` argument. But one inconvenient is that Kenwards-Rogers degrees of freedom provided by the `pbkrtest` package are not available for `gls` models.",2013-10-10 20:44:52.277 +103680,57245,346.0,CC BY-SA 3.0,,"@StéphaneLaurent Nah, I don't think so. Just look at the dfs, the denominator dfs are 105. `gls` cannot handle the nested structure (i.e., replicates for participants).",2013-10-10 20:46:49.893 +103681,57223,346.0,CC BY-SA 3.0,,"@StéphaneLaurent If you want variance weights and nested structure (i.e., replicates per participants) you can simply use `lme`instead of `gls`. Or what sepaks against `lme`?",2013-10-10 20:48:17.403 +103682,57245,6162.0,CC BY-SA 3.0,,"Yes, degrees of freedom given in the output are particular, but estimates (fixed effects and variances) are the same.",2013-10-10 20:50:38.863 +103684,57193,20320.0,CC BY-SA 3.0,,"So for my case it will theta=arg max E [log p(U|theta)] ?This means the probability of u's given theta, but I do not know theta as I need to estimate theta ! Also, will this formulation give me multiple parameters theta_1, theta_2 etc as my model has multiple parameters to be estimated.",2013-10-10 20:51:15.413 +103685,57223,6162.0,CC BY-SA 3.0,,I have nothing against `lme`. I'm just trying with `gls`. Considering the subject as a random effect is equivalent to consider an exchangeable correlation structure.,2013-10-10 20:56:23.140 +103686,57193,1717.0,CC BY-SA 3.0,,"In your case, $E[\log p(U,Z|\theta)]$. Remember that this is a recursive procedure, so you start by guessing $\theta$. After several iterations, the estimate for $\theta$ will converge to the ML estimate. As for your second question, yes. In the same way $X$ represents the set of observations, $\theta$ represents a set of parameters, although in this case I assumed there was only one parameter to be estimated.",2013-10-10 20:56:46.410 +103687,57215,22548.0,CC BY-SA 3.0,,"Thanks a lot for your answer. it is really good. and another question is: in my paper, I introduced some hypotheses and a regression model to determine the effect on each independent variable on dependent variable, I tested hypotheses by running multiple linear regression in SPSS software. in this case, do I need to specify what the noise should be?",2013-10-10 21:02:52.770 +103688,57251,5448.0,CC BY-SA 3.0,,"That doesn't look like a convergence theorem to me... convergence theorems generally say something like ""as $t \to \infty$, some function of $t$ (e.g., a probability distribution) approaches some other function"".",2013-10-10 21:02:54.867 +103691,57254,668.0,CC BY-SA 3.0,,"You need to know (or assume) more about what's going on. In some cases, the units within a packet might tend to have very similar weights; in other cases, they might often be wildly different. Would it be possible to destroy a small representative sample of the packets in order to assess this?",2013-10-10 21:11:16.907 +103692,57215,449.0,CC BY-SA 3.0,,"Generally, no, but I don't know nearly enough details about your paper. The shape of the noise is an assumption of your regression model and usually not something you need to specify in the paper.",2013-10-10 21:12:16.147 +103693,57193,20320.0,CC BY-SA 3.0,,let us [continue this discussion in chat](http://chat.stackexchange.com/rooms/11008/discussion-between-srishti-m-and-robert-smith),2013-10-10 21:12:40.647 +103694,57193,20320.0,CC BY-SA 3.0,,"Lastly, in some papers I have read that if the objective is to minimize then I take the negative of the log likelihood, apply kalman filters. Is this correct? Do I need to apply kalman filter? Or as u said maximizing the probability yields the parameters. thank you so much for continued help",2013-10-10 21:13:18.730 +103695,57206,668.0,CC BY-SA 3.0,,"A lot of this material can work well for large datasets and I agree with your initial assessment that normality testing can be limited or questionable with small datasets. But given the *huge* variability of skewness and kurtosis, it would seem that any effort to identify the type of underlying distribution based on these statistics would be even more questionable and less certain. Consequently, wouldn't this approach be (at best) misleading even as a preliminary check?",2013-10-10 21:16:55.807 +103697,57251,594.0,CC BY-SA 3.0,,"Just because you have a Gibbs sampler doesn't guarantee convergence (it's quite easy to construct a situation where sampling the full conditionals won't converge). Convergence of a particular implementation of Gibbs sampling (that is, with a particular model), and for MCMC implementations in general, is shown by establishing that the sampling scheme satisfies the conditions for convergence of a Markov Chain (usually fairly easy). If you look at what conditions have to apply for a Markov Chain to converge to its stationary distribution, you can see what you need to hold.",2013-10-10 21:29:23.927 +103698,57251,22578.0,CC BY-SA 3.0,,I added the quotation from the book I am using,2013-10-10 21:36:44.770 +103700,57258,22564.0,CC BY-SA 3.0,,"Do you know why this output is different in R 2.14.2 vs 3.0.1? It also does not say the ""['lmerMod'] in line 4 of your first code panel.",2013-10-10 21:58:39.493 +103701,57260,5237.0,CC BY-SA 3.0,,"At present, this is more of a comment than an answer. Would you mind expanding it a little bit to make it more answer-ish?",2013-10-10 21:58:50.500 +103702,57258,6162.0,CC BY-SA 3.0,,@Flask Probably an update of the `lme4` package.,2013-10-10 22:01:39.113 +103706,57259,22564.0,CC BY-SA 3.0,,"So in this case, if I had **not transformed** using square root I get pvalues of .9124, .0099, and .0046 respectively. Also, I still do not understand how it is possible to ignore the within-subject variance when comparing group means. The confidence intervals for the means are the same as well?",2013-10-10 22:14:58.587 +103708,57223,6162.0,CC BY-SA 3.0,,"As said in my second answer, taking the subject means is a correct approach. The group means are sufficient statistics if you don't want to estimate variance components.",2013-10-10 22:19:47.927 +103709,57259,6162.0,CC BY-SA 3.0,,"Yes, as long as you are interested in means only, you don't loose information by only looking at the subjects means. You don't ignore within-variance, you only ignore the decomposition of variance.",2013-10-10 22:23:44.257 +103710,57259,6162.0,CC BY-SA 3.0,,"@Flask A colleague of mine, which is not mathematician but which has a very strong intuition in statistics, would say that the subject is the ""unit of observation"", and then only his mean value plays a role.",2013-10-10 22:25:21.123 +103711,57259,22564.0,CC BY-SA 3.0,,mmm..This does not meet my intuition. If I am less sure about the individual means it should decrease my confidence in the estimate for group means. [see second to last post in this thread](http://www.physicsforums.com/showthread.php?t=608932&page=2),2013-10-10 22:35:27.267 +103712,57259,6162.0,CC BY-SA 3.0,,It took me one year of practice to understand :) I'm going to bed now.,2013-10-10 22:41:50.287 +103714,57259,22564.0,CC BY-SA 3.0,,"I am sure you are correct, but if that is true then this procedure must be answering the wrong question for my purposes. If I have no idea what is the cause of this large within-subject variance (subgroups amongst the measurements, etc) this must cause me to doubt the ""external validity"" of the conclusions due to the significance test more so than if the measurements were precise and consistent.",2013-10-10 22:52:46.453 +103715,52871,18040.0,CC BY-SA 3.0,,"Just as an FYI you shouldn't use ""t"" as a variable name since it's the name of an internal R function ?t.",2013-10-10 22:58:01.883 +103717,57263,503.0,CC BY-SA 3.0,,What's the treatment? (I don't see any mention of one).,2013-10-10 23:10:28.477 +103718,57261,5448.0,CC BY-SA 3.0,,"If $x_i = 0$, then $\log(p_i^{x_i}) = 0$. I'd deal with it by dropping the corresponding $i$s from your model, setting the corresponding $\hat{p_i} = 0$ (the MLE), and fitting on the rest of the data. (Then of course adding back in the elements you removed for fitting purposes.)",2013-10-10 23:18:13.720 +103719,57261,16703.0,CC BY-SA 3.0,,"@jbowman I'm not sure if I understand what you mean precisely, could you elaborate? Do you mean, in this example, dropping $x_1$ for both $sample_1$ and $sample_2$, or only for $sample_1$? In the former case, I have many samples and for almost all $i$ at least one $p_i^{x_i}$ is 0, so then I don't have many left over. If it's the latter case I don't see how that would be done, but I'll know where to look.",2013-10-10 23:26:26.627 +103720,57261,5448.0,CC BY-SA 3.0,,"Hmmm, I may have misunderstood the problem. Are you assuming the $i^{th}$ element of $p$ has the same value across all the samples? If so, it seems to me you can just add up the $x_i$ across your samples and use that total instead of doing a sample-by-sample calculation.",2013-10-10 23:30:04.633 +103721,57230,10060.0,CC BY-SA 3.0,,"Are you thinking about a graph like a report card for each student? Or are you thinking of visualizing a lot of students on a graph? If it's the latter, how many?",2013-10-10 23:35:56.483 +103722,57268,3183.0,CC BY-SA 3.0,,I don't understand what you mean about telling the models apart. They have different coefficients. They have different variances in the random effects. So they're different. I'm not sure what I'm missing.,2013-10-11 00:02:51.807 +103723,57268,9049.0,CC BY-SA 3.0,,"In addition to David's comment for extra clarifications. While I *really* appreciate the fact you went into the trouble of giving R-code unless you specify the random seed (eg. `set.seed(0)`) one will not be able to replicate your results/graphs. (In general, code comments would be a plus too.)",2013-10-11 00:20:03.413 +103724,412,,CC BY-SA 3.0,user31367,[Here](http://bayesianthink.blogspot.in/2012/12/the-best-books-to-learn-probability.html#.UldEJdJHJBE) is good list of books for non-statistician scientists. Most of them are probability related and some are readable for ones with a non-statistics background. HTH,2013-10-11 00:23:27.370 +103765,57278,1895.0,CC BY-SA 3.0,,"(+1) Nicely written question. Welcome to the site. Just to clarify: In one instance you know the parameters and want to sample from the distribution, but in the other you have *data* and want to find the associated parameters?",2013-10-11 12:00:20.583 +103725,57268,18040.0,CC BY-SA 3.0,,"@DavidJ.Harris True, but I feel like I'm not fitting the right model in the first scenario. I don't think it's the right model in the first scenario, because intuitively, all those lines seem to have the same slope and the same intercept. So I guess I'm wondering, what's the 3rd parameter that says ""There's variation between groups in the slope, the intercept and the x-values of the groups"". Perhaps my question is more, is there a better model to answer the preceding question?",2013-10-11 00:39:55.483 +103726,57268,3183.0,CC BY-SA 3.0,,"If you want all the differently-colored groups to have the same slopes and same intercepts, then you can enforce that by removing the random effect.",2013-10-11 00:43:20.520 +103727,57268,3183.0,CC BY-SA 3.0,,"In general, linear regression techniques don't have much to say about $x$, just about $y|x$. There's nothing in either model that says orange has to be on the left and blue has to be on the right.",2013-10-11 00:44:34.993 +103730,57268,9049.0,CC BY-SA 3.0,,"+1 to David on this. You are fitting the *right* model. An LME model assumes your data follow a distribution like $y \sim N(X\beta, Z \Sigma_{\gamma} Z^T + \Sigma_{\epsilon})$. To quote: Pinheiro & Bates (2000) : *""random effects (...) can be regarded as additional error terms to account for correlation among observations within the group""*, ie. it has nothing to do with your $X$, only your $Z$. Now if your feel certain values of $X$ might be less reliable or something like that, that's another question not answered directly by an LME though.",2013-10-11 01:05:50.353 +103733,57274,9049.0,CC BY-SA 3.0,,I would recommend using `nlme`'s `lme` or `lme4`'s `lmer` function. They are better documented and with cleaner syntax. In the long run their flexibility will definitely prove beneficial. I think what you are doing appears sensible. Just be weary of *multiple testing* and *data dredging* issues.,2013-10-11 02:42:26.963 +103735,56784,5237.0,CC BY-SA 3.0,,"Interesting question. I don't know the answer, but the idea that some degrees of freedom should be lost makes sense. If you hadn't seen it already, this answer by @whuber should be thought-provoking: [how-to-understand-degrees-of-freedom](http://stats.stackexchange.com/questions/16921//17148#17148). It seems to me that some simulation studies should enable you to get a toehold here, at least for some specific cases.",2013-10-11 04:06:00.847 +103736,57206,22555.0,CC BY-SA 3.0,,"Perhaps it is best to qualify the method further: Hahn and Shapiro (as referenced above) advise that caution should be exercised, especially when the sample size is less than 200 - and recommend that this be followed by further verification, such as a frequency table that compares the fitted distribution with the actual data. But in my view it is a useful method that *can suggest* where the data might lay within a spectrum of possibilities. I have used it on data sets not smaller than roughly 3000 and have built it into computer simulation software where it has proved useful.",2013-10-11 04:38:29.383 +103738,57269,19822.0,CC BY-SA 3.0,,"Thanks Simone for the comments... As you mentioned in your comments, my objective is to predict the class labels of records in the unlabeled dataset using RF. However, i would like to know whether this is possible using the RF code of Brieman and Cutler..??",2013-10-11 05:48:15.413 +103739,57269,1741.0,CC BY-SA 3.0,,"I am not that familiar with that code. It might also be a bit outdated. Try WEKA or R, they provide recent random forest implementations.",2013-10-11 05:54:42.780 +103740,57213,22190.0,CC BY-SA 3.0,,"I read your paper, but I am not able to understand what is the meaning of getting concordance and discordance as 25.0 and how to know by this whether the model is good or not",2013-10-11 05:57:39.060 +103741,56784,,CC BY-SA 3.0,,"Not sure how helpful this is, but there is a similar problem in the field of robust estimation. Specifically, a method of robust estimation (eg trimmed mean) often requires a parameterized input (eg parameter defining how much to trim). This parameter can be chosen by a data-driven method (eg see how fat the tails are before choosing the trimming parameter). But pre-selecting the trimming parameter does affect the distribution of the trimmed mean, versus, say, a fixed parameter rule. The usual way it is dealt with in that literature is via a bootstrap.",2013-10-11 06:10:11.600 +103742,57259,6162.0,CC BY-SA 3.0,,"i don't understand what you say. If you want to compare group means, the procedure is correct.",2013-10-11 06:21:29.663 +103743,56784,594.0,CC BY-SA 3.0,,"@ColinTBowers -- potentially somewhat helpful, thanks. Didn't think about the possibility of bootstrapping.",2013-10-11 06:59:39.877 +103745,57263,21762.0,CC BY-SA 3.0,,"You cannot distinguish the ""online program"" effect from any sort of time/placebo effect",2013-10-11 07:45:50.070 +103746,57281,21762.0,CC BY-SA 3.0,,Just one pint out of 20 is defective.,2013-10-11 07:53:49.630 +103747,57281,9074.0,CC BY-SA 3.0,,"Oh, sorry, didn't notice that. Will have to update the answer when I get home. Thanks for noticing.",2013-10-11 08:02:27.790 +103748,57265,21762.0,CC BY-SA 3.0,,30 tests will need a huge sample size to keep the experiment wise error rate low.,2013-10-11 08:09:13.900 +103753,57284,12683.0,CC BY-SA 3.0,,What are you trying to do - perform a test or estimate something with an associated confidence interval?,2013-10-11 09:13:05.887 +103754,57186,22542.0,CC BY-SA 3.0,,"It is not you, jeremy - I struggle to explain my thoughts in English :) I am looking, for example, at a school in an area that has changed significantly in recent decades. I am looking at people who enrolled in that school at three different times and asking them why they chose that school: for example, they wanted to go elsewhere but could not afford, school was their first preference, etc.",2013-10-11 09:55:49.540 +103755,57186,22542.0,CC BY-SA 3.0,,"All analysis is in STATA, though I think I can access SPSS",2013-10-11 09:56:14.713 +103756,57220,12683.0,CC BY-SA 3.0,,"Look [here](http://stats.stackexchange.com/questions/5450/what-if-interaction-wipes-out-my-direct-effects-in-regression), & at Ray's answer in particular. **There is no sense at all in worrying about the significance or otherwise of main effects if you have an interaction term in the model.**",2013-10-11 10:23:28.123 +103757,57222,12683.0,CC BY-SA 3.0,,"What do you mean by ""center *earnings* before taking the logarithm""? It can't be to subtract the sample mean earnings or you'd be trying to take logs of negative numbers.",2013-10-11 10:37:31.057 +103761,57288,21762.0,CC BY-SA 3.0,,"Not completely sure about what you mean by ""reversing"": Obviously $|X - \mu|$ is the same as $|\mu - X|$, where $\mu$ denotes true mean of $X$. In this way, the answer is ""yes"".",2013-10-11 11:12:47.790 +103762,57287,21762.0,CC BY-SA 3.0,,"If you already have the population at hand, why would you use bootstrap samples to make inference about this population?",2013-10-11 11:14:19.080 +103763,57230,22569.0,CC BY-SA 3.0,,"Kind of. However the graph will only be there for one 'student' or 'user'. It will go up on their profile page, each user has their own set of analytics.",2013-10-11 11:16:47.063 +103764,57288,,CC BY-SA 3.0,,"Very interesting, an extremely subtle question. I think the point he is getting at @MichaelMayer is that although $|X-\mu|$ is numerically $|\mu-X|$ the distribution of $X|\mu$ is not the same as $\mu|X$. My feeling is this either leads us into Bayesian statistics, where it will all fall out fine once you specify priors etc., or into those tortured lawyer-esque frequentist statements about exactly what a confidence intervals is.",2013-10-11 11:51:31.193 +103766,57291,,CC BY-SA 3.0,,This question appears to be off-topic because it is has no statistical content other than finding a function/specific R package. You can try http:/www.rseek.org ,2013-10-11 12:08:11.700 +103767,57288,21762.0,CC BY-SA 3.0,,"@Corone: You are right, I didn't realize the Bayesian flavor of the question :-)",2013-10-11 12:09:57.120 +103768,57278,20144.0,CC BY-SA 3.0,,"@cardinal well, ideally the process would be find the parameters -> make a prediction given $E(x)$ with those parameters by sampling from the distribution. The idea is that through my experiments $E(x)$ can change, albeit not randomly, but $T$ can't (it is a thermodynamical property). But I don't know $T$ and I have some data. Is this approach incorrect? Just as a comment, we can consider $E(x) = \mathbf{E}\cdot\mathbf{x}$, but as I said it is not random, I want to predict the most likely state given such $\mathbf{E}$",2013-10-11 12:13:32.717 +103769,57278,20144.0,CC BY-SA 3.0,,"And finally, in the data I have I know $\mathbf{E}$ and the marginal distributions of each $x_{i}$.",2013-10-11 12:14:33.283 +103772,57293,21762.0,CC BY-SA 3.0,,Is it possible to include all PCs?,2013-10-11 12:33:54.143 +103773,57287,21624.0,CC BY-SA 3.0,,"@MichaelMayer Yes, I could. The problem is there are many populations, and it could be better if I have an uniform method to do so. :)",2013-10-11 12:35:43.703 +103775,57242,22573.0,CC BY-SA 3.0,,Thanks. I updated my question. Hope this makes it more clear.,2013-10-11 12:40:18.537 +103777,57298,21762.0,CC BY-SA 3.0,,The first integral should start at 1 (maybe its just a typo),2013-10-11 12:43:41.530 +103778,57288,19436.0,CC BY-SA 3.0,,"@Corone Yes, the question is can we argue about $\mu|X$ and what may be a suitable rigorous mathematical framework. I'll not accept ""Bayesian statistics"" as an answer, that's too cheap :)",2013-10-11 12:54:28.643 +103780,57300,19395.0,CC BY-SA 3.0,,Thank you so much for you answer!It's funny how Stata does not allow to choose an exact result (I am assuming the program assumes normality). I guess I have to dust off my R knowledge...,2013-10-11 13:32:37.647 +103781,57288,1889.0,CC BY-SA 3.0,,"Suppose I observe the value $19.21886$ from an unknown distribution (which might be a Cauchy distribution of unknown centre and spread, or from something completely different). I do not even know if the population distribution has a mean or standard deviation, so it is difficult to say much in general about how far the mean is from my observation.",2013-10-11 13:42:36.290 +103782,57242,668.0,CC BY-SA 3.0,,"It does not sound like computing a covariance matrix would address your problem. Instead, it seems that you may have one or more measurements of the distance of something (along with its orientation), made with a known amount of radial error, and you wish to estimate the location and quantify the uncertainty in that location. Would this be an accurate interpretation?",2013-10-11 13:58:37.167 +103783,57206,668.0,CC BY-SA 3.0,,"I can see your method giving useful information with datasets of 3000 or greater. However, then there is no need to perform distributional testing to assess the applicability of a t-test of the mean.",2013-10-11 14:00:52.337 +103784,57300,15827.0,CC BY-SA 3.0,,"@asdir What you tried in Stata is not clear, but see the help on `permute` for an example of use with `ranksum` or `search somersd` for more general approaches.",2013-10-11 14:03:53.980 +103785,57242,22573.0,CC BY-SA 3.0,,"yes, that's exactly what I'm trying to do...",2013-10-11 14:08:11.717 +103786,57259,22564.0,CC BY-SA 3.0,,Well is there a procedure I can use to do this that will give me a wider estimate for group mean if the measurements are less precise? Regardless of any math theory such a procedure is bound to be closer to what I have in mind. I messed around with rjags and the credible intervals for group level means seem to scale with the within-subject variance. Is this what I want to do?,2013-10-11 14:08:56.487 +103787,57285,21108.0,CC BY-SA 3.0,,"(BTW: I was using the formula as you defined it, I mixed A and B when writing the question. Good you notice!)",2013-10-11 14:09:31.303 +103788,57259,6162.0,CC BY-SA 3.0,,"@Flask, as I said, the within-variance is not ignored. Try some simulations, use ""my"" method, increase the within-variance and you will see that the intervals become larger (because by increasing the within-variance you increase the total variance).",2013-10-11 14:12:57.213 +103789,57297,668.0,CC BY-SA 3.0,,"Could you please clarify whether you want to test equality of *eigenvalues* (as stated in the first sentence), *eigenvectors* (as in the penultimate paragraph), or the full *""eigendecomposition""* (as in the title)? (The latter would be equivalent to testing equality of the correlation matrices.)",2013-10-11 14:13:38.863 +103790,57288,668.0,CC BY-SA 3.0,,"If you do not assume a prior distribution for (mean, SD), then it is hard even to make sense of this question. (What does it mean to be ""close""? What does it mean to talk about the ""probability"" of the mean?) So I do not think you can reject a Bayesian solution as ""too cheap.""",2013-10-11 14:28:23.990 +103791,57305,22143.0,CC BY-SA 3.0,,$\int f(x)^2 dx$ is also $\mathbb{E}[f(x)]$ where the expectation is with respect to $X \sim f(x)$. Does this help?,2013-10-11 14:28:31.940 +103792,57220,,CC BY-SA 3.0,,The relevant concept here is 'marginal effect'. Ask instead whether (i.e. within what range or setting) *that* is significant.,2013-10-11 14:34:47.123 +103793,57305,1895.0,CC BY-SA 3.0,,**Related**: http://stats.stackexchange.com/questions/9926,2013-10-11 14:37:48.093 +103794,57293,1895.0,CC BY-SA 3.0,,Have you (already) read the relevant section on this in *Elements of Statistical Learning*?,2013-10-11 14:44:19.050 +103795,57306,9522.0,CC BY-SA 3.0,,"Thanks a lot for your answer. I also though that Fisher´exact test could be a good method for the analysis. I have not any statistic software to perform the results for other functional classes I would like to test too. Do you know any ""online"" tool to obtain the pvalues with all the decimals?",2013-10-11 14:55:33.103 +103796,57305,1895.0,CC BY-SA 3.0,,"Tom, Do the comments and the link answer your question or are you looking for something further?",2013-10-11 15:14:07.703 +103797,57306,15827.0,CC BY-SA 3.0,,You can download R for free. See http://www.r-project.org/ So having no software is soluble (and thinking that you need a way of calculating online is incorrect). But please do a little searching to find out these things for yourself. See advice at http://stats.stackexchange.com/help/how-to-ask on asking a good question.,2013-10-11 15:16:11.793 +103798,57307,15827.0,CC BY-SA 3.0,,"See answer and comments at http://stats.stackexchange.com/questions/72553/which-statistic-test-to-used which already give implicit and explicit answers, namely download R.",2013-10-11 15:18:07.647 +103799,57288,20473.0,CC BY-SA 3.0,,"You need to clarify whether you are looking at $\mu|X$ (@Corone style and a comment of yours) or $\mu|x$ (@Henry style and the main body of the question)... and I won't accept ""both"" as an answer, that's too expensive (to answer). :)",2013-10-11 15:32:26.183 +103800,57308,22262.0,CC BY-SA 3.0,,My data is not of the form to trust a test on the Pearson correlation. In fact it would violate almost every assumption of such a test. Is this still the approach you would take?,2013-10-11 15:37:06.073 +103828,57302,750.0,CC BY-SA 3.0,,"Yes you are correct about the last typo (I meant slopes not intercepts). I read the updates, and besides my answer and comment (and David Harris's comments) I don't see what else you are confused about exactly.",2013-10-11 17:19:38.033 +104262,57477,594.0,CC BY-SA 3.0,,"I have gone into more detail. If that doesn't suffice, please indicate clearly where the problem lies.",2013-10-15 02:47:33.973 +103801,57259,22564.0,CC BY-SA 3.0,,"This function does not include any information regarding within-subject variance: `dd <- aggregate(tvalue~Group+Subject, data=tdat, FUN=mean)`. I have run simulations where I sample each individual value from `dnorm(Subject_mean, sd)` and with sd from `seq(.001, 10, by=.001)`. It is true that larger sd widens the distribution of pvalues, but this is only because the means differ from the actual data. If the means are the same I will get the same pvalue.",2013-10-11 15:37:29.833 +103802,57308,22507.0,CC BY-SA 3.0,,How did you produce the data? Why you cannot trust a test on the Pearson correlation? Please elaborate.,2013-10-11 15:42:57.830 +103803,57265,2490.0,CC BY-SA 3.0,,"Assuming you mean family-wise or experiment-wise error rate, I don't see how this is 30 tests. So I don't think that applies here, but I would be interested in hearing why you think it is. It's one test/experiment, with a single hypothesis and a single variable with two levels. One of the levels could be considered the control.",2013-10-11 15:44:15.237 +103804,57259,22564.0,CC BY-SA 3.0,,I have added the code and result to the question,2013-10-11 15:49:04.647 +103805,57308,22262.0,CC BY-SA 3.0,,"Heteroscedasticity, non-normality, autocorrelation, non stationarity. Pearson correlation test unbiasedness is known to be extremely susceptible to violations of its assumptions.",2013-10-11 15:51:28.810 +103806,57206,22555.0,CC BY-SA 3.0,,"Whether one views this as a useful technique, as I do, or otherwise, as appears to be your view, it nonetheless is a quick and long-established (by Pearson) alternative to testing for normality (and Students-t application) in the context of this thread. Please don't get me wrong, I acknowledge and do agree with your concerns. But we would both agree, would we not, that without prior information, trying to establish whether an entire population can be modelled on a Gaussian from a very small data sample is a shot in the dark at best with any method, and at worst is dangerous.",2013-10-11 15:52:57.690 +103807,57254,22583.0,CC BY-SA 3.0,,Dr Huber - I do agree with you that the best way would be to destroy a small sample of the packets to assess the intra-packet variability. I was just hoping to to use mathematical statistics to come to a conclusion of some sort without destroying packets. I gave the question some more thought I have updated my original post. Your thoughts?,2013-10-11 16:03:09.577 +103808,57206,668.0,CC BY-SA 3.0,,"That's right. All I am saying is that if it is dangerous to try, from a small sample, to test whether the population is Gaussian, then it must be at least as dangerous to use the skewness and kurtosis to identify what the underlying distribution might be! In fact, it seems like such an attempt would actually be worse because it relies on unstable statistics like the kurtosis. Although Pearson's system can be a powerful guide to help people identify possible distributions, it provides less insight than even limited graphical displays like histograms.",2013-10-11 16:15:50.073 +103809,57307,668.0,CC BY-SA 3.0,,"@Nick I deleted the first part of your comment in the spirit of http://stats.stackexchange.com/help/behavior. I know you were helping, but we should take care to consider the feelings of newcomers who are not used to the site and the necessarily abbreviated conversations that occur in comments.",2013-10-11 16:17:54.363 +103810,57206,22555.0,CC BY-SA 3.0,,"It can be argued that the Student's-t should not be applied unless there is prior information that the population is in fact Gaussian, as it is designed for less than 30 samples anyway. The Students-t is in essence a way to 'narrow the spread' of a predicted population as a small number of samples is increased, but the assumption is that the population must be Gaussian to start with.",2013-10-11 16:19:33.527 +103812,52871,18447.0,CC BY-SA 3.0,,@gjabel it's a huge data set.I'm not sure how I can provide a slim data.Do you the problem due to data?,2013-10-11 16:25:03.543 +103813,57206,668.0,CC BY-SA 3.0,,"That argument continues to take place. Going back at least to Box 50 years ago, many have pointed out that the t-test actually is an approximation to a non-parametric (permutation) test: it does not require Normality of the population, but only approximate Normality of the *sampling distribution* of the mean. Simulations (and some theory) indicate the t-test may break down with skewed distributions, but even then it can be surprisingly robust.",2013-10-11 16:25:34.213 +103814,52871,18447.0,CC BY-SA 3.0,,@DistribEcology I've tried `th` instead of `t` as variable name but it did resolve the problem,2013-10-11 16:27:09.267 +103815,57206,22555.0,CC BY-SA 3.0,,"@whuber, I surrender. You've just exceeded my depth of knowledge on this subject ;-}",2013-10-11 16:28:59.410 +103816,57302,18040.0,CC BY-SA 3.0,,This has been helpful in me restating the question. What I'm getting at is how could I tell apart the following scenarios.,2013-10-11 16:29:30.040 +103817,57266,18447.0,CC BY-SA 3.0,,"How can I narrow down priors of variances while I'm using Wishart Dist.? The `cov[1:2,1:2]` is an identity matrix and I set df of Wishart Dist. as 2 because of the dimensions of `th`. I completely agree with you about trickiness of BUGS models.",2013-10-11 16:32:05.323 +103818,57288,19436.0,CC BY-SA 3.0,,"@whuber I'm not rejecting a Baysian solution at all. I'm rejecting just ""Bayesian statistics solves this"" as too unspecific for an answer.",2013-10-11 16:48:08.690 +103819,57284,,CC BY-SA 3.0,user14650,p is significant if it is smaller than the significance level. significance level must be chosen to fit the research topic. it is not a fixed value.,2013-10-11 16:59:51.547 +103820,57307,17249.0,CC BY-SA 3.0,,"Why do you need a p-value more exact than <.0001? I could have seen why you might have wanted an exact p-value if it had been > .001, but is there a reason why you want to be more precise than ""<0.0001"" ?",2013-10-11 17:02:53.247 +103821,57286,,CC BY-SA 3.0,user14650,you cannot interpret p without knowing the conf. level alpha,2013-10-11 17:04:54.080 +103822,57288,19436.0,CC BY-SA 3.0,,"@AlecosPapadopoulos in Vapnik-Chervonenkis theory it's $\mu|x$ (that is, actual values, data). Also, it is not Chebyshev's inqequality, but a large deviation type inequality that is inverted. If you happen to have one of their books around, any of the results that starts with: ""With probability $1-\delta$"" I would call unfounded.",2013-10-11 17:08:25.550 +103823,57310,17249.0,CC BY-SA 3.0,,(+1) I think this is better than suggesting to the OP to download R. I was not aware of this online resource.,2013-10-11 17:09:40.097 +103824,57302,750.0,CC BY-SA 3.0,,"@DistribEcology, are you asking about whether `x` significantly varies between `g`? That might be best answered by boxplots of `x ~ g` or an anova. If you are asking about whether `x` has a differing effect in different `g`, then you can use the example in my answer as an inferential test between the models (one with random effects and random intercepts and one with only a random intercept).",2013-10-11 17:09:51.210 +103825,57259,6162.0,CC BY-SA 3.0,,@Flask This discussion is too long. Maybe you should isolate this new specific question and open a new thread.,2013-10-11 17:17:36.347 +103826,57302,18040.0,CC BY-SA 3.0,,"I tried to elucidate the question by adding significant edits to the question, which maybe you can check out. But I'm not sure I follow that adding random intercepts adds nothing. Do you mean slopes? Clearly there's lot's of variance between intercepts, but none between slopes.",2013-10-11 17:18:02.863 +103890,57286,,CC BY-SA 3.0,user14650,why then does the function wilcox.test expect an argument conf.level and set a default for it?,2013-10-12 08:48:47.137 +103829,57302,18040.0,CC BY-SA 3.0,,"Well, I suppose @DavidJ.Harris comments not withstanding, I wonder what is the appropriate model to make inferences that there are grouping is correlated with both X and Y. Is there a cohesive model formulation I'm missing, or just to to fit the mixed effects model and throw in a secondary ANOVA with `x ~ g` That's what I'm trying to get at.",2013-10-11 17:26:40.743 +103830,57298,3446.0,CC BY-SA 3.0,,"There's no such thing as a cumulative density function. The word ""cumulative"" contradicts the word ""density"". See this disambiguation page on Wikipedia: http://en.wikipedia.org/wiki/Cumulative_density_function",2013-10-11 17:27:57.960 +103831,57302,750.0,CC BY-SA 3.0,,"Like David said in the comment, it seems your asking about the between group variation for `x` within `g`, which doesn't have anything to do with a model of `y = x|g`. `x` varying within `g` is neither a necessary nor sufficient condition for intercepts or slopes to vary when predicting `y`. So a second test seems appropriate to answer that question - although your example graphic sort of says it all in this example.",2013-10-11 17:30:41.850 +103832,57263,22585.0,CC BY-SA 3.0,,"Peter, sorry, should have been clearer. This site has coaching tools and online content to help people manage these mental health conditions, so the ""treatment"" is the use of these resources.",2013-10-11 17:32:48.320 +103833,57306,668.0,CC BY-SA 3.0,,"@Nick Your advice is good, but please do not couch it as a characterization of the poster: such phrasing is all too easily misunderstood as an attack, which I doubt you intended. Therefore I removed the preliminary phrase in your comment (which added no information to it).",2013-10-11 17:48:25.727 +103835,57302,18040.0,CC BY-SA 3.0,,I guess I was hoping there would be a nice elegant modeling framework that I was missing. But I take your and David's point.,2013-10-11 17:57:14.757 +103836,57311,668.0,CC BY-SA 3.0,,"+1 It's a good question. Note that it is pertinent even to the case of ""continuous"" dependent variables: in many situations it is not the case that numerical differences have the same meaning or interpretation regardless of the levels of the original values.",2013-10-11 18:03:30.937 +103837,57314,668.0,CC BY-SA 3.0,,"Comparisons to the ""real world"" are next to impossible except in contrived situations: after all, we compute the CIs precisely because we do *not* know the values they target. However, there are a huge number of simulation studies of coverage of CIs: it is practically unimaginable that there exists any CI in existence that has not been studied in that way. Having said that, there are some notable exceptions, such as a study of the [CIs for the speed of light provided by physicists](http://books.google.com/books?id=ajd1V305PgQC&pg=PA58#v=onepage&f=false).",2013-10-11 18:06:00.040 +103838,57259,22564.0,CC BY-SA 3.0,,Hopefully the answers to this question will help me: http://stats.stackexchange.com/questions/72573/when-making-inferences-about-group-means-are-credible-intervals-sensitive-to-wi,2013-10-11 18:06:51.590 +103839,57297,18198.0,CC BY-SA 3.0,,"Sorry I do mean test the full eigen-decomposition, however not that you mention it, being able to test the eigenvectors seperately would be nice so I can try to understand where any differences are coming from.",2013-10-11 18:07:42.517 +103840,57293,18198.0,CC BY-SA 3.0,,"Sorry Micheal can't use all the PC's , I'm trying to reduce the multi-collinearity by removing the smallest PC's. Will Look at Elements of statistical learning!",2013-10-11 18:11:29.377 +103844,57302,3183.0,CC BY-SA 3.0,,"You can evaluate whether ""Part of that difference between districts is because of the underlying relationship between spending and test scores"" by seeing how much of the variance associated with your district grouping factor disappears when you remove (or randomize) spending.",2013-10-11 18:46:24.837 +103845,57266,18040.0,CC BY-SA 3.0,,I'd try modifying the inits you draw from the rgamma in your init function as a place to start.,2013-10-11 18:56:03.643 +103846,57315,16474.0,CC BY-SA 3.0,,"One thing I noticed that the variances in your original example are very small 6.5e-6 = 0.00000065. It depends a bit on how you scaled your variables, but to me this suggest that either you need to rethink the scale of your variables or your variance is _de facto_ zero",2013-10-11 19:19:44.163 +103847,57314,3446.0,CC BY-SA 3.0,,"We do not know the true values at the time we form confidence intervals, but in some cases one learns them later.",2013-10-11 19:49:46.893 +103848,57319,2081.0,CC BY-SA 3.0,,"It sounds in your text that you equate ""reversing the sign of the coefficient"" with ""suppressor effect"". But actually these two are different phenomena. Suppressing can exist without sign reversal, and vice versa.",2013-10-11 19:51:37.463 +103849,57314,668.0,CC BY-SA 3.0,,"Yes; that is the case with the speed of light experiments, which cover 90 years and ultimately are compared to a consensus value obtained 25 years after that. But it took a long time to pin down even this fundamental physical constant. In other fields (economics, for instance), finding true values typically is impossible, yet their estimates likely are subject to much more unexpected and unmodeled error. We should be cautious in generalizing CI coverage results from one field to another or even from one kind of experiment to another within a field.",2013-10-11 20:00:14.623 +103850,57316,503.0,CC BY-SA 3.0,,What are the counts? Are they all small numbers or do they vary over a wide range?,2013-10-11 20:13:02.473 +103852,57321,22564.0,CC BY-SA 3.0,,"That model makes sense, yet the code does not seem to incorporate that information and I guess this is what I would like to understand. Am I doing this right for parameter estimation?: `lmer(Value~Group -1 + (1|Subject), dat)` `lmer(Value~Group -1 + (1|Subject), dat2)`, where dat is the original data and dat2 is the simulated with small within-subject variance. I get the same standard errors.",2013-10-11 20:18:35.103 +103854,57321,6162.0,CC BY-SA 3.0,,"I have not tried, but that sounds strange, you remove the fixed intercept but there is a random intercept by subject. From the theoretical point of view I don't see any problem but I don't exactly know how `lmer` deals with models without interecept. Keep the intercept to be sure.",2013-10-11 20:31:10.460 +103855,57321,22564.0,CC BY-SA 3.0,,I followed [this instruction](https://stat.ethz.ch/pipermail/r-help/2008-April/160074.html) as I could not otherwise figure out how to get an interval estimate. My understanding of the R formula syntax is low so maybe it makes no sense.,2013-10-11 20:34:51.587 +103856,57315,9049.0,CC BY-SA 3.0,,I don't seem to be able to replicate your results. Which version of `lme4` are you using? If you are not using version 1.0-4 or newer I would recommend upgrading before anything else. Currently I get `a failure to converge in 10000 evaluations` message.,2013-10-11 20:39:46.030 +103857,57261,,CC BY-SA 3.0,,Can't you just add a very tiny number to each element and renormalize?,2013-10-11 20:56:36.737 +103858,57265,21762.0,CC BY-SA 3.0,,You are mentioning multiple Likert-type items. I thought you would want to compare them between groups.,2013-10-11 21:08:51.470 +103859,57321,6162.0,CC BY-SA 3.0,,"@Flask AFAIK there's currently no package in R providing a way to get ""correct"" confidence intervals for `lmer` models. For your model in the particular case of a balanced design there exist some exact least-squares methods, but I don't know whether they are available in some package.",2013-10-11 21:37:44.797 +104123,57436,1693.0,CC BY-SA 3.0,,I'll study this. Did you intend 'x.and.z' instead of 'x.and.y' in lines 4 and 5 of your R code?,2013-10-14 14:02:09.670 +103860,57328,668.0,CC BY-SA 3.0,,"Note that in removing the means you have made your sequences not quite iid: values now have a slight negative correlation. In standardizing the covariance matrix you will exacerbate that somewhat. There is an inevitable trade-off between maintaining independence *within* each sequence and no correlation *between* sequences. If that's ok with you, then the next question is one of computational efficiency: although there are readily available solutions (e.g., SVD), they may start to founder as $M$ and $N$ grow large.",2013-10-11 21:39:58.693 +103861,57321,6162.0,CC BY-SA 3.0,,Though I wonder whether the `lsmeans` package together with the `pbkrtest` package could provide good confidence intervals.,2013-10-11 21:40:26.713 +103862,57305,22607.0,CC BY-SA 3.0,,"Yes I think they're good, & enough to answer my question. Thanks cardinal (not sure how to mark this as answered).",2013-10-11 21:43:28.013 +103863,57273,594.0,CC BY-SA 3.0,,"Are you after a basic but general discussion of MCMC overall, a description of a specific kind of MCMC, or a specific discussion of it in relation to your problem?",2013-10-11 21:43:52.313 +103864,47981,6162.0,CC BY-SA 3.0,,Do you think that this statistical significance has a practical significance ? Be aware of the meaning of statistical significance before claiming it.,2013-10-11 22:32:41.417 +103865,47981,5821.0,CC BY-SA 3.0,,"I dunno... maybe we should run a double bootstrap and calculate a confidence interval for the $p$-value! In all honesty, I would report: ""The findings were borderline significant, $0.049 < p < 0.050$."" At that point, you're splitting hairs, and everyone suddenly remembers that 1/20 odds of a false positive is a completely arbitrary way to run science.",2013-10-11 22:56:51.217 +103866,57329,594.0,CC BY-SA 3.0,,"(1) I'm not sure you can necessarily compare a GLM and a time series model via BIC. (2) In any case which you used depends on what you want to do well at; even when BIC's are comparable, BIC is no guarantee of out of sample performance. *Why* do you want to optimize on one or the other?",2013-10-12 00:09:41.507 +103867,57284,12683.0,CC BY-SA 3.0,,You seem to be confusing the confidence level of a confidence interval - in this case for the pseudomedian - with a p-value for a test. I'll post something later.,2013-10-12 00:11:34.483 +103868,57336,594.0,CC BY-SA 3.0,,"With large enough samples, even trivial differences may be statistically significant. Is a difference in the third significant figure of any *practical* importance (how much difference could it make to you, really)? If there's no practical difference, there's no point testing for statistical significance.",2013-10-12 00:13:59.900 +103869,57314,594.0,CC BY-SA 3.0,,"One major problem is the issue of biased intervals (e.g. caused by a model that misses a possibly small but important effect). Perhaps counterintuitively (until you understand [what's happening](http://stats.stackexchange.com/questions/66473/in-what-settings-would-confidence-intervals-not-get-better-as-sample-size-increa/66475#66475), at least), in real world problems coverage tends to get *worse* as sample sizes increase. For example, a 90% interval might have 88% actual coverage at $n=20$ and say 25% actual coverage at $n=10000$... and one that keeps decreasing with larger $n$.",2013-10-12 00:20:36.260 +103870,57329,10135.0,CC BY-SA 3.0,,"Do you have any reference showing that we cannot compare GLM and time series using BIC? Because to me, it is possible since BIC just depends on estimated log likelihood and number of parameter and number of observations. These models can be used to price some products and you want your price to be unique. So at the end you need to pick up one.",2013-10-12 00:24:19.637 +103871,57329,594.0,CC BY-SA 3.0,,"Having seen the particular assumptions under which BIC was derived, I don't see how the comparisons implied by that derivation applies to your situation; the [onus would be yours](http://en.wikipedia.org/wiki/Philosophic_burden_of_proof#Holder_of_the_burden) to show that what you're doing makes sense. [In fact I have one reference that says you can't compare *likelihoods* across models with different error distributions, which if it were correct would wipe out a lot more than just BIC. I don't know that the claim of the reference is correct, though.]",2013-10-12 01:02:07.340 +103872,57317,22564.0,CC BY-SA 3.0,,Well I just found this question. No answer was accepted: http://stats.stackexchange.com/questions/12002/how-to-calculate-the-confidence-interval-of-the-mean-of-means?rq=1,2013-10-12 01:03:28.597 +103873,57329,594.0,CC BY-SA 3.0,,"Some related questions: [1](http://stats.stackexchange.com/questions/65455/can-you-test-likelihood-ratio-between-different-models) [2](http://stats.stackexchange.com/questions/43312/can-i-use-a-likelihood-ratio-test-when-the-error-distributions-differ); there are a number of others as well. As you see from [2], even if you can compare the likelihoods a problem comes up; this problem would apply to a comparison of BICs (the variance difficulty would translate to a shift-issue in difference of BIC's - if one BIC involves an unknown constant not present in the other, what does one do?)",2013-10-12 01:05:22.633 +103876,57273,22596.0,CC BY-SA 3.0,,"Well, everything I find in terms of explanations seems to reference systems that already follow some distribution, like Monopoly. I was hoping for an explanation in terms of trying to fit a model to a preexisting set of data, which I cannot seem to find an example of. So, I suppose a discussion of how it relates to my specific _type_ of problem?",2013-10-12 04:51:13.130 +103877,57273,594.0,CC BY-SA 3.0,,Then you'll need to explain your problem better.,2013-10-12 05:11:47.650 +103878,57286,594.0,CC BY-SA 3.0,,"@what *You* choose your $\alpha$. The program doesn't need to know what you choose. I might choose a different significance level, but we both compare it to the same p-value.",2013-10-12 05:41:19.820 +103880,57308,22507.0,CC BY-SA 3.0,,What do you mean by autocorrelation and non-stationarity? Your dependent variable and predictors are time series?,2013-10-12 06:04:26.677 +103881,57343,21243.0,CC BY-SA 3.0,,"What exactly do you mean by ""not nearest to any point"", exactly? It would seem that by definition, if there are centroids living in a real or other metric space with points, a given point must necessarily be nearest to some centroid.",2013-10-12 06:21:32.687 +103882,57343,22629.0,CC BY-SA 3.0,,For example if we pick 3 centroids and all the datapoints are nearest to either centroid 1 or 2 .In such case all the points would be assigned to centroid 1 or 2 and centroid 3 would not have any points assigned,2013-10-12 06:30:42.827 +103883,57343,21243.0,CC BY-SA 3.0,,"Aha. In that case, the points can be assigned to one arbitrarily without any real issue. + +Additionally, you might also find interesting some of the methods of choosing the initial points, such as [K-Means++](http://en.wikipedia.org/wiki/K-means%2B%2B).",2013-10-12 06:39:13.097 +103884,57343,436.0,CC BY-SA 3.0,,@LCialdella: I think what he means is the situation where one of the centroids has no points assigned to it.,2013-10-12 06:55:46.783 +103885,57343,14799.0,CC BY-SA 3.0,,Try [K-means++](http://en.wikipedia.org/wiki/K-means%2B%2B).,2013-10-12 07:08:19.033 +103886,57343,22629.0,CC BY-SA 3.0,,@LCialdella .You got me wrong.I might not have explained clearly .I meant what 'nico' has mentioned.,2013-10-12 07:13:49.123 +103887,57343,22629.0,CC BY-SA 3.0,,@nico could you please let me know what needs to be done in such scenarios,2013-10-12 07:14:31.033 +103888,57308,22262.0,CC BY-SA 3.0,,Yes these are time series which display autocorrelation and non-stationarity. e.g. in the post I talk about pass bands and denoising.,2013-10-12 08:14:08.087 +103891,57271,11490.0,CC BY-SA 3.0,,"Thanks for your answer Alecos. I started with a similar idea, but then I was discouraged by the number of unknown parameters. One question: even if you estimate all the $\gamma$s you still don't have an estimate for the transfers $G^{A -> B}$ etc, right?",2013-10-12 09:57:35.030 +103893,57317,6162.0,CC BY-SA 3.0,,"It's curious that nobody here seems to know my ""trick"". I have just answered this question.",2013-10-12 10:05:43.587 +103894,57271,20473.0,CC BY-SA 3.0,,"No, you do (an estimate that is), because, say $\hat G_{t-1}^{A -> B} = \hat \gamma_{21}A_{t-1}$. Etc. This is the whole point.",2013-10-12 10:13:52.430 +103897,57343,2081.0,CC BY-SA 3.0,,"I see no problem. A cluster may stay empty, after all. I checked your situation - with one initial center far away - in SPSS, which uses Hartigan (1975) algorithm. There comes out an empty cluster without any error message.",2013-10-12 10:38:55.440 +103900,57350,594.0,CC BY-SA 3.0,,"No doubt someone will chime in with formal definitions, informally, all expectations are expectations over the distribution of (/expectation with respect to) some (possibly multivariate) random variable, whether it has been explicitly specified or left implied. In many cases it's obvious ($\text{E}(X)$ implies $\text{E}_X(X)$ rather than $\text{E}_W(X)$). Other times, it's necessary to distinguish; consider the law of total variance for example: $\text{Var}[Y] = \text{E}_X\left[\text{Var}[Y\mid X]\right] + \text{Var}_X\left[\text{E}[Y\mid X]\right]$.",2013-10-12 11:32:52.627 +103901,57349,22630.0,CC BY-SA 3.0,,"@NickCox I've added the link. In first senetence it asked us to subtract mean of each pixel overall images but standard deviation is on over all pixels and all images, so, in SD formula which mean should I use is it mean of that pixel position or mean of all pixels of all images? More importantly, should I take means and sds differently for r,g,b domains or combine rgb as one value and calculate this.",2013-10-12 11:54:03.867 +103902,57349,22630.0,CC BY-SA 3.0,,"@NickCox Thank you very much!, if possible consider adding an answer. More importantly, should I take means and sds differently for r,g,b domains or combine rgb as one value and calculate this?. In general what is preferred?",2013-10-12 12:03:35.697 +103903,57349,15827.0,CC BY-SA 3.0,,"Glad that helped, but now this is a morphing into a quite different new question in image processing, and (1) you should pose that in a new thread (2) it's not clear to me that it is essentially a statistical question that belongs here (3) sorry, but I am not experienced enough in that field to advise you.",2013-10-12 12:07:32.573 +103904,57349,22630.0,CC BY-SA 3.0,,@NickCox I mean if you don't mind please add answer to this thread question so that I can mark as accepted. I don't need an answer for the question in the comment. Sorry if I'm troubling you.,2013-10-12 12:11:42.293 +103905,57349,15827.0,CC BY-SA 3.0,,OK; I combined my earlier comments into an answer (and deleted the corresponding comments).,2013-10-12 12:16:56.620 +103906,57319,1693.0,CC BY-SA 3.0,,"Hi ttnphns. A reversal is one way a suppressor effect can work, would you say that's right?",2013-10-12 12:41:27.780 +103907,57319,2081.0,CC BY-SA 3.0,,"Yes, I think. Adding a suppressor can reverse the sign of a coefficient. As well as not. So what is your question about - a suppressing phenomenon or a changing of a sign phenomenon?",2013-10-12 13:05:23.213 +103908,57286,12683.0,CC BY-SA 3.0,,@what: It doesn't expect it - it's optional - & it's for something other than the Wilcoxon signed-rank test itself: the computation of a confidence interval for the pseudomedian. It's explained in the help page.,2013-10-12 13:09:02.730 +103909,57308,22507.0,CC BY-SA 3.0,,"Then I would still use the same approach, each time adding the most correlative predictor, but instead of the statistical significance, I would use the number of predictors as a metavariable. Note however that the autocorrelation makes predictors more, not less, significant, so you never want to add insignificant predictors. To determine metaparameters, I'd use a validation by rolling.",2013-10-12 13:26:01.770 +103910,57317,6162.0,CC BY-SA 3.0,,I have just taken a quick look at your JAGS model. It is different than the frequentist model because you assume a different variance for each subject (nested in group).,2013-10-12 13:26:29.833 +103913,57296,21398.0,CC BY-SA 3.0,,I will also be happy when it is well done :-D,2013-10-12 13:44:55.203 +103914,57318,6630.0,CC BY-SA 3.0,,What's VIF? What's MC?,2013-10-12 14:07:48.923 +103915,57356,8671.0,CC BY-SA 3.0,,"thanks for your reply. But how can I find the dependency between (c=1, k=1) and (c=2, k=1) ?",2013-10-12 14:12:01.880 +103916,57356,6630.0,CC BY-SA 3.0,,"what do you mean by (c=1,k=1)? is that a random quantity?",2013-10-12 14:15:00.643 +103917,57356,8671.0,CC BY-SA 3.0,,For example I want to find the dependency between the cluster 1 (k=1) and a class (c=1).,2013-10-12 14:21:35.523 +103918,57329,20473.0,CC BY-SA 3.0,,"@Glen_b I believe that this paper of Vuong (1989), http://www.jstor.org/discover/10.2307/1912557 provides a general framework for non-nested models.",2013-10-12 14:26:02.840 +103919,57355,9446.0,CC BY-SA 3.0,,Perhaps your question is answered [here](http://stats.stackexchange.com/questions/60383/bonferroni-adjustment-in-spss-what-does-it-do).,2013-10-12 14:28:09.760 +103920,57356,6630.0,CC BY-SA 3.0,,what is random in $k=1$? It's always $k=1$. You can't quantify a statistical dependence if there's no randomness.,2013-10-12 14:40:31.563 +103921,57356,8671.0,CC BY-SA 3.0,,K and c are random variables here. I want to calculate the dependecy between a particular k and particular c. how to do it with MI?,2013-10-12 14:41:38.773 +103922,57313,22611.0,CC BY-SA 3.0,,"Thank you for the great response. However, I am confused by this: + +""You should always correlate exogenous variables ..."" + +Everything I've read on the subject (e.g., Byrne's book on SEM with AMOS, etc.) says to correlate variables only if doing so is supported by theory and empirical results (like modification indices). Have I misinterpreted the literature?",2013-10-12 14:51:48.507 +103923,57356,6630.0,CC BY-SA 3.0,,"Once you fix k = 1, it's no longer random!!",2013-10-12 15:03:59.870 +103924,57318,5448.0,CC BY-SA 3.0,,"@Memming - VIF = Variance Inflation Factor, MC is multicollinearity. - Hugh - please don't use acronyms unless they are really, really standard and widely known, like ""BIC"" or ""GLM"".",2013-10-12 15:33:20.460 +103925,57356,8671.0,CC BY-SA 3.0,,e.g. http://en.wikipedia.org/wiki/Cluster_labeling#Mutual_Information,2013-10-12 15:53:53.693 +103926,57355,20927.0,CC BY-SA 3.0,,"thank you!might just opt for a one-way ANOVA because SPSS does not seem to do the Bonferroni with multiple t-tests, so would need to do that manually, best to do that i should think.Thanks!",2013-10-12 16:13:04.097 +103927,56911,2149.0,CC BY-SA 3.0,,Glen .... this is precisely what I pointed out in my response.,2013-10-12 16:20:07.000 +103928,57356,6630.0,CC BY-SA 3.0,,"@user570593 I see. In that wikipedia page, the random variables are indicator functions of k=1 and c=1. That's the confusion.",2013-10-12 16:29:30.053 +103929,57317,6162.0,CC BY-SA 3.0,,"... and your JAGS model also assumes a different between-variance for each group (because you run the model separately for each group, as I understand)",2013-10-12 16:34:56.677 +103930,57356,6630.0,CC BY-SA 3.0,,I updated the answer. Take a look.,2013-10-12 16:35:40.480 +103931,57271,11490.0,CC BY-SA 3.0,,"Yes, but $\hat{\gamma}_{21}A_{t-1}$, $\hat{\gamma}_{22}B_{t-1}$ $\hat{\gamma}_{23}C_{t-1}$ don't sum to $B_t$. I need 3 numbers that sum up to $B_t$, maybe that could be achieved with a model with time-varying coefficients?",2013-10-12 16:35:40.503 +103932,57317,22564.0,CC BY-SA 3.0,,"@Stéphane what was the trick? Also the variance does appear to be different for each subject. Also I am interested in your last comment. I do run it separately for each group, you think I should compare to running it on all groups at once?",2013-10-12 16:49:00.217 +103933,57356,8671.0,CC BY-SA 3.0,,Thank you very much for your answeres. Still I have some problems. I want to calculate the dependancy between a class and a cluster. How can I find it using MI?,2013-10-12 16:55:54.477 +103934,57356,6630.0,CC BY-SA 3.0,,"@user570593 You want to know if ""being in this cluster or not"" tells you anything about ""having this class label or not"". That's why you use the indicator function. And that's all good. If you had more than 2 clusters, the answer would be different. I think your problem is conceptual.",2013-10-12 16:59:57.247 +103935,57321,22564.0,CC BY-SA 3.0,,I accepted earlier on accident. Let me think on this for a bit.,2013-10-12 17:01:28.767 +103936,57271,20473.0,CC BY-SA 3.0,,"Indeed, and this is why the error terms exist. In general $\hat A_t$ never equals $A_t$, this is a core fact of any estimation procedure. If for some reasons you _need_ to obtain this equality, then one interesting avenue is to devise a way to _allocate_ the error of each period over the estimated coefficients (that's one way to obtain ""time-varying"" coefficients).",2013-10-12 17:08:30.663 +103937,57358,10135.0,CC BY-SA 3.0,,"Hint: Always in these type of problems, draw a diagram, here a unit square ( i.e. $0z)$ and then $P(Z\leq z)=1-P(Z>z)$.",2013-10-12 17:24:39.227 +103941,57356,6630.0,CC BY-SA 3.0,,"@user570593 If you know it is not cluster 2, you know immediately it is cluster 1. That's why it doesn't matter. It sounds like you have more than 2 clusters in reality in which case MI would work fine. Do you see what I mean?",2013-10-12 17:27:24.413 +103942,57356,8671.0,CC BY-SA 3.0,,"Lets say I have 50 clusters. The problem I have is when I calculate the MI(c=1, k=1) and MI(c=2, k=1) I am getting the same value. (e.g. from the table I have 3 clusters when I calculate MI(c=1, k=1) and MI(c=2, k=1) getting same values). If i get different values for MI(c=1, k=1) and MI(c=2, k=2) s.t. MI(c=1,k=1)>MI(c=2,k=1) I can conclude that the cluster k=1 has high dependency with the class c=1. But for all k=1,...K I am getting MI(c=1, k) = MI(c=2, k)",2013-10-12 17:30:55.207 +103943,57358,22637.0,CC BY-SA 3.0,,"I have drawn a figure and while I understand what I need to do, I still cannot explain the lower limit of integration of x, that is z-1.",2013-10-12 17:39:08.240 +103944,57361,22381.0,CC BY-SA 3.0,,I'm not sure how this answers my question?Can you explain?Is it possible for a volatile series to be defined as stationary?,2013-10-12 17:46:40.870 +103945,57317,6162.0,CC BY-SA 3.0,,"The ""trick"" is to reduce the mixed model to a simple model by taking for observations the subjects means in your case and the groups means in the other question. I don't know what you should do but I claimed that the sampling distribution of your Bayesian model is not the same as the one of the frequentist model.",2013-10-12 17:51:30.947 +103946,57321,6162.0,CC BY-SA 3.0,,"This is a ""toy example"": http://stats.stackexchange.com/a/72610/8402",2013-10-12 17:53:14.627 +103947,57356,6630.0,CC BY-SA 3.0,,"Don't you mean MI(I(c=1), I(k=1)) vs MI(I(c=1), I(k=2))? Why are you changing c? There are only 2 values for c. Your notation of c and k are not consistent in the previous comment, I think.",2013-10-12 17:58:41.443 +103948,57317,22564.0,CC BY-SA 3.0,,"I see what you mean that the model is different. However, the bayesian model displays the behavior I desire in that uncertainty is propagated from individual to group level. As I said if this does not occur then I consider something to be wrong regardless of what can be proven by math. There is some logical issue at play. Either the model that does not propagate the error is misspecified or I am asking the wrong question. The jags model has other issues but the best way to do that would be a different question I suppose.",2013-10-12 18:02:12.557 +103949,57321,22564.0,CC BY-SA 3.0,,I have the problem that I can understand in code and images better than generalized equations.,2013-10-12 18:17:48.917 +103950,57362,22381.0,CC BY-SA 3.0,,How can a series be stationary if it exhibits volatility?How do you define stationarity when applying a GARCH model?,2013-10-12 18:24:10.870 +103951,57361,22381.0,CC BY-SA 3.0,,If a time series exhibit volatility clustering doesn't that mean that the series in non-stationary and GARCH cannot be applied to it(if it's non-stationary)?,2013-10-12 18:32:10.400 +103952,57317,6162.0,CC BY-SA 3.0,,"Frankly I don't have the courage to try to understand your R code. It is long and hard to read. With the notations of my answer, the length of the frequentist interval is proportional to $\hat\sigma^2$, hence it increases when $\sigma^2_w$ increases. If you don't see this behavior with your simulations, you have simulated something else.",2013-10-12 18:37:17.147 +103953,57362,22381.0,CC BY-SA 3.0,,Would it be okay if I include AR and MA terms in my mean equation?If the return series exhibit some autocorrelation at short lags.,2013-10-12 18:43:26.197 +103954,57307,9522.0,CC BY-SA 3.0,,I need to obtain the exact pvalue to be able to compare different functional groups of genes.,2013-10-12 18:48:11.620 +103955,57366,22381.0,CC BY-SA 3.0,,"I was thinking of fitting the ARMA first, then fitting the residuals to a GARCH model. Is this wrong?How can I ""check the residuals for any linear time series properties which can then be modelled using ARMA processes.""?Can the ljung-box test be used to detect ARCH effect?",2013-10-12 18:48:59.930 +103956,57310,9522.0,CC BY-SA 3.0,,Thanks a lot Jamie and Patrick. The Microsoft tool works perfectly and now I can calculate all the pvalues :),2013-10-12 18:50:53.570 +103957,57366,306.0,CC BY-SA 3.0,,"simplest way is to look for the auto correlation function of the squared series. if it is significant then try out the GARCH model. if the autocorrelation of the square of the residuals gets removed, then the GARCH does help to model the dependence in the squared series.",2013-10-12 18:52:35.680 +103958,57359,19681.0,CC BY-SA 3.0,,"These issues sound like stylistic concerns. That's not to say that the questions are unimportant, but that the answers may depend more on your precise goals for the analysis. I don't see how any of the approaches that you mention would be ""generally bad"". It might be easier to get the answer you're looking for with a little more background on the scientific problem, and specifically what kind of interpretative statement you want to be able to draw from the model.",2013-10-12 18:54:51.893 +103959,57367,674.0,CC BY-SA 3.0,,"Two-way comparisons with paired samples are not restricted to matching. Siblings, twins, pre/post measurement, or responses to a question asked to both wife and husband might all be tested using, e.g., a t-test for paired samples. Could you clarify what your situation actually look like?",2013-10-12 18:57:54.153 +103960,57366,22381.0,CC BY-SA 3.0,,"If I do that my mean return will be 0 right?I want to be able to get a mean that will not be a straight line, like a mean function that will depend on AR and MA terms + the GARCH error.",2013-10-12 18:59:38.167 +103961,57366,306.0,CC BY-SA 3.0,,"there are three things : one is the decision of whether there are GARCH effects present, the other is a justification of using ARMA and GARCH and the third is to actually fit the model when the above two are affirmative. the fitting is not so simple as do it in two different stages. you have to fit both the ARMA and the GARCH parts simultaneously. There are methods available for this.",2013-10-12 19:06:18.897 +103962,57317,22564.0,CC BY-SA 3.0,,It's mostly just code for plots :/. I think I must be using incorrect terminology and it is leading to confusion. Hopefully someone will comment on your answer to the linked question and it will clarify for me.,2013-10-12 19:12:59.990 +103963,57361,20473.0,CC BY-SA 3.0,,"I take it that by ""volatility clustering"" you mean that it appears that the time series is characterized by different variance in different intervals. First, this is just an indication of possible non-stationarity, not proof. Second, the ARCH model and its extensions attempt to explain this ""volatility clustering"" by modelling the _conditional_ variance as time-changing, while maintaining the assumption of a constant _unconditional_ variance (and hence, the assumption of 2nd-order stationarity).",2013-10-12 19:13:17.750 +103964,57363,306.0,CC BY-SA 3.0,,search for recommender systems. this looks like a problem that can be adopted to that. start here http://en.wikipedia.org/wiki/Recommender_system,2013-10-12 19:21:42.567 +103965,57356,8671.0,CC BY-SA 3.0,,"No.. I need MI(I(c=1), I(k=1)) vs MI(I(c=2), I(k=1))",2013-10-12 19:24:35.467 +103966,57362,1406.0,CC BY-SA 3.0,,"Stationary means constant mean, variance and correlation depending only on lag. AR and MA terms can be included in the mean equation. The key in GARCH processes is conditional volatility. Note that volatility is not variance. The mean volatility is series variance.",2013-10-12 19:28:50.913 +103967,57366,22381.0,CC BY-SA 3.0,,Would the use of ARMA be justified if there are correlations in the return series?I think there are packages in R that does the fitting. I only need to know when to apply an ARMA-GARCH or simply a GARCH. Can I use ljung-box test to test for GARCH effects?,2013-10-12 19:29:34.540 +103968,57361,22381.0,CC BY-SA 3.0,,Well lets assume that there is indeed volatility clustering. The series itself would be non-stationary so how can I apply a GARCH model to a non-stationary series as mpiktas did say that GARCH should be applied to stationary series.,2013-10-12 19:33:44.197 +103969,57361,20473.0,CC BY-SA 3.0,,"No, volatility clustering does _not_ necessarily imply non-stationarity. So if it can be ""explained"" by GARCH modelling, then you can operate on the assumption of unconditional stationarity. Indeed, this appears a bit circular - but then again, we can almost never be sure that an actual observed stochastic process is, or is not, stationary.",2013-10-12 19:38:54.117 +103970,57362,22381.0,CC BY-SA 3.0,,"As reference take for example the SP500 data in R, the return data seems to be constant in its mean but exhibit blatant conditional heteroskedasticity. So it is possible to apply a GARCH model on it despite having non constant variance?",2013-10-12 19:40:09.887 +103971,57361,22381.0,CC BY-SA 3.0,,If the variance is varying through time doesn't that mean that the series is not stationary?,2013-10-12 19:42:22.333 +103972,57361,20473.0,CC BY-SA 3.0,,"The variance is an _unknown_ moment of the theoretical _unconditional_ distribution that your process follows. It is either time-varying (so the process is non-stationary), or it is unconditionally constant but conditionally time-varying (GARCH-like). These two different scenarios can have the same result as regards the actual evolution of the process through time (i.e. on what you actually observe). So implement GARCH, and if it performs well, you can maintain the hypothesis of stationarity. If it doesn't, then you can entertain the possibility of unconditional non-stationarity.",2013-10-12 20:00:39.623 +103973,57356,6630.0,CC BY-SA 3.0,,"@user570593 so you have only 2 clusters, if 'c' means clusters.",2013-10-12 20:10:03.237 +103974,57361,22381.0,CC BY-SA 3.0,,So there is no way to know if the series is stationary?We just apply the GARCH model like that?,2013-10-12 20:12:23.997 +103975,57361,20473.0,CC BY-SA 3.0,,"Yes, but we scrutinize its performance. Essentially, the concept of GARCH modeling itself has made the unconditional 2nd-order stationarity issue a bit moot (assuming we have mean-stationarity), when it comes to applied modelling.",2013-10-12 20:15:14.547 +103976,57362,22381.0,CC BY-SA 3.0,,"usually can I apply the GARCH model to any log return series that exhibits volatility clustering?I am asking this because I saw in a dissertation that the ADF test was applied to test for stationarity, so I thought that stationarity was necessary before applying the GARCH model.",2013-10-12 20:17:15.447 +103977,57361,22381.0,CC BY-SA 3.0,,let us [continue this discussion in chat](http://chat.stackexchange.com/rooms/11032/discussion-between-andy-and-alecos-papadopoulos),2013-10-12 20:19:12.610 +103978,57329,594.0,CC BY-SA 3.0,,"Thanks Alecos; it's an important reference, and one I had forgotten about since it came out. I'm going to take a close look now. (My recollection was I didn't follow it in 1989, but I've learned quite a few things since then.) -- that may well give a way of doing what is needed here.",2013-10-12 20:48:01.487 +103979,57372,594.0,CC BY-SA 3.0,,"This may be over-relying on a particular choice of phrasing; you're assuming rather a lot from what might be simply a poor choice of words - not everyone here has English as a first language. It's definitely worth raising as a potential problem, but to simply state things so baldly (""absolutely not"") implies you know more than we can tell from what's here. (Further, the reference to a 'lab notebook' implies the OP is doing work in a lab. I doubt this is the case. Again, you imply you know more than we have here.)",2013-10-12 20:56:39.273 +103981,57368,4656.0,CC BY-SA 3.0,,"This looks a lot like a homework problem, and if it is so, please add the `homework` or `self-study` tag as appropriate. With regard to parts (i) and (ii), first draw a diagram of the $x$-$y$ plane, indicate on it the region where $f_{X,Y}(x,y)$ is nonzero as well as the regions where $X/Y \leq t$ etc. You will have different diagrams depending on the numerical value of $t$. Integrate the joint pdf over the regions to get the answers you need.",2013-10-12 21:50:39.493 +103982,57371,594.0,CC BY-SA 3.0,,"Correlation is useless for this purpose, even with single-predictors. Consider X on 1,2,3,4,5 and fits f and g being (12, 15, 17, 19, 23) and (101.12, 101.15, 101.17, 101.19 and 101.23) respectively. They're *perfectly* correlated, but they're nowhere near each other.",2013-10-12 22:01:15.430 +103983,57356,8671.0,CC BY-SA 3.0,,No C means class and K means clusters. I want to find out the dependancy of a particular cluster with a class.,2013-10-12 22:10:03.037 +103984,57372,436.0,CC BY-SA 3.0,,As a non-native English speaker I definitely did not read the OP's sentence in this sense...,2013-10-12 22:13:07.747 +103985,57334,436.0,CC BY-SA 3.0,,"But, again, you can use very similar arguments to not reject null. There is nothing special about 0.05, if you had chosen 0.06 as your limit you would probably not be asking the question, but the situation would not be that much different... Rather in these situations I would ask: ""what is the real-life meaning of this result?"". For instance if this was a biological experiment I would look for the biological significance of the specific result, report the p-value as it is and rather comment on the biology.",2013-10-12 22:15:30.820 +103986,45804,594.0,CC BY-SA 3.0,,"Would something like summing the squares of the differences between F and G (or the absolute value of their differences) over all the values for which you have outputs for both suffice as a measure? With 6+ dimensions, I'd suggest not using a lattice-type grid of values, by the way, but random draws from the space of values (though you might use quasi-random sequences instead I suppose).",2013-10-12 22:16:43.557 +103987,57334,594.0,CC BY-SA 3.0,,@nico this was already the point of my item (2); it argues against over-reliance on the formal approach in (1),2013-10-12 22:18:21.067 +103988,57368,4656.0,CC BY-SA 3.0,,"@Glen_b I have noticed some people (here as well as on math.SE) positively bristling at the suggestion that the `homework` tag be added. ""It is _not_ homework,"" they insist, ""I am studying this stuff on my own."" These people are much happier tagging the problem as `self-study`. My responses and/or answer treat both tags the same....",2013-10-12 22:28:29.447 +103989,57358,668.0,CC BY-SA 3.0,,"A few of the ways one might go about computing the distribution of a sum of uniform variates are described in my answer at http://stats.stackexchange.com/a/43075/919. That question might even be a duplicate of this one, depending on what you mean by ""that specific form."" What do you mean?",2013-10-12 22:32:58.067 +103990,57356,6630.0,CC BY-SA 3.0,,let us [continue this discussion in chat](http://chat.stackexchange.com/rooms/11033/discussion-between-memming-and-user570593),2013-10-12 22:52:49.917 +103991,57195,7229.0,CC BY-SA 3.0,,What if you try to increase the plotting threshold (0.5) and to use more than 4 color steps? Or to use thinner-thicker lines instead of colors.,2013-10-12 23:07:05.000 +103992,57359,594.0,CC BY-SA 3.0,,I'd suggest using orthogonal polynomials.,2013-10-12 23:11:59.743 +103994,57367,22641.0,CC BY-SA 3.0,,"Example -- a sample is partitioned into two groups using a clustering algorithm. Those groups are then dependent. However, different subjects reside in each group and are not matched in any way. So, a paired t-test can't be used since no pairing is possible. However, the dependence between the two groups precludes using methods that require sample independence.",2013-10-12 23:22:13.693 +103995,57359,346.0,CC BY-SA 3.0,,@Glen_b Can you give some more details?,2013-10-12 23:32:00.083 +103996,57357,668.0,CC BY-SA 3.0,,+1 This is an interesting and (seemingly) powerful approach. Thank you for sharing this idea.,2013-10-12 23:35:00.303 +103997,57368,668.0,CC BY-SA 3.0,,"Notice that the event $X/Y\le t$ can be written both as $X\le tY$ and $Y\ge (\frac{1}{t})X.$ The chance of the latter is $1$ minus the chance of $Y\lt (\frac{1}{t})X.$ Because the distribution of $(X,Y)$ is identical to that of $(Y,X)$, this latter chance is the same as that of $X\lt(\frac{1}{t})Y$ which (because the distributions are continuous) is that of $X\le(\frac{1}{t})Y.$ When $t\gt 1,$ $\frac{1}{t}\lt 1,$ which reduces the calculation to the one you have already successfully performed.",2013-10-12 23:40:49.440 +103998,57367,16588.0,CC BY-SA 3.0,,This probably shouldn't be tagged with 'paired-data',2013-10-12 23:50:25.220 +103999,57371,22507.0,CC BY-SA 3.0,,"acbart writes: _I want to find some statistical measure of how ""similar"" the two functions are_. The two functions are ""similar"".",2013-10-13 00:12:27.173 +104000,57367,594.0,CC BY-SA 3.0,,"@ndoogan I added it, but it's possible I misunderstood the situation. I will remove it. The OP should clarify the nature of the dependence.",2013-10-13 00:13:14.443 +104001,57372,16990.0,CC BY-SA 3.0,,"Mike McCoy, thank you for your answer, but I'm afraid in this case Glen_b is correct. I am not a native English speaker, and while I strive to write and speak as fluently as my skills allow, usage and connotation continue to elude me. So, in this particular case, we didn't try different things until we found something that was significant. Actually, what we were trying to prove is that there were no statistically significant increase in some error value, and in one particular case we found that the error was actually reduced, and when we ran the W test, this is where we got the 0.0499.",2013-10-13 00:25:38.173 +104002,57334,16990.0,CC BY-SA 3.0,,"Thank you Glen and nico. This part of the data was secondary to our experiments, so we just ended up reporting the value as is. In any case, I am marking this as the accepted answer. Thanks again to everyone who participated with answers or comments.",2013-10-13 00:32:04.810 +104005,57353,22638.0,CC BY-SA 3.0,,@Elvis If there are 0's in an array .. how are we supposed to handle that situation?,2013-10-13 03:34:06.977 +104006,57373,594.0,CC BY-SA 3.0,,Is this for some subject?,2013-10-13 03:58:47.257 +104007,57195,22547.0,CC BY-SA 3.0,,"@nadya - I've been thinking about increasing the plotting threshold, and i think it's a good idea. More than 6 colors and the eye will have difficulty recognizing the different levels, though. I could potentially plot just the $n$ highest correlations at each site. But, I wish there were a way to avoid having to calculate and plot $\textrm{order}((n^2)/2)$ correlations for each month's worth of data. There might be something I can use from network / graph theory to reduce the number of pairs.",2013-10-13 04:16:01.267 +104008,57373,594.0,CC BY-SA 3.0,,"As it stands the question seems to be underspecified, but maybe I missed something.",2013-10-13 04:18:38.660 +104009,57366,306.0,CC BY-SA 3.0,,do the ljung-box test on the square of the series. ARMA is supposed to model the autocorrelations in the series but not for the autocorrelations in the square of the series for which GARCH exists.,2013-10-13 04:36:24.410 +104010,57357,1506.0,CC BY-SA 3.0,,"Thanks, great answer. I have been looking into signal processing methods with the robFilter package but I was not aware of this technique.",2013-10-13 05:05:49.510 +104013,57374,594.0,CC BY-SA 3.0,,"Under random sampling, yes; this is simply a confidence interval for a binomial proportion, with the usual caveats about the assumptions of and interpretation of confidence intervals.",2013-10-13 06:24:21.550 +104014,57374,594.0,CC BY-SA 3.0,,"As long as $np(1-p)$ is not small (bigger than 10 is usually plenty), you can use the normal interval described [here](http://stats.stackexchange.com/questions/30281/sample-size-for-binomial-confidence-interval/30306#30306). If the $n$ is small and $p$ is very near 0 or 1, you may need to consider one of the [other binomial approximate confidence intervals](http://stats.stackexchange.com/questions/28316/confidence-interval-for-a-proportion-when-sample-proportion-is-almost-1-or-0). Many other posts here cover aspects of CIs for binomial proportions; the search bar turns many up.",2013-10-13 06:36:35.060 +104015,57374,594.0,CC BY-SA 3.0,,"Further examples of previous posts on this topic: [e.g. 1](http://stats.stackexchange.com/questions/4756/confidence-interval-for-bernoulli-sampling), [e.g. 2](http://stats.stackexchange.com/questions/4756/confidence-interval-for-bernoulli-sampling)",2013-10-13 06:39:19.323 +104016,57374,594.0,CC BY-SA 3.0,,"Just as a side note - if you want to control your confidence level, then the size of the interval also depends on that.",2013-10-13 06:45:24.270 +104017,57372,21586.0,CC BY-SA 3.0,,"Mike, I also did not see a problem in the phrasing of the question. And it seems nobody else saw signs of data snooping, mining, dredging, whatsoever here ... And it definitely lies in the eye of the beholder. There is no mathematical fact but a decision rule chosen by the statistician. Re-read what AlefSin, Glen in his point (2) and I wrote.",2013-10-13 06:56:43.053 +104018,57338,,CC BY-SA 3.0,,"I suggest you to read Lecun, Efficient Backprop, 1986 (http://scholar.google.it/scholar?cluster=15983004533596008350&hl=en&as_sdt=0,5) where the author proposes and discuss some tricks and tuning about NNs. I always normalize input features between -1 and 1.",2013-10-13 08:25:31.243 +104019,57185,,CC BY-SA 3.0,user10619,"@Nick The psychometry presumes that generally there is an error in Scale for measurement of say, fear. And hence, pursues the analysis. For example, error (measurement) variance is deducted from observed variance for arriving at true variance. The statistician presumes inertia of large numbers for mean and variance. There is no need for dealing with measurement error.",2013-10-13 09:44:00.070 +104020,57185,15827.0,CC BY-SA 3.0,,"I am at a loss to know what kind of answer you seek. But the implication that statisticians ignore measurement error is contradicted by a substantial literature by statisticians. Go to www.amazon.com and search for ""measurement error models"" to bring up several major works.",2013-10-13 10:13:25.520 +104021,57380,15827.0,CC BY-SA 3.0,,"You can ensure positive predictions by using a generalized linear model with logarithmic link function. By the way, although your $R^2$ value is quite encouraging, a better check of whether the model follows the main shape of the data is a plot of residual vs predicted. Plots of observed vs predicted may also help illuminate your problem.",2013-10-13 10:24:29.107 +104022,57372,,CC BY-SA 3.0,,"@IslamEl-Nabarawy If you wanted to establish equivalence/lack of difference, you have many other problems than how to interpret a value close to the threshold or potential data snooping. Just finding a *p*-value slightly over .05 (or whatever error level you choose) is definitely not enough. Look up “testing for equivalence” here and elsewhere or ask a question specifically about that because it's an entirely different problem.",2013-10-13 10:32:39.763 +104024,57379,15827.0,CC BY-SA 3.0,,Spelling out which R function you used is always good practice.,2013-10-13 11:41:54.743 +104025,56784,19681.0,CC BY-SA 3.0,,"It could be interesting to break the problem down into a simplest case. Imagine something like just 5 observations from your favorite distribution, and put a single divider in the data to form just two bins.",2013-10-13 11:53:08.290 +104026,57384,5001.0,CC BY-SA 3.0,,"You wrote ""if you think that"" and ""if the confidence level that you want"". I believe the interpretation is easier to understand if it is stated objectively; independent of what I think. I believe the interpretation will be easier to understand if it is independent of my wants. Please see that my example is free of my wants and my subjective beliefs.",2013-10-13 12:51:21.257 +104027,57384,306.0,CC BY-SA 3.0,,"a 90% confidence level means that you are fine if the estimate is right 9 out of 10 times. 99.73% confidence level means that you are fine if the estimate is right 9973 out of 10000 times. Generally used values are 90%, 95% and 99%. But it is a subjective decision that has to be decided by the researcher.",2013-10-13 13:08:16.190 +104028,57382,503.0,CC BY-SA 3.0,,"Where are you getting the idea about ""gross sampling error""? What is ""MSE""?",2013-10-13 13:25:05.313 +104029,57384,5001.0,CC BY-SA 3.0,,"That's sweet but you probably don't know me well enough to say ""you are fine"". Please rephrase your answer if you care to.",2013-10-13 13:25:59.023 +104030,57380,503.0,CC BY-SA 3.0,,"@NickCox gave one suggestion. I would plot the data in more ways than just residual vs. predicted. However, you can certainly rescale money variables. One common method is to take log(cost) as the dependent variable. (I think this winds up equivalent to the log link function, but might be easier to comprehend). Log(cost) can, of course, be negative. And logs of money variables are often sensible because, e.g. a difference between 0.01 and 0.02 per click is important, but difference between 1.01 and 10.2 per click is not.",2013-10-13 13:38:39.820 +104031,57389,5001.0,CC BY-SA 3.0,,"Ok Thanks. I can simply remove the mention of the 0.514 but I will not be offended if anybody offers to do a, superior in several respects, rewrite.",2013-10-13 14:17:36.463 +104032,57386,436.0,CC BY-SA 3.0,,"Also, if one just wanted to see the linear regression: `lines(x, fitted(lm(y~x)))` would do the trick.",2013-10-13 14:52:36.763 +104033,57373,5448.0,CC BY-SA 3.0,,"Is this homework or self-study? If so, please add the appropriate tag... Try deriving the PDF using $p(x_1, x_2, x_3) = p(x_1)p(x_2|x_1)p(x_3|x_1,x_2)$.",2013-10-13 15:17:12.077 +104035,57373,668.0,CC BY-SA 3.0,,The joint distribution for which the pdf is a constant $1/6$ where $0\le X_i$ and $X_1+X_2+X_3\le 1$ does *not* have uniform marginals. @jbowman How are these conditional probabilities to be obtained given that only the *marginal* distributions are specified?,2013-10-13 15:52:42.193 +104036,55209,20222.0,CC BY-SA 3.0,,"@Moderator, thank you for migrating this question as a duplicate to CrossValidated. There are many smart people here, hopefully one or more will offer a reply to this question as I still don't have answers to this.",2013-10-13 15:56:46.467 +104037,57380,15827.0,CC BY-SA 3.0,,@Peter Flom I think meant 1.02 not 10.2.,2013-10-13 16:06:59.233 +104038,57393,668.0,CC BY-SA 3.0,,"Didn't you mean to type ""two-sided"" instead of ""one-sided""?",2013-10-13 16:25:13.920 +104039,57378,668.0,CC BY-SA 3.0,,"Yes, you are being specific, but your specific question is covered by all the answers to the duplicate: you are testing the hypothesis that the intercept equals zero. That is well defined and specific.",2013-10-13 17:10:06.297 +104041,57381,22547.0,CC BY-SA 3.0,,How well do these functions deal with missing data? I quite often have gaps in the time series.,2013-10-13 17:11:42.890 +104042,57394,668.0,CC BY-SA 3.0,,"What does ""do"" mean in your final formula?",2013-10-13 17:12:03.273 +104043,57394,16046.0,CC BY-SA 3.0,,@whuber edited.,2013-10-13 17:16:11.757 +104044,57394,668.0,CC BY-SA 3.0,,"Thanks, but questions need to be understandable on their own, so if you can, please describe what this means rather than just providing links. Otherwise you may severely limit the potential audience for this question and reduce your chances of getting great answers.",2013-10-13 17:19:29.760 +104045,57393,14850.0,CC BY-SA 3.0,,"Yes I did, thanks, well spotted. @whuber, you really pop up everywhere with good responses",2013-10-13 17:22:07.990 +104046,57397,668.0,CC BY-SA 3.0,,"I have upvoted this because it's a legitimate solution, but it is risky in practice. After all, the solution is arbitrarily sensitive to values of $\mathbf Z$: a single high-leverage value will steer the estimates far from a decent fit merely to enforce the constraint. Thus, at a minimum, this procedure *must* be accompanied by a careful goodness-of-fit test to the data.",2013-10-13 17:24:05.110 +104047,57396,668.0,CC BY-SA 3.0,,There is a substantial difference between the answer to this question for a *single* value of $c$ and an answer that is valid for more than one value. Which application do you have in mind?,2013-10-13 17:27:25.360 +104048,57396,22656.0,CC BY-SA 3.0,,a single value of c. I edited the question.,2013-10-13 17:28:01.300 +104049,57397,20473.0,CC BY-SA 3.0,,"@whuber You are right. So, OP, tread carefully here.",2013-10-13 17:30:37.653 +104050,57396,668.0,CC BY-SA 3.0,,"OK, that's easy. For the record, the solution for an arbitrary number of unspecified $c$ is given at http://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test#Setting_confidence_limits_for_the_shape_of_a_distribution_function.",2013-10-13 17:30:58.187 +104051,57394,16046.0,CC BY-SA 3.0,,"Well I am gonna be honest about this. My understanding of causal framework is very primitive and naive and as stated in the paper I linked to is: ""expressions of the form $P(Y = y|\text{set}(X = x))$ or $P(Y = y|\text{do}(X = x))$ to denote the probability (or frequency) that event $(Y = y)$ would occur if treatment condition $X = x$ were enforced uniformly over the population."" + +You are right, but I thought new causal framework is a well known one among statisticians. What I am implying is the ""statistician"" above, should be a dumb to treat $X_2=x_2$ as an observational variable.",2013-10-13 17:35:24.610 +104052,57381,8074.0,CC BY-SA 3.0,,"There are EOF methods that are designed for the special case of ""gappy data"" that you describe. Here is a link to a paper that reviews these methods: +http://dx.doi.org/10.6084/m9.figshare.732650 . You'll see that the RSEOF and DINEOF methods are the most accurate for deriving EOFs from gappy data sets. The DINEOF interpolation algorithm can be found here: http://menugget.blogspot.de/2012/10/dineof-data-interpolating-empirical.html",2013-10-13 17:41:05.680 +104053,57396,13037.0,CC BY-SA 3.0,,Are you sure you want $P(X\leq c)$ and not a statistic of your sample such as $P(\bar{X}\leq c)$?,2013-10-13 17:48:52.873 +104054,57396,22656.0,CC BY-SA 3.0,,Yes I want to find $P(X≤c)$,2013-10-13 17:54:32.420 +104055,57389,306.0,CC BY-SA 3.0,,"You understand this answer and not the one I gave, Wow! Yep I think I got your question completely wrong then. My apologies.",2013-10-13 17:55:58.970 +104056,57286,,CC BY-SA 3.0,user14650,"I see. Nevertheless it's not optional. It has a default value, if you don't set it. So it ""expects"" it.",2013-10-13 18:16:18.910 +104057,57396,6162.0,CC BY-SA 3.0,,Count the sample values $x_i$ such that $x_i \leq c$ and consider a confidence interval about the binomial propotion $\theta=\Pr(X \leq c)$.,2013-10-13 18:16:41.220 +104058,57313,12544.0,CC BY-SA 3.0,,"I'm not familiar with that book. If you're doing something analogous to regression, you should correlate predictors or you'll get incorrect estimates.",2013-10-13 18:28:12.573 +104059,57393,15827.0,CC BY-SA 3.0,,The model fitted is not logistic. It grows from 0 and approaches an upper asymptote exponentially.,2013-10-13 18:28:54.200 +104060,57386,22651.0,CC BY-SA 3.0,,"Thanks Peter, You have put it right, i actually want to have a justification of using regression analysis to predict waste generation, im sorry for that ambiguity.one of the prerequisites for regression analysis is linear relationship among variables.so im i safe to proceed and assume a linear relationship in this data?",2013-10-13 18:37:04.970 +104061,57396,22656.0,CC BY-SA 3.0,,"@StéphaneLaurent I am a little confused. My question is how many samples, i.e. $x_i's$, I should choose. What is the binomial propotion? Would you please explain a little more?",2013-10-13 18:43:57.220 +104062,57396,22656.0,CC BY-SA 3.0,,"@whuber Thanks for your help. Knowing the value c, how can I find the required number of sample?",2013-10-13 18:45:54.120 +104063,57393,14850.0,CC BY-SA 3.0,,Is it not? I labelled it logistic because it is a specific version of the [generalised logistic function](http://en.wikipedia.org/wiki/Generalised_logistic_function),2013-10-13 18:50:27.670 +104064,57393,15827.0,CC BY-SA 3.0,,"Really? A glance at the algebra suggests otherwise. In any case, I'd assert that as usually understood in growth modelling a logistic curve has an inflexion.",2013-10-13 19:07:59.810 +104065,57386,503.0,CC BY-SA 3.0,,"""Linear regression"" does demand a linear relationship among variables, but that can be surprisingly nonlinear. e.g. you can include quadratic and cubic terms, for instance. In addition, there are non-linear regression models.",2013-10-13 19:34:47.123 +104066,57219,21952.0,CC BY-SA 3.0,,I made some edits to the problem. Also I have data for the previous year and the lagged variables for the Z's could be used as instruments.,2013-10-13 19:47:05.313 +104067,57393,14850.0,CC BY-SA 3.0,,"You are right, there is no division, I'm going blind",2013-10-13 20:01:27.850 +104068,57401,21762.0,CC BY-SA 3.0,,"Two normal distributions centered around 0, one with larger variance (compare 0.1 and 0.9 quantile)?",2013-10-13 20:02:04.353 +104069,57390,20473.0,CC BY-SA 3.0,,The likelihood function of the sample is the joint density. Here the variables are not independent. Do you know how to apply the chain rule to decompose the joint density into conditional densities?,2013-10-13 20:22:07.870 +104070,57401,15377.0,CC BY-SA 3.0,,"I am not sure if this can happen. Quantiles for Normal distributions are basically the scaled up version of the corresponding SD. Therefore if for the 1st Normal, 0.1 quantile is higher then 0.9 quantile will also be higher. What I am looking for is opposite. I am looking for this scenario: 0.1 quantile is higher for 1st r.v. than 2nd r.v. but, 0.9 quantile is lower for 1st than 2nd.",2013-10-13 20:26:28.333 +104071,57401,668.0,CC BY-SA 3.0,,@Michael Mayer is correct. It might help to draw a picture of the two CDFs overlaid on one another.,2013-10-13 21:03:50.783 +104072,57405,19681.0,CC BY-SA 3.0,,"It may lead somewhere to play with the algebra here. Notice that the two logs in the definition of $f$ can be combined (sum of logs is the log of a product). Subsequently, plugging this simplified version of $f$ into 13.1 might lead to an interpretable expression. The Law of Total Probability comes to mind.",2013-10-13 21:24:11.873 +104073,57353,5875.0,CC BY-SA 3.0,,"If two discrete distributions with same support are both 0 at some point of this support, you can just remove this point, as I did above.",2013-10-13 21:45:22.553 +104074,57368,21840.0,CC BY-SA 3.0,,"@whuber I understood your explanation, the only part I am struggling with is why are we using the fact that $(X,Y)$ and $(Y,X)$ have same distribution.",2013-10-13 22:03:53.663 +104076,57380,594.0,CC BY-SA 3.0,,A little sample data would help people illustrate potential solutions.,2013-10-13 22:21:10.770 +104078,57368,668.0,CC BY-SA 3.0,,Because it's simpler than doing the same calculation twice!,2013-10-13 23:20:32.307 +104079,57402,22547.0,CC BY-SA 3.0,,"This is also an interesting idea. Because some of the domains can be quite large, I'd probably group the data into $x \times x$ km cells rather than $x^\circ$ latitude-by-longitude.",2013-10-13 23:41:41.510 +104080,57195,22547.0,CC BY-SA 3.0,,There are some interesting suggestions arriving. I'm in the process of speeding up the basic data processing and am starting to try some of these ideas. I will provide comments later.,2013-10-13 23:42:41.657 +104081,57402,7229.0,CC BY-SA 3.0,,"Yes, to project the coordinates is a good idea. Good luck!",2013-10-13 23:58:41.930 +104083,57228,22547.0,CC BY-SA 3.0,,"If you want, post a question to Stackoverflow and I'll answer there. It's only a few lines.",2013-10-14 00:28:09.860 +104084,57413,594.0,CC BY-SA 3.0,,"This may simply be my own ignorance, but I am unfamiliar with the definition of ""*time persistent process*"". Googling didn't turn up anything obvious. Could you define your term please?",2013-10-14 01:28:44.937 +104085,57414,594.0,CC BY-SA 3.0,,"Your phrasing here: ""* [4,0,0,0...] should be as far away as [32,0,0,0,..] but with lower significance*"" suggests that your notion of distance or significance will have to change for it to make sense. I don't see how the two can be consistent as it stands.",2013-10-14 01:40:42.627 +104086,57385,594.0,CC BY-SA 3.0,,Is this for some subject?,2013-10-14 01:44:00.987 +104087,57379,594.0,CC BY-SA 3.0,,More information would help. Would you understand the meaning of the column headings if it were a linear regression model rather than a nonlinear model? Would you understand some but not others?,2013-10-14 02:26:02.990 +104088,57404,22507.0,CC BY-SA 3.0,,"What do you mean by apply to real life? Do you mean science, or technical stuff, or everyday life decisions, or business & administration?",2013-10-14 02:26:27.700 +104089,57388,,CC BY-SA 3.0,,"Hi, I'll give a more detailed answer later, but simply put they are not the same. You should first be measuring the treatment effect within a study and then pooling across studies. The other way is known as the 'naive method' and is dangerous as it can give invalid results. Have a look at the wikipedia page for Sampson's paradox for some examples (http://en.wikipedia.org/wiki/Simpson%27s_paradox).",2013-10-14 02:47:46.493 +104090,57417,594.0,CC BY-SA 3.0,,"You can't *prove* independence from a sample. You might find that your data are consistent with independence, but they'd also be consistent with mild dependence. Showing that they're inconsistent with being iid should be easier.",2013-10-14 03:31:46.377 +104091,57416,5237.0,CC BY-SA 3.0,,"This post is being automatically flagged as 'low quality' because of its length. At present, it's sort of a comment. Can you expand it a little bit to make it more of an answer?",2013-10-14 03:31:48.230 +104092,57417,5237.0,CC BY-SA 3.0,,In what sense do you want a *proof*? Are you just trying to understand the ideas? Is this a class assignment? What would having such a proof help you achieve?,2013-10-14 03:34:00.057 +104093,57417,594.0,CC BY-SA 3.0,,More details/context might help,2013-10-14 03:41:18.997 +104094,57389,5001.0,CC BY-SA 3.0,,"My suggested interpretation, informed by the above, becomes: Under the assumption that the true value of the y-intercept is zero, random sampling of the same number of (x,y) pairs, specifically 90, produced by the same process, would result in a least squares best fit line with a y-intercept at least as extreme as +0.00087, with a probability of 0.0027, and equal to or greater than +0.00087, with a probability of 0.00135.",2013-10-14 04:16:35.180 +104095,57417,22425.0,CC BY-SA 3.0,,"@gung: I'm working on a machine learning problem. When I assumed the data is independent but not identically distributed, I got better results than assuming IID. Hence I would like to prove the data is independent but not identically distributed.",2013-10-14 05:31:45.797 +104096,57418,594.0,CC BY-SA 3.0,,"If they're specified in advance, specific comparisons are usually called planned contrasts.",2013-10-14 05:40:16.690 +104097,57390,21985.0,CC BY-SA 3.0,,I thought about that. But I don't get it... Do you have a more concrete hint?,2013-10-14 05:51:31.597 +104098,57421,594.0,CC BY-SA 3.0,,Can you be more specific about the 'trend'?,2013-10-14 06:11:29.363 +104099,57237,22570.0,CC BY-SA 3.0,,"Well, to me its kind of obvious, because the $\bf{a}$ distributions are pretty much symmetric around zero. So when generate the $a_i$ there's no dependency on $v$. When the current $v$ is at the upper edge of the marginal $\bf{v}$ distribution, you'd assume that there should be a bias towards negative $a_i$. ""draw values"" refers to: take the 1-dim probability distrbution, built the cumulative distribution, throw a random number $r$ between 0 and 1, find the $x$ where the cum. distribution has the value $r$. This $x$ is my ""drawn value""",2013-10-14 06:17:50.200 +104100,57237,22570.0,CC BY-SA 3.0,,"For completeness: the data originates from gps-logging. I have a set of logged trips in cars, which log speed with 1Hz. So their's a pair of $v$ and $a$ for every datapoint. These are filled into the histogram.",2013-10-14 06:28:09.850 +104101,57420,22425.0,CC BY-SA 3.0,,Thank you for your reply. However I don't understand 'We group together each of the variables separately and then calculate the correlation coefficients for each pair of the groups'. Can you please explain it ?,2013-10-14 06:28:55.460 +104102,57420,306.0,CC BY-SA 3.0,,"have edited the answer, please do read it again. sorry for the earlier ambiguity.",2013-10-14 07:13:47.637 +104103,57413,1406.0,CC BY-SA 3.0,,@Glen_b time persistence is simply a fancy term describing non- stationary data.,2013-10-14 07:23:02.780 +104104,57413,594.0,CC BY-SA 3.0,,"The number of observations doesn't change whether or not something is stationary, though the number of observations might affect your ability to detect it.",2013-10-14 07:38:08.980 +104105,57425,594.0,CC BY-SA 3.0,,"Define 'strong sources', please. We could certainly measure the observation (though it could also be described other ways), but we estimate the parameter.",2013-10-14 07:46:55.707 +104107,57420,22425.0,CC BY-SA 3.0,,May I know the reason behind saying that they have different probability distributions if we cannot reject the hypothesis of these correlation coefficients being zero?,2013-10-14 07:49:12.830 +104108,57420,10547.0,CC BY-SA 3.0,,This would be the definition of random variables which supposed to be independend (this only holds for the normal distribution or spheric distributions) but not iid.,2013-10-14 07:55:11.487 +104110,57431,11117.0,CC BY-SA 3.0,,"In practice, I work on a model corresponding the first case you mention but the corresponding community uses the second case-terminology and I am very uncomfortable with that. I guess that following the community is the best thing to do.",2013-10-14 09:53:33.637 +104113,57287,20120.0,CC BY-SA 3.0,,"I think @MichaelMayer is asking why you are trying to infer a population parameter when you have data from the entire population available, so you don't need to infer, you may simply measure. And my guess is that you did not mean to imply you have the entire population sampled; your 10,000 is actually a sample.",2013-10-14 10:36:39.233 +104114,57428,503.0,CC BY-SA 3.0,,"What is the ""ID3 algorithm""? If it requires you to discretize continuous variables, it may be better to use some other method. Certainly there are classification tree methods (from your tag) that do not require binning.",2013-10-14 11:09:23.413 +104115,57427,21985.0,CC BY-SA 3.0,,"Thanks a lot! Did not expect such a detailed explanation, but it's great!",2013-10-14 11:51:15.533 +104117,57383,8063.0,CC BY-SA 3.0,,"The only slight problem with this approach is that there are no chiptype specific annotation packages for Agilent drosophila chips. But it can be overcome, as the Agilent output chip itself maps the probes to genomic positions. Or I can do it the hard way and go through biomaRt and annotate the genes (not probes) to cytobands on my own.",2013-10-14 12:44:00.773 +104118,57427,20473.0,CC BY-SA 3.0,,You're welcome Michael. I remember you are a biologist taking a stats course. How is it going so far?,2013-10-14 13:15:32.487 +104119,57423,12683.0,CC BY-SA 3.0,,"(+1) For the OP's benefit, stationarity of the original series can be checked visually by examining whether the sample auto-correlation function falls off to noise quickly (exponentially).",2013-10-14 13:30:23.733 +104120,57435,594.0,CC BY-SA 3.0,,One starting point - the paper mentioned in the help for that function is [here](http://dss.ucsd.edu/~hwhite/pub_files/hwcv-041.pdf). The paper defines the term on its second page (p270). Someone who knows the stuff better than me should probably write an answer though.,2013-10-14 13:43:16.020 +104122,57421,12683.0,CC BY-SA 3.0,,"Perhaps you could post a graph of the time series, its ACF, & its PACF. How long is the time series anyway? I'm sure @vinux is right about over-differencing. Did the error variance increase when you differenced?",2013-10-14 13:57:34.813 +104263,57483,22659.0,CC BY-SA 3.0,,"Thank you Dougal, I will keep that in mind for the future.",2013-10-15 03:05:40.080 +104124,57237,668.0,CC BY-SA 3.0,,"Your comments indicate you are assuming that $\mathbf{a}$ and $\mathbf{v}$ are independent. That cannot possibly be, because there are physical limitations to speeds: that means many accelerations will not be experienced at the most extreme speeds. However, it's not easy to provide more detailed advice because you haven't articulated what you're trying to accomplish; instead, you have described an approach to solving an unstated problem. Why don't you change this question and ask instead about the problem you need to solve rather than how to implement a solution that looks invalid?",2013-10-14 14:14:52.903 +104125,57436,7949.0,CC BY-SA 3.0,,"Yes, thanks. At first I always mistakenly used x.and.y then I noticed the mistake and fixed it in just the first line; the R code continued working for me since I had not cleared my workspace. Fixed now.",2013-10-14 14:18:07.023 +104127,57427,21985.0,CC BY-SA 3.0,,Hey Alecos! It's very interessting! But I am laking many mathematical tools. That is why I ask so many questions here. But the good thing is that I understand the stuff after solving the exercises ;-),2013-10-14 14:25:24.927 +104128,57421,2149.0,CC BY-SA 3.0,,"The data would be useful as the ACF is merely a descriptive summary statistic whose form can arise from a number of possible ""causes"". Please post your data so an informed analysis can proceed.",2013-10-14 14:29:55.240 +104129,57423,2149.0,CC BY-SA 3.0,,"@Scortchi Whereas the acf can suggest non-stationarity, non-stationarity is a symptom and can have multiple causes.",2013-10-14 14:32:14.530 +104130,57436,1693.0,CC BY-SA 3.0,,"I’d like others' help in assessing whether your example constitutes a sound litmus test. But I‘m seeing how your example supports your conclusion. I’ve tried out your simulation about 30 times, using a variety of sd.error values from 10 to 50. The sd.error and the Z coefficient are correlated at 0.18 with p = .3.",2013-10-14 14:57:45.360 +104131,57436,1693.0,CC BY-SA 3.0,,"Still, doesn't it trouble you to think that nearly ineffectual control for covariates would have the same expected effect on a focal coefficient as very thorough control would?",2013-10-14 15:00:43.153 +104132,57438,5237.0,CC BY-SA 3.0,,"This isn't really clear. Can you state your situation in simple English w/ a concrete example? What is your response variable? Is it a count of success & failures, or a utility? How is the experiment set up?",2013-10-14 15:02:18.937 +104133,57436,1693.0,CC BY-SA 3.0,,I think a stumbling point is that changing your sd.error exerts an equal effect on both X's and Z's squared correlation with Y. I've edited my question very slightly to reflect my interest in what happens when X's connection to Y (and not Z's) gets stronger.,2013-10-14 15:22:31.163 +104134,57418,19043.0,CC BY-SA 3.0,,Thanks. Now I can do a better job searching this out on my own.,2013-10-14 15:26:22.343 +104135,57439,2149.0,CC BY-SA 3.0,,"If the series has one or more shifts in the mean , I believe some if not most of your recommended tests will falsely conclude about the need for differencing as a remedy . Differencing is one from of remedy for non-stationarity but ny no means (play on words) is not the only form. See the following article on the flaws of differencing http://www.autobox.com/makridakis.pdf. Other possible causes for apparent non-stationarity that do not require differencing are 1:) time-varying parameters and 2) time varying error variance",2013-10-14 15:27:09.863 +104136,57421,5637.0,CC BY-SA 3.0,,"@thigger, you don't need to difference a series for a deterministic trend. I guess in your case stationary model with trend ($X_t=a + bt + Z_t$, where $Z_t$ is stationary series) would fit your data.",2013-10-14 15:28:51.677 +104137,57440,10147.0,CC BY-SA 3.0,,Thanks so much. Can you please give a reference to the graphical methods used to choose among the 4 models?,2013-10-14 15:29:08.860 +104138,57442,5637.0,CC BY-SA 3.0,,This may help. http://stats.stackexchange.com/questions/56538/how-to-test-heteroskedasticity-of-a-time-series-in-r,2013-10-14 15:36:17.177 +104139,57426,9716.0,CC BY-SA 3.0,,"You can split your data to build a model (say 0.7), then test your model on the remaining data. Just a thought, i'm not a specialist on this area.",2013-10-14 15:41:27.993 +104141,57438,5237.0,CC BY-SA 3.0,,"I gather your experiment will be *adaptive*, the nature of the next problem that a student tries will be dependent on the set of problems the student has not yet done, or done correctly, & for those that were missed before: how long it has been since they tried & how easy they thought it was. What you are trying to figure out here is how to program the experiment with respect to how long / how many intervening problems the student should see before you show then the same problem again. Is that correct?",2013-10-14 15:45:47.370 +104142,57440,2666.0,CC BY-SA 3.0,,"Stratify by a top predictor and within each stratum compute the inverse transformation (logit, etc.) of the cumulative proportion $Y \geq y$ for all $y$ above $min(Y)$. Look for parallelism.",2013-10-14 15:55:54.847 +104143,57438,13385.0,CC BY-SA 3.0,,"@gung: Yes, that's right. But the dependency of the next problem on the ""current"" one is weak. New problems are randomly drawn from a bank. The next one seen is only dependent because it might have been seen and scheduled already.",2013-10-14 15:56:56.357 +104144,57426,20470.0,CC BY-SA 3.0,,"Yes, thank you. It is more the suitability of HMMs for the task that I am unsure about.",2013-10-14 15:59:25.960 +104145,57443,15827.0,CC BY-SA 3.0,,Does AUTOBOX do ARCH/GARCH as well as ARIMA?,2013-10-14 15:59:40.610 +104146,57421,12683.0,CC BY-SA 3.0,,"@vinux: Good point, & I think that might, if the error variance is small compared to the trend, explain the apparent need for a big MA term after differencing - you'd be introducing negative auto-correlation.",2013-10-14 16:03:13.817 +104147,57437,15827.0,CC BY-SA 3.0,,"It's almost axiomatic that statistical people don't have good data on anything like uses of statistical methods, statistical software, etc. But you could search your favourite citation database to get some indications. But relative use doesn't tell you much about desirability, at least not much more than newspaper circulation or book sales figures tell you what's worth reading.",2013-10-14 16:03:21.570 +104148,57444,22143.0,CC BY-SA 3.0,,A couple of clarifications: 1) Is indexing by $i$ important to the question? 2) You want $P(Y \leq y | X = x)$ but do you know the conditional distribution $P(Y=y|X = x)$?,2013-10-14 16:09:44.317 +104149,57435,1889.0,CC BY-SA 3.0,,Presumabbly it is when you use a linear model when the underlying process is not linear,2013-10-14 16:28:13.880 +104150,57429,20473.0,CC BY-SA 3.0,,"It would help to clarify by defining explicitly and in mathematical terms what $Y$ is. ""Payback rate"" is a very general term. Is it measured as a ratio of monetary values? Does it count instances of payments-non payments? Is it a weighted average of the behavior of old and new customers? And if yes, what are the weights used? Etc. How one models the RHS has obviously a great deal to do with what exactly does this RHS attempt to explain (the LHS of the regression specification). Otherwise, it would be blind mechanical search for a good fit.",2013-10-14 16:34:38.473 +104151,57444,19681.0,CC BY-SA 3.0,,Why not use logistic regression? You have a binary outcome: greater than 0.1 or not.,2013-10-14 16:35:32.293 +104152,57444,22262.0,CC BY-SA 3.0,,"@Theja I'm not sure I follow. If I know the latter then I automatically know the former with integration. Also I think the notation is confused -- in the former you use $P$ to mean probability, but in the latter you use it to refer to a density function.",2013-10-14 16:35:32.620 +104154,57444,19681.0,CC BY-SA 3.0,,OK. Maybe it would help clarify your problem if you explain why logistic regression is not ideal here.,2013-10-14 16:38:24.027 +104155,57444,22262.0,CC BY-SA 3.0,,"@zkurtz I'm fielding alternatives; I will definitely try logistic regression. I just don't want it as the answer because I already know that approach. Just wondering; don't I want to avoid discretizing a continuous response (here, $Y$) and avoid throwing away the information if I can help it?",2013-10-14 16:43:11.390 +104156,57447,22262.0,CC BY-SA 3.0,,"Could you please clarify ""it doesn't appear that $Y$ is binary"". In the question I said it was continuously normally distributed so this comment is a bit confusing to me.",2013-10-14 16:44:02.567 +104157,57444,19681.0,CC BY-SA 3.0,,"@user2763361, I guess the question is whether that information is relevant. You seem to have asked a binary question: is Y greater than 0.1 or not? If you had asked ""what is the distribution of Y"", then reducing to binary would certainly be silly and quantile regression would be obviously advantageous.",2013-10-14 16:53:15.433 +104158,57442,2149.0,CC BY-SA 3.0,,@vinux The tests you recommend all require an error process that is free of pulses/level shifts/seasonal pulses/local time trends and has time invariant parameters and no points in time where the error varince changes deterministically.,2013-10-14 17:02:33.903 +104159,57444,22143.0,CC BY-SA 3.0,,"@user2763361, sorry about the abuse of $P()$. I kind of agree with what zkurtz is suggesting. If the only aim is to find P(Y \leq 0.1), then create a dependent variable Y' = 1 if $Y \leq .1$ and 0 otherwise. Then, run logistic regression. From that you can get the probability directly since logistic function's output $\in [0,1]$.",2013-10-14 17:02:50.413 +104160,57421,2149.0,CC BY-SA 3.0,,This series could have three local trends plus 2 anomalous data points. Only the data can correctly speak ! Please post the actual data .,2013-10-14 17:22:39.190 +104161,57407,15377.0,CC BY-SA 3.0,,"Thanks everyone for your quick pointer. However at this point, I realize the question I asked for is not properly specified. I should mention a caveat that, the quantiles should be of the same side of Origin (assuming location parameters for both r.v.s are zero). For example, let say 5th and 1st quantile of 2 random variables. My question is that, given 2 real valued r.v.s, if 5th quantile of a r.v. is smaller than 5th quantile of 2nd r.v. then 1st quantiles also will be of similar order? Is it a necessary property? Or there exist random variables for which such rule may not follow?",2013-10-14 17:29:16.163 +104162,57443,15827.0,CC BY-SA 3.0,,"Thanks; I take that as ""Not directly, and we think justifiably"".",2013-10-14 17:29:46.707 +104163,57444,12282.0,CC BY-SA 3.0,,"I've read this question several times, and I can't figure out what you're asking. Looking at some of the other replies I think I'm not alone. You bring up a design matrix, so I assume there's some explanatory variables which it gets multiplied by. But then you say Y is distributed $N(0, \sigma^2)$, with no mention of X - if we know the distribution of Y, why care about X at all? Is Y actually the noise? And what does ""...my model has estimated the $\tau$-th quantile to be 0.1, which is point on the x-axis in Y's pdf."" mean?",2013-10-14 17:39:34.350 +104164,57444,12282.0,CC BY-SA 3.0,,"...(continued) I think you need to edit your question and add in a section where you define all the variables, explicitly state their relationships, and say exactly what you're trying to find. Give us too much information. I think the answers you get will get better then.",2013-10-14 17:40:54.013 +104165,57443,2149.0,CC BY-SA 3.0,,@Nick Correct ! We just use the language here as we really don't know it .,2013-10-14 17:48:39.977 +104166,57421,12683.0,CC BY-SA 3.0,,Really looks like @vinux is right & you've created a non-invertible series by differencing a series with deterministic trend. Try de-trending as suggested & then see what you've got left to deal with (@IrishStat is also right that there seem to be a couple of outliers there). Note as well you've a very small sample size for ARIMA - if the purpose is to forecast I'm not sure I wouldn't be using exponential smoothing unless I'd some background knowledge to go on.,2013-10-14 18:02:12.820 +104167,57383,,CC BY-SA 3.0,anon,"You're right, of course, Bioconductor project does not distribute the annotations for the Drosophila array. Just in case you decide to try the cytoband approach, I have produced an annotation package using the refseq information from Agilent, and it is shared on my Google Drive at https://docs.google.com/file/d/0B5F_KFI2_sBKZzZXR3ZsV2pvQ1k/edit?usp=sharing. I just updated it today.",2013-10-14 18:25:12.977 +104168,57442,5637.0,CC BY-SA 3.0,,"I agree with you @IrishStat. But, usually financial time series are free from mean level pulses or the volatility part is dominated than conditional expectation. Anyway I was trying to give an option for the tests in R.",2013-10-14 18:30:20.563 +104169,57421,22669.0,CC BY-SA 3.0,,@Scortchi - if I'm honest the main purpose here is to teach myself more about time series analysis - the data were collected to demonstrate that there's a change (which is pretty definite on any analysis); I just saw the opportunity to play with it and try to examine the properties of the change. I have a version averaged weekly (with consequently 4x the data points) but it's much noisier. I'm itching to have a go with vinux's model suggestion now!,2013-10-14 18:35:54.037 +104170,57451,668.0,CC BY-SA 3.0,,"Your integrals make no sense, because $U$ and $u$ have different meanings (one is a random variable, the other is a dummy variable of integration) and $W$ and $w$ have different meanings. What solution did you get using double integrals and why is it important to obtain one using triple integration? (I obtain $5/36 + \log(2)/6 \approx 0.254413$.)",2013-10-14 18:37:12.347 +104171,57351,,CC BY-SA 3.0,,"I have two questions. 1) Not sure if I understand this properly, can I interpret the expectation as one of the first two equations, if either X or Y has been fixed? 2) Can you give an example for EQ 4 and EQ 5? I have a hard time interpreting them and I think concrete examples would help. Thanks!",2013-10-14 18:50:23.860 +104172,57451,21840.0,CC BY-SA 3.0,,"@whuber Using double integral, I obtained 0.2545. I want to see how can we solve using triple integral.",2013-10-14 18:51:01.600 +104173,57442,22677.0,CC BY-SA 3.0,,"@vinux just out of curiosity, how do you know that i was referring to financial time series on this question instead of other field of science, does ARCH/GARCH and ARIMA model only exist on financial studies?",2013-10-14 18:56:17.833 +104174,57451,668.0,CC BY-SA 3.0,,"Perhaps you should show your demonstration using double integrals, because your triple integrals still make no sense.",2013-10-14 18:57:14.797 +104175,57421,22669.0,CC BY-SA 3.0,,"@vinux - I feel an idiot for ignoring the simplest solution, thanks! A simple ""reg y D_date_MY"" already gives a better model than the ARIMA one where the MA term was desperately trying to eject itself. I'll have a look to see if it can be improved by adding a stationary model (though I'm suspicious not!)",2013-10-14 18:59:58.547 +104221,57462,22143.0,CC BY-SA 3.0,,"@whuber, though I did put forth the idea of sampling uniformly from the surface, I am not sure if it leaves gaps as you mention. In fact, I think one way to achieve uniform sampling is to fit a grid to your planar surface and pick some of these points uniformly at random with replacement.",2013-10-14 21:33:32.957 +104261,57484,22698.0,CC BY-SA 3.0,,"Yes I mean that rho=cor(x,y)=cor(x,z)=cor(y,z) and what are the limits for rho. Dilip, can you extend that to say that rho must be non-negative, ie >= 0?",2013-10-15 02:44:22.660 +104176,57453,5237.0,CC BY-SA 3.0,,"Welcome to the site, @Spy_Lord. This question seems to be *only* about how to do this in R. Thus, it may be off-topic for CV (see our [help page](http://stats.stackexchange.com/help)); but could be on-topic on [Stack Overflow](http://stackoverflow.com/). If you have a statistical question about RF, please edit to clarify; if not, we could migrate it for you (*please don't cross-post*). However, it will need a [reproducible example](http://stackoverflow.com/questions/5963269/) to be on-topic there; so you'll need to show what you've tried so far & add a `dput()` of your data.",2013-10-14 19:03:10.937 +104177,57453,5237.0,CC BY-SA 3.0,,This question does not appear to be about statistics within the scope defined in the help center.,2013-10-14 19:04:06.067 +104178,57396,6162.0,CC BY-SA 3.0,,What is your criterion ? What I had in mind is for example to find $n$ such that the length of the confidence interval is below a prespecified maximal length.,2013-10-14 19:09:37.893 +104179,57372,21947.0,CC BY-SA 3.0,,"@IslamEl-Nabarawy If I understand correctly, you found that there was no significance for several statistics except for one specific test. Assuming you are testing against the null hypothesis that ""one or more of these statistics is significant"", but that each $p$-value was determined independently, then your $p$ values are too small. (Unfortunately, I doubt that there are good tools to control for this; standard methods such as the Bonferroni correction are not applicable to this test.) Nevertheless, this *bolsters* your hypothesis that there are no significant relationships.",2013-10-14 19:10:31.990 +104180,57351,20473.0,CC BY-SA 3.0,,"@ceiling cat 1) $E[h(X,\bar y)] = \int_{-\infty}^{\infty} h(x,\bar y) f_X(x)dx$ is correct because essentially you do _not_ have _two_ random variables any more. Likewise for fixing $X$ to $\bar x$.",2013-10-14 19:10:33.973 +104181,57451,21840.0,CC BY-SA 3.0,,@whuber I have posted the solution using double integral.,2013-10-14 19:14:16.933 +104182,57453,22682.0,CC BY-SA 3.0,,"Ah OK, my apologies. I would be alright with a solution outside R, so I suppose it overlaps slightly between CV and Stack Overflow. However user31264 looks like he's given me a workable solution anyway.",2013-10-14 19:15:22.533 +104183,57351,20473.0,CC BY-SA 3.0,,"@ceiling cat 2)-EQ5 : Consider $Z = X^2(Y-(Y+2)^3) = h(X,Y)$. $Z$ is a random variable alright (for an appropriate support). Then using the specific meaning for the short hand notation, $E_X(Z)=E_X[(h(X,Y)] = \int_{-\infty}^{\infty} x^2(y-(y+2)^2) f_{X}(x)dx$ where $f_{X}(x)$ is the density of $X$ (whatever that is). Obviously $Y$ is not integrated, and it will stay intact. But the result you will obtain won't be a number (as in my previous comment), but a random variable (a function of $Y$), since $Y$ here is _not_ fixed, just not-integrated out.",2013-10-14 19:18:29.690 +104184,57351,20473.0,CC BY-SA 3.0,,"@ceiling cat In both cases in my two previous comments, the ""mechanics"" of mathematical calculations will be the same. The end results though have different interpretations.",2013-10-14 19:22:14.193 +104185,57351,20473.0,CC BY-SA 3.0,,"@ceiling cat 2)-EQ4: Consider the same random variable $Z$. Its expected value conditional on $X$ is (using the other meaning for the shorthand notation) $E_X[Z] = E(Z\mid X) = \int_{-\infty}^{\infty} z f_{Z|X}(z\mid x)dz$. Note that here the $x$'s and $y$'s do not appear directly in the integrand -they are ""condensed"" in the $z$ symbol.",2013-10-14 19:29:14.960 +104187,57455,22143.0,CC BY-SA 3.0,,"What is the feedback? Is it updating the algorithm only if it makes error or is it updating the algorithm with the new data irrespective of how the algorithm performed? Depending on the situation, the effect may be different.",2013-10-14 20:01:10.150 +104188,57448,22656.0,CC BY-SA 3.0,,Thanks a lot! Would you please introduce a reference for this inequality?,2013-10-14 20:04:57.153 +104189,57452,22143.0,CC BY-SA 3.0,,Can you clarify: Do we have a 1-D distribution $D$ from which we get $N_s$ examples $\{x_i\}_{i=1}^{N_s}$? By population size $N_p$ do you mean that the values which random variable takes is finite and the size of this set of values is $N_p$?,2013-10-14 20:13:38.957 +104190,57396,6162.0,CC BY-SA 3.0,,"About your update: yes, this is what I said, if you count the sample values $x_i$ such that $x_i \leq c$ then this count has a binomial distribution.",2013-10-14 20:14:29.053 +104191,57412,,CC BY-SA 3.0,,I asked a similar question at http://stats.stackexchange.com/questions/72570/simulating-from-posterior-predictive-over-many-periods but it looks like yours has received more up votes so far.,2013-10-14 20:15:12.887 +104192,57418,594.0,CC BY-SA 3.0,,Also sometimes '*a priori* contrasts' or 'planned comparisons'.,2013-10-14 20:19:04.030 +104193,57421,,CC BY-SA 3.0,,"@thigger It's worth bearing in mind that, as a rule, one should have at least N=50 observations in order to build an ARIMA model. If I understand correctly, the series to be modeled contains N=31 obs, which is a good deal less than N=50. Furthermore, one generally only considers N/4 partial- and auto-correlation coefficients, so, in this case, only the first 8 lags in the ACF and PACF are relevant for identifying tentative models. For y; both the ACF and PACF have two significant spikes at lags 1 and 2. For D.y; both the ACF and PACF have one significant spike at lag 1.",2013-10-14 20:25:23.803 +104195,57421,,CC BY-SA 3.0,,"@thigger Essentially, I'd like to emphasize the point made by Scortchi that your sample size is very small. Hopefully my other comments provide you with some useful help too.",2013-10-14 20:28:43.740 +104196,57452,22627.0,CC BY-SA 3.0,,"@Theja: yes, the population distribution is 1-D, and $N_p$ is the number of values from the distribution. Here's the background: the shape of a manufactured part is deemed acceptable iff the maximal deviation $X_p$ from the desired shape, as measured over $10^9$-ish locations, is small enough. In fact, $X_p$ became the measure of the shape's fitness in the industry. It's infeasible to measure (manufactured - desired) at every of the $10^9$ location, so only a sample of $10^5$-ish points are measured. Thus the question: how to estimate $X_p$ given $X_s$ of the sample and the sample's momenta.",2013-10-14 20:42:10.310 +104197,57458,5237.0,CC BY-SA 3.0,,"Your latter strategy is [winsorising](http://en.wikipedia.org/wiki/Winsorising). Statisticians are usually leery of this, & prefer to use [robust analyses](http://en.wikipedia.org/wiki/Robust_statistics). As far as what the instructor had wanted students to do, I can't say for sure; your best bet would be to see if you can find it in the materials that are online, or send them an email. Regarding how to speed up these algorithms in `R`, that's off-topic for CV (see our [help page](http://stats.stackexchange.com/help)), but should be on-topic on [Stack Overflow](http://stackoverflow.com/).",2013-10-14 20:45:19.070 +104199,57461,22685.0,CC BY-SA 3.0,,"I think i may know the answer to this question. But I'm not 100%. If X and Y have a statistical distance that is e(k)-close then depending on the value of ""k"", X and Y could be ""statistically indistinguishable"", which is a good thing in cryptography because you don't want an adversary to be able to easily guess which distribution you sampled a variable from.",2013-10-14 20:49:55.787 +104258,57477,594.0,CC BY-SA 3.0,,*Every* part of the correct formula is covered by my discussion above. Can you more clearly explain what you don't get and I can point to where it's already covered by my answer. I can try expanding my explanation further.,2013-10-15 02:17:20.650 +104259,57484,22507.0,CC BY-SA 3.0,,"Do you mean that rho=cor(x,y)=cor(x,z)=cor(y,z), and what are the limits for rho?",2013-10-15 02:23:44.780 +104200,57460,1693.0,CC BY-SA 3.0,,"To me, the relative strength of coeff.'s matters a lot, as I tried to say in a recent comment and edits. And I've just learned about Leamer, E. E., A Result on the Sign of Restricted Least Squares Estimates,"" Journal of Econometrics, 3 (1975), 387-390. See a brief summary at http://davegiles.blogspot.com/2013/05/when-can-regression-coefficients-change.html. Apparently in OLS there is a minimum predictive power required of 1 variable (relative to that of another) in order for its inclusion to cause a sign change for the other. I'd like to know the rule for groups of covariates, in logit.",2013-10-14 20:55:01.523 +104201,57396,,CC BY-SA 3.0,,"I suggest you take a look at _inverse binomial sampling_. This is a sequential method that adaptively selects sample size to _guarantee_ a certain confidence level for a prescribed _relative_ confidence interval. So, for example, this method can assure that the estimated probability does not deviate from the true probability by more than, say, 10% with 95% confidence. Take a look at an explanation here (see especially the last reference): http://stats.stackexchange.com/questions/71164/monte-carlo-estimation-of-probabilities/71228#71228",2013-10-14 20:55:45.960 +104202,57455,14748.0,CC BY-SA 3.0,,"Feedback is used irrespectively, right or wrong.",2013-10-14 20:58:49.580 +104203,57461,668.0,CC BY-SA 3.0,,"Have you consulted the [Wikipedia article on ""statistical distance""](http://en.wikipedia.org/wiki/Statistical_distance), which points out there are *many* different distances? Which definition does your reference use?",2013-10-14 20:59:22.113 +104204,57456,19265.0,CC BY-SA 3.0,,"I know, that they are typically nonzero when the 2-class data is non-separable. +I know, that we are trying to minimize their sum. +But I don't know, what is the loss function they are calculated with. Is it a step function or hinge loss or something else?",2013-10-14 20:59:52.853 +104205,57457,668.0,CC BY-SA 3.0,,"Although you did not use integral notation, you indeed did compute a triple integral through a process of three successive integrations (in a somewhat mysterious way)--and that's usually what somebody means when they request a ""triple integral.""",2013-10-14 21:01:45.703 +104206,57459,12683.0,CC BY-SA 3.0,,How do you know?,2013-10-14 21:06:34.853 +104207,57319,668.0,CC BY-SA 3.0,,"You might get more serious attention if you were to explain your undefined terms and acronyms, in particular ""RSQ"" and ""B."" Although many readers will make educated guesses, the more experienced of them will know that there are multiple possible correct guesses. For example, the meaning of ""B"" depends on how both `RA` and `HHS` are encoded, so even your statements about its sign are ambiguous.",2013-10-14 21:08:57.393 +104208,57459,22143.0,CC BY-SA 3.0,,"There are two options: 'thresholding' may mean remove those rows/observations where any of the values is not in $[20,16000]$ or it could mean what you just described. I think the latter is used in statistics since it generates censored data, thus my answer.",2013-10-14 21:12:33.207 +104209,57456,22143.0,CC BY-SA 3.0,,"Yes, it is the hinge loss. The hinge loss has been removed from the objective and made into a bunch of constraints (their number equalt to the number of examples, $l$ in your notation). In particular, $\max[0,1-y_iw^Tx_i]$ is the loss on example $i$.",2013-10-14 21:14:39.170 +104210,57462,668.0,CC BY-SA 3.0,,"(1) I am curious why MaxEnt might apply here, because it seems to me that the more long-tailed the underlying distribution becomes, the more uncertain any sample-based estimate of its maximum will be. That suggests this principle might not even be relevant to the question. (2) Of what value is an inequality relating the maximum of a sample to its expectation when the concern is about the maximum of the *population*? This inequality seems to ignore the potentially huge negative bias in using the maximum of a sample to estimate the population max.",2013-10-14 21:16:18.430 +104211,57462,668.0,CC BY-SA 3.0,,"(3) Why make measurements uniformly at random, which is known to leave fairly large spatial gaps with high probability, when other procedures--such as gridded samples--will surely leave smaller gaps?",2013-10-14 21:17:22.107 +104212,57457,6162.0,CC BY-SA 3.0,,"@whuber Why mysterious ? I only use the formula $E[f(X,Y)]=E[E[f(X,Y) \mid Y]]$ at each step.",2013-10-14 21:18:33.090 +104213,57458,12683.0,CC BY-SA 3.0,,Not sure - I've not heard of 'thresholding' before. I'd guess it means 'Winsorizing' just because if he'd meant 'discard data less than 20 or more than 16k' it would have been straightforward to say just that. But people don't always like to be straightforward. On the other hand you cross thresholds rather than piling stuff up on them.,2013-10-14 21:21:06.817 +104214,57463,668.0,CC BY-SA 3.0,,"As explained in comments at http://stats.stackexchange.com/questions/57847/formula-to-calculate-a-t-distribution, there are trade-offs between computation time, storage, and programming complexity. What are your preferences concerning those? What is the anticipated range of degrees of freedom? How accurate do the calculations need to be?",2013-10-14 21:21:19.300 +104215,57457,668.0,CC BY-SA 3.0,,"The mysteries lie primarily in the details, which are carried out wholly without explanation. A look at the question shows the O.P.'s effort fell apart in not recognizing the importance of tracking the domains of the RVs, as evidenced by the appearances of ""$\min$"" and ""$\max$"" in this answer. Thus, although you have provided a *correct* answer, it does little to reveal what went wrong or to explain the methods you have used to break up the integrals and identify the proper ranges to use in each one.",2013-10-14 21:24:24.487 +104216,57458,22507.0,CC BY-SA 3.0,,"What is the meaning of your data? For example, what is U58516_at? Is it a gene? a specie? an animal? What is X1, X2, etc.? What the positive and negative numbers mean? From the question one can only understand that there are columns starting with U and ending with _at, there are rows starting with X, there are numbers corresponding to each column/row pair, and all this is somehow connected to genetics. Also, why shouldn't you ask your professor what he or she means by thresholding?",2013-10-14 21:25:00.470 +104217,57463,22687.0,CC BY-SA 3.0,,"@whuber I'm afraid I don't see any implementations of the actual formula in that question. There is a link to Wikipedia's article on the Student's T, but I can't find the formula for the density function there. There is also a link to an R builtin, but my software package is written de novo in embedded C. +Basically I just need a pointer at somewhere that clearly describes the math and I can handle over the programming from there.",2013-10-14 21:26:47.753 +104218,57462,22143.0,CC BY-SA 3.0,,"I am not applying MaxEnt per se. I chose uniform distribution over an interval as my data model. GIven that, the estimator for the upper interval bound is the answer given in my answer. I alluded to maxEnt because it says (I think) when you have no information, use the distribution with the maximum entropy.",2013-10-14 21:28:40.640 +104219,57462,668.0,CC BY-SA 3.0,,"I don't believe MaxEnt says anything like that at all: you must always bear in mind the *purpose* of the distributional assumption. I can't decipher what else you said in that comment--too many ""estimators"" appear in one sentence--but nevertheless I still see nothing in this answer that directly relates properties of the sample to the maximum of the *population.*",2013-10-14 21:31:43.027 +104220,57461,22685.0,CC BY-SA 3.0,,@whuber - I think Kolmogorov–Smirnov statistical distance. But I'm not 100% sure.,2013-10-14 21:32:02.537 +104222,57463,668.0,CC BY-SA 3.0,,"I did not refer you to that thread for its formulas--you are correct, it unfortunately lacks any--but to point out that there are *myriad* ways to compute the t distribution. It's not really a math question, but one of scientific programming. For instance, in your situation an attractive solution might be to store a few tables and interpolate within them, because then you won't have to program any kind of numerical integration routines. If you don't disclose your engineering constraints and objectives, you will reduce the opportunity to learn about such options.",2013-10-14 21:35:20.047 +104223,57462,22143.0,CC BY-SA 3.0,,"I am claiming that the $max_{i=1,..,N_s}x_i$ is the estimator of the maximum of the *population* here. This is motivated from the fact that if the unknown distribution was uniform, the estimator would make sense. In other cases, I do not know how to construct the estimator from the sample which can tell me the maximum of the population.",2013-10-14 21:37:37.823 +104224,57462,668.0,CC BY-SA 3.0,,let us [continue this discussion in chat](http://chat.stackexchange.com/rooms/11061/discussion-between-whuber-and-theja),2013-10-14 21:38:02.153 +104225,57447,2666.0,CC BY-SA 3.0,,Someone suggested using binary logistic regression but that would not be the best choice if $Y$ is not binary.,2013-10-14 21:39:50.870 +104227,57416,22507.0,CC BY-SA 3.0,,It *is* an answer. The topic starter asked for a name of a statistical test.,2013-10-14 21:45:05.577 +104228,57416,5237.0,CC BY-SA 3.0,,"I agree that it is an answer, that's why I didn't vote to delete it when SE's software automatically flagged it. However, it would be nice if you could expand it a little.",2013-10-14 21:47:29.183 +104229,57463,22687.0,CC BY-SA 3.0,,"@whuber Well, that brings me back to the question title. If I were to build a table offline that I then cubic-interpolate at runtime, how can I build that table?",2013-10-14 21:49:47.730 +104230,57407,594.0,CC BY-SA 3.0,,"Ron, I have added your change to your question and responded to it. Please note that the way you worded it in your comment doesn't make sense (what's the ""fifth quantile""? What's the ""first quantile""?), but with that part left out, we can get somewhere.",2013-10-14 21:52:15.027 +104231,57457,6162.0,CC BY-SA 3.0,,"@whuber Yes, I do not pretend this is the expected answer :) (though my answer could help to perform the ""true"" triple integral calculation).",2013-10-14 21:54:59.367 +104232,57467,503.0,CC BY-SA 3.0,,Look into equivalence testing.,2013-10-14 22:00:46.957 +104234,57467,594.0,CC BY-SA 3.0,,"You can't *show* that it's 0, since it can be arbitrarily close to zero while being unequal to it. e.g. if $b_1 = 0.0000001$ then $b_1\neq 0$ - and you'd ideally reject that point null, yet with reasonable values for and moderate sample size (and for the disposition of the $x$'s I guess), you can't. Peter's suggestion to consider equivalence testing is a good one (but it's showing something a bit different from what you're asking).",2013-10-14 22:04:06.460 +104235,57468,5045.0,CC BY-SA 3.0,,"Have you tried $y^{\frac{1}{2}}$ or $y^{\frac{1}{3}}$? These can be applied to zero (and negative values for the cube root), which is probably why you are loosing data. It might help if you post some summary statistics of your raw data (min, max, mean, median, kurtosis, skewness).",2013-10-14 22:09:08.430 +104236,57463,594.0,CC BY-SA 3.0,,"For small integer d.f. you can do integration by parts. For larger d.f. you might do it by numerical integration, or by identifying a suitable approximation for the cdf (or some equivalent), some of which are in published algorithms.",2013-10-14 22:09:54.417 +104237,57463,22687.0,CC BY-SA 3.0,,@Glen_b Where can I find one of those published algorithms? I lack the necessary statistics background to know the right words to punch into Google Scholar.,2013-10-14 22:15:00.820 +104238,57467,16469.0,CC BY-SA 3.0,,"Thanks @PeterFlom, equivalence testing is what I was looking for. If you write it as an answer I will gladly accept it.",2013-10-14 22:49:54.187 +104239,57463,594.0,CC BY-SA 3.0,,"Some algorithms for the cdf of the t are based on the incomplete beta function (which is a commonly used function in various parts of mathematics or physics). Plain googling on *algorithm cdf|""distribution function"" student t* turns up plenty of references within the pages linked (e.g. [here](http://devdoc.madlib.net/v0.2beta/student_8cpp.html)), such as Abramowitz and Stegun's *Handbook of Mathematical Functions* (which gives some small-d.f.-exact and approximate calculations), and various other books and papers.",2013-10-14 23:02:04.970 +104240,57463,594.0,CC BY-SA 3.0,,"If you want the noncentral t (e.g. for power calculations) a standard reference is Lenth, R. V. 1989. ""Algorithm AS 243: Cumulative distribution function of the noncentral t distribution"". *Applied Statistics*, 38, 185-189.",2013-10-14 23:02:43.397 +104242,57468,594.0,CC BY-SA 3.0,,"What kind of data do you have (e.g. is it count data, measurements)? Why did you transform it?",2013-10-14 23:06:07.333 +104243,57449,594.0,CC BY-SA 3.0,,"Along the lines of $\sum_{j=1}^{n}(X_{j}-\bar X)(X_{j}-\bar X)^T +=\sum_{j=1}^{n} (X_{j}X_{j}^T-X_{j}\bar X^T-\bar X X_{j}^T+\bar X \bar X^T)$ +and $\sum_{j=1}^{n}\bar X X_{j}^T=\bar X\sum_{j=1}^{n} X_{j}^T=n\bar X\bar X^T = \frac{1}{n} X^T1 (X^T1)^T$ etc",2013-10-14 23:27:24.997 +104245,57465,14748.0,CC BY-SA 3.0,,"Thanks for your response. Just to clarify, what do you mean when you say ""influence the data source""? Once the algorithm goes live, it only gets fed examples where it had predicted 1's, since those are the only ones being checked. Does that ""influence the data source""?",2013-10-14 23:34:33.053 +104246,57473,594.0,CC BY-SA 3.0,,"If you want standard errors, your second formula shouldn't have squared terms, and it's not the standard error of the distributions but the standard error of the sample means you're talking about there.",2013-10-14 23:37:55.263 +104247,57473,5643.0,CC BY-SA 3.0,,"Yes sample means, not distribution means. Fixed the question.",2013-10-15 00:21:30.703 +104249,57474,10060.0,CC BY-SA 3.0,,Check **the other tables** of the outputs and look for the coding schemes. These two modules might have coded your sex variable differently; one might have used male as reference while the other one used female.,2013-10-15 01:10:33.537 +104250,57473,594.0,CC BY-SA 3.0,,You still have the first error I mentioned. I will fix it for you.,2013-10-15 01:22:57.853 +104251,57483,7483.0,CC BY-SA 3.0,,You might get more informative answers about the scikit-learn specifics if you ask at https://github.com/scikit-learn/scikit-learn/issues.,2013-10-15 01:31:28.460 +104254,57477,5643.0,CC BY-SA 3.0,,"I'm ok with the variance algebra, what I am asking is why the square root goes to the sum of the two variances divided into the sample sizes. Notice that in the second (wrong) formula the variance is already squared, that is we use the standard deviation.",2013-10-15 01:48:16.380 +104255,57484,5237.0,CC BY-SA 3.0,,"Presumably by ""pho"", you mean *rho* ($\rho$). However, your question is not clear. What do you mean by ""What’s the tightest bound you can give""?",2013-10-15 01:57:19.573 +104256,57273,22596.0,CC BY-SA 3.0,,"The problem is the simple part: how do you use MCMC Hammer to generate a model, and then fit that model to a pre-existing data set, when the model has a large amount of parameters?",2013-10-15 02:01:42.340 +104257,57484,22698.0,CC BY-SA 3.0,,"Well the name of the variable is just a dummy. By tightest bound, I mean something like [-1, 1] for a correlation, but this clearly isn't the tightest possible bound.",2013-10-15 02:08:41.217 +104260,57442,5637.0,CC BY-SA 3.0,,@FirhatNawfanH. Yes. Usually ARCH/GARCH mainly used in financial time series.,2013-10-15 02:42:06.040 +104264,57485,22659.0,CC BY-SA 3.0,,"Thanks Jacob. Theta can indeed be a vector, however the elements of the vector represent that theta parameter in D-dimensional space. For example, if I wanted a hyperparam $\sigma$ for that SE kernel to be its variance, then that works. However, say for some reason I also needed another hyperparameter $A$ that scaled the entire thing, i.e. I had A*exp(\sigma*d**2), then it doesn't work. What if I wanted this kernel with those two hyperparameters but with two features? So we have $\mathbf{\theta} = \{A,\sigma\}$, and in the 1-D case $A$ and $\sigma$ are both scalars, but in a 2-D case they",2013-10-15 03:10:32.340 +104265,57485,22659.0,CC BY-SA 3.0,,"must be 2-dimensional vectors themselves. So in terms of scikits code, see the final return value for the SE kernel, it reshapes theta just to make sure it has the proper formatting for a single row vector, where each element is a feature (but that kernel restricts $|\mathbf{\theta}|$ to 1). In a 2-D case I would expect $\mathbf{\theta}$ to be 2x2 matrix, where the first row corresponds to $A$ and the second row corresponds to $\sigma$, where the columns are the dimensions/features. Anyway, it's not actually like that, and it screams at me. I don't know the correct way to do this. Any insight?",2013-10-15 03:13:35.713 +104266,57485,7155.0,CC BY-SA 3.0,,"Write the function for corr with the parameters hardcoded. From a programmatic view you probably want to do it as a partial application. From a statistics view you lose the ability to specify upper and lower bounds of theta, which would require a rewrite.",2013-10-15 03:37:12.963 +104267,57485,22659.0,CC BY-SA 3.0,,"OK, that will probably be really tedious but maybe do-able for proof of concept at least. I really just wanted to make sure there wasn't another way and I was just not seeing it. Thanks.",2013-10-15 04:03:38.677 +104269,57473,594.0,CC BY-SA 3.0,,I have made the thing you called a sum of standard errors into a sum of standard errors.,2013-10-15 04:37:47.040 +104270,57487,668.0,CC BY-SA 3.0,,A derivation is sketched at http://stats.stackexchange.com/questions/72262/how-to-average-variables-having-given-standard-deviations.,2013-10-15 04:57:18.850 +104271,57492,668.0,CC BY-SA 3.0,,"For recommendations to have some objective support, they need to respond to a more specific question than this. Please consider indicating the intended audience (and their background) and the purpose of the text. Also, a ""handbook"" is so close to a ""textbook"" that it does not appear sufficiently to distinguish this question from the predecessor, at least not without further elaboration of your aims.",2013-10-15 05:02:34.957 +104272,57484,668.0,CC BY-SA 3.0,,"**Closely related**: http://stats.stackexchange.com/questions/5747. It's not quite a duplicate, but its answers provide some results for the general trivariate correlation matrix that are easy to specialize to this case. The two points they do not cover are sufficiency--namely, showing that any matrix whose coefficients satisfy the conditions actually is a correlation matrix for some distribution--and tightness (there are no better bounds).",2013-10-15 05:06:02.347 +104273,57473,668.0,CC BY-SA 3.0,,"Perhaps the most famous--and likely the earliest--completely intuitive demonstration of this relation was obtained by Galton with his [quincunx](http://www1.kcn.ne.jp/~h-uchii/quinc.html). (Don't visit Wikipedia or any of the top hits on this term, because they explain only its most trivial uses: the link I gave is the first I could find that reproduces one of Galton's illustrations and hints at the revelations this machine makes possible.)",2013-10-15 05:15:01.360 +104274,57473,668.0,CC BY-SA 3.0,,"The intuition is one of *cancellation of independent errors.* It took scientists rather a long time--approximately 150 years from the beginning of probability theory in the mid 17th century to the work of Gauss and Laplace around 1800 as applied to combining astronomical observations--to realize that this actually happens with measurements! Because such cancellation is at the heart of the Central Limit Theorem, [intuitions for that](http://stats.stackexchange.com/questions/3734) are relevant here, too.",2013-10-15 05:21:36.373 +104275,57474,22163.0,CC BY-SA 3.0,,"No, that is not it. I have been very careful to get the same reference for every variable in both logistic and Genlin. In every respect (since I have left out the repeated subcommand) the models should be the same.",2013-10-15 06:10:28.813 +104276,57469,16469.0,CC BY-SA 3.0,,I agree with that. But that is from the perspective of significance testing. I was looking for a technique specifically designed to test the hypothesis of whether two groups are equal (or a coefficient is zero).,2013-10-15 06:13:39.253 +104278,57422,22668.0,CC BY-SA 3.0,,"Thanks Gilbert. That was exactly the kind of answer that I was looking for -clear and precise. Do you have, or know any reference that I can include in my document concerning these comments? I have spent a couple of weeks looking for one -I have not been able to find anything discussing these points in the (clear) way that you just did.",2013-10-15 06:31:22.893 +104279,57436,7949.0,CC BY-SA 3.0,,@rolando2 This is a reply to your comment on ineffectual control: your question still needs a bit more refinement regarding your definition of reversal. Are you interested in a reversal of the expectations of the coefficients? The answer above and the one below adress that. Or do you want to consider the power to statistically *detect* reversal (e.g. reject the null hypothesis that the coefficients of both the full and reduced model have the same sign)? The later depends on the residual variance and the magnitude of the expectations of the coefficient (thus indirectly also on R squared).,2013-10-15 06:55:09.097 +104280,57446,20470.0,CC BY-SA 3.0,,"@ alto, thank you. As you say, I will be looking at $p = log(P(O|hmm))$, and values like $p_1 =-2504, p_2 = -2403, p_3= -2450$, etc. so spotting a significant increase in $p$ may be problematic. In the meantime, I think training HMM2 will be hard. The number of points I have for HMM2 (no event) will be much higher and there may be no patter but only noise. What do you think? **P.S**: I chose 5 in as my window size arbitrarily, it is likely to be longer than that in an actual implementation.",2013-10-15 07:13:31.163 +104281,57498,15827.0,CC BY-SA 3.0,,"There are many dedicated texts that give you the details. At the opposite extreme is this one paragraph. Data come in different forms (e.g. counts can only be zero or positive integers, some measurements can be positive only, some are less restricted). Also, data come in different shapes: experience with data shows that. So, statisticians and others have proposed many different models, some with mathematical derivations and underlying ideas about generating processes, and others just proposed more or less empirically as shapes that might be fair fits for at least some distributions.",2013-10-15 08:16:55.943 +104282,57492,20130.0,CC BY-SA 3.0,,"@whuber Thank you for your commentary. I described the details. The difference between ""handbook"" and ""textbook,"" as I see it, is in the depth of details and scope of topics. Handbooks in this case exclude discussions and explanation and concern upper-level topics usually omitted in textbooks.",2013-10-15 08:40:24.730 +104284,57503,12683.0,CC BY-SA 3.0,,"Shape is associated not just with skew, but with anything that isn't to do with central tendency or dispersion; a shape parameter might well affect higher moments. It's any parameter that isn't a location or scale parameter. And cross out 'mostly' in the second sentence unless you can think of an exception.",2013-10-15 09:20:55.697 +104285,57503,15827.0,CC BY-SA 3.0,,"@Scortchi Agreed on ""shape"". Vinux: A simple example is the Poisson. Change the mean and you change variance, skewness, kurtosis, ... It is not clear whether you regard such linked changes as usual or exceptional.",2013-10-15 09:33:13.637 +104286,57503,12683.0,CC BY-SA 3.0,,@Nick: But would you call the mean of the Poisson a location parameter? I wouldn't. Same goes for the mean of the exponential distribution - it's a scale parameter.,2013-10-15 09:36:55.283 +104288,57503,15827.0,CC BY-SA 3.0,,"I guess that mathematical statisticians have invariance and equivariance criteria for different kinds of parameter. I am reacting to @Vinux's line that location indicates central tendency and the idea that location and central tendency are the same ballpark. That's probably loose at best by mathematical standards, but it's a data analytic view.",2013-10-15 09:46:20.217 +104289,57503,12683.0,CC BY-SA 3.0,,Would it be right to say that for a location parameter $\theta$ the density $f(x;\theta)=f(x-\theta;0)$ and changing it only changes the mean; while for a scale parameter $\phi$ the density $f(x;\phi)=\frac{f(\frac{x}{\phi};1)}{\phi}$ and changing it only changes the variance & perhaps the mean?,2013-10-15 09:54:47.213 +104291,57503,5637.0,CC BY-SA 3.0,,I think the question is very broad. I am not sure about any standard definition for shape in distribution context. I was trying to give a layman's picture.,2013-10-15 09:59:38.470 +104292,57469,21762.0,CC BY-SA 3.0,,As user31264 and also glen_b said: You can't. Equivalence tests are only able to show that a parameter is *close* to a certain value but not that it has exactly this value.,2013-10-15 10:05:42.527 +104293,57513,21762.0,CC BY-SA 3.0,,How large is the reference group?,2013-10-15 10:07:02.830 +104294,57513,10594.0,CC BY-SA 3.0,,Each treatment group had equal number of $n=20$,2013-10-15 10:11:54.957 +104295,57423,22669.0,CC BY-SA 3.0,,"@vinux - just thought I'd add the other component to your answer here - It works well as (Xt=a+bt+Zt, where Zt is stationary series) - and the stationary series appears to be pretty much white noise. Thanks! +My underlying error was a failure to appreciate the difference between trend removal by differencing and trend removal by subtraction of a linear trend - the -1 MA(1) term seems to have been trying to convert a random walk back into white noise.",2013-10-15 10:18:22.473 +104296,57513,21762.0,CC BY-SA 3.0,,"Maximum-likelihood estimates cannot be calculated in case of quasi-complete separation (only 0 in control, almost only 1 in test groups). This is confirmed by the huge standard errors.",2013-10-15 10:35:06.683 +104297,57503,12683.0,CC BY-SA 3.0,,"Fair enough - but 'skewness' is even less a layman's term than 'shape'. Location parameters *shift*, scale parameters *stretch*, and shape parameters change the *shape*.",2013-10-15 10:53:03.027 +104298,57505,7860.0,CC BY-SA 3.0,,"No idea how I should apply such a method to my issue, sorry. Upvote for pointing me to `scikit-learn`, I hadn't heard about that package, thanks.",2013-10-15 10:57:21.287 +104299,57503,5637.0,CC BY-SA 3.0,,@Scortchi You are right. I should not associate shape with skewness. I will edit my answer.,2013-10-15 10:58:22.017 +104300,57513,10594.0,CC BY-SA 3.0,,thanks @Michael Mayer. Indeed my data has quasi-complete separation problem. The estimated coefficients and the SE tend to be too large. Do you have any suggestions about how to deal with it? Present the result descriptively?,2013-10-15 11:05:01.797 +104301,57503,15827.0,CC BY-SA 3.0,,"@Scortchi The lay use of ""skewed"" is increasingly as meaning ""biased"", as in ""the results are skewed by including too many people of type X"". It may have been around for some time, but I've noticed it more recently. I haven't noticed ""skewness"" as a synonym for ""bias"".",2013-10-15 11:07:29.013 +104305,57504,2666.0,CC BY-SA 3.0,,"There are different brands of nonparametric regression, some better called semiparametric. Semiparametric models use only the rank of $Y$ so don't dependent on having a proper transformation of $Y$. Such models include the proportional odds and proportional hazard models.",2013-10-15 12:07:02.523 +104306,57436,1693.0,CC BY-SA 3.0,,"Thanks @Erik! Of your 2, it's close to the former. I'm asking about ""what conditions are necessary in order to obtain the theoretically expected reversal of a coefficient.""",2013-10-15 12:09:52.630 +104307,57519,21884.0,CC BY-SA 3.0,,Wow... that software seems extremely powerful. I guess that answers the question that it is indeed $O(1/n)$ too. Thank you very much.,2013-10-15 12:14:59.737 +104308,57519,21884.0,CC BY-SA 3.0,,"wolfies, is your software available for users who don't have mathematica installed?",2013-10-15 12:40:34.743 +104309,57523,22410.0,CC BY-SA 3.0,,"thanks. But how does that relate to ""If the (colored) clusters look separated in at least some of the plots. They won’t be very separated in all of the plots.""",2013-10-15 13:03:15.033 +104310,57519,17328.0,CC BY-SA 3.0,,"mathStatica is built on top of _Mathematica_, so it requires _Mathematica_. We have thought of porting it to other computer algebra systems, but it is unfortunately a lot of work to do that.",2013-10-15 13:07:02.977 +104311,57517,14874.0,CC BY-SA 3.0,,"Ok, my formulation was awkward. What I wanted to say is, that the dependence is nonlinear and we could assume something like $E(x_i|Y) = \sqrt{x_i}$. I hope its clear now?!",2013-10-15 13:18:32.270 +104312,57296,21398.0,CC BY-SA 3.0,,No one knows. No survey specialists here :'-(.,2013-10-15 13:23:38.960 +104313,57523,10278.0,CC BY-SA 3.0,,"Well take a look at the red cluster. It's very well separated if you consider `Petal.Width` against `Petal.Length` but less well separated if you consider `Sepal.Width` against `Sepal.Length`. You can tell this by looking at the univariate density plots, the red curve overlaps much more the blue and green curves when you consider `Sepal.Width` then when you consider `Petal.Width`.",2013-10-15 13:28:54.390 +104314,57526,594.0,CC BY-SA 3.0,,"You *might* choose MMSE, its a fine criterion, but that doesn't mean you have to use it.",2013-10-15 13:33:14.030 +104315,57526,594.0,CC BY-SA 3.0,,"For the normal it gives a divisor of $n+1$, but one problem is you don't actually know what distribution you really have. Yet the $n-1$ form is unbiased for every distribution. I often just use ML, but I'm generally as happy with $n-1$, and not averse to $n+1$, even though I rarely use it. It's only a hard choice when $n$ is small.",2013-10-15 13:39:02.147 +104316,57527,,CC BY-SA 3.0,,"Hey, it was not really about the inclusion of seasonal component to explain the response variable, but how to handle the seasonality in the regressors.",2013-10-15 13:42:53.573 +104317,57528,2149.0,CC BY-SA 3.0,,"Does Proc Ucm deal with multiple level shifts , multiple time trends , changes in seasonality, lead and lag effects around known events while detecting parameter transience and variance heterogeneity ? or does it have embedded assumptions about the non-existence of these characteristics ? It might be interesting for us to share some results offline. If you are interested please contact me. Alternatively if we can get the OP to post his actual data we could have a public bakeoff/comparison",2013-10-15 13:58:40.613 +104318,57528,22705.0,CC BY-SA 3.0,,"The trend, seasonality components are estimated using Kalman filter (random walk, EM approach). So, while I've not encountered the problems you're raising, i'm guessing it should. Yes, running it on OP's data-set will help determine. To answer your question in another way, I've read that any ARIMA model can be expressed as a state space equation and thus be modeled using UCM.",2013-10-15 14:13:55.060 +104319,57527,22705.0,CC BY-SA 3.0,,"Yes, JohnnyB. Maybe I wasn't clear. PROC UCM helps estimate trend & seasonality components from your response variable pattern. You don't have to explicitly estimate the trend/seasonality using indicator variables/proc timeseries etc. http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_ucm_sect001.htm Hope I could explain better.",2013-10-15 14:20:20.397 +104320,57528,15827.0,CC BY-SA 3.0,,"What language or package is this please? The mention of proc (?PROC) leads me to guess SAS, but please spell it out. See http://meta.stats.stackexchange.com/questions/1479/how-to-ask-a-good-question-on-crossvalidated for the advice ""Say what programming language you're using"". (It applies to answers as well as questions.)",2013-10-15 14:28:15.277 +104322,57505,22678.0,CC BY-SA 3.0,,"Well, here is an example: http://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html#example-svm-plot-oneclass-py",2013-10-15 14:42:10.060 +104323,57532,20304.0,CC BY-SA 3.0,,Thanks. That makes sense. I guess that due to the nature of subscription-based services the curve will get fuller as time goes on. There must be some limiting time where all accounts cancel but I guess we don't know what it is yet...,2013-10-15 14:42:21.483 +104324,57528,22705.0,CC BY-SA 3.0,,"Yes. it is SAS. Sorry, didn't know the conventions. But, I've bumped into presentations of stata which has implemented ucm as well.",2013-10-15 14:44:38.520 +104325,57532,3999.0,CC BY-SA 3.0,,"@user1893354 The day you go out of business ;) And the curve should never get ""fuller"" (by which I assume you mean go up) because your time variable is not calendar time, but time since the account was opened.",2013-10-15 14:45:19.007 +104326,57533,2081.0,CC BY-SA 3.0,,"I'm not fluent in deciphering formulae, but what I may say for sure is that discriminant analysis (see what's written in its tag) first extracts latent dimensions (discriminants), like PCA does. Only then it classifies - with those. As far as I can see, your question is about classification only. It is unclear if you are speaking of the original variables or the discriminant latent variables here.",2013-10-15 14:47:46.497 +104328,57492,3922.0,CC BY-SA 3.0,,possible duplicate of [Good econometrics textbooks?](http://stats.stackexchange.com/questions/4612/good-econometrics-textbooks),2013-10-15 14:50:18.687 +104329,57535,668.0,CC BY-SA 3.0,,"My reply at http://stats.stackexchange.com/a/13317 addresses the negative part of this question: namely, why $R^2$ does not of itself tell you anything about linearity, especially when the independent variables in the data sets are different. Although you did not ask it, this question begs for a positive response, too (and perhaps you should mention this explicitly): given that your proposed methods don't do the job, what *does* work for assessing linearity?",2013-10-15 14:56:01.060 +104330,57465,22143.0,CC BY-SA 3.0,,"No that does not influence the data source. By influence, I mean, if the output of the algorithm (lets say an ad) affects the data source (say user behavior such as click/no click) which is then fed back to update the algorithm.",2013-10-15 14:57:52.003 +104331,57464,22143.0,CC BY-SA 3.0,,the algorithm is receiving 0s and 1s according to the OP. It is only that it is seeing these 0s and 1s when the algorithm output is 1.,2013-10-15 14:59:00.947 +104332,57532,20304.0,CC BY-SA 3.0,,"Sorry, by ""fuller"" I meant that the curve should get closer to zero (as older accounts eventually start cancelling)",2013-10-15 15:00:48.500 +104333,57535,22713.0,CC BY-SA 3.0,,"Thank you for your answer. I am trying to show how linear the data increases by time, and their difference. If my methods don't work, what else method can I use to address this?",2013-10-15 15:07:35.917 +104334,57389,166.0,CC BY-SA 3.0,,"@broiyan: Sounds close. I think technically the precise value obtained, if re-obtained, would be included in the probability. Therefore, I would go to the expense of extra words to make that clear by saying ""as extreme or more extreme"". Also, because you are phrasing it in a frequentist framework, I wouldn't talk about the probability of that event, but the proportion of times you would expect it to occur if the experiment were repeated over and over.",2013-10-15 15:08:03.567 +104335,57448,22656.0,CC BY-SA 3.0,,"Thanks for the reference, but I cannot find α in the formula.",2013-10-15 15:11:37.880 +104336,57536,21804.0,CC BY-SA 3.0,,"There are several different models you could build based on this data. Using more than one predictor would rersult in a regression plane though, so the task seems to imply that you choose one. But the dataset itself does not necessitate a specific choice, so some clarification is indeed missing...",2013-10-15 15:22:01.200 +104337,57535,668.0,CC BY-SA 3.0,,"Please edit your question to include the clarification you just made. It would also help to explain more fully what you mean by ""more"" or ""less"" linear and precisely what ""difference"" you refer to.",2013-10-15 15:23:35.403 +104338,57541,22714.0,CC BY-SA 3.0,,"Sorry, I don't understand your approach. What is nob value? Could you explain your approach in more details? Can I run a hierachical clustering? Thank you.",2013-10-15 15:23:56.837 +104340,57528,15827.0,CC BY-SA 3.0,,Indeed; Stata has a `ucm` command too. There will be other implementations too.,2013-10-15 15:29:52.367 +104341,57504,668.0,CC BY-SA 3.0,,"I think this answer may be misleading because it is not sufficiently clear about what ""function type"" means: the distinction between *linear* and *nonlinear* lies in how the *parameters* enter the functional formula, not the variables. For instance, the second bullet is *not* an example of a nonlinear model; indeed (assuming $(x_1,x_2,x_4)$ are the variables), is has no parameters at all!",2013-10-15 15:31:47.237 +104342,57533,19265.0,CC BY-SA 3.0,,There is only one dimension. What discriminants can we extract here?,2013-10-15 15:38:35.083 +104343,57541,15827.0,CC BY-SA 3.0,,"As usual, @IrishStat, your inclination is to treat everything as a time series! If the data are not a time series, I think the validity of your method depends on whether the same classes (breakpoints) would be identified from a series and itself reversed, i.e. the method depends on past and future being interchangeable.",2013-10-15 15:43:29.297 +104344,57541,22714.0,CC BY-SA 3.0,,I am not working with time series... What methods can I use? Can I use multiple classification methods (like hierachical ascending) by only one variable as a classifier?,2013-10-15 15:49:27.217 +104346,57542,5203.0,CC BY-SA 3.0,,"Thanks, @whuber, I somehow mix up row and column all the time (and English is my native language!)",2013-10-15 16:04:47.167 +104347,57541,2149.0,CC BY-SA 3.0,,"@Nick The trick when using Interevention Detection for non-time series data (whis is what we have) (successful I might add) is to disable ARIMA identification, seasonal pulse identification and trend detection. These constraints eliminate any and all unwarranted/unwanted time series structure. All that is left to identify is level/step shifts (group classification) and pulse detection (one time anomalies). Send me your email address and I will forward you the results of any set of values that you wish to send to me.",2013-10-15 16:20:31.597 +104349,57541,15827.0,CC BY-SA 3.0,,"@IrishStat Thanks, but I already have code that does something similar to my satisfaction (I wrote the answer to the thread cited as duplicate to this). I was just curious about any hidden assumptions behind your suggestion.",2013-10-15 16:36:37.500 +104350,57547,15827.0,CC BY-SA 3.0,,"The first half of this is missing, i.e. what your scientific problem is, how Date features in your analysis, why you are treating it as a factor, etc.",2013-10-15 16:42:56.187 +104351,57541,2149.0,CC BY-SA 3.0,,@jos nob is the number of observations in the set. The procedure to do Intervention Detection can be found in a number of places including http://www.unc.edu/~jbhill/tsay.pdf,2013-10-15 16:55:23.337 +104352,57464,22507.0,CC BY-SA 3.0,,Then it will be biased another way (toward 0's).,2013-10-15 16:58:34.527 +104353,57547,2857.0,CC BY-SA 3.0,,Probably [related](http://stats.stackexchange.com/questions/9751/do-we-need-a-global-test-before-post-hoc-tests),2013-10-15 17:05:57.397 +104355,57485,7155.0,CC BY-SA 3.0,,"The most obvious way to me, would be write the two separate corr functions that are partially applied on theta. E.g. def square_exp(d), then write a function to combine the results via multiplication or addition with the argument def function(theta, d), where theta doesn't do anything in the function.",2013-10-15 17:21:52.837 +104356,57398,5821.0,CC BY-SA 3.0,,You're saying clogit isn't a two tailed test?,2013-10-15 17:25:25.877 +104358,57545,668.0,CC BY-SA 3.0,,"**This question is not answerable** because the domain of the pmf has not been specified. (I suspect there is a typographical error and that the pmf might be $(1-\pi)^{1-x}\pi^x$ for $x\in\{0,1\}$.)",2013-10-15 17:47:37.777 +104359,57553,9716.0,CC BY-SA 3.0,,"Sorry for the mistake, i edited the question.",2013-10-15 17:56:21.573 +104360,57545,21985.0,CC BY-SA 3.0,,"The exercise IS realy like this... I was also puzzled by this, but I can not say why it looks odd. There is no solution for sure?",2013-10-15 17:57:40.953 +104361,57545,5448.0,CC BY-SA 3.0,,"A lot of exercises like this are really more designed (in my experience) to emphasize or test recognition of the pmf. So we'd assume that $p(x;\pi)$ is some slight specialization of a standard pmf, perhaps by a simple transform of the random variable, try to figure out which one it is, then get the sample space from the definition of the (standard) pmf. In this case, the random variable $x$ can be easily transformed to another random variable, let us say $y$, for which $p(y;\pi)$ is a well-known distribution, and everything else follows from that.",2013-10-15 18:07:10.543 +104362,57554,15827.0,CC BY-SA 3.0,,"This may be clear to some, but I can't understand the distinction you are making. Empirical CDFs are necessarily calculated from the data. What is the modified CDF? Can you give an example of what the result would look like? Or an accessible reference for the term ""modified CDF""?",2013-10-15 18:21:22.150 +104363,57553,9483.0,CC BY-SA 3.0,,"The issue is that the search space of your GA, viz. the polynomials of degree 10, is too far away from your target function `f(x) = 1 /(1 + (5*x)^2)`, hence the poor fitness value.",2013-10-15 18:22:00.740 +104364,57553,9716.0,CC BY-SA 3.0,,"So theoretically, if i increase this search i should get better fits?",2013-10-15 18:23:33.113 +104365,57553,9483.0,CC BY-SA 3.0,,"Yes but since your target function cannot be approximated by a polynomial, increasing the degree is useless. In evolutionary computation such task is called symbolic regression and we use genetic programming to optimize, not GA (GA typically requires to have a pretty good knowledge on the structure of your target function).",2013-10-15 18:30:03.400 +104366,57555,15827.0,CC BY-SA 3.0,,"This seems akin to a longstanding joke in which one person says ""Suppose that there are $s$ sheep"" and somebody asks ""But what if the number of sheep is different?"". If you denote some probability $p$, that is your notation: no more. Clearly what its magnitude is numerically and whether you can estimate it accurately are key questions; in abstraction all we can say is that some probabilities are more difficult to estimate than others. That can be said independently of any speculation about ""true randomness"".",2013-10-15 18:31:06.587 +104367,57448,16644.0,CC BY-SA 3.0,,"The 3rd equation on the reference page says that with a sample of size $b$ the probability that you will be more than $\epsilon$ away from the true cdf is $\leq 2e^{-2b \epsilon^2}.$ For your application just set this probability to be $\leq \alpha$ (where your $\alpha$ will be very small, since you want to be highly likely to be close to the true cdf) and then solve for $b.$",2013-10-15 18:46:47.240 +104368,57561,15827.0,CC BY-SA 3.0,,"Not so. I've never met your first usage, even as a mistake. If you were right, and I don't think you are, it would be much flagged that a minute difference in wording was associated with such a big difference in meaning: textbook writers would be obliged to explain at length and there would be campaigns to change the terminology. I challenge you to find even one explanation of your definition in the literature. @whuber already gave an excellent answer that remains definitive.",2013-10-15 18:55:11.040 +104369,57558,12544.0,CC BY-SA 3.0,,"I'm not sure why, but breaking it into two steps works for me: +AccLog <- as.logical(as.numeric(Accuracy)) then run m2 with AccLog.",2013-10-15 18:57:09.953 +104370,57558,12544.0,CC BY-SA 3.0,,"You get the same result with glm, not just glmer.",2013-10-15 19:00:23.033 +104371,57562,21958.0,CC BY-SA 3.0,,"For example, a model could look like this: +Y_t=alpha + beta_1*X_t*Y_(t-1) + beta_2*X_(t-1)*Y_(t-2) +sigma_t*epsilon_t + +, where the X_t and X_(t-1) are two latent variables either 1 or 0, alpha,beta's,sigma are parameters, and epsilon_t is standard normally distributed. +So this is an autoregressive model, with lag=2.",2013-10-15 19:04:02.800 +104373,57562,21958.0,CC BY-SA 3.0,,"If I wanted to get the residuals, I could choose the mean of the posterior distributions of the alpha,beta_1,beta_2 and sigma_t and get something like: +r_t= (Y_observed - Y_predicted)/sigma_t = standard normally distributed, but I would have to choose some value for the latent variables. And this makes no sense.",2013-10-15 19:15:30.850 +104374,57555,16588.0,CC BY-SA 3.0,,"It might be worth expanding on what you mean by ""true randomness seems impossible."" Or more generally, give a concrete example of what you're trying to describe/ask.",2013-10-15 19:18:13.447 +104375,57533,2081.0,CC BY-SA 3.0,,"Ah, well (I didn't see it). Then term ""discriminant analysis"" _proper_ is inapplicable, although word ""discrimination"" can retained, as it is close to ""classification"" or ""distinguishing"".",2013-10-15 19:26:05.770 +104376,57566,1895.0,CC BY-SA 3.0,,"Symmetry and the law of the unconscious statistician. But a word of caution: ""...like a N(0,1) random variable..."" is sloppy, imprecise wording on the part of your professor. Such a statement is not, in general, true for any old symmetric distribution. The requisite moments must exist (be well-defined) in the first place.",2013-10-15 19:39:02.050 +104377,57550,12683.0,CC BY-SA 3.0,,"For logistic regression [it's fine](http://stats.stackexchange.com/questions/67903/does-down-sampling-change-logistic-regression-coefficients/68726), though you'll have to be careful that the sample of NS firms is not biased with respect to any predictors.",2013-10-15 19:41:29.893 +104378,57554,15363.0,CC BY-SA 3.0,,"So, the empirical cumulative is a step function that gives the mapping of the number of data points in in the population that are ≤ X. But, in R, the same is not done by ecdf. You can see this anamoly here - http://stats.stackexchange.com/questions/51607/strange-behavior-of-r-function-ecdf",2013-10-15 20:16:04.507 +104379,57286,12683.0,CC BY-SA 3.0,,Doesn't do anything when `conf.int = FALSE` which is also the default. Try `?wilcox.test`.,2013-10-15 20:28:53.687 +104410,57587,15539.0,CC BY-SA 3.0,,"I have one additional question for you. Given all 3 of these models, the X1X2 model I posted above and two separate ones for X1 and X2, how can you tell which is best, given the R^2 and MSE values for each?",2013-10-16 02:19:14.097 +104380,57554,15827.0,CC BY-SA 3.0,,I don't see that thread supporting your report of an anomaly. It seems a matter of convention to use $\le$ rather than $<$; the same difference can be found in literature on survival functions (or the same beasts otherwise named). Elsewhere I am the author of a program for plotting CDFs that supports different conventions (`distplot` in Stata).,2013-10-15 20:31:37.960 +104381,57566,668.0,CC BY-SA 3.0,,"This assertion is false unless a particularly narrow interpretation of ""symmetric"" is adopted (and even then @Cardinal's admonition must be heeded): see http://stats.stackexchange.com/a/29010 for more about this. For *familiar* counterexamples in Cardinal's spirit you may contemplate what happens with any Student $t$ distribution with $\nu$ degrees of freedom and you use any odd power greater than or equal to $\nu$.",2013-10-15 20:55:41.677 +104382,57565,668.0,CC BY-SA 3.0,,"Could you explain the reasoning that justifies your assertion that ""there shouldn't be any difference""?",2013-10-15 20:59:52.493 +104383,57565,22727.0,CC BY-SA 3.0,,"Assuming his sampling is truly random and that he already has a list of every single person with heart disease in the world (as implied by his 'Or is it better to only look at the people who have heart disease.' And since he is only interested in finding ten people with heart disease, then it is just as random and more efficient to select from the group you are interested in studying rather than selecting from the billions of people on earth over and over until all 10 have heart disease.",2013-10-15 21:11:54.900 +104385,57573,,CC BY-SA 3.0,user30490,"I am reading through the paper but am stuck, do you think you can elaborate on these calculations?",2013-10-15 21:57:52.180 +104386,57573,,CC BY-SA 3.0,user30490,Or rather is it obvious why this result holds?,2013-10-15 22:12:42.023 +104387,57521,594.0,CC BY-SA 3.0,,There's a fairly good introduction [here](http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval),2013-10-15 22:17:46.620 +104388,57578,15827.0,CC BY-SA 3.0,,No problem in principle. Plot the data and the fitted curve as well as looking at the numeric results to see how well it works.,2013-10-15 22:23:32.070 +104389,57578,594.0,CC BY-SA 3.0,,"Well, as you already note, you can get multicollinearity (though this can be avoided with the use of orthogonal polynomials).",2013-10-15 22:42:47.413 +104390,57554,594.0,CC BY-SA 3.0,,It doesn't give the proportion $\leq X$ but the proportion $\leq x$.,2013-10-15 22:43:45.327 +104392,57554,594.0,CC BY-SA 3.0,,"I'm looking at a plot of an [ECDF in R right now](http://i.imgur.com/kmWSwaY.png), it's definitely giving $\leq x$, and I'm not sure I understand the problem you have with it. It's an empirical cdf so obviously it has to be defined in terms of proportions, since the probabilities are unknown. What's the issue?",2013-10-15 23:02:13.677 +104393,57538,594.0,CC BY-SA 3.0,,"Generally, the gist of these kind of problems go: combine and expand the exponents, collect like terms, complete the square, write quadratic as $(x-g(y,\mu,...))^2+S$, spot the density.",2013-10-15 23:06:19.433 +104394,57506,22639.0,CC BY-SA 3.0,,this seems closest to what I was looking for.,2013-10-15 23:28:51.703 +104395,57578,16588.0,CC BY-SA 3.0,,"An alternative to the suggestion of @Glen_b is to first center X, then create a squared version of the centered X variable. Include them both and multicollinearity will be substantially reduced. As an added bonus, model interpretation is often nicer with centered IVs.",2013-10-16 00:12:18.833 +104396,57578,15827.0,CC BY-SA 3.0,,"In this case, what most makes the model interpretable is, I suggest, to plot data and model fit. Centering X does no harm, but the most interesting level for X is often that at which the quadratic has a turning point, whenever that occurs within the range of the data. (It may lie way outside that range.) Numerical stability given correlated predictors is much less of a difficulty with modern statistical software than it was a few decades ago when regression programs did not always use good algorithms. But an X and its square can't be interpreted as having separate effects, regardless.",2013-10-16 00:35:14.263 +104397,57583,594.0,CC BY-SA 3.0,,Is this for some subject?,2013-10-16 00:36:00.143 +104398,57572,19681.0,CC BY-SA 3.0,,"Not sure why you say the dependent variable is not repeatedly measured. For the subject with ID = S_1, it looks like there are 4 measurements.",2013-10-16 00:42:01.537 +104399,57572,22732.0,CC BY-SA 3.0,,"Because I'm trying to predict whether an individual will ever get cancer. So the true outcome would be all Ys if the individual ever gets cancer and all Ns otherwise. In the table above, this would translate into all Ys for S_1, all Ns for S_2, all Ys for S_3.",2013-10-16 00:52:03.660 +104401,57564,1411.0,CC BY-SA 3.0,,"it might be worth submitting this as an issue at https://github.com/lme4/lme4/issues -- I think it *should* work correctly with a factor, as `glm()` does ...",2013-10-16 00:54:03.350 +104402,57583,15827.0,CC BY-SA 3.0,,"The answer would probably be a publishable paper in some literatures, e.g. statistical hydrology. By the same token, an answer seems unlikely unless it already has been published.",2013-10-16 00:58:02.527 +104403,57577,14888.0,CC BY-SA 3.0,,"How about in the case where the two separate measures of the dependent variable are to measure an experimental effect that takes place in between measures? This is more akin to a paired t-test, but in this example there are two factors (or more). Most discussions/introductions to repeated measures ANOVA that I have found are about cases in which a dependent variable is predicted by a few to several independent variables that are measured on the same set of individuals (so the values are correlated within individuals).",2013-10-16 01:07:14.310 +104404,57584,17249.0,CC BY-SA 3.0,,I cannot imagine doing this being possible.,2013-10-16 01:27:53.150 +104405,57584,15539.0,CC BY-SA 3.0,,Neither can I.. could some type of technique involving dummy/indicator variables be the key? I'm just completely throwing that out there...,2013-10-16 01:37:39.587 +104406,57572,19681.0,CC BY-SA 3.0,,"Ah, OK. I'll venture a guess that mixed models are appropriate. Your primary concern seems to be that the response is fixed, within-subject. One way of looking at this is that the within-subject variance is zero. I suspect that this fits in with mixed models as a special case, but here is where my knowledge ends.",2013-10-16 01:39:14.637 +104407,57587,15539.0,CC BY-SA 3.0,,"Ok! Great.. I actually do have the ANOVA tables for X1 and X2 separately, so I have 2 additional tables. How could I go about completing the partial F-test for X1, given X2 is already in the model if I have the ANOVA tables for X1 and X2 separately?",2013-10-16 01:45:05.427 +104408,57587,5237.0,CC BY-SA 3.0,,"That's more or less in the other answers I link to, but you might have to read between the lines. But I'll edit so that it's explicit here.",2013-10-16 01:47:55.610 +104409,57480,22423.0,CC BY-SA 3.0,,"Thanks @Jonathan, this is what I wished to know. Just a final question before I accept this as answer: if in 4 surveys the weights were summed to total respondents, and 1 survey was weighted to reflect Total US population. If I multiply (Respondent size/Total US pop) to the weights of that 1 survey, and union the 5 datasets together with their weights. +Would it be right to say that I have achieved final weights where each individual record do not have any special importance relative to another dataset?",2013-10-16 01:48:56.320 +104411,57570,4656.0,CC BY-SA 3.0,,I like the way your answer skirts around the answer that most people would arrive at.,2013-10-16 02:22:56.087 +104412,57587,5237.0,CC BY-SA 3.0,,"You could use theoretical knowledge that you have of the subject matter, but to a first approximation there isn't a ""best"", at least that we will be able to discern. Note, eg, that the MSE *must* go down as you add variables, & R2 *must* go up.",2013-10-16 02:26:43.897 +104413,57582,16588.0,CC BY-SA 3.0,,This could be a helpful start: http://stats.stackexchange.com/q/3463/24000,2013-10-16 02:48:27.377 +104414,57570,668.0,CC BY-SA 3.0,,@Dilip I have no clue what that might mean. What answer to you believe most people would offer?,2013-10-16 02:52:38.580 +104415,57582,16588.0,CC BY-SA 3.0,,Also: http://stats.stackexchange.com/a/26846/24000,2013-10-16 02:55:24.420 +104416,57570,4656.0,CC BY-SA 3.0,,"I would have said that $X$ is a geometric random variable with parameter $\pi$, where $\pi$ does not have the usual meaning that it has in mathematical circles, pun intended, but is a number in $(0,1)$, and $x$ takes on all nonnegative integer values. Using the hint, the experiment would consist of tossing a biased coin with $P(\text{Heads})=\pi$ until a Head occurs, the sample space would be the set $$\Omega=\{H,TH,TTH,\ldots\}$$ with outcomes having probabilities $\pi,(1-\pi)\pi,\ldots$, and $X$ being the number of Tails in the outcome.",2013-10-16 03:07:02.060 +104417,57587,15539.0,CC BY-SA 3.0,,"From the values I calculated, I have: for X1: R2 = 8%, MSE=5900; for X2: R2 = 80.1%, MSE = 1880, for X1X2: R2=80.6%, MSE=1940. Which model do you feel is best? It appears to me as if it's a clear race between X2 and X1X2(combined), but which?",2013-10-16 03:43:55.373 +104418,53439,22705.0,CC BY-SA 3.0,,"This looks like a very exciting problem to solve. but, i think there are too many problems you can solve. +so, the first step is to fix on one or two key problems. and, then determine the factors which will be essential to solve that problem. i don't think we should talk about any model till then.",2013-10-16 05:10:07.357 +104419,57529,15563.0,CC BY-SA 3.0,,"Thank you @JTT. So if I now use newdat to create a SVM model, I suppose my model takes input in this new rotated universe, which means I will need to also rotate my Test data before applying it to the model. Is this correct? And if yes, how do you rotate a test data.frame with the same rotation?",2013-10-16 05:24:35.300 +104420,57530,15563.0,CC BY-SA 3.0,,Thank you for providing so much details. Unfortunately the example code is too cryptic for me. I see you are using predict. Where is the manual for prcomp predict? is it here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/prcomp.html ?,2013-10-16 05:36:26.843 +104421,57529,,CC BY-SA 3.0,anon,"The easiest way is to use the `predict()` method for the test data. Using the example above, `predict(pr, USArrests)` will return the same matrix as `pr$x`. For test data, replace the USarrests with the name of the test data. You can do the same thing by hand, but this is easier, since the predict methods takes automatically care of the correct scaling of the test data set.",2013-10-16 05:36:43.183 +104422,57529,15563.0,CC BY-SA 3.0,,How does predict work? Does it use all Principal Compenents. In your answer you had chosen only 2 Components to cover 80% of variance. What does predict do?,2013-10-16 05:44:23.613 +104423,57592,15539.0,CC BY-SA 3.0,,"Just on a purely theoretical basis, would it not be possible to be exclusionary for some of the results? As in, exclude Roku results where Roku 2 is mentioned Roku 3 is not, but include results with both Roku 2 and Roku 3 mentioned. +However, I think on a broad, automated scale this is a very difficult task. I think this explains why so many search engines have difficulty with the concept of 'relevance'. I'd love to hear your feedback on this suggestion.",2013-10-16 05:56:31.893 +104424,57592,15539.0,CC BY-SA 3.0,,"I think that part of the reason that this may be difficult is that you would need sets of related data. Using automated methods, I would think it would be quite difficult for an engine to determine that that are indeed two different Roku, 2 and 3 (and perhaps the original).",2013-10-16 06:01:17.543 +104425,57529,,CC BY-SA 3.0,anon,"The function `predict()` uses by default all the components. However, you can limit the number of components that are returned, e.g., `predict(pr, USArrests)[,1:2]. Would that work for you?",2013-10-16 06:33:40.360 +104426,57579,2081.0,CC BY-SA 3.0,,"The really important matter is the meaning of it. Whether a missing is ""not known value"" (and thence NA*0=0) or ""unprovided entry"" (and thence NA*0=NA).",2013-10-16 06:55:26.923 +104428,57530,8074.0,CC BY-SA 3.0,,"I have now added more explanation to my answer. Hopefully it is clearer to you now. Yes, you were correct in your link to the `predict.prcomp` help.",2013-10-16 07:32:36.150 +104429,57594,15539.0,CC BY-SA 3.0,,"Adjusted R^2 is not to be used for this problem. From the values I calculated, I have: for X1: R2 = 8%, MSE=5900; for X2: R2 = 80.1%, MSE = 1880, for X1X2: R2=80.6%, MSE=1940. Which model do you feel is best? It appears to me as if it's a clear race between X2 and X1X2(combined), but which?",2013-10-16 07:52:09.740 +104430,57594,22705.0,CC BY-SA 3.0,,"I would not take a stance between the two options, till I understand the problem and look at the interpretation offered by the two models. May be a model with x1, x2 and x1x2 will give you some insights. You can then possibly articulate interaction effects better. And, how are your residuals? Are you sure you have no information left there? Finally, why not adj r sq?",2013-10-16 08:11:02.780 +104431,57554,15363.0,CC BY-SA 3.0,,"Sorry people, I think I misunderstood the statement. Thanks for stepping in !",2013-10-16 08:44:20.747 +104432,57596,17328.0,CC BY-SA 3.0,,So ummm what is the question? Find t such that ... what? That the amount raised = amount spent? Or find t to maximise something subject to a budget constraint? Or something else?,2013-10-16 08:46:28.083 +104433,57448,22656.0,CC BY-SA 3.0,,"Thanks. Sorry I am not good at statistic, I have another question; Can we say ϵ is confidence interval?",2013-10-16 08:47:18.880 +104435,57588,,CC BY-SA 3.0,,"From your comment to @Learnerbeaver's reply below, I gather that this is homework/self study, which is why I added the relevant tag. So a few hints: what have you learned about the effect on R^2 when adding a variable to a model? How do R^2 and MSE relate? What does this imply about how useful R^2 and MSE are in selecting models (and why Learnerbeaver recommends adjusted R^2)? Finally, look at your specific values of R^2 and MSE for the three models in the comment below. You may be able to figure out the answer yourself by now.",2013-10-16 09:30:22.927 +104436,57603,22750.0,CC BY-SA 3.0,,I generate P(L) and P(D) from two different models. I do not know which is best.,2013-10-16 10:00:48.117 +104437,57577,503.0,CC BY-SA 3.0,,That case isn't really different than the one I described. RM ANOVA is a generalization of a paired t-test but it makes some unrealistic assumptions such as sphericity.,2013-10-16 10:09:13.940 +104438,57588,503.0,CC BY-SA 3.0,,"If you've studied AIC and BIC and so on in class, that might be what is being looked for.",2013-10-16 10:25:35.127 +104439,57588,15539.0,CC BY-SA 3.0,,"Yes, it appears option 2 with X2 is the option, as MSE goes up when X1 is added to the model in the third option (X1,X2), so X2 would have to be it. Can you perhaps offer me a theoretical comment on why this is so?",2013-10-16 10:42:43.390 +104542,57662,15827.0,CC BY-SA 3.0,,"The view that Poisson regression is for counts only is widely rebutted: for one informal account with Stata flavour see http://blog.stata.com/2011/08/22/use-poisson-rather-than-regress-tell-a-friend/ Also, for Stata ""non-integer frequency weights"" just can't be frequency weights, but they can be analytic weights.",2013-10-16 22:48:59.660 +104440,57601,12282.0,CC BY-SA 3.0,,"The question as posed is nice and straightforward to read, but some other info it would be useful to add in at the end: 1) What kind of variables are L and D? Continuous? Categorical? What values can they take? 2) Given know P(L|D) and P(D|L) exactly, could you tell us what they are? 3) What models have you used to estimate P(L) and P(D)? Why? 4) What data do you have access to? It sounds like you have samples from P(L) and samples from P(D), to which you have fitted a model. If so, how many samples from each? 5) What are you going to do with P(L,D) once you know it?",2013-10-16 11:41:13.200 +104441,57601,12282.0,CC BY-SA 3.0,,"And why I would like to know: 1) To get an idea of how much missing information there is to infer, given you knowledge of the two conditional distributions. 2) Similar reasons to 1. 3) If you have good reason to believe one model's estimates are closer to the 'true' distribution, you should favour using it . 4) If you have more samples of L than D, for example, estimates of P(L) may be more accurate. 5) Depending on the intended application, it may be best to keep *both* estimates of P(L,D).",2013-10-16 11:49:35.207 +104442,57595,1895.0,CC BY-SA 3.0,,"Some intuition: In going from $Y_k$ to $Y_k^2$, do you lose any information? If so, about what? Now, consider the example of $X_n$ being a simple random walk. Is it a martingale with respect to $\sigma(X_0,\ldots,X_{n-1})$? What about $\sigma(X_0^2,\ldots,X_{n-1}^2)$?",2013-10-16 12:07:53.623 +104443,57607,22752.0,CC BY-SA 3.0,,Thank you for your help. That was exactly what I was looking for. Could you please explain to me why you could treat $\beta$ as a constant (I'm referring to the second equality of the last block of derivations)?,2013-10-16 12:22:19.957 +104444,57607,22143.0,CC BY-SA 3.0,,"$V(const + W) = Var(W)$ is a property of variance operator. Basically, constant terms do not appear since $V(const + W) = E[(const + w - E[const + W])^2] = E[(W - E[W])^2] = V(W)$.",2013-10-16 12:38:52.347 +104445,57601,22750.0,CC BY-SA 3.0,,"1) What kind of variables are L and D? Continuous? Categorical? What values can they take? + +They are continuous probabilities, so can take values between 0 amd 1? + +2) Given know P(L|D) and P(D|L) exactly, could you tell us what they are? + +P(L|D) = 0.385, P(D|L) = 0.735 + +3) What models have you used to estimate P(L) and P(D)? Why?",2013-10-16 12:48:13.370 +104446,57601,22750.0,CC BY-SA 3.0,,"I have two linear regression models to estimate each value. I want to estimate whether things and L and D are the same time. I could build a third model but wondered if I could use the two I have. + +4) What data do you have access to? It sounds like you have samples from P(L) and samples from P(D), to which you have fitted a model. If so, how many samples from each? + +I have the same set of samples for each model. That's the historical data to calculate the regression values.",2013-10-16 12:48:51.873 +104447,57601,22750.0,CC BY-SA 3.0,,"5) What are you going to do with P(L,D) once you know it? – Pat 52 mins ago + +I am going to select those with the highest probablity from a current set of data where neither L or D are known. I want to select the things which are most likely to be L and D at the same time. + +I am inclined to keep both estimates and average them.",2013-10-16 12:49:49.363 +104448,57607,22752.0,CC BY-SA 3.0,,"@Theja Thanks but that was actually not what I meant. I know about that property, but what I tried to ask is why $\beta$ *can be seen as a constant*. Isn't it some kind of random variable?",2013-10-16 12:50:38.133 +104449,57607,22752.0,CC BY-SA 3.0,,"Oh wait, I think I am confusing $\beta$ with $\hat{\beta}$ which *is* a random variable. Never mind then :).",2013-10-16 12:55:07.817 +104450,57607,22143.0,CC BY-SA 3.0,,$\beta$ is the true unknown model which is *assumed* to be fixed and not random (see point 1. in the answer above).,2013-10-16 12:55:28.257 +104451,57607,17573.0,CC BY-SA 3.0,,"@rbm Well, if you are a Bayesian, then you are going to see that step as a mistake (I think---but ask a Bayesian to be sure). If you are a Frequentist, parameters are always constants.",2013-10-16 13:12:00.577 +104452,57611,503.0,CC BY-SA 3.0,,Seems sensible to me,2013-10-16 13:21:41.877 +104454,57448,16644.0,CC BY-SA 3.0,,"Say it this way: $\epsilon$ is the half-width of the confidence interval. Since your empirical cdf will be within $\epsilon$ with probability $1 - \alpha,$ the confidence interval has width $2 \epsilon.$ Here is another way to look at it: If $G(c)$ is your empirical cdf at $c,$ the confidence interval is $[G(c) - \epsilon, G(c) + \epsilon].$",2013-10-16 13:34:42.047 +104455,57420,306.0,CC BY-SA 3.0,,i have said that different cases are to be handled separately and this is just one particular case. and different normally distributed random variables having different parameters of normal distribution are said to be of different distributions if that is the confusion here.,2013-10-16 13:39:51.890 +104456,57564,1411.0,CC BY-SA 3.0,,"oops. Reading this more carefully I see that it isn't a `glmer` issue at all. I will say that at least the development version of `lme4` gives an error `Response is constant - cannot fit the model` , which at least gives a clue ...",2013-10-16 13:43:02.467 +104457,57600,1790.0,CC BY-SA 3.0,,"Thanks Dikran. When you say `""average over the parameter values""` I think understand how to do this through an ensemble method (e.g. building the ensemble output as the average of the the classifier outputs), but I am not sure how do this with a Bayesian approach when working with a discriminative model. I understand the theory of a fully Bayesian approach (i.e. avoid point estimates, and marginalize out the parameters to build the final posterior), but, assuming that my prior on the parameters is uniform, wouldn't this be equivalent to building the averaging ensemble?",2013-10-16 13:54:55.483 +104458,57458,22684.0,CC BY-SA 3.0,,"Thanks a lot. The label of the first line indicates the name of gene, but I do not know what the numbers mean either.",2013-10-16 14:03:24.647 +104459,57446,4320.0,CC BY-SA 3.0,,"@Berkan I don't think either issue you mention (more no event sequences than event sequences and just noise for no event) should rule out the 2 HMM approach. If you took the prior $P(HMM1)$ into account (I've update my original answer in this regard) then you may need to adjust for the unbalanced class distribution (more no events than events), but there are lots of ways to deal with this. See [this](http://stats.stackexchange.com/questions/64163/how-to-deal-with-low-frequency-examples-in-classification/64165#64165) answer I gave for example.",2013-10-16 14:03:47.033 +104460,57458,22684.0,CC BY-SA 3.0,,"This is from a online course material and I did not take this course. Anyway, I may try to email that professor to seek more information. Thanks",2013-10-16 14:05:01.767 +104461,57601,12282.0,CC BY-SA 3.0,,"Ok. There's a few things I don't understand in your answers. But one thing you said leapt out at me: ""I have the same set of samples for each model. That's the historical data to calculate the regression values. "" So, does this mean you have paired data? As in, your dataset has samples $(l_1,d_1), (l_2,d_2), \ldots$ where each pair $(l_i, d_i)$ tells you the value of L and D at the same time? If so, and if you're able to, I strongly advise building a third model using these pairs to directly find out what you want to know. It's almost certainly the best way to estimate P(L,D) from your data.",2013-10-16 14:11:50.133 +104462,57446,4320.0,CC BY-SA 3.0,,"@Berkan As for the window size, based on my own personal experience I expect what I've said in this matter will hold for any **fixed** window size. Obviously all of the things I've said will need to be tested empirically for your particular problem.",2013-10-16 14:14:55.540 +104464,57577,14888.0,CC BY-SA 3.0,,"What I don't understand yet is this: In the paired t-test, it is an outcome variable that is ""paired"" or ""correlated."" In the case I described, it is the outcome variable that is correlated. In RM ANOVA, the outcome variable is predicted by independent variables, which are correlated. That is a different situation, no?",2013-10-16 14:15:20.923 +104465,57577,14888.0,CC BY-SA 3.0,,"I do see the parallel between the paired t-test and RM ANOVA, that error variance is minimized. It is that aspect that generalizes. However, they differ in that paired t-test focuses on the differences between an outcome variable, whereas RM ANOVA focuses on repeated measures of predictor variables.",2013-10-16 14:15:51.223 +104466,57613,9716.0,CC BY-SA 3.0,,check out the *forecast* package - it's great.,2013-10-16 14:17:17.500 +104467,57446,20470.0,CC BY-SA 3.0,,"thanks for updating your answer, it is a lot clearer now. Since I will be working with logarithms, I will be making the comparison: $log(P(HMM1))+log(P(O|HMM1)) >? log(P(HMM2))+log(P(O|HMM2)) $. Now, $log(P(HMM1))$ is calculated using the forward algorithm, how do I calculate $log(P(HMM1))$? Is ti just a prior that I appoint?",2013-10-16 14:17:52.563 +104468,57577,503.0,CC BY-SA 3.0,,RM ANOVA can certainly be used when the DV is repeated. It generalizes the paired t because it is about more than 2 repeats. In the paired t-test the difference in the two outcomes is predicted by a single two-level categorical variable.,2013-10-16 14:18:11.960 +104469,57577,14888.0,CC BY-SA 3.0,,"The RM ANOVA method that I am familiar with (based on Andy Field's Statistics book for R) only ""works"" when the IVs are repeated and the outcome is a single measure. So, for my needs, I did the following: For each individual, I subtracted the two DV measures, then ran a two-way ANOVA using the DV difference as the outcome. 1) Is that valid? 2) Is that still a RM ANOVA? I think the answers to those questions will clear things up for me.",2013-10-16 14:26:08.950 +104470,57575,19359.0,CC BY-SA 3.0,,"Thanks for the advice, that's very helpful! And I appreciate your suggestion for double-checking the null deviance estimates. As for the degrees of freedom, that is not a typo-- although there are 7 (probabilistically) independent variables, the rank of the matrix of observations is only 5 (i.e. the observations are linearly dependent). Thanks!",2013-10-16 14:29:10.437 +104471,57599,18296.0,CC BY-SA 3.0,,It seems that it is not a simple question. I am surprised that this Topic is not treated or explicitly treated in the books.,2013-10-16 14:38:13.710 +104472,57617,20062.0,CC BY-SA 3.0,,"Boxplot for each group with brackets above looks great, and you also display variability of data...(or instead of brackets you can use notches). If you have basic skills with ""R"" I can provide you working code.",2013-10-16 14:57:07.793 +104473,57544,20740.0,CC BY-SA 3.0,,Interesting hadn't thought of that with the variance. New customers have a higher variance so more news means not only worse performance but higher variance and it is definitely heteroskedastic...,2013-10-16 15:02:24.943 +104474,57617,22399.0,CC BY-SA 3.0,,Two issues with a boxplot. (a) It can be a challenge to interpret the plot for a non-technical audience (especially for someone who has never seen a boxplot) (b) it does not scale well if I have lots of such mulltiple-comparisons test. Imagine doing the above for 20 such tests in which case a table with 20 rows with suitable emphasis/visualization seems compact relative to 20 boxplots.,2013-10-16 15:02:31.173 +104476,57617,7700.0,CC BY-SA 3.0,,"Two questions: 1-What exactly do you want to show, basic data, the multiple comparisons, the comparison's differences, all of the above? 2-What is the visualizations purpose and audience? Data exploration for you or explanation for a non-tech audience (if both you probably need two viz's). Also, you mention that the mean for D is different than A & B, what about C?",2013-10-16 15:12:04.253 +104477,57577,503.0,CC BY-SA 3.0,,"1) Yes, even if not ideal for reasons in my first answer. 2) I don't think so, but terminology isn't key.",2013-10-16 15:16:18.820 +104478,57617,22399.0,CC BY-SA 3.0,,1. Right now my goal is to show the means and just draw the attention to the ones that are different. 2. The audience is not statistically aware and showing them a table of means with an emphasis on the ones that are different seems to be the right approach to me.,2013-10-16 15:19:04.303 +104479,57446,20470.0,CC BY-SA 3.0,,"thanks for updating your answer, it is a lot clearer now. Since I will be working with logarithms, I will be making the comparison: $log(P(HMM1))+log(P(O|HMM1))>?log(P(HMM2))+log(P(O|HMM2))$. Now, $log(P(HMM1))$ is calculated using the forward algorithm. Do I calculate log(P(HMM1)) using simple MLE based on frequencies? i.e. for the given case, $HMM1 = (5 * 2,000) / 108,000$ where the numerator is the number of points that fall under HMM1 and denominators is the size of the data set.",2013-10-16 15:23:39.693 +104480,57615,22369.0,CC BY-SA 3.0,,"We've actually counteracted the issue of weeks not lining up by munging the weeks to line up. For example, this year there would actually be an additional week so the first week's information includes 8 weekdays instead of 5. It's not a perfect system but the holiday weeks should match up, the final week should roughly correspond every year, and so on. + +What I'm interested in is, is there a good method to build a model based on the trends from previous years to apply to this year's data?",2013-10-16 15:28:35.953 +104481,57617,7700.0,CC BY-SA 3.0,,"How many points do you have? With just 4, it's going to be difficult to show why D is different but C is not. For this, I'd almost do a simple dot-plot with a different symbol for D since it is statistically different.",2013-10-16 15:29:13.217 +104482,57586,1693.0,CC BY-SA 3.0,,"I'm not the swiftest; are you arguing that with large samples there is never a sound basis for expecting a sign to reverse, theory or no theory?",2013-10-16 15:31:45.530 +104483,57623,22262.0,CC BY-SA 3.0,,"This is very useful, but not quite what I'm after. Up voted anyway.",2013-10-16 15:38:09.437 +104484,57611,21762.0,CC BY-SA 3.0,,You are assuming that the hazard ratio associated with treatment does not substantially depend on the propensity score (i.e. on the variables used for matching).,2013-10-16 15:38:11.190 +104485,57617,20062.0,CC BY-SA 3.0,,"Maybe you can try to make a little explanation how to look at it. It is not necessary to be a ""rocket scientist"" to understand boxplot. Do not pamper you audience :)",2013-10-16 15:38:12.277 +104486,55609,1805.0,CC BY-SA 3.0,,"In excel, your best bet is probably to assume that sales tomorrow will be the same as sales today. This is called the ""naive"" forecast, and is usually a great place to start.",2013-10-16 16:09:11.677 +104488,57586,20473.0,CC BY-SA 3.0,,"Yes, this is the result that emerges (for the particular model specification examined) - I don't know whether it is a known result, it was new to me -but it is correct. But you shouldn't be surprised: each specific model has a ""technical identity"" that creates certain rigidities that do not permit the model to accommodate all possible aspects of a real phenomenon (such as a sign reversal in your case).",2013-10-16 16:35:54.987 +104489,57586,1693.0,CC BY-SA 3.0,,It sounds as if you are consigning statistical control to a much more limited role than many people believe it can play. And that as a consequence of your conclusion many authors (and contributors to this site) would have to completely revise what they've written about sign reversals in regression.,2013-10-16 16:41:09.857 +104543,57662,15827.0,CC BY-SA 3.0,,"Stata commands relevant (here names are self-explanatory): `zip`, `zinb`, `nbreg`.",2013-10-16 22:51:41.157 +104490,57570,668.0,CC BY-SA 3.0,,"@Dilip I see, thank you. It is of note that you had to supply several assumptions that, although they may be obvious to a trained statistician, are certainly not explicit in the question. I have instead attempted to take this question at face value--much as a neophyte would--and answer it on its merits (which are few indeed :-).",2013-10-16 16:46:41.163 +104491,57586,20473.0,CC BY-SA 3.0,,"You are generalizing the result, while I don't. It is not I that I conclude - it is the mathematics of the model that lead to the conclusion, that holds for the exact specific assumptions of this particular model: a logit model. One Binary regressor of interest. No Constant Term. The case of dropping _all_ control variables at once. Perhaps if you change any one of these assumptions, the result may not hold (or perhaps it will).",2013-10-16 16:49:14.840 +104492,57586,1693.0,CC BY-SA 3.0,,Thank you for your responses. I'm puzzled by all of them :-),2013-10-16 16:55:25.273 +104493,57572,22732.0,CC BY-SA 3.0,,"That's what I figured. I'm trying to piece together what the correlation structure should look like. It's not going to be autocorrelated as the within subject prediction is always the same, but what is it then... Thanks a lot for the feedback.",2013-10-16 16:59:17.430 +104494,57504,668.0,CC BY-SA 3.0,,"Thanks for editing, but the same potential to be misleading still exists. In the second bullet, despite all appearances, there really is only one effective parameter, equal to $a_1 \frac{a_2}{a_4}$ (because $a_2/a_2=1$ in the argument to the exponential makes $a_2$ disappear) and it enters *linearly* into the formula, not in a nonlinear fashion. The point is that linearity is a *mathematical* property of the function and not a mere *syntactic* property of how it is written down.",2013-10-16 17:18:27.220 +104495,57626,5237.0,CC BY-SA 3.0,,"Welcome to the site, @Roman. Asking for R packages is off-topic for CV (see our [help page](http://stats.stackexchange.com/help)). Moreover, this Q would be off-topic on [Stack Overflow](http://stackoverflow.com/) as well. You might try the r-help listserv.",2013-10-16 17:34:04.223 +104496,57626,5237.0,CC BY-SA 3.0,,This question appears to be off-topic because it is about asking for R packages.,2013-10-16 17:34:19.270 +104497,57629,21762.0,CC BY-SA 3.0,,Do you know anything about the shape of the distribution of the $Y$-values?,2013-10-16 17:34:31.560 +104498,57629,21864.0,CC BY-SA 3.0,,"actually, I don't have this luxury. The application should work for any distribution.",2013-10-16 17:36:15.240 +104499,57632,16046.0,CC BY-SA 3.0,,Can I know a source that I could see how this formulas are achieved?,2013-10-16 17:38:06.123 +104500,57629,21762.0,CC BY-SA 3.0,,"Then you might go with Chebychev's inequality (the finite sample version) http://en.wikipedia.org/wiki/Chebyshev%27s_inequality It gives bounds for the probability to deviate more than $k$ standard deviations from the mean. If you know mean and standard deviation, you can derive the range based on this",2013-10-16 17:39:10.647 +104501,57626,22762.0,CC BY-SA 3.0,,could I ask for the algorithm for panel VAR estimation?,2013-10-16 17:40:38.280 +104502,57632,3580.0,CC BY-SA 3.0,,"@Naji the CRP is a member of the exponential family and the number of tables is the sufficient statistic. If $P$ is the number of tables and $\mathcal P$ is the partition, then $f(\mathcal P) = h(\mathcal P) \exp\left\{Pa - [\log \Gamma(e^a + N) - \log \Gamma(e^a)]\right\}$ where $a = \log \alpha$. It follows from the properties of exponential families that $E[P]$ and $\mbox{Var}(P)$ are related to the derivatives of the function $\log \Gamma(e^a + N) - \log \Gamma(e^a)$ with respect to $a$. Those are just the formulas that pop out when you do the calculation.",2013-10-16 17:47:37.833 +104503,57626,5237.0,CC BY-SA 3.0,,"Sure, you can ask about how to deal w/ this situation, & in the process of answering someone might be able to provide some helpful R code (or not...). It's just asking 'what package will do X' that's off-topic. If you want the question to stay here (& stay open), just edit your Q to make it on-topic. It may help you to read the [relevant section of the help page](http://stats.stackexchange.com/help/on-topic) & our [guide to asking questions](http://meta.stats.stackexchange.com/questions/1479/how-to-ask-a-good-question-on-crossvalidated) in reformulating your Q.",2013-10-16 17:51:02.503 +104504,57632,3580.0,CC BY-SA 3.0,,"@Naji (continued) For a source, [this paper](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&cad=rja&ved=0CDsQFjAB&url=http%3A%2F%2Fwww.fas.harvard.edu%2F~junliu%2FTechRept%2F96folder%2Fnonpbayes.pdf&ei=MtJeUvepCoP49QTH24G4BA&usg=AFQjCNEosiOI_kGroCnu_PbMMiEINc6XdQ&sig2=oN7LojuVmm7FdrrYyT6uvA&bvm=bv.54176721,d.eWU) has them, but no derivations as far as I know.",2013-10-16 17:52:30.277 +104505,57611,20456.0,CC BY-SA 3.0,,"Actually, the HR may substantially depend on age: older people have more comorbidities and this could influence treatment they had been given. How should I proceed? I thought to add the variable age as a covariate in the Cox regression. Correct?",2013-10-16 18:01:12.467 +104506,57608,20473.0,CC BY-SA 3.0,,"You may want to look up ""choice-based modeling"".",2013-10-16 18:04:49.757 +104507,57633,2081.0,CC BY-SA 3.0,,[This](http://stats.stackexchange.com/a/36158/3277) might be of some relevance.,2013-10-16 18:09:23.460 +104508,57446,4320.0,CC BY-SA 3.0,,"I think you mean you can calculate $P(O|HMM1)$ using the forward algorithm. If so, then yes $P(HMM1)$ would just be the number of sequences where the event you're trying to predict occurs divided by the total number of sequences. You should have $P(HMM1) + P(HMM2) = 1$ since you would be assuming each each sequence is either generated by HMM1 or HMM2.",2013-10-16 18:10:21.637 +104509,57637,,CC BY-SA 3.0,user30490,I am not sure I follow how to do this. Do you think you could just show the last step of what the likelihood should be?,2013-10-16 18:10:48.570 +104510,57626,5237.0,CC BY-SA 3.0,,"I edited this in the hopes that it might lead to more productive answers for you. Please make sure it is still asking what you want to know & see if you like it. If not, click ""rollback"" to return it to your last edit with my apologies.",2013-10-16 18:18:14.827 +104511,57586,20473.0,CC BY-SA 3.0,,"""Puzzled"" is the best start to learn and really understand anything - at least from my personal experience!",2013-10-16 18:26:51.197 +104512,57570,4656.0,CC BY-SA 3.0,,"Alas, I am not a trained statistician, and yes, I had to supply several assumptions based on the `Hint: for example. a Bernoulli random variable can be used to model a coin with probability of success` $p \in ]0,1[$, that is in the problem statement. To me, the natural extension was to a geometric random variable. I agree with you about the merits of the question; indeed, even the notation is quite dreadful. $\pi_0$ and $\pi_1$ as the prior probabilities of two hypotheses is common usage, but just $\pi$? with all the confusion likely to ensue when normal distributions are encountered?",2013-10-16 18:30:04.127 +104513,57626,5237.0,CC BY-SA 3.0,,"I don't know the answer to your question, but I think you may be able to get helpful responses now. GL",2013-10-16 18:30:14.360 +104514,57636,20062.0,CC BY-SA 3.0,,don't you want ...to determine which VARIABLE give the greatest segregation between the yes and no samples? (instead of correlation),2013-10-16 18:34:58.897 +104515,57636,20062.0,CC BY-SA 3.0,,Data frame with 98 observations and 107 variables sounds like simple table (no correlation matrix). Please clarify!,2013-10-16 18:37:26.767 +104516,57632,16046.0,CC BY-SA 3.0,,what is $f$ exactly? If it is the pdf I don't understand how can its domain be a vector of variable size of random variables.,2013-10-16 18:41:39.093 +104517,57640,668.0,CC BY-SA 3.0,,"You seem to state that you know the vectors $(a_k)$ and $(b_k)$ up to permutation. (To maximize their inner product you put both sequences in ascending order; to minimize their inner product you put one in ascending order and the other in descending order.) But I'm not sure that's what you're saying, because if you *do* have this kind of information then you can easily compute both the means and the SDs, yet you specifically remark that you have the means and yet cannot find the SDs. Could you please edit the question to clarify exactly what information you really have?",2013-10-16 18:57:24.100 +104518,57632,3580.0,CC BY-SA 3.0,,"@Naji it is the pmf of $\mathcal P$, i.e. a mass function on the space of partitions. The domain is the set of **partitions of** $\{1, 2, ..., N\}$. In probability-speak, $\mathcal P$ is a random element (rather than a random variable or vector).",2013-10-16 19:02:31.827 +104519,57633,16746.0,CC BY-SA 3.0,,Thanks! Still it is not clear to me why Euclidean distance can not be used as similarity measure instead of cosine of angle between two vectors and vice versa?,2013-10-16 19:08:03.713 +104520,57641,21243.0,CC BY-SA 3.0,,Would you mind linking to the paper? It might help us better understand the context.,2013-10-16 19:12:31.880 +104521,57640,22767.0,CC BY-SA 3.0,,"I think I can not put the sequences ascending or descending, because although I know for instance 80% of $a_k$ have the same vale of $a_1$, still I don't know the value of $a_1$. Do you think I can calculate SD with this condition?",2013-10-16 19:28:19.770 +104522,57578,16144.0,CC BY-SA 3.0,,@Glen_b How do you orthogonalize these predictors?,2013-10-16 19:58:55.343 +104523,57616,5448.0,CC BY-SA 3.0,,"Do you want to be guided on a journey to figure out the answer or would you prefer to just be given the answer, along with an explanation of why it's the answer?",2013-10-16 20:05:48.023 +104524,57640,668.0,CC BY-SA 3.0,,"Given that you do not know the values of the $a_k$ nor of the $b_k,$ you will have to find extreme values that are consistent with what you do know. Your information includes the mean and the fact that the $a_k$ are non-negative, but you haven't given us any information about the possible values of the $b_k$ nor about the possible upper bounds on the $a_k$: both of these are essential for bounding the inner product.",2013-10-16 20:39:20.010 +104525,57644,668.0,CC BY-SA 3.0,,"Your distribution does not look Normal: it is clearly left-skewed. Without information about the details of the calculations giving the results, it really is not possible to answer this question.",2013-10-16 20:47:24.663 +104526,57653,,CC BY-SA 3.0,,"Is TP true positive, FP false positive, Tn true negative and FN false negative? If so please edit your question, abbreviations are typically not uniquely defined.",2013-10-16 20:55:19.263 +104527,57596,668.0,CC BY-SA 3.0,,"Have you noted the nice symmetry of this distribution? Because the height at $20-y$ is the same as the height at $20 + 2y$ for $0 \le y \le 20,$ then for small $dy$ the number of families with incomes between $20-y$ and $20-(y+dy)$ for $0\lt y\le 20$ is one-half the number of families with incomes between $20 + 2y$ and $20 + 2y + 2 dy.$ Thus the contribution from each of the latter families, which is proportional to $2 y t,$ must exactly balance the transfer to the former families, who receive an amount proportional to $y/2.$ That makes it obvious what $t$ should be.",2013-10-16 20:56:47.213 +104528,57655,449.0,CC BY-SA 3.0,,"In addition, given the covariates are covarying with X it's best that's it's the response variable and not a predictor.",2013-10-16 20:58:12.440 +104529,57656,5448.0,CC BY-SA 3.0,,Do you know the functional form of $f$?,2013-10-16 21:00:35.193 +104530,57595,22746.0,CC BY-SA 3.0,,"Intuitively yes I would say you lose information. $g (x) = x^2$ is not one-to-one (ie it has 2 values on the domain for each range element). We know that if we are given $Y_0 = k_0, \ldots, Y_n= k_n$, we can get X_n. However, give $Y_0^2 = m_0, \ldots, Y_n^2= m_n$, we cannot derive $Y_0$ as it is not one-to-one. So therefore we cannot get $X_n$. I think this is the answer to the first part. Thanks for the tip.",2013-10-16 21:05:36.253 +104531,57616,22756.0,CC BY-SA 3.0,,"A journey sounds nice. This isn't for a class and the answer is given at the end of the question. I don't care to just know the answer - I already know it! + +I've taken a stats course many years ago, but I didn't appreciate it enough then. I'm trying to remedy that now and really begin to understand the underlying patterns. + +I'd appreciate the help. This particular problem doesn't seem to fit with the others from this section and a proper approach isn't clearly demonstrated (to me) from the text's information on the binomial distribution nor its examples given.",2013-10-16 21:11:19.340 +104532,57635,21864.0,CC BY-SA 3.0,,"In my case, I cannot assume that my distribution is normal or is not, and the outlier detection should work for all cases. In fact, I only have an idea about what data set to run the outlier detection on just after the user interaction. The variance test was the most general idea that doesn't require me to have previous knowledge of the distribution of my data. What I understand from your explanation is that I should replace comparing the distance with comparing the statistic, is that what you mean? and what is ""med"" and ""mad""",2013-10-16 21:19:47.480 +104534,57662,5237.0,CC BY-SA 3.0,,"If you think there are too many zeros, you may want to run a zero-inflated Poisson (zip) model, instead of negative binomial, which is more for overdispersion.",2013-10-16 21:47:19.693 +104535,57644,12140.0,CC BY-SA 3.0,,"Right, the distribution doesn't look exactly like Normal. Perhaps, the number of samples (100) is not sufficient. It takes several hours to obtain a single sample. But I certainly see the trend - the more samples, the more bell-shaped the distribution becomes.",2013-10-16 22:17:38.957 +104536,57664,16043.0,CC BY-SA 3.0,,"Why Beta(1/2, 1/2)? This is a bimodal distribution with the weight concentrated around 0 and 1. OP, and future readers, will be interested in explanation about why this is the prior, rather than simply a prior among many alternatives. Some coins could be 1 with probability 0.75, so a prior of Beta(3,1) could represent that... or Beta(30,10)...",2013-10-16 22:24:38.273 +104537,57664,633.0,CC BY-SA 3.0,,"@user777: The distribution is over the coin's probability $p$ — it is not binomial, but Beta. OP already said he was familiar with the Jeffreys' prior, but I'll mention it as you suggest. I did say that ""assuming you know nothing to start"", so the Jeffreys' prior is IMO the most reasonable choice.",2013-10-16 22:26:31.577 +104538,57662,503.0,CC BY-SA 3.0,,"More precisely, `SPSS` is worrying about enough to not let the model run, while 'Stata` is telling the user it's a problem but letting the user deal with that. Also, as @gung points out, for too many zeroes, a ZIP model seems more apropos; for overdispersion, negative binomial; for both - ZINB.",2013-10-16 22:31:57.880 +104539,57659,594.0,CC BY-SA 3.0,,When you say there are many zeroes - are there more than would be expected for a Poisson model? Or is it just that the Poisson mean is quite small?,2013-10-16 22:37:08.863 +104540,57653,503.0,CC BY-SA 3.0,,"Any measure that did *not* give your classifier a 0 would be highly suspect. Your classifier is not doing anything. In a sense, no estimation of your classifier is needed other than the fact that it is not predicting any positives.",2013-10-16 22:38:26.050 +104541,57659,15827.0,CC BY-SA 3.0,,Just adding 0.5 strikes me as a fudge at best; do you have literature or theoretical support for that?,2013-10-16 22:45:27.210 +104545,57644,668.0,CC BY-SA 3.0,,You already have enough samples to demonstrate a significant departure from normality. It is unlikely additional samples will change the shape of the distribution appreciably.,2013-10-16 22:57:56.263 +104546,57656,16039.0,CC BY-SA 3.0,,"Well.. I don't know for sure, but I was planning on fitting either a linear or quadratic function of the fitted values.",2013-10-16 23:02:17.213 +104547,57665,503.0,CC BY-SA 3.0,,It is a good idea to spell out acronyms and abbreviations.,2013-10-16 23:09:59.053 +104548,57668,4656.0,CC BY-SA 3.0,,What does $Y|X$ mean? There is a _conditional_ distribution of $Y$ given the value of $X$ that uses notation like $f_{Y|X}(y|x)$ but $Y|X$ is not a random variable.,2013-10-16 23:20:27.863 +104549,57668,9175.0,CC BY-SA 3.0,,I am sorry that was supposed to mean $f_{Y|X}(y|x)$,2013-10-16 23:25:03.473 +104550,57659,5045.0,CC BY-SA 3.0,,Some [useful references](http://stats.stackexchange.com/a/38588/7071) on the why.,2013-10-16 23:32:43.723 +104551,57661,15827.0,CC BY-SA 3.0,,"Is this mainly about what makes sense statistically or do you seek Stata support? If the latter, it is arguably off-topic here (and would not be well received on SO, but you do not show a programming problem). In either case, please explain ADL and please don't assume that abbreviations used in your field make sense to all readers.",2013-10-16 23:33:10.800 +104552,57664,633.0,CC BY-SA 3.0,,"Added the proof for the likelihood. For the proof of the prior, you'll have to read Jeffreys' paper.",2013-10-17 00:01:49.237 +104553,57669,594.0,CC BY-SA 3.0,,In what context does this arise?,2013-10-17 00:03:17.307 +104554,57665,19325.0,CC BY-SA 3.0,,"@PeterFlom Thanks for the suggestion, I've added the full spelling of some terms.",2013-10-17 00:06:17.080 +104555,57653,5203.0,CC BY-SA 3.0,,"@PeterFlom, well said! I think the OP got hung up on precision/recall definition of F measure, which gives you an undefined (0/0) answer.",2013-10-17 00:06:21.953 +104556,57669,1145.0,CC BY-SA 3.0,,I'm playing around with the idea an index of 'stability' or 'solidity' of experimental evidence that might be based on how susceptible a result is to change upon the addition of an extra datum.,2013-10-17 00:11:57.130 +104557,57671,1145.0,CC BY-SA 3.0,,"Thank you very much. I can now screw up the many pages of scribbles that I have made. (Some bits are similar to yours, but certainly not all of them!)",2013-10-17 00:13:20.247 +104558,57644,12140.0,CC BY-SA 3.0,,I'm less concerned about getting exact Normal distribution. What's puzzling is general bell-like shapes that I'm getting each and every time. And its coming from a process that is unlike any random distribution. So the question is really if I can disprove that the process obeys CLT.,2013-10-17 00:16:20.033 +104559,57671,503.0,CC BY-SA 3.0,,Wow. That is a lot of LaTeX! :-),2013-10-17 00:31:49.720 +104560,57668,10684.0,CC BY-SA 3.0,,"I think the OP probably means ""Prove that $Y/X$ and $X$ are independent"" rather than ""Prove that $Y|X$ and $X$ are independent.""",2013-10-17 00:43:33.723 +104561,57480,2121.0,CC BY-SA 3.0,,I think that would be appropriate - you would be essentially adjusting your weighting scheme so you fall into situation #1.,2013-10-17 01:12:18.440 +104562,57673,449.0,CC BY-SA 3.0,,"What have you done to try to address the problem? What you're asking implies you don't know what *t*, *p*, or the test are. It's hard to tell exactly where you're confused. Please expand your question.",2013-10-17 01:28:22.170 +104563,57662,5045.0,CC BY-SA 3.0,,Here's a [simulation paper](http://personal.lse.ac.uk/TENREYRO/ppml-fsr.pdf) where the Poisson quasi-MLE does very well when there are lots of zeros.,2013-10-17 01:50:10.727 +104564,57669,594.0,CC BY-SA 3.0,,Sounds a bit like an influence function or an empirical influence function,2013-10-17 01:50:56.050 +104565,57676,594.0,CC BY-SA 3.0,,"The effort involved in your approach is far greater (and unless special care is taken, it's more likely to suffer numerical stability issues).",2013-10-17 02:50:06.297 +104566,57674,594.0,CC BY-SA 3.0,,"+1 Good answer, though I'd add the minor point that in both cases, *equality* corresponds to rejection as well -- and while it's not an issue with continuously-distributed test statistics, it matters when they're discrete.",2013-10-17 02:51:44.447 +104567,57679,,CC BY-SA 3.0,user30490,But $v\sim\text{Inverse-Gamma}$? I have $$v^{-p/}=v^{-p/2+1-1}=v^{-(p/2-1)}-1$$,2013-10-17 03:39:01.900 +104568,57685,22507.0,CC BY-SA 3.0,,The problem you discuss (representing 4d points as 2d points) is called **dimensionality reduction**. Google or read wikipedia articles about the topic.,2013-10-17 04:25:04.057 +104570,57359,594.0,CC BY-SA 3.0,,Details included. Sorry it took some days.,2013-10-17 06:05:43.990 +104571,57672,17328.0,CC BY-SA 3.0,,"There are many competing ways of expressing Inverse distributions. Accordingly, if you fail to provide the functional forms you are using, there is nothing 'clear' about the above. The definition I use is that if $X$~$Gamma(a,b)$ with pdf $$f(x) = \frac{x^{a-1} e^{-\frac{x}{b}}}{b^a \Gamma (a)}$$ then $1/X$ ~ $InverseGamma(a,b)$.",2013-10-17 06:06:43.790 +104572,57578,594.0,CC BY-SA 3.0,,Outline discussion [here](http://stats.stackexchange.com/questions/72626/how-to-include-a-linear-and-quadratic-term-when-also-including-interaction-with/73042#73042),2013-10-17 06:08:16.740 +104574,57688,594.0,CC BY-SA 3.0,,"One explanation you don't seem to have ruled out is random variation. If you observed a new data set for each, might your decisions have gone the other way? You might like to consider cross-validation",2013-10-17 07:16:10.963 +104575,30862,,CC BY-SA 3.0,,"They usually have sparse representations - you don't need to store $mn$ numbers for a low rank approximation. For example, a rank 1 approximation requires $n+m$ numbers.",2013-10-17 07:26:48.173 +104576,57647,22570.0,CC BY-SA 3.0,,"I think my problem is rather clear - I have the measured distribution of $\bf{v}$ and $\bf{a}$ and from this I'd like to sample a pseudo-random $\bf{v_{rand}}$, that ultimately reproduces the input. I'm well aware of your point on whether what comes out of it is realistic, but that's a different question...",2013-10-17 07:50:10.537 +104577,57635,450.0,CC BY-SA 3.0,,"med is median and mad is the median absolute deviation. Even then, you will find the the quantile of the normal are tilted to the conservative side (for symmetric distributions)",2013-10-17 07:53:15.917 +104578,57647,22555.0,CC BY-SA 3.0,,"At the very least, as indicated in the equation above, this would not be a stationary effect. I would think that a first step would be to bin the readings according to time interval and then compare them. I don't know how many readings you have but this comparison could be run through something like [Pearson's Distribution](http://stats.stackexchange.com/a/72434/31323) as a starting point - to try to classify the nature of the distribution.",2013-10-17 08:09:13.697 +104579,57446,20470.0,CC BY-SA 3.0,,"Yes, thanks I will give it a try and see how it goes. *P.S:* I calculate $log(P(O|HMM1))$ since I use scaling factors and $P(O|HMM1)$ is out of the dynamic range of the machine.",2013-10-17 08:13:42.407 +104580,57389,5001.0,CC BY-SA 3.0,,"Informed by the above, it becomes: Under the assumption that the true value of the y-intercept is zero, random sampling of the same number of (x,y) pairs, specifically 90, produced by the same process, would result in a least squares best fit line with a y-intercept as extreme as or more extreme than +0.00087, with an expectation of 27 times out of 10000 trials, and equal to or greater than +0.00087, with an expectation of 135 times out of 100000 trials.",2013-10-17 08:25:31.893 +104581,57378,5001.0,CC BY-SA 3.0,,"I think there's an overlooked distinction between the most up voted answer in the purported duplicate, and the answer sought here. The ""dup"" provides an understanding of the p-value, whereas this question seeks an answer that is an example of precise wording of the interpretation of the p-value. A concise answer, such as that in the comments here might be valuable to some practitioners.",2013-10-17 08:35:34.800 +104582,57693,8719.0,CC BY-SA 3.0,,Thank you for your answer! I was surprised to find out that I did a kind-of power analysis without knowing. :-) Statistics is more intuitive than I thought.,2013-10-17 08:43:36.630 +104583,57693,4910.0,CC BY-SA 3.0,,"I would say it is often intuitive, but unfortunately hidden by a thick coating of arithmetics and mathematical notation... :)",2013-10-17 08:46:10.767 +104584,57601,22750.0,CC BY-SA 3.0,,"Thanks. It's not quite that simple. I do have l and d pairs as you say, but they are derived from other variables which are integers (I am using a probit model). So l is 1 if g is < 3 (i.e. g=[0,1,2]) otherwise 0, and d is 1 if f=a, otherwise 0). But there are 4 possible combinations of l and g so I could probably do a probit model based on that new variable.",2013-10-17 08:48:18.917 +104585,57550,12683.0,CC BY-SA 3.0,,"What I mean is that ideally you'd pick firms from country A at random until you have enough doing business in country B for the NS group in your sample. If you were to start, say, looking at larger firms first then it should be no surprise to find when you compare NS with S that larger firms appear more likely not to have a subsidiary in country B. If you can't do a random sample for some reason then the compellingness of your conclusions is negatively correlated to the plausibility of such selection biases.",2013-10-17 08:52:55.540 +104586,57692,20062.0,CC BY-SA 3.0,,There are plenty of sources how to do simple correlation of two columns if you just google it...show some research effort,2013-10-17 09:00:20.837 +104587,57692,22784.0,CC BY-SA 3.0,,Correlation would be best in my case ? What about T-student ?,2013-10-17 09:02:34.207 +104588,57507,4910.0,CC BY-SA 3.0,,"Well, the interpretation would rather be that a 95% credibility intervals includes with 95% probability the true parameter (given the model assuptions, of course...). I would say that the 95% credibility interval is extremely useful information, and often the reported end result of a Bayesian analysis (If you do not choose to look at the complete posterior distribution, that is).",2013-10-17 09:07:58.597 +104589,57692,20062.0,CC BY-SA 3.0,,If you mean the student's t.test it compares the means of two groups to look for possible difference. Not your case.,2013-10-17 09:10:29.200 +104590,57651,22762.0,CC BY-SA 3.0,,thank you @fredrikhs for your comments. actually {vars} is good for time-series. how to use this package for the purpose of panels? direct applying doesn't work...,2013-10-17 09:21:39.090 +104591,57698,12282.0,CC BY-SA 3.0,,"Both methods will give exactly the same answer, I'm afraid.",2013-10-17 10:07:43.710 +104592,57640,22767.0,CC BY-SA 3.0,,"My experimental data are sparse, but I might be able to find the standard deviation from the error bars of the experiments. So, I will change the question to see how this expression could be calculated knowing also the SDs. The reply to that would help me too.",2013-10-17 10:08:25.633 +104593,57698,8386.0,CC BY-SA 3.0,,"With this sort of question it is important to consider whether balls are replaced after drawing. Would you a) replace each ball before drawing the next one, or b) for approach 2, replace the balls after each sample of 50, or c) not replace at all?",2013-10-17 10:11:54.710 +104594,57698,22788.0,CC BY-SA 3.0,,"I ran an experiment with both approaches. In approach 2, each sample of 50 is drawn without replacement. But I draw the sample of the next 50 with replacement. Like Pat mentions I found that both estimations yield similar results. My follow up question is will it make more sense to draw the sample of 50 with replacement?",2013-10-17 10:17:37.747 +104595,57696,503.0,CC BY-SA 3.0,,"Given that you have 5 dichotomous variables, I am not sure what graphs you created. Can you tell us? Also, even though it's a 5 way interaction, it's only $2^5 = 32$ combinations, so you could look at the predicted value for every combination without it being too overwhelming.",2013-10-17 10:17:51.880 +104596,57662,22775.0,CC BY-SA 3.0,,"Thank you all, I'm running the regressions for the two genders separately. In males the frequency of zeros is 11%, in the females 35%. Adding .5 was suggested to me because when I run the model with up to 3-way interactions in the se s of the estimates for the 2-way interaction terms are larger than the se s of the respective 2-way interaction terms when the model is run with only up to 2-way interactions in. I thought that was to be expected, but to the person who pointed it out it seemed odd because it happens when the higher-order interaction term is significant but also when it is not.",2013-10-17 10:19:30.107 +104597,57698,12282.0,CC BY-SA 3.0,,"Ohh, I hadn't thought about sampling with/without replacement. I just assumed the former. Disregard my earlier comment.",2013-10-17 10:24:52.100 +104598,57698,,CC BY-SA 3.0,,"@ryk I edited your question for consistent notation and terminology, please check if it is still what you wanted to ask (or roll back with my apologies).",2013-10-17 10:39:06.413 +104599,57699,20473.0,CC BY-SA 3.0,,"First you need to carefully define _what the ""effectiveness"" metric will be_.",2013-10-17 10:40:10.783 +104600,57699,,CC BY-SA 3.0,,What is 80' and 90'? Does this stand for minutes?,2013-10-17 10:40:32.033 +104601,57698,12282.0,CC BY-SA 3.0,,"So, at the risk of getting things wrong again, my gut feeling is you should use method 1. My reasoning is if we imagined for a moment that there were only 500 balls in the jar, method 1 is **guaranteed** to get you the exact proportion, while method 2 has a non-zero chance of giving a wrong result. If we increase the number of balls then the maximum error of method 1 will increase (e.g. with 501 balls it might be wrong by $\pm$ 1/501), but unless there's some variance improvement inherent to method 2 I'm not seeing, I suspect it will still retain a slight advantage.",2013-10-17 10:49:31.730 +104602,57696,22787.0,CC BY-SA 3.0,,Thank you for your response. The graphs that I've created were based on the est. marginal means. Not sure if that answers your question but the scores was ranged from 0-2. What do you mean by predicted values though?,2013-10-17 10:50:55.300 +104603,57662,22775.0,CC BY-SA 3.0,,"I should add that this is preliminary data and we're exploring, plus we're not expecting a good fit as the predictors are 6 but we're only fitting up to 3-way interactions. Also, estat gof does not run after glm. I am not running Poisson regression as such (ie poisson), I am running a Poisson under a generalized linear model (ie glm).",2013-10-17 10:59:08.777 +104604,57699,22790.0,CC BY-SA 3.0,,"@AlecosPapadopoulos, patients were followed for several years. I want to compare survival times.",2013-10-17 11:01:23.743 +104605,57699,22790.0,CC BY-SA 3.0,,"@Momo, patients were operated in 1980's and 1990's.",2013-10-17 11:01:56.403 +104606,57696,503.0,CC BY-SA 3.0,,The predicted value is the value that the ANOVA predicts for each combination of values. What software are you using?,2013-10-17 11:08:50.757 +104608,57696,22787.0,CC BY-SA 3.0,,"Ah, got it. I'm using SPSS version 21",2013-10-17 11:29:47.450 +104609,57696,503.0,CC BY-SA 3.0,,"Unfortunately, I don't know `SPSS`.",2013-10-17 11:43:35.723 +104668,57729,14806.0,CC BY-SA 3.0,,"Sorry, I just assumed everyone would look at the other post. I should probably have posted it here anyway. Thanks for the reply.",2013-10-17 20:09:35.363 +104610,57644,668.0,CC BY-SA 3.0,,"And the answer is yes, there are ways to disprove the process is exhibiting CLT-like behavior: apply a distribution test to the data you have already collected. Getting ""general bell-like shapes"" is common and often has little to do with the CLT. But, once again, please note that you have not supplied any of the information about your process that would be needed for readers here to give you objective, informed, or relevant advice.",2013-10-17 11:48:07.463 +104611,57696,22787.0,CC BY-SA 3.0,,"No worries, thank you for your help though!",2013-10-17 11:49:51.010 +104612,57616,20470.0,CC BY-SA 3.0,,I would be very interested in reading a detailed answer (with pointers to further reading where necessary) to this question.,2013-10-17 12:00:22.810 +104613,57688,19681.0,CC BY-SA 3.0,,"Each of the link functions is an actual function whose value can be computed. Try looking up the formula for each of your links (for example, the logit is clearly defined on wikipedia), and then graph each function to see its ""physical"" meaning.",2013-10-17 12:15:35.570 +104614,57696,10060.0,CC BY-SA 3.0,,"To get the predicted value, in SPSS Mixed Model panel, after you have specified all the information, Click `Save` button, and check the box `Predicted values`. You should then obtain a new variable after the model is completed. Then you can proceed to see the 32 means, and compare them without treading into the mess of 4- or 5-way interaction terms.",2013-10-17 12:22:02.317 +104615,57477,5643.0,CC BY-SA 3.0,,"The intuition I can't capture is when you say ""...the sum will always be too large"". I need to read through @whuber links, if you can elaborate a bit more it would help. Thank you.",2013-10-17 12:30:57.047 +104616,57705,21762.0,CC BY-SA 3.0,,"If you take differences, you treat the variables as being at least interval scaled. So, if the sample size is not too small, you could as well consider a linear (mixed effects) regression.",2013-10-17 12:31:32.937 +104617,57699,20473.0,CC BY-SA 3.0,,"No it is not a problem that method B was not available. This fact does not affect any inference related to survival times. You are not trying to explain ""why they chose A over B"", but _given the choice_ (for whatever reason), what was the survival time.",2013-10-17 12:37:36.787 +104618,57707,668.0,CC BY-SA 3.0,,"Whenever you are implementing a well-known statistical procedure, it is a *great* idea to compare your results to those produced by working software. When you run your data through a stats package, what does it report? Does it have the same problem or not?",2013-10-17 12:41:07.070 +104619,57696,22787.0,CC BY-SA 3.0,,Just ran the SPSS and have gotten the predicted means. But am still confused as to how I'd compare the means that way.. tq for the help,2013-10-17 12:50:45.413 +104620,57707,21918.0,CC BY-SA 3.0,,"@whuber, yes, that's a good idea.",2013-10-17 12:55:54.880 +104621,57710,21762.0,CC BY-SA 3.0,,Cramérs V is for two nominals. What is bad about regression? Take the numeric variable as response and regress it to the nominal (using dummies). Look at the $R^2$ and the associated global F-test.,2013-10-17 13:14:31.763 +104623,57601,12358.0,CC BY-SA 3.0,,"Why can't you just count up the fraction of instances for each of the four cases, i.e. the joint distribution, and then compute the marginal/conditional probabilities when you need them? If you do that, your two ""different"" estimates are identitical.",2013-10-17 13:27:32.103 +104625,57565,668.0,CC BY-SA 3.0,,"Thanks, but that's not an explanation, it's just a reiteration of your assertion (which is nevertheless correct: but repeating it does not make it any more convincing). Because intuitions about randomness are often incorrect, even the most ""obvious"" statements about randomness and probability ought to be justified by appeals to axiomatic principles or established theorems.",2013-10-17 13:33:50.707 +104626,57651,15183.0,CC BY-SA 3.0,,"Can you give an example, what does the data look like?",2013-10-17 14:04:00.433 +104627,57709,22787.0,CC BY-SA 3.0,,"Thank you for your response thaq. Really appreciate it! + +In terms of splitting the data (going with the sex example), I have gotten to the stage where I had split the data to male/female Caucasians/Asians. At this stage though, some main effects were significant, some were not, the interaction effects were mostly non-significant and at times both main and interaction effects was not significant as well. However, some the graphs clearly shows that there is a interaction effect but only at one level of the 3rdvariable. For example: + +Lineup ethnicity x Participant sex at two levels of Lineup Sex",2013-10-17 14:10:38.283 +104628,40104,668.0,CC BY-SA 3.0,,"Provided $\lambda_i$ are not terribly small, continuous approximations to this linear combination ought to work well for computing the CDF (such as a Cornish-Fisher expansion). What can you tell us about the possible values of the $\lambda_i$ and $a_i$?",2013-10-17 14:11:18.140 +104629,57709,22787.0,CC BY-SA 3.0,,"Male Caucasians Lineup: Lineup ethnicity x Participant Sex + - All main effects & interactions are non sig, + +Female Caucasian Lineup: Lineup ethnicity x Participant Sex + - All main effects & interactions are non.sig + +But the graph indicates an interaction effect only for Female Caucasians. Would I then mention, ""There were no reported sig. effects. However the graph indicates an interaction effect for .. (..then explain with the study theory)""?",2013-10-17 14:12:04.293 +104630,57709,22787.0,CC BY-SA 3.0,,"If I'm understanding your response correctly then, at this stage, if I do not have any significant interaction effects. I would calculate the ANOVA and report it (even when there are sig. main effects). The same applies if everything was not sig. (main effects + interactions)?",2013-10-17 14:16:24.180 +104631,57699,22790.0,CC BY-SA 3.0,,"@AlecosPapadopoulos, thanks. That partly confirms my intuition. However, isn't it a problem for estimating propensity scores (PS)? Given that this is an observational study, the goal of PS is to statistically reconostruct the assignment process to methods A and B. And for the first group, all where assigned to A because B was not available. Would not that distort the propensity scores?",2013-10-17 14:22:23.730 +104632,57594,22705.0,CC BY-SA 3.0,,"Let me try expressing my discomfort in a different way. If you only have 2 variables as regressors, you really don't need to do regression. You could just define business rules? Its kind of odd to use regression if you have only 1 or 2 variables in the model.",2013-10-17 14:27:12.927 +104633,57710,22795.0,CC BY-SA 3.0,,"Nothing wrong with regression, but as we have already that measure we would like to check it in another way just as double check with a correlation coefficient....thanks for the answer",2013-10-17 14:29:49.223 +104634,57699,20473.0,CC BY-SA 3.0,,"Indeed you are right on that, but now I am confused: I thought that you wanted to compare the two methods in terms of their effectiveness. I understand now that ""effectiveness"" will be an _explanatory_ variable in order to see how it affects the propensity to assign A or B? In that case, the whole first sample should not be considered, in my opinion, because it is irrelevant to the object of study, if the object of study is the comparative construction of PS -""faced with two options how do we decide which one?"". For the first group of patients there was no ""dilemma"".",2013-10-17 14:38:05.323 +104669,57711,5237.0,CC BY-SA 3.0,,"That's true, @Tal, but it's simply due to tradition. That is, it's what people do because it's what people do. There is a legitimate question about whether it is the best way to go about things. Even if you believe the appropriate answer here is ""no"" (which is undoubtedly a defensible position), this is certainly a question worth asking.",2013-10-17 20:11:12.327 +104635,57707,,CC BY-SA 3.0,,"Have you considered the modified newton's method? It was a method built specifically (as far as I know) to deal with running into singular matrix. You simply make an $n$ by $n$ identidy matrix which you subtract from your Hessian to avoid a matrix that does not invert. As for your second question, Newton's method can run into problems that it finds local and not global maximum, so you should try different starting points and see what you get. It can also start moving in one direction (the wrong one) and simply never converge, so starting point selection is very important.",2013-10-17 14:55:11.003 +104636,57711,5237.0,CC BY-SA 3.0,,"Just out of curiosity, are you also the person behind this question: [assessing-approximate-distribution-of-data-based-on-a-histogram](http://stats.stackexchange.com/q/51718/), & this question: [what-is-the-intuition-behind-conditional-gaussian-distributions](http://stats.stackexchange.com/q/71260/)? (The usernames seem similar to me.) If so, would you mind [registering](http://stats.stackexchange.com/help/creating-accounts) your account, & then [merging](http://stats.stackexchange.com/help/merging-accounts) these into 1 account? We're happy to help you, but this makes the site run smoother.",2013-10-17 15:01:44.867 +104638,57707,,CC BY-SA 3.0,,"I found this. I believe that the method that I was suggesting here is the one that they're calling the Levenberg-Marquadt method. Discovery - Unconstrained Optimization 24 So that • ˆH(x) is symmetric p.d. • ˆH(x) is not too close to singular, i.e., its smallest eigenvalue is bounded below by a constant bigger than zero. Popular methods: • Greenstadt’s method: Modify eigenvalues. • Levenberg-Marquardt method: Add a scaled identity matrix • Modified Cholesky Stratigies: Perform Choleskey factorization of the Hessian and modify the diagonal elements",2013-10-17 15:04:10.283 +104640,57616,5448.0,CC BY-SA 3.0,,"Let's consider a concrete, simple example; you have 5 slides from a person who has the pathogen. What is the probability that you fail to correctly identify this person as having the pathogen? A hidden assumption is that the presence / absence of the pathogen on a slide is independent of the presence / absence of the pathogen on other slides taken from the same specimen.",2013-10-17 15:28:07.410 +104641,57656,5448.0,CC BY-SA 3.0,,"Is your target variable $y > 0$ or perhaps $\geq 0$? If so, that opens up some options...",2013-10-17 15:30:19.310 +104642,57712,12544.0,CC BY-SA 3.0,,I would add the regression line to them.,2013-10-17 15:36:46.067 +104644,57694,18848.0,CC BY-SA 3.0,,"Thanks Ladislav, I ran the approach and I also confirmed it with the plots and it worked well. Thanks again",2013-10-17 16:29:54.493 +104645,57713,5448.0,CC BY-SA 3.0,,"Nice use of an often-forgotten tool... and, of course, for either of $\lambda_1$ or $\lambda_2 \leq 5$ or so, the brute force convolution method won't be all that painful.",2013-10-17 16:30:04.337 +104646,57616,22756.0,CC BY-SA 3.0,,That would be the probability of obtaining 5 false negatives in a row:,2013-10-17 16:35:23.437 +104647,57720,5237.0,CC BY-SA 3.0,,"When working w/ novices, it's helpful to keep explanations simpler even if you sacrifice a little nuance & accuracy; however, describing what p-values are as telling ""you if there is actually a relationship between the independent and dependent variables"" may be a bit too far in that direction for comfort.",2013-10-17 16:43:32.020 +104648,57720,13549.0,CC BY-SA 3.0,,I appreciate the comment gung. I haven't ever answered a question before on here before. I only ventured to help on this one as I have taught simple linear regression to about 150 biology students in the last year but perhaps I've gotten too used to toning it down. I'll see if there's a way to delete answers.,2013-10-17 16:48:12.447 +104651,57720,22798.0,CC BY-SA 3.0,,What is the difference between the r^2 value and the adjusted r^2 value? Also if you look at the blue chart on the bottom this includes a p-value and adjusted r^2,2013-10-17 17:40:52.457 +104652,57719,5448.0,CC BY-SA 3.0,,"Yes, sometimes the easiest problems are the hardest!",2013-10-17 17:45:20.027 +104654,57641,22631.0,CC BY-SA 3.0,,@LCialdella Alright Ill add a link.,2013-10-17 17:54:39.140 +104655,57641,22631.0,CC BY-SA 3.0,,http://personal.ee.surrey.ac.uk/Personal/P.Jackson/pub/avsp09/HaqJackson_AVSP09.pdf,2013-10-17 17:55:24.957 +104656,57660,5203.0,CC BY-SA 3.0,,Glad I could help. Keep @PeterFlom's advice in mind though--the problem here isn't the evaluation method; it's the classifier.,2013-10-17 18:26:32.757 +104657,57668,9175.0,CC BY-SA 3.0,,@Flounderer was right. The question was about proving $\frac{Y}{X}$ and $X$ are independent which was straightforward.,2013-10-17 18:35:51.660 +104658,57595,17573.0,CC BY-SA 3.0,,"$F=G$ is probably not what you want, not least because it's not true. You do want the law of iterated expectations, though, that much you have right. Something about sub sigma algebras . . .",2013-10-17 18:36:25.960 +104659,57675,18767.0,CC BY-SA 3.0,,"That makes sense now, thanks! I don't know why I was so sure that he was using Bayes' theorem...",2013-10-17 18:47:45.863 +104660,57628,17573.0,CC BY-SA 3.0,,I tried to resist but could not. Have you read Arthur Goldberger's wonderful chapter on micronumerosity? It's quoted in full in this blog post: http://davegiles.blogspot.com/2011/09/micronumerosity.html,2013-10-17 19:14:32.700 +104661,57718,9245.0,CC BY-SA 3.0,,"If you have a good grasp of what the different foods and environmental conditions in the""universe"" are you can use techniques for ""Positive and Unlabeled Examples."" They do tend to have more of a machine learning than a statistical motivation, though. I believe Charles Elkan has a good paper on the subject.",2013-10-17 19:14:44.023 +104662,57726,5237.0,CC BY-SA 3.0,,"Welcome to the site, @alexhli. If this question were *only* searching for a function or library to do this in Python, it would be off-topic for CV (see our [help page](http://stats.stackexchange.com/help)). However, it's not clear to me whether that's what you are asking (eg, ""*preferably* in Python""). If you have a substantive statistical question about these methods beyond looking for a function, would you edit to clarify it?",2013-10-17 19:15:03.827 +104663,57711,,CC BY-SA 3.0,,"Why do you want to do that? In all of the scientific literature I know, different main and interaction effects of an ANOVA are *not* considered as members of the same family of comparisons. In other words, alpha is controlled for each effect by itself. Multiple comparisons are corrected for only when simple effects are tested.",2013-10-17 19:47:03.590 +104664,57661,,CC BY-SA 3.0,user31629,It is on the latter so I apologize for posting off-topic. By ADL I meant Autoregressive Distributed Lag.,2013-10-17 19:53:03.050 +104665,57729,13037.0,CC BY-SA 3.0,,"You should always include a sample of your data, the exact commands you ran, and the exact error you got. Otherwise it is hard to know what you did (or what you did wrong). NVM - I see you posted the info in the other post.",2013-10-17 20:03:30.553 +104666,57717,668.0,CC BY-SA 3.0,,"Because the ""reflection"" is straightforward to interpret, this question effectively is a [duplicate of questions](http://stats.stackexchange.com/search?q=interpretation+log+add) about interpreting the ""started log"" $\log(1 + Z)$.",2013-10-17 20:03:51.617 +104667,57477,594.0,CC BY-SA 3.0,,"Do you see that $\sqrt{a^2+b^2}\leq a+b$ (with $a, b\geq0$), and you only have equality if one of them *is* $0$? (This is just that the hypotenuse of a right angled triangle is smaller than the sum of the other two sides)",2013-10-17 20:04:40.207 +104670,57729,594.0,CC BY-SA 3.0,,CoG? Could you please expand your abbreviation in-question?,2013-10-17 20:11:37.163 +104673,57715,8414.0,CC BY-SA 3.0,,Could you add some more context to your description? What other questions do you want to ask of your data?,2013-10-17 20:17:48.447 +104674,57727,5448.0,CC BY-SA 3.0,,"Is this self-study or homework? If so, please add the appropriate tag - we'll guide you to an answer, rather than just providing one outright.",2013-10-17 20:19:16.173 +104675,57730,594.0,CC BY-SA 3.0,,"Ah, I get it. You're reading it as ""who is speaking"" is the response? I didn't read it like that.",2013-10-17 20:20:14.643 +104676,57729,14806.0,CC BY-SA 3.0,,Center of Gravity. It's measure of where the greatest energy concentration occurs over a given frequency in speech.,2013-10-17 20:22:58.407 +104677,57726,594.0,CC BY-SA 3.0,,"If you were to modify the question to something like ""What is a way or ways to what I want, and is there a python implementation?"" the first part should be sufficiently on topic. But then your question would require clarification (you end by asking about exploring, not testing -- those are very different exercises)",2013-10-17 20:24:39.380 +104678,57730,14806.0,CC BY-SA 3.0,,"Just to clarify, the end result ('p' in this case) is going to be a list of p-values for each level, right?",2013-10-17 20:29:47.463 +104679,57730,14806.0,CC BY-SA 3.0,,"Also, this leads to another question: So if we know that CoG and Kurtosis are significant in predicting differences between speakers, does this also mean that there is significant difference between speakers based on these variables?",2013-10-17 20:40:33.583 +104680,57731,,CC BY-SA 3.0,,Do you need an analytical solution or a piece of code?,2013-10-17 20:53:04.240 +104681,57656,16039.0,CC BY-SA 3.0,,"Hmm, no unfortunately not. Out of curiosity, what would those options be?",2013-10-17 20:59:57.487 +104682,57718,13549.0,CC BY-SA 3.0,,"Alex - thanks for this, I've filed it away for future reference. The hope with this is to eventually make a predictive model and the Elkan paper seems like it might help with that, in the later stages",2013-10-17 21:00:17.047 +104683,57730,13037.0,CC BY-SA 3.0,,"@Shakesbeery it is important to note that multinomial regression fits pairwise models (like speaker 3 compared to speaker 1, speaker 2 compared to speaker 1) I highly recommend going through the examples on the link I provided to gain a greater understanding of the output.",2013-10-17 21:02:02.490 +104684,57725,13549.0,CC BY-SA 3.0,,Unfortunately I know little about Bayesian analysis beyond what it can be used for. I can always start doing some reading though! I'll keep this open for a few more days to see if I get other suggestions,2013-10-17 21:03:52.427 +104685,57731,20473.0,CC BY-SA 3.0,,"You write ""then the distance between the _first_ and the _second_ point is found"". So even though the points are dispersed in space, still they are indexed $1,...,K$, _before they are dispersed_, and you consider distances only following the index sequentially? Also is $Dth$ one symbol, or a product that we should know something about?",2013-10-17 21:06:12.080 +104686,57656,5448.0,CC BY-SA 3.0,,"The ""family"" parameter in `gam` specifies a link function, as with a generalized linear model; part of that is a specification of a mean-variance relationship. For example, for the Poisson family, variance = mean (with the default link function); for the Gamma, the standard deviation = the mean (with the default link function.) You may still be able to make use of this approach by transforming `y` mildly; the nonparametric nature of the right hand side (at least in your example) means you don't have to worry (much) about functional forms being changed by doing so.",2013-10-17 21:18:22.500 +104687,57718,16588.0,CC BY-SA 3.0,,You're not allowed to sample non-event cases because there could be error in those samples? Is there no possibility of error in the observed events? There's always error. That's why we bother with statistical inference.,2013-10-17 21:28:11.460 +104688,57685,18268.0,CC BY-SA 3.0,,"Thank you, user31264. I did my research, but I was wondering if there is a way to do this without losing information (i.e. having all 100% variance preserved in the modified dataset).",2013-10-17 21:35:59.587 +104689,57718,13549.0,CC BY-SA 3.0,,"No, it was for non-statistical reasons that I was asked to look for another method. Although if nothing else appears to work as well as what I had planned then that may be persuasive enough to let me get my samples. I need to find out first though.",2013-10-17 21:40:11.953 +104690,57696,14799.0,CC BY-SA 3.0,,"First, try redefining the levels of the within-subject factors, from absolute to relative: same/different as the subject. This should interchange some main effects and interactions. Second, if the only possible values of the dependent variable are 0/1/2 then some of the interactions might be due to floor or ceiling effects, in which case you might try an ordinal logistic analysis.",2013-10-17 21:44:46.567 +104692,57477,594.0,CC BY-SA 3.0,,"Looked at another way, consider the implied variance. If $a+b$ is the standard error of the difference, the implied variance is $a^2+b^2+2ab$, which for $a,b>0$ is bigger than the sum of the variances. If you accept that $\text{Var}(X-Y)=\text{Var}(X)+\text{Var}(Y)$ then if you say ""add the standard errors"", clearly this results in an additional positive term in the variance that shouldn't be there. i.e. making it 'too large'",2013-10-17 21:57:25.660 +104693,57731,22806.0,CC BY-SA 3.0,,"Thank you both for responding. Momo, I need an analytical solution. Alecos, yes I index the points and after that I model their locations using 2D-Poisson. The distances has no relation with the index. Dth is one symbol denoting the threshold distance.",2013-10-17 21:57:41.010 +104696,57695,594.0,CC BY-SA 3.0,,What are you trying to achieve (i.e. what is the actual problem you seek to ask of your data)?,2013-10-17 22:08:46.223 +104697,57656,16039.0,CC BY-SA 3.0,,"Thanks for your suggestion. That does sound awfully fiddly though, and I feel like I might be better off manually calculating regression weights and iterating.",2013-10-17 22:09:41.830 +104698,57712,594.0,CC BY-SA 3.0,,"""*The negative slope indicates that the values are decreasing together negatively*"". If they 'decrease together' (i.e. one decreases when the other one decreases) they'd have a *positive slope*. You mean that one *decreases* as the other *increases*, which is the opposite of any sense of 'together'. Adding 'negatively' to the end of that doesn't serve to make it less confusing.",2013-10-17 22:27:26.800 +104699,57656,5448.0,CC BY-SA 3.0,,"Yes, that's another idea. It's just that if you can do it easily with the ""family"" parameter, the iteration is handled for you. But if it's going to be work, it might well be better to do it all by hand, well inside a loop at any rate, and get more flexibility.",2013-10-17 22:50:22.950 +104701,57685,22507.0,CC BY-SA 3.0,,I afraid that is not possible. Suppose you have 4 points in vertices of a regular tetrahedron. Their mutual distances will be the same. You cannot reproduce it at a plane.,2013-10-17 23:07:03.190 +104702,57734,10060.0,CC BY-SA 3.0,,"The F-statistics you found there should be useful. Try to calculate the p-value of this ANOVA test. The resultant p-value should also be the p-value of the interaction term in your model 5. Now, you have the p-value and the point estimate, and you know that regression coefficients are tested with t-statistics, and you also know the df is 1... you should be able to deduct what the standard error of the interaction term is.",2013-10-17 23:17:48.530 +104703,57656,16039.0,CC BY-SA 3.0,,"I will have a play around with the 'family parameter' and see if I can get it to work easily for me, but I'm comfortable programming, so it might be easier to get the flexibility like you say. Thanks for the help.",2013-10-17 23:27:44.723 +104704,57734,16588.0,CC BY-SA 3.0,,"why not look at the output of `summary(lm(y ~ x1 + x2 + I(x1*x2)))` for the standard error? Also, the `I()` is unnecessary.",2013-10-17 23:48:39.800 +104705,57730,594.0,CC BY-SA 3.0,,@Benjamin I see from a clarification on the identical question posted to reddit that the OP does appear to intend it the way you interpret it. My apologies.,2013-10-18 00:03:16.843 +104706,57729,594.0,CC BY-SA 3.0,,"As I already asked please clarify *in the question*. That is, please edit your question to make the question clear (and even better, fully define 'center of gravity'). Hardly any of us are speech researchers, and not everyone reads all the comments in order to understand the questions, which should stand alone.",2013-10-18 00:05:29.260 +104707,57730,13037.0,CC BY-SA 3.0,,@Glen_b man...this is probably first time that I have been correct and a mod has been wrong. My life has gotten so much better!,2013-10-18 00:07:03.233 +104709,57731,20473.0,CC BY-SA 3.0,,"It seems there is an internal consistency problem with your formulation. Denote $d_{ij}$ the distance between points $i$ and $j$. Assume we are at point 3. Assume first that points 1 and 2 are in different groups. It may be the case that both $d_{13}$ and $d_{23}$ exceed the threshold. In which group should point 3 be placed, since it is eligible for both existing? Maybe the answer is that we measure the distance of each point only from its predecessor? (i.e. we measure only $d_{23}$? (CONTINUED)",2013-10-18 00:29:51.707 +104710,57731,20473.0,CC BY-SA 3.0,,"(CONTD)...But if this were the case, then assume that points 1 and 2 are in the same group: in order to determine whether point 3 should join them or be placed in a separate group, we must measure both $d_{13}$ and $d_{23}$. So we arrive at the internal consistency problem: if for each point we must measure the distance from all its predecessors, then indeterminacies may arise (case in previous comment). If we are to measure the distance only from its predecessor, then we cannot apply the criterion stated for joining a group. What are your thoughts on that?",2013-10-18 00:33:43.480 +104711,57685,18268.0,CC BY-SA 3.0,,"Okay, a little more reading have clarified this. Thank you.",2013-10-18 00:36:54.490 +104712,57717,22800.0,CC BY-SA 3.0,,"I'm not sure the question referenced as 'answered' was sufficient.Suggestions were made to use other transformations or a GLM framework, but the author specifically asked about the interpretation of 'beta' in terms of percentages.I did not find a specific yes or no confirmation in the comments regarding the correctness of interpretation. My question differs in adding a reflection, and specifically I'm interested in a reverse transform of 'b1'to get a 'units' interpretation (via exponentiation)which is a different question.I apologize if it is glaring before me but I have not found an answer.",2013-10-18 00:57:24.423 +104713,57666,15583.0,CC BY-SA 3.0,,Any suggestion of how to initialize the `lbfgs_malloced` array?,2013-10-18 01:02:27.167 +104714,57730,594.0,CC BY-SA 3.0,,"Sorry, but that's still to come -- I'm not a mod.",2013-10-18 01:10:43.510 +104715,57738,594.0,CC BY-SA 3.0,,"Analyses in R should be able to manage just fine with variables coded M/F and L/R; it has probably already made them factors (check by something like `is.factor(gender)` or even just use `str()` on them); if not, try `as.factor()`. You could analyze this data using ANOVA, yes",2013-10-18 01:20:16.507 +104716,57738,5237.0,CC BY-SA 3.0,,"*Note, although this question mentions using R, I see no reason to think it is off-topic. The main question seems to be ""is an ANOVA appropriate"", which is clearly on-topic & has nothing to do with R.*",2013-10-18 01:25:28.937 +104717,57738,594.0,CC BY-SA 3.0,,@gung I agree -- and have highlighted the main question to make it clear that an on-topic question is being asked.,2013-10-18 01:27:16.880 +104718,57733,2081.0,CC BY-SA 3.0,,"I don't understand your notation $\rho_y$ and $\rho_y$. The formula you give should probably be meant $\beta_x = b_x(\sigma_x/\sigma_y)$. This formula of converting _b_ in _beta_ is valid also when there is no intercept; but then $\sigma_x$ and $\sigma_y$ must be standard deviations not from the _means_ but from 0, - they are _root mean squares_ then.",2013-10-18 01:27:33.247 +104719,57738,5237.0,CC BY-SA 3.0,,"Because each individual contributes two data, you may want to use repeated measures ANOVA here. If you are less familiar with that, you could try using difference scores.",2013-10-18 01:31:45.077 +104720,57664,633.0,CC BY-SA 3.0,,"@user777: I think that for any particular distribution $d$ of a continuous exponential family, there is a sample space transformation that makes it so that $d$ is uniform over the new sample space. In other words, this intuition about something being uniform, multimodal or unimodal is probably irrelevant. Specifically, $\textrm{Beta}(\frac12, \frac12)$ is unimodal if you were to transform the sample space to be the space of log-odds.",2013-10-18 01:40:39.850 +104721,57741,,CC BY-SA 3.0,,"I have voted to close as this question solicits either opinions (person A's good writer need to be everyone's good writer), or list like answers; there is no true answer. Such questions are not a good fit for *any* [se] site. Hopefully this explains why people are voting to close?",2013-10-18 02:06:49.557 +104722,57734,594.0,CC BY-SA 3.0,,"@ndoogan +1, but the `*` operator in R formulas includes the main effects, while `:` represents the interaction-only. That is the formula `y ~ x1*x2` is equivalent to `y ~ x1 + x2 + x1:x2`",2013-10-18 02:25:38.440 +104723,57666,22143.0,CC BY-SA 3.0,,"Since the documentation notes *A user does not have to use this function for libLBFGS built without SSE/SSE2 optimization*, I assume you want to have SSE optimization? Hmm, I have not explored it.",2013-10-18 02:26:31.223 +104724,57733,22340.0,CC BY-SA 3.0,,@ttnphns Please see edits for better clarification,2013-10-18 02:27:32.197 +104725,57734,594.0,CC BY-SA 3.0,,"`?confint` will generate one from the model (though it's easy by hand from the output, as ndoogan suggests).",2013-10-18 02:27:49.497 +104726,57733,2081.0,CC BY-SA 3.0,,It'd be nice to follow more standard notation: $b$ = regr. coef; $\beta$= standardized regr. coef; $r$ = empirical correlation etc.,2013-10-18 02:36:06.213 +104727,57729,14806.0,CC BY-SA 3.0,,"@Glen_b Well you don't need to be a speech researcher to know the difference between _in question_ and _in the question_... Hehe, but jokes aside, I will clarify my post. Thanks.",2013-10-18 03:11:24.543 +104728,57746,594.0,CC BY-SA 3.0,,"You might like to draw a plot of f vs t first, to see what's going on, though for the second one you'll have to look at it as a process approaching a limit.",2013-10-18 03:56:37.897 +104729,57745,2081.0,CC BY-SA 3.0,,P.S. There is a good short overview about ordinal variable approaches in Jeromy Anglim's blog http://jeromyanglim.blogspot.ru/2009/10/analysing-ordinal-variables.html,2013-10-18 03:58:37.860 +104730,57628,18198.0,CC BY-SA 3.0,,"Your initial comment about resisting commenting seems to suggest that you think my research into ridge and other regularization methods is misguided? I have read about micronumerosity in the course of my research. Unfortunately I can't get any more data for a given day, but I can add data in the bayesian sense by building priors based on previous days observations. Could this be used to address micronumerosity?",2013-10-18 04:03:32.283 +104731,57710,2081.0,CC BY-SA 3.0,,"You haven't said anything specific about your ""numeric/ordinal"" variable. _What_ makes you to pose it ordinal? numeric?",2013-10-18 04:15:18.160 +104733,57746,594.0,CC BY-SA 3.0,,Is this for some subject?,2013-10-18 04:48:01.220 +104734,57737,13846.0,CC BY-SA 3.0,,My main questions (the two in bold) cannot be addressed using `anova()`. I want to see whether the log-odds in each of the conditions is significantly different from 0.,2013-10-18 04:51:29.987 +104735,57683,22763.0,CC BY-SA 3.0,,"You're right, I've edited my question. The real question though, is can I use the new $\mu_\text{sum}$ and $\sigma_\text{sum}$ to compute a CDF using the sum of three specific samples. I don't think this works but I wanted to be sure.",2013-10-18 05:37:16.847 +104736,57683,594.0,CC BY-SA 3.0,,@mikepk are you asking whether you can use *sample* means and standard deviations to compute the *population* distribution function? Or are you trying to find the ECDF of the summed sample values? Or something else? What are you actually trying to achieve in the end?,2013-10-18 05:40:35.200 +104737,57744,12140.0,CC BY-SA 3.0,,"I certainly didn't expect to see that results have bell-shaped form. I discovered this property accidentally, and now want to explore it further, because it's very helpful with what I'm doing.",2013-10-18 05:46:55.213 +104738,57331,594.0,CC BY-SA 3.0,,"Your notation is now confusing and contradictory. I suggest saying ""Let $S=X+Y+Z$,"" replacing ""$\text{sum}$"" with ""$s$"" in subscripts and $F_x(x_x + x_y + x_z)$ with $F_S(x + y + z)$. If you agree that is what you're asking, then your question would make sense, and the short answer is ""yes, that's what you do"".",2013-10-18 06:02:08.937 +104739,57723,,CC BY-SA 3.0,,Great answer with added bonus of all references outside of the paywall!,2013-10-18 06:04:39.687 +104740,57710,22795.0,CC BY-SA 3.0,,"ordinal beacuse I have a variable coming from a survey test so its range is -4,4, you can also think it as interval but this kind of survey variable are considered mostly as ordinal and the others are numeric, in specific continuous as they are features extracted.",2013-10-18 07:49:03.560 +104741,57738,21762.0,CC BY-SA 3.0,,"@gung: Since it is about two trials, its probably not a paired design. This should be clarified by the OP.",2013-10-18 07:51:19.850 +104742,57711,21762.0,CC BY-SA 3.0,,"You are doing three tests, so its $\alpha/3$. (I'd prefer the less conservative Bonferroni-Holm correction, which is almost as simple to apply). @Tal: Setting a global error in modelling would successfully avoid people doing p-value based variable selection...",2013-10-18 07:55:53.477 +104744,57727,21762.0,CC BY-SA 3.0,,The answer to the second question is yes.,2013-10-18 08:39:15.730 +104745,57753,13846.0,CC BY-SA 3.0,,"This is a great trick! It actually (I think) does exactly what changing the reference level does, but without the hassle of having to change the reference levels multiple times for each condition/category. Thanks!",2013-10-18 08:40:41.387 +104746,57741,,CC BY-SA 3.0,,"I like this question, and am disappointed that we won't be seeing any answers to it.",2013-10-18 08:53:26.230 +104747,57745,21762.0,CC BY-SA 3.0,,"One measure of association between an ordinal and a nominal is called ""Freeman's $\theta$"". Unfortunately, I don't have any open access reference at hand.",2013-10-18 09:02:21.013 +104748,57759,594.0,CC BY-SA 3.0,,"In what way(s) were the scores you computed inadequate for your purposes? Without knowing that, we'd likely suggest things with the same inadequacies.",2013-10-18 09:31:41.093 +104749,57758,1927.0,CC BY-SA 3.0,,The trace is invariant under cyclic permutations.,2013-10-18 09:40:12.897 +104750,57758,594.0,CC BY-SA 3.0,,[$tr(AB)=tr(BA)$](http://en.wikipedia.org/wiki/Trace_%28linear_algebra%29#Trace_of_a_product),2013-10-18 09:41:02.940 +104751,57760,22824.0,CC BY-SA 3.0,,Thank you for your help :)! That's exactly what I needed to know.,2013-10-18 09:52:42.727 +104752,57759,14525.0,CC BY-SA 3.0,,I have edited my post to address your questions,2013-10-18 09:53:42.613 +104753,57745,2081.0,CC BY-SA 3.0,,"@Michael thanks, here I found a paper ""A further note on freeman's measure of association"" http://moreno.ss.uci.edu/22.pdf",2013-10-18 10:30:45.237 +104754,57731,22806.0,CC BY-SA 3.0,,"That is why I'm indexing the points before modeling their locations. This is a sequential process, so in your example point 3 will be put in group 1 because it has be tested with point 1 before point 2 regardless to the positions of the points.",2013-10-18 10:51:55.067 +104755,57731,22806.0,CC BY-SA 3.0,,"I find the distance of a certain point with all other points following the indexes. If the distance between the point of concern and another point is higher than the threshold, then I test it with all the other points in the same group of that point. If all the distances are higher than the threshold then I put that point in that group.",2013-10-18 10:53:56.593 +104756,57757,2081.0,CC BY-SA 3.0,,"I don't quite get what you mean by ""maps"", but if what you realy want is to _compare visually_ several frequency (or other contingency) tables, here is two choices among a few: (1) multiple (3-way) **correspondence analysis**; (2) individual-scaling model of **multidimensional unfolding**. Both are quite advanced statistical techniques, so you might prefer easier ways, such as mosaic, paneled, etc. charts showing frequencies.",2013-10-18 11:03:40.767 +104757,57764,,CC BY-SA 3.0,,Related: http://en.wikipedia.org/wiki/No_free_lunch_theorem and http://stats.stackexchange.com/questions/17066/what-is-a-good-resource-that-includes-a-comparison-of-the-pros-and-cons-of-diffe,2013-10-18 11:04:53.450 +104758,57765,15827.0,CC BY-SA 3.0,,"Correlation only makes sense if values are paired. If one or more of your values in sample 1 is not paired with a value in sample 2, then those values can't be used in a correlation. Most of all, if no values are paired, correlation does not apply.",2013-10-18 11:11:08.070 +104759,57765,2081.0,CC BY-SA 3.0,,"@Tania, you have _one_ sample, two variables.",2013-10-18 11:14:13.463 +104760,57763,,CC BY-SA 3.0,,"+1 I have also frequently seen (and perhaps used) “correlational” as a synonym for “observational” or “non-experimental”. At this stage, I don't think there is much hope for another, more specific definition to take hold.",2013-10-18 11:21:59.370 +104761,57765,15827.0,CC BY-SA 3.0,,"As @ttnphns points out, ""sample"" is not the right term here. What are paired are values of variables.",2013-10-18 11:22:55.723 +104762,57765,22830.0,CC BY-SA 3.0,,Yes Nick! I am looking for correlation between 2 scales (psy tests). The n of one is a little higher than the n of another. I mean not ALL the respondents who filled up one form (scale) have filled up the other. There are some (very few though) missing...,2013-10-18 11:23:55.653 +104763,57765,15827.0,CC BY-SA 3.0,,"So, you can't use any cases (observations, records) with missing values. Your software should take care of that somehow. (Depending on quite what your scale is, correlation might not be best, but that's another story.)",2013-10-18 11:29:05.127 +104764,57764,22827.0,CC BY-SA 3.0,,"I am using the rapid miner tool, I'm using 10 fold cross validation with stratified sampling. for Naive bayes I check the laplacian correction and for SVM I use a dot kernel and other parameters are all in their defaults, but when I change the parameters and try again, I get same result; naive bayes outperforms SVM",2013-10-18 11:34:57.897 +104765,57765,22830.0,CC BY-SA 3.0,,"That is what I exactly thought: software should have taken care of it!I have already computed the correlation by using SPSS, but I wanted to be sure that it is ok if a few observations are missing in one of the variables, and n (s) are not matching exactly.",2013-10-18 11:43:07.847 +104944,57826,1412.0,CC BY-SA 3.0,,"I used to scratch my head about ""sth"" and eventually realized it is tweeter-speak for ""something"". The other keyboardism I would like to eradicated is ""wanna"" for ""want to"".",2013-10-19 18:40:25.463 +104766,57765,15827.0,CC BY-SA 3.0,,"I don't know what you mean by ""ok"". You have some missing values; whether that implies a bias is impossible for us to say. Sample size $n$ as far as the correlation is concerned is the number of pairs of values included in the correlation; it can only be unclear or misleading to others if you think of this or report it as using different sample sizes.",2013-10-18 11:59:14.103 +104767,57765,15827.0,CC BY-SA 3.0,,"Why remove mathematical formatting? It is cosmetic here, but it does no harm. (We are wasting time trying to make small improvements to your presentation if you keep reversing them.)",2013-10-18 12:01:32.287 +104768,57628,17573.0,CC BY-SA 3.0,,"No, the point of Goldberger's chapter is that multicollinearity is not a problem to be solved via statistical technique. Just like micronumerosity is not a problem to be solved via statistical technique. The standard errors are big because your data don't reveal the thing you are interested in, not because you analyzed them incorrectly. Of course, you can always make the standard errors smaller by bringing in outside information, but then it's the outside information not the data which are identifying the parameter(s) of interest.",2013-10-18 12:02:18.087 +104769,57765,22830.0,CC BY-SA 3.0,,"Nick, would you suggest I rerun the analysis by taking care of the missing values, and make the n (s) equal?",2013-10-18 12:04:04.900 +104770,57600,651.0,CC BY-SA 3.0,,"In the Bayesian approach, the models would be weighted by their marginal likelihood (i.e. Bayesian evidence) and any prior placed over the hyper-parameters, so it would be a special case of averaging over an ensemble with a particular method for weighting the models.",2013-10-18 12:20:09.750 +104771,57765,15827.0,CC BY-SA 3.0,,"Sorry, but I am at a loss to know what you did that needs correcting. No statistical software worthy of the name will calculate a correlation from differing numbers of values for the two variables. Perhaps you should show us what commands you used and what output you got.",2013-10-18 12:23:20.933 +104772,57767,15827.0,CC BY-SA 3.0,,"For others I will add that in climatology ""anomaly"" just means deviation from a reference level; there is no implication of anything pathological or very unusual. In terms of the question, who is claiming that this is ""optimal""? What is optimal will depend at least tacitly on a model for the time series, quite apart from any other considerations?",2013-10-18 12:26:35.777 +104773,57752,15563.0,CC BY-SA 3.0,,This page might help: http://stackoverflow.com/questions/6782070/display-correlation-tables-as-descending-list?rq=1,2013-10-18 12:30:48.703 +104774,57763,17573.0,CC BY-SA 3.0,,"That is a very strict definition of ""true experiment."" The populations in medical trials are generally convenience samples, for example, who are then randomly assigned. So, they would be quasi-experiments. Most people seem to mean just ""random assignment"" by experimental.",2013-10-18 12:31:54.270 +104775,57756,17573.0,CC BY-SA 3.0,,"Wikipedia has good definitions of quasi-experiment, natural experiment, cohort study, and observational study. Your study is an observational, longitudinal (or cohort) study. Your study doesn't look quasi-experimental to me---it would be if students were randomly assigned to G1/G2 somehow by nature. A picture of the ""pyramid of evidence:"" http://sourcesandmethods.blogspot.com/2011/05/evaluating-analytic-methods-what-counts.html",2013-10-18 12:33:47.810 +104776,57734,16588.0,CC BY-SA 3.0,,"@Glen_b you are exactly right. However, if the main effects are already present in the formula, the x1*x2 notation will not re-add them.",2013-10-18 13:05:07.263 +104777,57765,22830.0,CC BY-SA 3.0,,Many thanks Nick and @ttnphns! I will have to go back and check where did I go wrong:-) I might bother you again. TC,2013-10-18 13:05:20.420 +104778,57763,503.0,CC BY-SA 3.0,,"@Bill Indeed. Both ""quasi-experiment"" and ""experiment"" are used differently by different people. But this masks problems of external validity with convenience samples.",2013-10-18 13:21:01.520 +104780,57767,8629.0,CC BY-SA 3.0,,"True, my choice of the word ""optimal"" was not optimal. Replaced with ""reasonable"".",2013-10-18 13:28:21.470 +104781,57769,8629.0,CC BY-SA 3.0,,"Thanks, this makes sense for purely additive processes. However, I often see people use this approach when the underlying process is clearly *not* purely additive. See my updated question.",2013-10-18 13:35:26.563 +104782,57477,5643.0,CC BY-SA 3.0,,"ok, I see all the numbers, although the intuition probably lies in geometry, rather than statistics. I get that adding variances would overestimate the standard error, because of the extra $2ab$ term, but why squaring the sum of the variances would give the ""right"" estimate? Yes, this is really a geometry question.",2013-10-18 13:42:38.600 +104783,57762,22601.0,CC BY-SA 3.0,,"Thank you so much for this elaborate response, but I'm afraid that I've stated my problem wrong. I am very sure that your post is of use for others and thus vote it up! Thanks!",2013-10-18 13:43:18.037 +104784,57767,22507.0,CC BY-SA 3.0,,"(1) Is the multiplicative part negligibly small? After all, the base level changed by maybe 1 degree of Kelvin, while the base temperature is about 300 K (roughly). (2) Is there any statistical evidence that the seasonal variation shows long term trends, or there is a long term change in seasonal patterns?",2013-10-18 13:47:43.397 +104785,57770,22833.0,CC BY-SA 3.0,,So you just to the 0.56 ignoring the constant + 1 there?,2013-10-18 14:02:46.050 +104786,57586,1693.0,CC BY-SA 3.0,,"I can't find a mistake with your math. I have run a model, though, that fits all the conditions you've described (except that the control variables are *added* all at once, not dropped). This arrangement, the first I've run through the origin, is actually the only one I've used that does create a sign reversal. N is 5k.",2013-10-18 14:18:45.417 +104787,57767,8629.0,CC BY-SA 3.0,,"In my case (tropospheric NO2 pollution levels), the amplitude of the seasonal variation is clearly changing with time (clearly meaning by visual inspection). Which in my opinion shows that the multiplicative part cannot be neglected. As to (2), I'm not sure how to provide this statistical evidence. Any suggestions?",2013-10-18 14:18:47.010 +104788,57680,306.0,CC BY-SA 3.0,,"to my wisdom, this is based on the problem at hand and it is dangerous to use such thumbrules in practice.",2013-10-18 14:21:27.623 +104789,57752,2081.0,CC BY-SA 3.0,,That now looks somewhat more a graph theory than a statistical question (because correlations are not seen as interdependent anymore). Maybe StackOverflow can yield better answers. Some sort of constrained minimal spanning tree...,2013-10-18 14:21:47.040 +104790,57773,2081.0,CC BY-SA 3.0,,"Very nice snapshots, gladden the eye! And the explanation. But - I ask you - please tell @Tania about pairwise and listwise deletion of missings and under what button it is found in SPSS.",2013-10-18 14:29:59.307 +104792,57772,17328.0,CC BY-SA 3.0,,"So what conclusion did you reach based on your simulations? Why not show a diagram of the resulting plot rather than this plethora of code? In fact, I would suggest that you delete everything in your question after the line: `Is Z distributed as well as a Beta-Binomial (with parameters n1+n2, a and b?`",2013-10-18 14:34:16.557 +104817,57509,22703.0,CC BY-SA 3.0,,"Sir, the answer is simpler when we talk about the binomial experiment. The above mentioned texts as we move on never provide the motivation for the distribution. I intend to ask this question on many distributions, individually in fact.",2013-10-18 18:17:28.877 +104818,57586,20473.0,CC BY-SA 3.0,,That's interesting. Can you calculate the two empirical probabilities of eq [6] in the model with the control variables present?,2013-10-18 18:30:55.553 +104793,57699,22790.0,CC BY-SA 3.0,,"@AlecosPapadopoulos, the goal is to compare methods A and B in terms of patient survival. I updated my question. I want to use propensity score matching to deal with the problem that somwhat different patients (in terms of age, gender and so on) are treated by the two methods. For example the newer method B is used on older patients which were not operate at all earlier. The question is then, whether balancing data with PS I can use the first group of patients, who were operated some time ago and only with method A.",2013-10-18 14:37:07.730 +104794,57776,21762.0,CC BY-SA 3.0,,Sorry for deleting the former comment. But there was a mistake in it I couldn't correct anymore ;),2013-10-18 14:45:47.540 +104795,57773,10060.0,CC BY-SA 3.0,,"@ttnphns, no problem. Revised.",2013-10-18 14:50:47.387 +104796,57683,22763.0,CC BY-SA 3.0,,"Thanks Glen, I'm probably not asking the right questions. I have a collection of three separate scores on three separate normally distributed ranges. Usually I compare the individual scores against all scores using a percentile (for which I'm using the CDF). + +I'd like to compare the collection of all three scores against all the other possible collections of those three scores. Effectively how does this collection compare to the population of all collections. Like I said in my original question, my stats memory is dim, but I don't think this works.",2013-10-18 14:53:59.933 +104797,57773,2081.0,CC BY-SA 3.0,,"Excellent (yes, really). Those shades, too... I feel like dragging it all to Flickr photostream.",2013-10-18 14:56:15.110 +104798,57773,10060.0,CC BY-SA 3.0,,"@ttnphns, you're too kind. I use a screen capture software called [Snagit](http://www.techsmith.com/snagit.html) to do capture and post-capture touch up (like adding circles and arrows.) It also makes screen video, too. Pretty handy. (Disclaimer: I am not affiliate with this software's maker.)",2013-10-18 15:02:07.507 +104799,57772,,CC BY-SA 3.0,,"I edited your question, please check if it is still correct. If not, you can rollback.",2013-10-18 15:03:51.780 +104800,57699,17740.0,CC BY-SA 3.0,,"The fact most of the A surgeries occurred decades earlier than the B surgeries could imply that you can't really compare the two. The fact the time period is so radically different can cause significant confounding. Other seemingly unrelated treatments and life in general has changed a lot over such a long period, which will also reflect on your survival results. **Don't underestimate this hurdle.**",2013-10-18 15:05:25.407 +104801,57683,22763.0,CC BY-SA 3.0,,"Reading over the original question I think I've even confused myself :). So I have a single sample $x = 300$ that I want the CDF of, so I get a value back like $F_X(300) = 0.6$. So the probability of all random samples being <= 300 is less than or equal is 0.6. Now I have three samples lets say (300, 900, 100) from three *different* data sets (all normally distributed) and I want to do something similar, the probability of three samples being less than or equal to that particular collection of three samples. The more I think about this the more it doesn't quite make sense to me.",2013-10-18 15:21:43.057 +104802,57628,18198.0,CC BY-SA 3.0,,Understood but its not just that the standard errors are large the coefficients themselves tend to unfeasibly large and offsetting. Although these numbers provide the BLUE fit they are pretty much non-nonsensical in terms of real life values there are supposed to represent + I would not trust and prediction made using these extreme values. I do take your point though that the problem would probably disappear if I could just get more data.,2013-10-18 15:41:46.980 +104804,57780,2081.0,CC BY-SA 3.0,,Ordinal predictors? You may try for example: A) Use the predictors as polynomial sets. B) Quantify ordinal into interval via CATREG (categorical regression). (_But_: are you really sure your variables must be treated as ordinal and not interval?),2013-10-18 15:46:41.133 +104805,57752,22601.0,CC BY-SA 3.0,,"@ttnphs: a minimal spanning tree is just the thing I don't want, since pairwise correlations imply a complete graph. Nevertheless, you're right that this question might fit the mathematics-site better. Thanks!",2013-10-18 16:09:29.463 +104806,57773,22830.0,CC BY-SA 3.0,,A THOUSAND thanks @Penguin_Knight!!! You are just amazing...you have saved my life:-) I wish ALL teachers/ mentors are like you! And look at these beautiful & amazing snapshots. Awesome...,2013-10-18 16:14:07.997 +104807,57772,21476.0,CC BY-SA 3.0,,"Sorry for not describing the results of the simulations. I could not find a combination of the free parameters for which the two variables $z$ (sum of beta-binomials) and $z1$ (beta-binomial with adjusted $a$ and $b$) have different densities (given the same mean and variance). I edited the text to include this. @wolfies: you are indeed right, the question could stop after the line you mention, I just wanted to point out that I have strong suspicions that the sum is distributed as a Beta-Binomial, but I am looking for a proof.",2013-10-18 16:22:17.270 +104808,57744,1150.0,CC BY-SA 3.0,,@OutputLogic which is great! I'm not sure how much the second part of that question helps answer your question ... it is hard to explain how deterministic and non-deterministic processes interact. Can you talk a bit more about what is confusing for you? (I'm actually working on a paper that tries to explain this to psychology and CS people right now!),2013-10-18 16:24:28.490 +104809,57727,22341.0,CC BY-SA 3.0,,I an sorry I do not know how to add the appropriate tag,2013-10-18 16:53:01.350 +104810,57628,17573.0,CC BY-SA 3.0,,"I understand now. When you have multicollinearity, usually you can make pretty precise comparisons and good predictions as long as you make them ""with the grain"" of the multicollinearity. For example, suppose that X3 is always pretty close to equal to X4 in the data. Predictions at points where X3 and X4 are pretty close to equal will usually have pretty low variance and be reasonable. Similarly, if you want to measure the effect of X3 and X4 rising together by one point, you will get precise estimates. It's only if you try to ask a question the data can't answer that there are problems.",2013-10-18 17:08:04.230 +104811,57628,17573.0,CC BY-SA 3.0,,"To use an example I always use in class, suppose you have a sample of shoe factories with variables for total production cost, number of right shoes, number of left shoes. You can get excellent, plausible estimates of the extra cost from producing an extra pair of shoes and excellent forecasts of how much a shoe factory producing 10000 pairs of shoes will cost to run. But, your estimates of how much it would cost to produce 1000 left shoes and 6000 right shoes will be terrible, because nothing like that ever happens in the data.",2013-10-18 17:10:58.263 +104812,57779,5448.0,CC BY-SA 3.0,,"Just out of curiosity, why are you constrained not to use Stirling's formula?",2013-10-18 17:30:42.150 +104813,57779,,CC BY-SA 3.0,user30602,"I am not constrained - I know how to use it, but I would like to see if there is a simpler solution.",2013-10-18 17:33:54.500 +104815,57503,22703.0,CC BY-SA 3.0,,Thanks for clearing the geometric way of explaining the doubts. I wish to have clarity on the statistical idea. A standard reference with a detailed explanation would be of great help. A confusion regarding the scale and the shape exists though !,2013-10-18 18:15:19.307 +104816,57704,22793.0,CC BY-SA 3.0,,That prooved to be an interesting idea but the interleaving zone is still quite problematic (in fact it gives worst results than just applying the ridge classifier directly on the dataset without any modification).,2013-10-18 18:17:22.260 +105034,57895,503.0,CC BY-SA 3.0,,See the `effects` package in `R`,2013-10-20 22:09:20.420 +104819,57758,594.0,CC BY-SA 3.0,,"student; ocram beat me to the punch, with an even more general result (though it's a consequence of the one I mentioned, since you can replace either $A$ or $B$ with arbitrary products and get the result that it's true for cyclic permutations)",2013-10-18 19:00:22.363 +104820,57500,5237.0,CC BY-SA 3.0,,"Here are some links to related threads that should be helpful to read: for **linear** means: [What does linear stand for in linear regression?](http://stats.stackexchange.com/questions/8689/); for a simple example of a **non-linear regression**: [Linear regression best polynomial (or better approach to use)?](http://stats.stackexchange.com/questions/70153/); for what **parameters** are: [Is any quantitative property of the population a ""parameter""?](http://stats.stackexchange.com/questions/63386/)",2013-10-18 19:00:26.273 +104821,57477,594.0,CC BY-SA 3.0,,Sorry to be dense for so long about what you're after; I will add some more detail to my answer.,2013-10-18 19:04:13.100 +104822,57762,2081.0,CC BY-SA 3.0,,"@Ray, thank you for being attentive to spot a lapse.",2013-10-18 19:04:27.970 +104823,57787,594.0,CC BY-SA 3.0,,Which three parameter gamma? I've seen more than one. Do you mean the one that's simply a shifted two parameter gamma?,2013-10-18 19:16:41.623 +104824,57683,594.0,CC BY-SA 3.0,,"@mikepk How do you come to know $\mu_X$ and $\sigma_X$? If you do know them and can assume independence, then I don't see any difficulty with doing what you suggest.",2013-10-18 19:17:28.550 +104825,57790,2081.0,CC BY-SA 3.0,,"This appears technical question, you must be doing something wrong via syntax or menu. I think the question should be moved to StackOverflow.",2013-10-18 19:21:10.373 +104826,57792,2081.0,CC BY-SA 3.0,,Are you speaking of **median** absolute deviation?,2013-10-18 19:24:46.293 +104827,57792,21884.0,CC BY-SA 3.0,,@ttnphns No. **Mean** absolute deviation error.,2013-10-18 19:28:07.663 +104828,57788,15827.0,CC BY-SA 3.0,,I don't think it's a definition or characterisation of descriptive statistics that they aim for minimum loss of information. It's entirely possible to have descriptive statistics that leave out really important detail and that's often a problem.,2013-10-18 19:43:13.417 +104829,57785,12683.0,CC BY-SA 3.0,,"Who considers the first order statistic to be a good initial estimate of a location parameter, & in what context? It would certainly be a very odd one for most situations.",2013-10-18 19:43:34.890 +104830,57749,22817.0,CC BY-SA 3.0,,"Yeah, I could see that it is the median. But I am not being able to prove it",2013-10-18 19:44:47.230 +104831,57749,594.0,CC BY-SA 3.0,,Did you do what I suggested with the derivative? What does it give you?,2013-10-18 19:59:56.197 +104832,57793,1895.0,CC BY-SA 3.0,,"Use Fubini. Then, pause. Then, ask yourself why the argument doesn't work, in general, if $X$ can take both positive and negative values.",2013-10-18 20:01:12.267 +104833,57793,594.0,CC BY-SA 3.0,,See discussion of the result [here](http://stats.stackexchange.com/questions/18438/does-a-univariate-random-variables-mean-always-equal-the-integral-of-its-quanti) and [here](http://math.stackexchange.com/questions/64186/intuition-behind-using-complementary-cdf-to-compute-expectation-for-nonnegative) and [here](http://math.stackexchange.com/questions/64186/intuition-behind-using-complementary-cdf-to-compute-expectation-for-nonnegative). I've only heard it expressed as 'the expectation is the integral of the survival function' ... rather than with any particular name.,2013-10-18 20:04:02.450 +104834,57793,594.0,CC BY-SA 3.0,,"Incidentally, for continuous random variables, you can show it in two lines using integration by parts.",2013-10-18 20:14:56.983 +104835,57791,674.0,CC BY-SA 3.0,,For readers who will read this response to the end I would suggest to add a brief take-away message (and to provide appropriate citation if it applies).,2013-10-18 20:30:04.420 +104837,57790,5237.0,CC BY-SA 3.0,,"My first suspicion is that this is not a technical problem with SPSS, but a less frequent result of stepwise selection algorithms (see my answer below). I believe this question is on-topic on CV.",2013-10-18 20:34:57.023 +104838,57586,1693.0,CC BY-SA 3.0,,"I wish I could follow it. With laypeople I say I'm a statistician, but with statisticians I say I'm a...researcher.",2013-10-18 20:37:19.347 +104839,57800,22843.0,CC BY-SA 3.0,,"Good point, but how do we know that the Cov(X,Y) is less than or equal to the product of the standard deviations of X and Y?",2013-10-18 20:38:47.910 +104840,57744,12140.0,CC BY-SA 3.0,,"The purpose of a random seed is to initialize chip place & route algorithm. However, the algorithm itself is deterministic. That means if you run it multiple times with the same seed, you'd get the same result. Meaning of the result is not the efficiency, but the measure of how close it is to the given constraints. So the upper bound means that the algorithm meets or even exceeds given constraints. The lower bound can theoretically any negative value, but practically there is some.",2013-10-18 20:42:10.963 +104841,57790,503.0,CC BY-SA 3.0,,I agree with @gung. This question should stay open.,2013-10-18 20:47:58.973 +104842,57800,19752.0,CC BY-SA 3.0,,"I've never actually proven myself, but some Googling brought up this page: http://www2.math.umd.edu/~ddarmon/teaching/stat400/correlation-proof.pdf",2013-10-18 20:51:04.827 +104844,57779,668.0,CC BY-SA 3.0,,Stirling's approximation follows easily from taking the logarithms of the binomial coefficients and so is perhaps one of the simplest and most natural solutions possible.,2013-10-18 21:03:28.323 +104845,57721,,CC BY-SA 3.0,user31676,"The book, Discrete Multivariate Analysis, by Bishop, Holland, and others, has some techniques for finding patterns in sequences.",2013-10-18 21:06:56.417 +104846,57791,5661.0,CC BY-SA 3.0,,"With -2 votes so far, I think there's not much I can do to save it :) I think the ending, where they all agree with each other, and admit they can use each others methods without worry about each others philosophy, is a 'take-away message'.",2013-10-18 21:45:23.860 +104847,57791,5661.0,CC BY-SA 3.0,,"No citation required. I just made it up myself. It's probably not very well informed, it's based on my own (mis)-interpretations of arguments I've had with a small number of colleagues over the years.",2013-10-18 21:46:57.273 +104848,57778,20473.0,CC BY-SA 3.0,,Look up the answer to this question : http://stats.stackexchange.com/questions/72857/derivation-of-conditional-distribution-from-other-two-distributions/72870#72870,2013-10-18 21:56:33.543 +104849,57778,10547.0,CC BY-SA 3.0,,Thanks. This might be stuipid but would it not help to center $Y$ w.r.t $W$ then $Z:=Y-W$ will have expectation $0$ and variance $\sigma_y^2$,2013-10-18 22:09:51.457 +104850,57802,6162.0,CC BY-SA 3.0,,"With `lm()`, you are using $\sqrt{\hat\sigma}$ instead of $\hat\sigma$.",2013-10-18 22:40:00.533 +104851,57778,20473.0,CC BY-SA 3.0,,"Then you should also eliminate $W$ from the second exp which would make $Z$ appear in there too, having again your new integrating variable ($z$) present in the two exp's. You don't gain anything, really.",2013-10-18 22:54:36.663 +104852,57778,10547.0,CC BY-SA 3.0,,"Ah sure, thats what I missed. I already presumed that this would not work... Why should centering solve a integral...",2013-10-18 23:01:59.887 +104853,57779,,CC BY-SA 3.0,user30602,"Ok. I used Stirling's formula, but not by taking the logarithms of the binomial coefficients. Could you show me how?",2013-10-18 23:07:11.503 +104854,57790,22841.0,CC BY-SA 3.0,,"Yes, I am using enter. The reference category is droped the 2nd level becomes the reference category.",2013-10-18 23:21:39.647 +104855,57799,22841.0,CC BY-SA 3.0,,"That was my first guess, but is not the case… The crosstabs look good, I have missing cases but I have 17% of the cases in the level that is dropped. I also tried to change the reference category, the result is the same. I tried to change the measure (ordinal, nominal, scale), the same result...",2013-10-18 23:41:53.497 +104856,57787,5448.0,CC BY-SA 3.0,,"The meaning of ""location parameter"" and ""scale parameter"" is independent of the distribution. Also, what do you mean by ""structure of its density""?",2013-10-19 00:00:53.580 +104857,57796,20991.0,CC BY-SA 3.0,,"thank's a lot for your answer, I want to understand what you did, so the cumulative distribution is the y1, or what is this line y1 <- cumsum(y)*diff(x)[1]. So if my question is stupid but I just start to study r 1 week ago,",2013-10-19 00:26:56.890 +104859,57790,10060.0,CC BY-SA 3.0,,"I need to know more... did you use the `categorical` button to assign a variable to be categorical or did you make a series of dichotomous indicator by yourself and feed them into the model? If it's the earlier, pay attention after assigning the categorical variable, you can actually set your reference group to either the first or last of the coding scheme (if your desirable ref. group is at the middle then you'll need to recode). If it's the latter, then you should not feed a group of dichotomous indicators into stepwise... bad idea. use `block` button to test the whole categorical variable.",2013-10-19 00:39:56.783 +104860,57807,10570.0,CC BY-SA 3.0,,"Replace the nodes of your graph with $Heat_t$, $PlateArea_t$, and $Friction_t$, where $t$ is the time step (0...N). The graph is now acyclic: $Heat_t$ points to $PlateArea_{t+1}$, $PlateArea_t$ points to $Friction_{t+1}$, and so on. The trick that makes this infinitely long network possibly tractable is the assumption that the parameters are the same across time, so all the lines connecting $Heat$ nodes to $PlateArea$ nodes have the same params (and so forth). [Page 430 of this](http://www.stat.cmu.edu/~cshalizi/uADA/12/lectures/ch21.pdf) gives a good diagram for such a graph.",2013-10-19 00:43:56.230 +104861,57749,22817.0,CC BY-SA 3.0,,I got the derivative something like this $\sum_{i=1}^{n}(-\frac{y_i-t}{|y_i-t|})$. I can't equate it to zero,2013-10-19 01:03:46.100 +104862,57749,22817.0,CC BY-SA 3.0,,Also for the second question it is $\sum_{i=1}^{n}({|y_i-t|^{\infty}})$ not what you have written,2013-10-19 01:17:26.913 +104863,57749,594.0,CC BY-SA 3.0,,"(i) *you don't equate it to zero*, as I already explained. (ii) I discussed the difference in my answer. I suggest you think carefully about what I wrote, and answer every question I have asked, in comments or in my answer that has not already been responded to. You don't seem to be putting much thought into working with what information you've already been given.",2013-10-19 01:25:33.733 +104864,57746,594.0,CC BY-SA 3.0,,"Note that CV is NOT a 'do my homework' site. Please add the self-study tag, and read its tag wiki info to what your responsibilities include.",2013-10-19 01:26:03.737 +104865,57749,22817.0,CC BY-SA 3.0,,The answer is given by minimum subgradient the one with the least slope which is at the median,2013-10-19 01:29:47.470 +104867,57749,22817.0,CC BY-SA 3.0,,The answer is given by minimum subgradient the one with the least slope which is at the median,2013-10-19 01:35:56.980 +104868,57810,5237.0,CC BY-SA 3.0,,"This is not quite right. A *parametric* regression model need not be a ""straight"" line it simply needs to be a function of a finite number of parameters.",2013-10-19 02:05:46.487 +104871,57810,22705.0,CC BY-SA 3.0,,"Got it, what I meant to explain was that - in parametric regression, we specify the functional form in terms of the # of parameters, which parameter. The simple straight line was an example. Did I get it right?",2013-10-19 02:38:31.123 +104872,57744,1150.0,CC BY-SA 3.0,,"So, why are you assuming that the place & route algorithms have anything to do with the CLT? It has to do with the random sampling of locations and how many starting positions will exceed your output constraints, whatever they happen to be.",2013-10-19 02:50:51.660 +104873,57796,5875.0,CC BY-SA 3.0,,"Yes, y1 is the cdf!",2013-10-19 03:12:10.527 +104874,57815,5237.0,CC BY-SA 3.0,,"Is this supposed to be an answer or a question? Moreover, here are a couple of thoughts: mixed models can be thought of as a limited version of SEM, & I'm not sure you couldn't satisfactorily deal w/ your situation w/ a hierarchical model.",2013-10-19 03:12:55.873 +104875,57815,22705.0,CC BY-SA 3.0,,"I'm getting used to this place. it is an answer - highlighting what Bayesian can do which mixed cant. Yes, mixed is a limited version of SEM. (@gung, second statement- I didn't understand)",2013-10-19 03:15:59.697 +104876,57755,,CC BY-SA 3.0,,"I think ""average dip"" is good enough. It doesn't have the dimensions of acceleration, so it's certainly not anything to do with that.",2013-10-19 03:49:02.607 +104877,57787,22703.0,CC BY-SA 3.0,,It refers to the probability density function,2013-10-19 04:42:47.020 +104878,57785,22703.0,CC BY-SA 3.0,,"@Scortchi Whenever an iterative procedure is being used for parameter estimation, as the initial value for the location parameter is generally taken to be either the first order statistic, or a linear combination of the first few order statistic",2013-10-19 04:45:57.787 +104879,57564,19559.0,CC BY-SA 3.0,,"@BenBolker: ha, yes, an error message like that would have been helpful!",2013-10-19 05:31:30.730 +104880,57749,594.0,CC BY-SA 3.0,,"Think about simplifying $\sum_{i=1}^{n}(-\frac{y_i-t}{|y_i-t|})$, (hint: what is $\text{sign}(x)|x|$?) and look again at the plot and my discussion. The slope is actually undefined at the median.",2013-10-19 05:42:26.080 +104882,57812,594.0,CC BY-SA 3.0,,What is the best approach depends on the conditional distribution of $y$; you might be looking at either nonlinear least squares (weighted or unweighted) or generalized nonlinear models. What are the y-values?,2013-10-19 05:53:12.507 +104883,57791,674.0,CC BY-SA 3.0,,"I've seen such dialogue (shorter, though) in the past, and I find them interesting. I was also concerned by the downvotes, hence my suggestion to put a brief summary at the top so as to motivate readers to read the rest of your post.",2013-10-19 05:59:13.547 +104884,57744,12140.0,CC BY-SA 3.0,,"On the contrary, I'm observing certain behavior that looks like CLT, and trying to disprove it. If I cannot disprove it, it has far reaching implication to what I'm trying to do.",2013-10-19 06:44:19.320 +104885,57811,2081.0,CC BY-SA 3.0,,"It is unclear if you want to implement Discriminant analysis or Bayes classifier. DA first extracts the discriminants. Then it classifies (in a manner of a Bayes classifier) using _those_. If you need DA you ought to read more pages (including this site) about it, to stop being so `new to this field`.",2013-10-19 07:11:53.347 +104887,57783,436.0,CC BY-SA 3.0,,"Thank you Enrique, TraMineR seems very interesting I will try it on Monday!",2013-10-19 07:23:37.097 +104888,57798,2081.0,CC BY-SA 3.0,,"You may read [here](http://stats.stackexchange.com/a/30724/3277) that this formula reduces to the formula of the [cosine](http://stats.stackexchange.com/a/36158/3277) similarity, and _r_ [is the cosine](http://stats.stackexchange.com/a/22520/3277) for centered data.",2013-10-19 07:25:25.143 +104889,57784,436.0,CC BY-SA 3.0,,"That is interesting Andy, thank you. I guess it would need some tweaking to be generalized to multiple groups, but I will see if I can come up with something. By the way, if you register a JSTOR account you can read the paper online for free.",2013-10-19 07:28:22.763 +104890,57812,15827.0,CC BY-SA 3.0,,"Much overlap with http://stats.stackexchange.com/questions/59784/regression-for-a-model-of-form-y-axk Your data may be different, but in my experience additive errors are usually implausible for power functions. As in the thread cited, a power function is not best described as even a special case of of a polynomial.",2013-10-19 07:49:41.327 +104891,57812,21762.0,CC BY-SA 3.0,,How do you handle the potentially strong overfitting? By external validation?,2013-10-19 08:20:41.567 +104892,57816,,CC BY-SA 3.0,,The statement for |t|>2 will only be true if the degrees of freedom are large enough. Can you provide examples of papers that make a statement like that? Is it possible for you to use the `nlme` package instead of `lme4`?,2013-10-19 08:24:12.387 +104893,57776,22833.0,CC BY-SA 3.0,,what if the item is not income and has small values!,2013-10-19 08:44:39.440 +104894,57816,20120.0,CC BY-SA 3.0,,Is there a way to present confidence intervals (e.g. of the slopes)? Confidence intervals not encompassing 0 => rejection of nil-null.,2013-10-19 08:44:56.793 +104895,57460,2081.0,CC BY-SA 3.0,,"@Ray, your picture is a viable explanation of the sign of a coefficient; it is like picture [here](http://stats.stackexchange.com/a/70910/3277), only 2D. But I don't see how it can explain _suppression_. To show suppression you must show error term because suppression is defined wrt it.",2013-10-19 08:50:26.363 +104896,57319,2081.0,CC BY-SA 3.0,,"@rolando2, let me repeat it again, that coefficient's sign reversal in response to adding a new predictor does not necessarily makes that predictor a suppressor. And vice versa, adding a clear suppressor does not necessarily changes the sign. The title of your question remains ambiguous. Choose: either you ask about sign reversal or about suppressing effect. Or about when suppressing and sign reversal will coincide.",2013-10-19 09:20:52.163 +104897,57791,5661.0,CC BY-SA 3.0,,"@chi, Interesting. Maybe I'll stop worrying about the downvotes, and just work on improving it! But not just yet.",2013-10-19 09:20:55.303 +104898,57752,14799.0,CC BY-SA 3.0,,"I'm not clear on what you want. If you were to check all $\binom{n}{k}$ subsets, would you pick the subset with the smallest sum of squared correlations, where the sum is over the $k(k-1)/2$ within-subset correlations? Do the $k(n-k)$ correlations with the remaining $n-k$ items matter?",2013-10-19 09:21:51.867 +104899,57802,22848.0,CC BY-SA 3.0,,Thanks Stéphane for the correction but it still doesn't seem to work,2013-10-19 09:31:51.913 +104900,57802,,CC BY-SA 3.0,,try looking at the source code: `stats:::logLik.glm`,2013-10-19 09:45:54.467 +104901,57790,22841.0,CC BY-SA 3.0,,"Hi, All my variables were recode to the reference category be 1 and most of my variables have three categories. I am using Enter with the following syntax. LOGISTIC REGRESSION VAR= ""DV"" +/METHOD=ENTER ""IV's"" +/CONTRAST (""IV"")=Indicator (1) [...] +/PRINT=GOODFIT CI(95) + /CRITERIA PIN(.05) POUT(.10) ITERATE(20) CUT(.5). Do you have idea what is going wrong?",2013-10-19 10:05:11.427 +104902,57819,22705.0,CC BY-SA 3.0,,"Thanks for the response @Nick. From my edited question above, it might start to appear why a multiplicative log-log model wont satisfy my requirements. It cannot isolate the interaction effects between each pair of x (which can be isolated in an additive model). Further, a log-log model necessitates that y increases at a slower rate as x tends to infinity (large values).",2013-10-19 10:10:31.763 +104903,57819,15827.0,CC BY-SA 3.0,,"Your original model doesn't (obviously) have interaction terms either; the answer in both cases is to add further terms to the model if justified. Your comment on ""diminishing impacts"" is cryptic, but powers necessarily being positive is not assumed or implied by either model.",2013-10-19 10:13:36.630 +104904,57819,22705.0,CC BY-SA 3.0,,"I am not sure how adding an interaction term to a multiplicative model will help isolate the pair-wise interaction effects. Because, its already a multiplicative (has all pairwise multiplicative effects) model.",2013-10-19 10:17:08.653 +104905,57802,22848.0,CC BY-SA 3.0,,I did this but this function just reverse the aic slot from the glm object to find back the log-likelihood. And I don't see anything about aic in the glm function...,2013-10-19 10:20:25.350 +104906,57819,15827.0,CC BY-SA 3.0,,See edits above.,2013-10-19 10:22:34.977 +104907,57819,15827.0,CC BY-SA 3.0,,"A power function can have powers $>1$, so your comment (if I understand your wprding) that $y$ necessarily increases more slowly with $x$ is quite incorrect.",2013-10-19 10:33:24.337 +104908,57812,22705.0,CC BY-SA 3.0,,"How I'm handling over-fitting: Build a model with 75% of data. And, then fit the model with the same set of variables for each additional week of data. And, see how the coefficients for each variable vary across weeks. If the variance of coefficients is not very high, I conclude that the model is stable and responds well to new data/rejecting any fear of over-fitting.",2013-10-19 10:33:32.773 +104909,57812,594.0,CC BY-SA 3.0,,"Sales may tend to be fairly skew (I mean the conditional distribution); you might want to consider a GLM (perhaps a gamma family), with a log-link. Alternatively, transformation may make the distribution less skew.",2013-10-19 10:40:08.717 +104910,57819,22705.0,CC BY-SA 3.0,,"ok, I got it. The diminishing impact is not a concern in the log-log model. But, adding pairwise interaction terms (non log) as indicated in your edited equation, will not solve isolation of the pairwise interaction effects. Because, the product term (∑ J j= 1 b j lnx j) already has the interaction. Intuitively, adding more pair wise products doesn't seem to help isolate the interaction effects.",2013-10-19 10:44:00.027 +104911,57776,21762.0,CC BY-SA 3.0,,Updated the response in this direction.,2013-10-19 10:53:09.070 +104912,57819,15827.0,CC BY-SA 3.0,,"I can't comment in detail on how best to model interaction effects for your data. As before, your question doesn't clearly spell out how you think interaction works or how you would model interaction yourself. If you could be more explicit on that it would help any further comment.",2013-10-19 10:59:58.190 +104913,57819,15827.0,CC BY-SA 3.0,,I should emphasise that I mentioned the multiplicative power function model because it is similar in spirit to what you were thinking about but much simpler to work with. In principle and in practice I have no way of knowing whether quite different models would work as well or better for your data. I support the suggestion from @Glen_b to consider a generalized linear model with log link.,2013-10-19 11:13:22.193 +104914,57790,10060.0,CC BY-SA 3.0,,"The syntax looks fine. One possible reason is that the suspect variable has a very low count in level 1 to begin with. When other predictors were also added, the extra information might have caused the level 1 of the suspect variable to be perfectly predicted (aka perfect collinearity.)",2013-10-19 11:59:22.703 +104915,57802,,CC BY-SA 3.0,,"I suspect this has something to do with LogLik and AIC (which are tied together at the hip) assuming that three parameters are being estimated (the slope, intercept, and dispersion/residual standard error) whereas the dispersion/residual standard error is calculated assuming two parameters are estimated (slope and intercept).",2013-10-19 12:04:37.267 +104943,57834,1412.0,CC BY-SA 3.0,,"Wish I had more ability to upvote. One feature not discussed is ""blinding"" and the need to assess the success of randomization and stability of results under resampling.",2013-10-19 18:13:32.363 +105035,57270,674.0,CC BY-SA 3.0,,"Julien, please visit our [Help Center](http://stats.stackexchange.com/help/merging-accounts) to merge your two unregistered accounts.",2013-10-20 22:11:08.787 +104916,57709,22716.0,CC BY-SA 3.0,,"yes, of course your graph might look as if there might be some effects, but if they do not become significant, then they are not reliable (== they are 'not really there'). Often this becomes understandable if you plot the standard deviation/error around your data points. Large variance will prevent something that looks like an effect/interaction from becoming significant. In such a case, it is incorrect to say that 'there was an interaction in the plot'. Rather, you can say that what looks like an interaction was not significant (and should not be interpreted).",2013-10-19 12:04:47.207 +104917,57823,22381.0,CC BY-SA 3.0,,is it possible to have loss less than 0?,2013-10-19 12:35:37.983 +104918,57785,12683.0,CC BY-SA 3.0,,Certainly not true in general. Are you thinking of say a three-parameter Weibull model?,2013-10-19 12:36:49.273 +104919,57823,15827.0,CC BY-SA 3.0,,"This is more a matter of convention than of logic. In practice, at least in my experience, loss functions are functions with zero or positive value. I don't think anything stops anyone calling something a loss function when that something might be negative. It's just like thinking that you want to minimise expenditure, but if you get some net income that is better yet than zero expenditure.",2013-10-19 12:45:35.977 +104920,57749,22817.0,CC BY-SA 3.0,,The answer is when it is at minimum subgradient which is the median point at the point where sign(x)|x| changes sign,2013-10-19 13:26:53.450 +104921,57578,16144.0,CC BY-SA 3.0,,"I used 'orthog' in Stata to orthogonalize X and X^2. I have a couple of questions about the orthogonalized variables: 1) How do you decide if you want to orthogonalize X with respect to X^2, or vice versa? 2) The centered variables have a different effect in my model than the orthogonalized variables. How do I decide which one to chose? 3) After orthogonalization, is there any interpretation possible of the beta values of these variables?",2013-10-19 13:38:13.813 +104922,57749,594.0,CC BY-SA 3.0,,"Wrong both times, I'm afraid. Forget about your problem and actually *evaluate* sign(x)|x| for various values of $x$. Then explain how to simplify your $f'$ equation from a few comments up. Then draw it for a small sample.",2013-10-19 13:57:37.003 +104923,57822,594.0,CC BY-SA 3.0,,"Loss *could* be monetary, but is far more general. You could think of it as something akin to utility, but it doesn't have to be actual utility, or even very much like it. It's just some specifies measure of the 'badness' of the outcome in some sense.",2013-10-19 14:03:26.127 +104924,57749,22817.0,CC BY-SA 3.0,,The gradient is given by sign(x)|x| itself which is undefined at the points where t= the given points.,2013-10-19 14:18:20.963 +104925,57287,5875.0,CC BY-SA 3.0,,I simply don't understand the question...,2013-10-19 14:28:27.773 +104926,57749,594.0,CC BY-SA 3.0,,"Like I said, **actually evaluate sign(x)|x| for various values of x**. What do you notice about sign(x)|x|? Hence, what is x/|x|?",2013-10-19 14:39:35.500 +104927,57826,5237.0,CC BY-SA 3.0,,"Welcome to the site, @teddypicker. I have taken the liberty of formatting your question with the $\LaTeX$ that the site affords. Please ensure it still says what you want it to. Also, could you edit your last paragraph? I cannot quite parse those sentences (& what does ""sth"" mean?).",2013-10-19 14:48:29.480 +104928,57826,503.0,CC BY-SA 3.0,,@gung You are a little faster than I am!,2013-10-19 14:56:49.557 +104930,57688,,CC BY-SA 3.0,,Closely related: http://stats.stackexchange.com/questions/20523/difference-between-logit-and-probit-models and,2013-10-19 15:25:44.153 +104931,57790,22841.0,CC BY-SA 3.0,,"True, ""very low count in level 1"". In this analysis I had quasi-complete separation with other variables (very large S.E.). I recoded them and add this ""suspect variable"" as a proxy of one that I excluded. Can I use the odd ratios as they are (without the same reference category that I have in other countries?) and report that in a footnote? If not, Is it too sloppy to drop the variable just for the two countries here I am having problems?Thanks for your help!",2013-10-19 15:30:40.327 +104932,57821,19559.0,CC BY-SA 3.0,,"thanks! This is useful. In my study I have ~1500 observations per condition, so I don't know if I would fit Baayen's criteria. I would like to use confidence intervals if I can, but I usually get my confidence intervals from the output of lmer (calculated through MCMC also) and that is again not an option when my mixed-effects model has the random correlation paramenters between intercept and slopes",2013-10-19 15:34:23.787 +104933,57757,22823.0,CC BY-SA 3.0,,"By ""maps"" I mean 2D data that have geographic interpretation. I'm not a statistician but I think that what you're proposing will help to find some patterns inside these maps and not help me to compare one with another to find their (dis)similarity. Maybe I was not clear enough in my post so I'll update it with some more info.",2013-10-19 15:37:00.157 +104934,57816,19559.0,CC BY-SA 3.0,,"@mark999: sure, for example, see [Vasishth, Brüssow, Lewis, Drenhaus, (Cognitive Science, 2008] (http://www.ncbi.nlm.nih.gov/pubmed/21635350). They only present t-values in the tables and say below Table 5: ""T scores with absolute values greater than 2 are statistically significant."" (page 704).",2013-10-19 15:48:08.580 +104935,57815,3922.0,CC BY-SA 3.0,,"I think this is wrong. Anything Bayesian is just a fancy version of likelihood. If likelihood fails, Bayesian fails. If correlation between errors leads to biased frequentist estimates, no amount of Bayesian computational trickery will rectify that. On the other hand, Bayesian versions of instrumental variables that can deal with correlated measurement error issues are difficult to set up, unless you want to do non-parametric modeling of the error term distribution.",2013-10-19 15:52:09.443 +104936,57821,20120.0,CC BY-SA 3.0,,"@SolLago: I've update the quote, it seems you're in the clear. Do read the paper by Baayen et al. though.",2013-10-19 16:01:53.283 +104937,57828,16174.0,CC BY-SA 3.0,,"@SolLago, I just edited the answer. Its author is Jeff Long (say thanks to him). :)",2013-10-19 16:06:45.403 +104938,57816,20120.0,CC BY-SA 3.0,,"In Table 5 of that paper, they also give the HPD, and it's obvious that **iff** the upper and lower edge of the HPD have the same sign, |t| > 2. (In fact, |t| > 1.) .. So basically, report HPDs.",2013-10-19 16:07:44.050 +104939,57816,19559.0,CC BY-SA 3.0,,"@jona: yes, you are right. The problem is that the functions that would calculate HPD intervals (mcmcsamp, pvals.fnc) are all not implemented in R for models with random correlation parameters. So I can't use those and I don't know how to get them otherwise. Maybe I should ask this as a separate question in case anyone has suggestions?",2013-10-19 16:28:11.237 +104940,57615,2149.0,CC BY-SA 3.0,,Yes there is a good method to develop this structure. It is a combibation or ARIMA and regressor variables which can include all that I mentioned above. If tou wish you can contact me at my email address and I will try and give you more details. Just click on my name and you can get my email address. Alyternatively post your data and I will analyse it and show you precisely what can be done. Do not post the cumulative but do post the actual daily values.,2013-10-19 16:37:34.733 +104941,57838,21762.0,CC BY-SA 3.0,,"Thanks@GregSnow for the nice answer. There is one point that needs some attention: According to Gauss-Markov, equal variance is required for BLUE.",2013-10-19 17:20:45.633 +104942,57839,7155.0,CC BY-SA 3.0,,It's not clear what you're asking. If you correct the formatting you're more likely to get an answer.,2013-10-19 17:58:23.697 +105072,57875,21762.0,CC BY-SA 3.0,,Hint: think about t-test's assumpion of independent observations.,2013-10-21 05:49:32.933 +104945,57818,1412.0,CC BY-SA 3.0,,Please don't capitalize R package names if that is not their correct spelling. You have not addressed the key point (sample size) in whuber's response to the question you linked to.,2013-10-19 18:42:11.473 +104946,57775,1412.0,CC BY-SA 3.0,,Forgot to include the beta coeff.,2013-10-19 18:48:10.110 +104947,57842,6162.0,CC BY-SA 3.0,,By the way you have to similarly be careful with the REML/ML option for lme/lmer models.,2013-10-19 18:51:14.350 +104949,57811,22836.0,CC BY-SA 3.0,,"I see, thanks, I was not sure how to convert sentences into numbers...",2013-10-19 19:45:50.083 +104951,57807,22792.0,CC BY-SA 3.0,,"(1) Does this imply that causal systems always do evolve in time somehow? Because a Bayesian Network represents a joint probability distribution over the variables corresponding to its vertices, for example $P(Heat,PlateArea,Friction)$ if we do not take the time $t$ into account and this does not tell us anything about the chronological order. Let's assume that we start this hypothetical machine, wait for $t$ amounts of time and at time $t$ we instantaneously sample the system and get measurements for Heat, PlateArea and Friction.",2013-10-19 20:07:20.547 +104952,57842,17249.0,CC BY-SA 3.0,,(+1) Is it n-1 or is it indeed n-2 in the denominator of $\hat\sigma$ ?,2013-10-19 20:07:29.477 +104953,57807,22792.0,CC BY-SA 3.0,,"(2)Lets assume again that we repeat this experiment (wait $t$ amount of time and sample instantaneously) many times more and we obtain a list of (Heat,PlateArea,Friction) measurements. Assuming all this experiments have been conducted independently, how we can set the causality relationships then? For just at an instant of time, where no chronological ordering exists, I cannot think of a causality relationship to build a Bayesian Network. So, what I am trying to understand is, do we need ""time"" to build Bayesian Networks, then? @StumpyJoePete",2013-10-19 20:14:37.593 +104954,57814,22792.0,CC BY-SA 3.0,,"I have some questions about the usage of time for building a Bayesian Network, I added these as extra comments under my first question. @Learnerbeaver",2013-10-19 20:21:45.207 +104955,57725,13549.0,CC BY-SA 3.0,,"Looking into it more, your suggestion would have worked well Greg if I had better prior information. Thanks.",2013-10-19 20:23:37.837 +104956,57842,6162.0,CC BY-SA 3.0,,@PatrickCoulombe No : intercept + slope,2013-10-19 20:28:45.437 +104957,57839,594.0,CC BY-SA 3.0,,"Please check and edit your question. The answer to your title question is 'yes' (if A and B are the right shapes for the products to be square), but the body contains errors and is inconsistent with the title. Which standard references? I expect you've made a copying error in one of those terms. For $n\times 1$ vectors, a common manipulation would go $x^Tx = \text{tr}(x^Tx) = \text{tr}(xx^T)$ or something along those lines. [In my notation $\text{tr}$ is trace.]",2013-10-19 20:52:01.437 +104958,57846,6162.0,CC BY-SA 3.0,,What is the deterministic example you have in mind ? I don't see what could be interpreted as a deterministic analogous of a divergence between two distributions.,2013-10-19 20:57:34.680 +104959,57848,22864.0,CC BY-SA 3.0,,Thanks for the response. That makes sense. I'm looking for a way then to identify the variables that best predict a certain vulnerability score. For example if I know a fisherman fish with a partner is that a strong predictor of where they lie on a vulnerability scale? I find in my data when I go over the raw data that on average fishermen who fish with a partner have lower vulnerability score. Is there an analysis I can do to show that across all variables?,2013-10-19 20:57:58.057 +104960,57848,503.0,CC BY-SA 3.0,,"But you already used that variable in creating the index, so you already know how good a predictor it is. It is as good a predictor as you made it.",2013-10-19 20:59:58.207 +104961,57830,594.0,CC BY-SA 3.0,,"Using the notation of that page, $D_n$ is often regarded as 'the test statistic' (as shown [in this image on the same page](http://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/KS_Example.png/300px-KS_Example.png)), but in the discussion you quote, $\sqrt n D_n$ is being used as a test statistic. Obviously the two statistics would be equivalent (lead to rejection/non-rejection of exactly the same cases).",2013-10-19 21:00:54.413 +104962,57846,22863.0,CC BY-SA 3.0,,"In the deterministic case, I'm thinking of having two non-stochastic vectors of finite dimension and I use any norm to measure how far away they are from each other. Then divide this difference by the norm of either one to get a relative difference. I know I can't think of distributions in the same manner (finite dimensional vectors) but is there an analogous relative divergence.",2013-10-19 21:14:00.237 +104963,57835,594.0,CC BY-SA 3.0,,"You can still estimate the coefficients that way; the parameter estimates are still consistent, but their standard errors *aren't*. Any hypothesis tests, confidence intervals and prediction intervals are not 'valid'.",2013-10-19 21:23:27.223 +104964,57831,594.0,CC BY-SA 3.0,,One rule of thumb: use enough points that it looks like a smooth curve when you plot it.,2013-10-19 21:25:53.143 +104965,57842,22848.0,CC BY-SA 3.0,,"Ok, perfectly clear now. Thanks a lot ! But what do you mean with REML/ML (something to do with my last post on GuR I guess) ? Please explain (there maybe). I want to learn !",2013-10-19 21:26:55.657 +104966,57460,14799.0,CC BY-SA 3.0,,"@ttnphns My definition of suppression needs only the betas. However interesting it would be to discuss what definitions of suppression might be most useful for what purposes in what situations, it would also be against Stack Exchange policy, so I guess we're going to have to agree to disagree here.",2013-10-19 21:31:52.047 +104967,57842,6162.0,CC BY-SA 3.0,,"The REML estimates of the variance components in a mixed models are like the ""corrected for bias"" ML estimates. I have not seen your post on GuR yet :)",2013-10-19 22:17:31.163 +104968,57851,594.0,CC BY-SA 3.0,,"The population of 'coins in circulation' and 'coins that appear in my change' would need to be the same, at the right relative frequencies. Why would this be the case?",2013-10-19 22:24:03.710 +104969,57854,21762.0,CC BY-SA 3.0,,"I guess its about 'first order' approximations of something, not about order statistics.",2013-10-19 22:37:45.027 +104970,57785,668.0,CC BY-SA 3.0,,"I wonder, Vani, whether we might have a communication problem here. Most people would understand a ""first order statistic"" to be the minimum (or sometimes the maximum) of a dataset. I hope it's obvious that is a terrible estimate of a location parameter except under very special assumptions such as those described in Scortchi's answer. Would you perhaps have something else in mind here, such as the median? Could you clarify this point?",2013-10-19 22:38:28.783 +104971,57851,1895.0,CC BY-SA 3.0,,Related: [Canadian mint circulation currency](http://www.mint.ca/store/mint/learn/circulation-currency-1100028). You can click through to get the quantity minted for each coin by year.,2013-10-19 22:40:00.603 +104972,57838,2873.0,CC BY-SA 3.0,,"@MichaelMayer, you are right, I remembered wrong. will fix.",2013-10-19 22:42:21.203 +104973,57854,12683.0,CC BY-SA 3.0,,"@Michael: Might be, but something in the phrasing of the comment triggers old memories of reliability analysis & shifted failure-time distributions.",2013-10-19 22:53:40.177 +104974,57785,594.0,CC BY-SA 3.0,,OP: please clarify the meaning of your question. It seems like you must either be misusing some terms or that you refer to a very particular context; in either case you can't hope for a very satisfying answer until the miscommunication is dealt with.,2013-10-20 00:03:55.427 +104975,57855,22865.0,CC BY-SA 3.0,,I believe the 'discrete' description refers to the fact that draws from a Dirichlet process are discrete with probability one (it follows from the stick breaking representation of the DP).,2013-10-20 00:20:01.463 +104976,57855,594.0,CC BY-SA 3.0,,"You're going to have to elaborate. If I break a stick into $k$ pieces in some fashion, the distributions of the stick lengths are continuous.",2013-10-20 00:22:27.917 +104977,57595,5448.0,CC BY-SA 3.0,,"For part 2, does $\text{E}f(x) = f(\text{E}x)$ in general? Also consider the case of a simple random walk on the integers which is currently located at 0.",2013-10-20 01:54:39.177 +104978,57849,5448.0,CC BY-SA 3.0,,Consider any $t < \infty$. Is $\text{Var}(y_{t+1}) = \text{Var}(y_t)$?,2013-10-20 02:04:34.323 +104980,57855,3183.0,CC BY-SA 3.0,,"@Glen_b: Your intuition matches mine, but the paper ankit linked to says ""that draws from a DP are discrete (with probability one)"". I can't follow their argument, but I respect the authors.",2013-10-20 03:02:19.153 +104981,57821,9049.0,CC BY-SA 3.0,,+1 to your answer jona. As a somewhat general comment: I would question the realistic difference of a $t$-distribution with 600+ d.f. and a Gaussian... Those $t$-values should look awful lot like $z$-values.,2013-10-20 04:50:19.583 +104982,57744,1150.0,CC BY-SA 3.0,,"I should have been more precise, it has to do with your *initial random seed*. Your routing algorithms, the distribution of solutions, and the limits of the problem space are primarily reflected in the overall shape of the distribution (which happens to be normal in your case).",2013-10-20 04:54:54.363 +104983,57858,22372.0,CC BY-SA 3.0,,"Thanks for the answer, but, another question rises, should the residuals be closer or far to each other? Should I try some manipulation in my variable to fix something I saw, like combining variables?",2013-10-20 05:33:22.490 +104984,57855,594.0,CC BY-SA 3.0,,"@DavidJ.Harris yes, reading up about it, it seems - inconsistently with the way the word 'process' is more usually associated with distributions - to be referring to what I'd have called something like a 'multinomial process' or 'multinomial mixture', since the output is the category. (This naming scheme would be kind of like referring to inter-event times as a 'Poisson process', rather than the count of the number of events as is normally the case, or perhaps referring to a Bernoulli process as a 'beta process' because there was a beta prior on the Bernoulli probability.)",2013-10-20 06:09:09.943 +104985,57861,594.0,CC BY-SA 3.0,,"The use of the phrase 'relative to' there is critical to the meaning. Because of that, it doesn't refer to the population mean but to the difference between the population and sample mean. Beware, however - I see at least one error in that article.",2013-10-20 06:22:19.783 +104986,57648,6204.0,CC BY-SA 3.0,,I'm a little confused about what you are looking for in an answer. A citation to a paper that uses a conditional multinomial logit in a recommender setting?,2013-10-20 06:30:25.223 +104987,57814,22705.0,CC BY-SA 3.0,,"Whether to use time or not should be dictated by your business hypothesis. Just to get some additional clarity, you could view the chapter on probabilistic graphical models by daphne koller in coursera.org. Might help.",2013-10-20 07:05:44.097 +104988,57815,22705.0,CC BY-SA 3.0,,"Interesting point @StasK. Have never thought of correlating prediction errors in a Bayesian hierarchical model. But, am not sure if it's right - to correlate residuals determined by the Bayesian approach and determine if error correlation exists.",2013-10-20 07:13:48.683 +104989,57855,,CC BY-SA 3.0,,"It depends on whether you think a ""countably infinite"" number of real numbers is representative of the real numbers. I would have thought that it is, thus providing an argument against the above claim.",2013-10-20 09:04:02.507 +104990,57856,,CC BY-SA 3.0,,Why is it bad to estimate a density by a discrete distribution? Does this mean quadrature is also bad and inappropriate?,2013-10-20 09:05:14.777 +104991,57833,15827.0,CC BY-SA 3.0,,"This makes out ANOVA to be a testing procedure and regression to be a modelling procedure in which you can carry out tests. But ANOVA also has an underlying model, regardless of whether this is emphasised in all introductory treatments. So, this answer does not capture any difference between them. Nor is it addressed at the question, which is why they are taught as different regardless of strong similarities.",2013-10-20 09:45:39.777 +104992,57818,22524.0,CC BY-SA 3.0,,I have edited the question.,2013-10-20 11:56:06.347 +104993,57775,306.0,CC BY-SA 3.0,,extremely sorry for the error have updated the value.,2013-10-20 13:20:05.780 +104994,57784,750.0,CC BY-SA 3.0,,"@nico - I'm not sure what you mean by generalizing to multiple groups. I gave an example for runs in three groups, but the logic applies to more. (The typical runs test in most software I've seen only allows two, but the cited papers establishes the test statistic for multiple groups. That is why I elaborated with an example with 3 groups.)",2013-10-20 13:29:22.973 +104995,57784,436.0,CC BY-SA 3.0,,"Sorry, what I meant is that the test analyses runs of binary events (success/failure), while I have more than two levels in my group variable.",2013-10-20 13:40:29.013 +104996,57870,17740.0,CC BY-SA 3.0,,It's not entirely clear what you are asking. Surely you had a reason to use PCA? What are you trying to learn? Did you use PCA without knowing what it does?,2013-10-20 13:40:37.077 +104997,57870,22872.0,CC BY-SA 3.0,,"Honestly, I was hoping that there would be two or three components and that certain industry sectors would correlated with each other. I don't have a pressing reason to do a PCA, other than curiosity.",2013-10-20 13:46:57.597 +104998,57836,20120.0,CC BY-SA 3.0,,"Note that a confidence interval/CI is not the same as a Highest Posterior Density/HPD interval, or a Bayesian Credible Interval.",2013-10-20 13:50:29.420 +104999,57873,22872.0,CC BY-SA 3.0,,"Can you please tell me more about the red lines on the biplot. Some of them appear to be grouped together. Does this mean anything. For example, energy, resources and materials appear to be going in a similar direction.",2013-10-20 14:06:15.987 +105000,57875,503.0,CC BY-SA 3.0,,"Welcome to the site. If this is a homework question, please add the ""self-study"" tag. See [homework questions](http://meta.stackexchange.com/questions/10811/how-do-i-ask-and-answer-homework-questions/10812#10812)",2013-10-20 14:13:43.400 +105001,57856,7007.0,CC BY-SA 3.0,,"I didn't say it is ""bad"". But suppose that you have good prior information about the smoothness of the random density. You can't use this prior information if you are modelling with the plain DP. That's the kind of thing that I have in mind.",2013-10-20 14:14:13.493 +105002,57874,503.0,CC BY-SA 3.0,,"This certainly looks like a homework question. Please add the ""self-study"" tag. See [homework questions](http://meta.stackexchange.com/questions/10811/how-do-i-ask-and-answer-homework-questions/10812#10812)",2013-10-20 14:14:44.530 +105003,57874,22752.0,CC BY-SA 3.0,,@PeterFlom I've added the tag. It's not homework but part of my exam-preparation.,2013-10-20 14:20:10.577 +105004,57872,9049.0,CC BY-SA 3.0,,"jona, your train of thought is correct but take notice two things: 1. What you described in $non$-parametric bootstrap, not ""simple"" bootstrap; there is an inherent bias-variance trade-off between the two. 2. You need to be a bit careful how you resample your sample. You may accidentally end up missing a grouping especially if you have a lot of clusters. That is not ""the end of the world"" and would asymptotically ""not happen"" but this might mess-up your calculation procedures slightly.",2013-10-20 14:34:37.663 +105005,57872,20120.0,CC BY-SA 3.0,,"Regarding 2, I agree - it's just an example that would need to be adapted to the individual model. (You might get by simply sampling from the grouping variables too.) Regarding 1 - I don't understand, can you elaborate?",2013-10-20 14:41:42.403 +105006,57876,503.0,CC BY-SA 3.0,,"Welcome to the site. Since this looks like a class assignment, please add the ""self-study"" tag. See [homework questions](http://meta.stackexchange.com/questions/10811/how-do-i-ask-and-answer-homework-questions/10812#10812)",2013-10-20 14:46:22.830 +105007,57873,12808.0,CC BY-SA 3.0,,I am not very familiar with R and never have I used the vector representation. The R documentation tells the following about the direction of arrows: http://cc.oulu.fi/~jarioksa/softhelp/vegan/html/biplot.rda.html,2013-10-20 14:56:51.470 +105008,57872,9049.0,CC BY-SA 3.0,,"With any bootstrapping technique the simulations get processed just like the real data. With non-parametric bootstrapping (what you described) you resample your original data. With parametric bootstrapping you simulate a new sample based on the original model you fitted. Non-parametric btsp. makes less assumptions but usually has more variance. Parametric btsp. assumes that the model you fit is ""correct"", so it makes more assumptions, but it usually has less variance. Param. btsp. also eliminate issues regarding the resampling. (Cont.)",2013-10-20 15:01:44.250 +105009,57548,10547.0,CC BY-SA 3.0,,"For a more general case: $Y|X=x \sim N(x,\sigma_x^2)$ and $X\sim N(\mu_x,\sigma^2_x)$ I've $\beta = \frac{2\sigma_x^2 + 2\sigma_y^2}{\sigma_y^2\sigma_x^2}$ and $\gamma = \frac{-4\sigma_x^2 y - 4\sigma^2_y\mu_x}{4\sigma_y^2\sigma_x^2}$. It seems that $\int_{-\infty}^0 \text{exp}\{-\frac{x^2}{4\beta}-\gamma x\}dx = \int_{0}^\infty \text{exp}\{-\frac{x^2}{4\beta}-\gamma x\}dx$. With this I'll get smth. like a density multiplied by the $\text{erf}$. But since $\text{erf}$ integrates over $\gamma\sqrt{\beta}$ which depends upon $y$ how am I supposed to find the expected value and variance of $y$?",2013-10-20 15:03:13.473 +105010,57872,9049.0,CC BY-SA 3.0,,"Given the fact you are making parametric assumptions to start with, when you fit your original model you might as well use them and get a better estimate (ie. if you don't believe the model why bootstrap it anyway). You have the correct idea; I just want to highlight though that there is a trade-off between the non-parametric bootstrap you outlined and the parametric bootstrap that `lme4`'s native `bootMer()` function offers.",2013-10-20 15:04:28.200 +105011,57548,10547.0,CC BY-SA 3.0,,"Short: $\text{erf}(x) = \frac{2}{\sqrt{\pi}}\int_0^x \text{exp}\{-\tau^2\}d\tau$. Because $\tau = \gamma\sqrt{\beta} = \text{f}(y,\mu_x,\sigma_x^2,\sigma_y^2)$ the $\text{erf}$ part seems rather complicated.",2013-10-20 15:13:05.240 +105012,57876,,CC BY-SA 3.0,,"Asking if some data “are significant” is not a well-defined problem. If you are doing a test at all (it's not the only way to analyze quantitative data), you would typically test a specific hypothesis, perhaps compare several subgroups or test if there is a particular relationship between several variables. You need to tell us what you are trying to find out in this study.",2013-10-20 15:13:41.787 +105013,57850,22092.0,CC BY-SA 3.0,,Thank you for your kind help.I add a figure got from the book Numerical Recipes.There are questions remained.Please take a look at my updated post and forgive my retard questions.,2013-10-20 15:32:59.477 +105014,57876,22874.0,CC BY-SA 3.0,,I am trying to find out which group of teachers uses social media in their classrooms - private versus public,2013-10-20 16:06:24.913 +105015,57879,22874.0,CC BY-SA 3.0,,""" Is it a typo of ""t-test or ANOVA""?"" yes - this was a typo - my apologies - as you suspected it should read ""t-test or ANOVA""",2013-10-20 16:07:43.497 +105016,57879,22874.0,CC BY-SA 3.0,,I am trying to find out if there is a difference in teachers who use social media in their classrooms when comparing public and private school settings; Also am interested to see the relationship between teachers who use social media personally and in their classroom ; In reviewing my notes for this class I believe the professor shared the T test and ANOVA as suggestions to use in determining statistical significance - I am hoping to find a tool which will help me evaluate the data I have collected with regard to accuracy and sample size,2013-10-20 16:14:23.627 +105017,57876,22874.0,CC BY-SA 3.0,,what other tools are used to analyze quantitative data?,2013-10-20 16:15:35.023 +105018,57872,19559.0,CC BY-SA 3.0,,"@jona: Thanks! And thanks for taking the time to explain this as well. I am comfortable with bootstrapping (much more than with mixed effects models anyway) and I usually use the **boot** package in R. And thanks to user11852, now I know that I can also use bootMer() as well. Thanks both!",2013-10-20 16:41:09.277 +105019,57870,2081.0,CC BY-SA 3.0,,"@CuriousCat, Be curious and search this site for ""pca biplot"". There is also tags `pca` and `biplot`.",2013-10-20 17:03:13.397 +105020,57548,20473.0,CC BY-SA 3.0,,"$\int_{-\infty}^0 \text{exp}\{-\frac{x^2}{4\beta}-\gamma x\}dx = \int_{0}^\infty \text{exp}\{-\frac{x^2}{4\beta}+\gamma x\}dx$. Use the relation of erf with the standard normal cdf, and we'll take it from there.",2013-10-20 17:19:52.123 +105021,57879,22874.0,CC BY-SA 3.0,,is the 2x2 table of results the same as a chi square?,2013-10-20 18:22:36.123 +105022,57879,22874.0,CC BY-SA 3.0,,"I came across information about using fishers exact test when analyzing categorical(nominal) variables - this seems to fit my data, but as the tool is new to me wanted to gain some support for this idea",2013-10-20 18:24:26.227 +105023,57849,,CC BY-SA 3.0,user30490,Awesome! That makes perfect sense :),2013-10-20 18:38:22.287 +105024,57883,14799.0,CC BY-SA 3.0,,You want the Fisher-Irwin test. See Ian Campbell's [website](http://www.iancampbell.co.uk/twobytwo/twobytwo.htm) for the details and data to support the recommendation.,2013-10-20 19:41:57.500 +105025,57889,22877.0,CC BY-SA 3.0,,observed OR is 2 statistically significant different than the null value(using 2 sided type 1 error rate of 0.05),2013-10-20 20:02:43.120 +105026,57887,594.0,CC BY-SA 3.0,,"*Both* variables are ordered. An ordinary chi-square will potentially be throwing away a lot of power, though it depends on the exact hypotheses you're most interested in.",2013-10-20 20:24:46.710 +105027,57864,22868.0,CC BY-SA 3.0,,"Thanks. What about the bit that says ""the t-distribution can be used to estimate how likely it is that the true mean lies in any given range."" This makes it sound like we should be able to take a sample, then for any range [A,B] we choose, calculate Prob(A$",2013-10-21 02:54:16.353 +105058,57911,5237.0,CC BY-SA 3.0,,"What you are doing, in essence, is discrete time survival analysis. [Here](http://www.ats.ucla.edu/stat/mplus/seminars/DiscreteTimeSurvival/default.htm) is some information from UCLA's stats help site. I haven't seen these slideshows, but the UCLA site is of uniformly high-quality.",2013-10-21 02:54:25.530 +105059,57863,9049.0,CC BY-SA 3.0,,How do you calculate your derivatives? What software package are you using?,2013-10-21 02:55:57.573 +105060,57905,22843.0,CC BY-SA 3.0,,I'm guessing you're saying that $x$ and $y$ have the same mean?,2013-10-21 02:56:38.710 +105061,57905,3993.0,CC BY-SA 3.0,,"@Guest If a variable has a mean of 0 (this is what it means for a variable to be ""centered"" about its mean), then its sum must also be 0. After all, the mean of a variable is just computed as its sum divided by something. So if the mean is 0, the sum must also be 0. Make sense?",2013-10-21 02:57:07.780 +105062,57893,594.0,CC BY-SA 3.0,,"It seems like you're maybe confusing together the cdf with its inverse. First, try drawing a picture of the pdf, the cdf, and the inverse of the cdf (the quantile function). Once you can get the cdf right, it's easier to get its inverse right.",2013-10-21 03:02:46.263 +105063,57875,22611.0,CC BY-SA 3.0,,"Just wanted to add - my main question is the appropriateness of using N = 15 for each group. I'm aware you can conduct a t-test with only summary statistics from each group (e.g., mean, SD, and sample size), but the sample size is usually the number of *individuals* contributing to the data - not the number of *questions*",2013-10-21 03:07:11.847 +105064,57895,594.0,CC BY-SA 3.0,,"I guess I should have asked for this clarification first: do you mean linear regression with multiple predictors (x's, IVs) - that is multiple regression, or do you mean linear regression with multiple responses (y's, DVs) - that is, *multivariate* regression?",2013-10-21 03:07:20.243 +105065,57858,10135.0,CC BY-SA 3.0,,"Have a look at [this](http://stats.stackexchange.com/questions/29271/interpreting-residual-diagnostic-plots-for-glm-models) and the links in the answer. It is much more comprehensive than what I was going to say here. Depending on the problem, you need to revised your model, sometimes removing a variable, sometimes transforming it. I cannot provide a single remedy, it is depends to the problem.",2013-10-21 03:45:46.760 +105066,57381,22547.0,CC BY-SA 3.0,,I think this is the best answer for what is a terrible question (in hindsight).,2013-10-21 03:57:08.600 +105067,57195,22547.0,CC BY-SA 3.0,,"I've realized from this that I have a lot of work to do on pre-processing the data before I start the analysis I've outlined here. Reading the response from @nadya I think it's clear I need to look at some kind of spatial aggregation, but that will be challenging as it's wrong to aggregate land and ocean data. Then I need to look at gap-filling strategies. Then (and only then) can I start to look at the mapping / visualization work.",2013-10-21 03:59:37.990 +105068,57914,594.0,CC BY-SA 3.0,,"This looks like pretty standard bookwork. Is this for some subject, or even just for the purposes of your own study?",2013-10-21 04:22:39.190 +105069,57319,5237.0,CC BY-SA 3.0,,"I haven't really had a chance to get to this, although I've wanted to. It's worth noting @ttnphns' point, though. The sign reversal has to do with endogeneity, not suppression. For reference, I have discussed endogeneity [here](http://stats.stackexchange.com/q/58709//58712#58712), & suppression [here](http://stats.stackexchange.com/q/33888//34016#34016).",2013-10-21 04:45:55.887 +105070,57651,22762.0,CC BY-SA 3.0,,The data is in ordinary format as for the {plm} package purpose. Vars: ID country year REER GDP FinalConsumpExpend DimesticDemand ...(21 vars in total) over 1994Q1:2003Q1 period of time,2013-10-21 05:23:26.677 +105071,57914,,CC BY-SA 3.0,,Hint: What's the probability of getting *anything but* a 2 on a single throw of the die?,2013-10-21 05:32:31.887 +105073,57887,169.0,CC BY-SA 3.0,,Of course! They are both ordered. Is there such a thing as an ordered chi square? I never heard of such a thing,2013-10-21 05:55:27.560 +105074,57319,2081.0,CC BY-SA 3.0,,"@gung, you link to useful answers of yours. (Though I would doubt that sign reversal is always due to endogeneity.) If you like you might post a question about suppression etc and people, including yourself, might give their answers.",2013-10-21 06:06:01.413 +105075,57893,22884.0,CC BY-SA 3.0,,"The way I was trying to do this is as follows: calculate the CDF (by integrating), then find the inverse function. Now I was just confused how to ""stick them together"" since I only draw p in [0,1]. +@Glen_b When I am drawing random numbers accordingly to a given pdf I always use its quantile function. I think this is correct since it is related to the inverse of the pdf but please correct me if I am wrong! +That is of course, correct, I was thinking about calculating only x >0 first, then mirroring it! Would that work?",2013-10-21 06:07:41.430 +105076,57864,6204.0,CC BY-SA 3.0,,"You can, but it usually goes in the other direction. The typical process is to pick a probability (usually 90%, 95%, or 99%) and determine the (symmetrical) range about the observed sample mean which encompasses this probability of finding the true mean. That's what a confidence interval is.",2013-10-21 06:24:21.793 +105077,57893,594.0,CC BY-SA 3.0,,"Take some care. One reason for suggesting you write the cdf was that I was hoping you'd see that your *density* is probably not correctly specified (I suspect you're aiming to have continuity at $\pm x_0$ - is that the case? If so, your $f$ is wrong. As I said, somewhere earlier, draw it.). If you don't get that right you're wasting your time jumping several steps further along.",2013-10-21 06:46:25.227 +105078,57913,15321.0,CC BY-SA 3.0,,"Just to avoid any confusion, I will be using the same colors with the same number of colors in same proportion in both the cases. The only difference between the dataset would be the configuration and labels of the graph vertices.",2013-10-21 06:48:01.643 +105079,57887,594.0,CC BY-SA 3.0,,"There are actually several possible ways to analyze such data, some of which would correspond to some form of chi-square. However, with this data, I'd probably be looking at modelling it as something like a cumulative logit model on the quantile groups with time as a independent variable. It kind of depends on what kinds of things you want to test.",2013-10-21 07:06:04.277 +105080,57914,20470.0,CC BY-SA 3.0,,A 2-second Google search would have saved you from a 15-second question typing effort: http://math.stackexchange.com/questions/337689/if-you-roll-5-standard-six-sided-dice-whats-the-probability-that-you-get-at,2013-10-21 08:03:53.973 +105081,57906,20473.0,CC BY-SA 3.0,,"The size of the sample $n$ is used when we are _estimating_ the expected value from a sample. When we _define_ the expected value the ""weight"" factor is the pmf itself. Indeed, the expected value is a ""weighted"" average, while the sample mean is an ""unweighted"" average (nevertheless, the latter is a consistent estimator of the former). As for the second issue, you are confused because in other scientific fields, the symbol $< >$ is used _instead_ of the symbol $E$ -it means exactly the same thing.",2013-10-21 08:57:34.193 +105082,57919,22752.0,CC BY-SA 3.0,,"Note: this is not a homework question, but I'm studying for exams.",2013-10-21 09:12:22.147 +105084,57916,3993.0,CC BY-SA 3.0,,"This is not a ""problem"" and does not need to be ""solved."" As you already noted yourself, this apparent multicollinearity is a natural consequence of using dummy codes. If you use non-orthogonal codes, you get non-orthogonal parameter estimates. My advice: ignore it.",2013-10-21 10:16:01.180 +105085,57901,6162.0,CC BY-SA 3.0,,"Maybe I'm missing something, but you have only derived the distribution of $Y$, whereas the OP requires the conditional distribution of $Y$ given $Y \leq W$. Moreover there's no need to do all these calculations in order to derive the distribution of $Y$.",2013-10-21 10:33:53.223 +105086,57926,22677.0,CC BY-SA 3.0,,"@`mpiktas`, suppose i wanted to fit with a mean model of p,d,q 2,3,4 what function do you suggest `rugarch` or `garchFit` or is there others? or do i need to do further differencing until reaching zero `d` before fitting?",2013-10-21 11:58:12.080 +105087,57926,1406.0,CC BY-SA 3.0,,Does your data really conform to d=3? Processes with $d=3$ behave very wildly and usually are not examples of financial data.,2013-10-21 12:03:59.440 +105088,57901,20473.0,CC BY-SA 3.0,,"@Stephane Laurent In the OP's question the solution to the specific integral is requested _at the end_ of OP' post, where the OP clearly indicates that the solution to this integral is what the OP needs. It would be very useful to everybody if you would post an answer with the alternative and shorter way to derive the solution to this integral.",2013-10-21 12:05:39.920 +105089,57916,18914.0,CC BY-SA 3.0,,"Ok, Thanks a lot! I thought that it should be ok, but I just want to be sure:-)",2013-10-21 12:10:46.940 +105090,57919,4656.0,CC BY-SA 3.0,,"Most likely by `the change-of-variables technique` is meant the usual method involving Jacobians. However, the result that you are asked to prove is false unless the univariate standard normal random variables are **independent** random variables. See [this answer](http://stats.stackexchange.com/a/30205/6633) for a great description of how two normal random variables can fail to have a bivariate normal distribution.",2013-10-21 12:16:18.943 +105091,57930,503.0,CC BY-SA 3.0,,"First, a test can't show ""no significance"" - it gives a particular p value. Second I don't understand exactly what you mean by ""subsetting on varX"", nor exactly what you are bootstrapping. What is it you are trying to do?",2013-10-21 12:17:06.847 +105092,57926,22677.0,CC BY-SA 3.0,,"of course not it's only hypothetical, the biggest 'd' in my data are the index of financial sector of greece 'fin.gre' with arima order (1,2,1) while the log return of it 're.fin.gre' is (1,1,3)",2013-10-21 12:17:23.597 +105093,57930,4499.0,CC BY-SA 3.0,,"After testing for all cases 5K vs all controls 5K pvalue is >0.05, not significant. I am trying to find out if there is a significance for the subset of cases `dat[ dat$caco==0 | dat$varX==""A"",]`, i.e. all controls vs cases with varX==""A"", here I have pvalue <0.05.",2013-10-21 12:23:07.950 +105094,57870,5671.0,CC BY-SA 3.0,,"Since you have outliers in the data set, they might be dominating your variances. Judging from your plot, the data is a single large blob only, and there is no structure to be seen in here. You may need to do more manual preprocessing and data cleaning.",2013-10-21 12:28:03.823 +105095,57930,503.0,CC BY-SA 3.0,,"In that case, you don't need to do anything else, just say what you did when you report results. However, beware that with 5000 vs. 1000 even very small differences can be significant.",2013-10-21 12:29:41.680 +105096,57919,22752.0,CC BY-SA 3.0,,@DilipSarwate Thank you. I assume though that that is probably what the professor meant. Could you show me how the method with the Jacobians works assuming that that is so?,2013-10-21 12:30:48.447 +105097,57894,22885.0,CC BY-SA 3.0,,@ttnphns Thanks for pointing it out. I was really using too much of terminology. Hope now it's better. I tried to make the problem description clearer. Though I understand that it's still messy. I'm working on it.,2013-10-21 12:32:53.020 +105098,57930,4499.0,CC BY-SA 3.0,,"*5000 vs. 1000 even very small differences can be significant.* - that is my worry, hence I came up with above ""method"" - as a way of validation. (Apologies, I'm not a statistician.)",2013-10-21 12:41:20.833 +105099,57931,17670.0,CC BY-SA 3.0,,"Thank you, is there a way to achieve this while sticking to logistic regression (i.e. without touching the likelihood function)?",2013-10-21 12:41:33.093 +105100,57931,2666.0,CC BY-SA 3.0,,"It depends on what ""this"" is. What is the ultimate goal and how will the model be used?",2013-10-21 13:00:04.627 +105101,57919,4656.0,CC BY-SA 3.0,,See [this document](http://courses.engr.illinois.edu/ece313/fa2000/ppt/Lecture39.pdf) for how the Jacobian method applies to the special case of linear transformations of normal random variables.,2013-10-21 13:01:55.557 +105102,57880,22643.0,CC BY-SA 3.0,,"Thank you, jbowman. I have edited the question to include a fictitious use case, and an explanation why I think duration is important.",2013-10-21 13:07:55.397 +105103,57928,17740.0,CC BY-SA 3.0,,"Whether you interpret it as a vector or a matrix doesn't really change anything. I would not worry about 2500 features, that's not *big* by current standards.",2013-10-21 13:11:15.553 +105104,57926,1406.0,CC BY-SA 3.0,,"Hm so you have probably monthly data? I suggest looking at other packages then, or apply the ugarch on differenced data.",2013-10-21 13:17:11.090 +105105,57930,503.0,CC BY-SA 3.0,,"Your method doesn't fix that and isn't necessary. Just look at the effect sizes - here, the proportions in each cell of your table.",2013-10-21 13:18:13.787 +105106,57894,2081.0,CC BY-SA 3.0,,"`I'm doing eigen-decomposition of a covariance matrix, then using eigenvectors to make an orthogonal transformation of the data that is not mean-centered` Are you saying that you compute the PC scores by multiplying raw_data*eigenvectors, not centered_data*eigenvectors? (That gives the PCs which are completely correlated with the ""true"", centered PCs.)",2013-10-21 13:29:40.660 +105107,57890,4320.0,CC BY-SA 3.0,,"If you estimate the parameters and don't want to use a prior (different then using an ""uninformative prior"") then you essentially just want to do [Maximum likelihood estimation](http://en.wikipedia.org/wiki/Maximum_likelihood), i.e., $\theta^* = \arg\max_\theta f(X|\theta)$.",2013-10-21 13:34:42.073 +105108,57901,6162.0,CC BY-SA 3.0,,"Ok sorry, I'm at the office and the LaTeX rendering does not work. The shorter way I had in mind is the one given by @RayKoopman, without integral calculations. If you really want to calculate an integral, you don't need to calculate the normalization constant.",2013-10-21 13:34:48.337 +105109,57928,22901.0,CC BY-SA 3.0,,"thanks marc, I guess its fair enough to use the 2500 points as individual features, I just want to make sure that the machine learning captures the 'shape/relationship of matrix elements' of the matrix vs just concentrating on which features are important and weighting them. But perhaps that is the same thing",2013-10-21 13:38:35.500 +105110,57898,668.0,CC BY-SA 3.0,,"John Tukey, who invented the approach from which this method appears derived, used *two* multipliers: He set one ""fence"" at 1.5 times (an analog of) the IQR away from each quartile and another fence at 3 times the IQR from each quartile. Values beyond the first fence were ""out"" and values beyond the second were considered ""far out"" (those who remember the '60s will understand this terminology). If you think you need more extreme fences, then most likely you should consider *re-expressing* your data rather than changing the fences.",2013-10-21 13:42:38.017 +105111,57901,10547.0,CC BY-SA 3.0,,"If u've $f_Y(y)$ its not a big effort to derive (5). Nevertheless is there not a small error in the calculation? Since $\int_{-\infty}^\infty \text{exp}\{-\beta w^2 + \alpha w\}dw$ = $\int_{\infty}^0 \text{exp}\{-\beta w^2 - \alpha w\}dw$ + $\int_{0}^\infty \text{exp}\{-\beta w^2 + \alpha w\}dw$ = $-\int_0^{\infty} \text{exp}\{-\beta w^2 - \alpha w\}dw + \int_{0}^\infty \text{exp}\{-\beta w^2 + \alpha w\}dw$. Then I would use Gradshteyn & Ryzhik (2007), ""Table of Integrals, Series and Products"", 7th ed., p. 336, eq. 3.322(2) with $\alpha = -\gamma$ for the 1st part.",2013-10-21 13:43:11.810 +105112,57914,668.0,CC BY-SA 3.0,,Useful information is available on our site through [a search](http://stats.stackexchange.com/search?q=binomial+dice).,2013-10-21 13:45:11.027 +105113,57894,22885.0,CC BY-SA 3.0,,"@ttnphns Exactly. I plotted toy data in 2d and checked the rotation. It works the same as if data was mean-centered, but just a pivot point of rotation is not located in the mean (honestly, I don't know where it is). My concern now is if it can affect optimization results in some way. +After receiving optimized coefficients, i just multiply them by transposed eigenvectors.",2013-10-21 13:50:51.147 +105114,57933,20498.0,CC BY-SA 3.0,,Thank you very much for the response. Rapidminer is very good...very simple and a gateway to R..which is what im trying to practice this on. I have purchased the book. It may give me some ideas,2013-10-21 13:56:53.007 +105115,57901,20473.0,CC BY-SA 3.0,,"The formula I used (which makes use of the cosh function) is quicker than what I had proposed to my other answer. Mistake, there isn't, in either ways. Be careful with how signs change/change not when swapping integral limits etc.",2013-10-21 14:02:04.663 +105116,57935,668.0,CC BY-SA 3.0,,"What exactly does `points` measure and what relationship does it have (if any) to a player's ""score""? (Or is the score a separate variable altogether?) Note, too, that your ability to assess ""influence"" in any material or causal sense depends on how you collected these data: if they are just records of the outcomes of contests, then most likely you cannot estimate influence at all, but you might be able to identify some *quantitative relationships* among the variables.",2013-10-21 14:02:06.037 +105117,57935,503.0,CC BY-SA 3.0,,"It might help to post the first few lines of your data, in `R` you can do this with `head`.",2013-10-21 14:05:23.577 +105118,57901,20473.0,CC BY-SA 3.0,,"@Stephan Laurent Yes, of course, but I wanted to calculate the _integral_ for which the OP said he was stuck. And the bulk of my calculations are not about the constant term, but about manipulating the terms that contain the integrating variable.",2013-10-21 14:05:42.230 +105119,57935,10409.0,CC BY-SA 3.0,,"`points` is a combination of the player's assists, goals, penalties (negative), etc... where each attribute has a multiplier. I should reword that to be `points` and not `score`. Good catch.",2013-10-21 14:09:18.160 +105120,57935,10409.0,CC BY-SA 3.0,,"opponent facts are things like the opponent's ""average points allowed per game""",2013-10-21 14:10:28.507 +105121,57886,22880.0,CC BY-SA 3.0,,"Exactly, and I asked two questions....sorry for bad formatting.",2013-10-21 14:12:14.017 +105122,57782,22507.0,CC BY-SA 3.0,,"Before asking what would be the flaws with this approach, maybe you should write why this approach should work, in your opinion. Why do you think that the steps 2-4 improve the outcome?",2013-10-21 14:40:11.457 +105123,57782,22507.0,CC BY-SA 3.0,,"Also, am I right that at the end you drop the model from the step 1 and use only model from the step 4?",2013-10-21 14:47:27.660 +105124,57928,4320.0,CC BY-SA 3.0,,"@user1449677 What does ""pretty much an image"" mean? There are a lot of different features you can compute from images, [SIFT](http://en.wikipedia.org/wiki/Scale-invariant_feature_transform), [HOG](http://en.wikipedia.org/wiki/Histogram_of_oriented_gradients), or see the Wikipedia page [Feature (computer_vision)](http://en.wikipedia.org/wiki/Feature_(computer_vision)) as a jump off point.",2013-10-21 15:29:44.547 +105178,57962,10594.0,CC BY-SA 3.0,,"@jbowman, Could you give me more insight about ""My suspicion is that you really mean that the expected number of cells surviving is Nexp{−βx}, which doesn't agree with λ=Nexp{−βx}."" Why?",2013-10-21 20:48:29.147 +105125,57937,668.0,CC BY-SA 3.0,,This was the original (1908) *definition* of $t$: everything else known about it comes from this characterization. The Wikipedia article on the Student t distribution provides a reference to [Fisher's 1925 paper](http://www.sothis.ro/user/content/4ef6e90670749a86-student_distribution_1925.pdf).,2013-10-21 15:31:06.600 +105126,57941,668.0,CC BY-SA 3.0,,What happens to the polynomial and its roots when you plug in $1/u$ for $\lambda$?,2013-10-21 15:41:58.217 +105127,57940,668.0,CC BY-SA 3.0,,"+1 You will find an extended discussion of (1) and (3)--with definite empirical answers--in Daniel Kahnemann's book *Thinking, Fast and Slow* (2011).",2013-10-21 15:44:35.700 +105128,57942,668.0,CC BY-SA 3.0,,"Could you please explain how it would be possible for a percentage of *anything* in a group of size $n=6$ to be any value other than $0, 100/6, 200/6, \ldots, 500/6,$ and $100$? In particular, how do you arrive at $1.5667\%$?",2013-10-21 15:46:25.647 +105129,57850,22092.0,CC BY-SA 3.0,,"er..When you have time,could you please..?I know they are retarded questions. I tried to understand but,sigh.",2013-10-21 15:47:54.830 +105130,57941,,CC BY-SA 3.0,user30490,"Could you (""hold my hand"") and elaborate. I don't see it still even if i plug in 1/u.",2013-10-21 15:48:29.733 +105131,57942,22909.0,CC BY-SA 3.0,,"@whuber The data-sets i have are time dependent. In the second group, the case x was seen 1.5667% of the total time.",2013-10-21 15:49:58.963 +105132,57782,17670.0,CC BY-SA 3.0,,"Yes, I was planning on using the model fitted with the entire data set, but it doesn't make sense to do so because it's under-performing the model fitted with the training set.",2013-10-21 15:50:35.567 +105133,57931,17670.0,CC BY-SA 3.0,,I edited my question to provide detail on what I'm trying to achieve.,2013-10-21 16:03:08.613 +105134,57940,651.0,CC BY-SA 3.0,,"I'd need to reread the book, but (1) seems to be a rather odd use of probabilities for decision making. You don't need to reject hypotheses to make decisions, taking the decision that maximises the expected return is perfectly valid, and in this case would tell you that any lottery ticket is as good as any other (excluding consideration of the behaviour of other customers).",2013-10-21 16:05:37.190 +105135,57940,5448.0,CC BY-SA 3.0,,"I have to say, I had a hard time reading past the first ""paradox""; an author who opines on statistics and decision-making while, it would appear, having no knowledge of statistical decision-making, is not to be trusted on the applicability of statistics in general. Also, as Russell and Whitehead showed, logic is a part of mathematics, and of course so is probability theory, so they can't be inconsistent with each other - unless mathematics itself is internally inconsistent. As for paradox #2, ask any actuary or gambler about whether probability can be applied to real life.",2013-10-21 16:08:40.020 +105136,57782,5821.0,CC BY-SA 3.0,,I don't have a source on this right now... but are you aware you can optimize a logistic regression model to maximize the Area Under The (Receiver Operating Characteristic) Curve (or AUC)? No need to reinvent the wheel.,2013-10-21 16:11:04.680 +105137,57942,503.0,CC BY-SA 3.0,,"That doesn't really answer the question, it just raises more questions. If you have N = 6, @whuber is correct. If you have something else, please describe what you have. See [how to ask a statistics question](http://www.statisticalanalysisconsulting.com/how-to-ask-a-statistics-question/).",2013-10-21 16:12:40.160 +105138,57942,22909.0,CC BY-SA 3.0,,@PeterFlom Please see the edit. I hope now my question is more clear,2013-10-21 16:22:26.357 +105139,57919,22752.0,CC BY-SA 3.0,,"@DilipSarwate Hmmh, I still don't really understand how it would work in this case. Could you please show me?",2013-10-21 17:02:28.883 +105140,57942,436.0,CC BY-SA 3.0,,"You may need to explicit a little bit better what you did because (at least to me) it is still unclear. Something on the line of: ""In each experiment I measured 5000 events and counted how many were of type x"", or whatever you did...",2013-10-21 17:03:45.440 +105141,57942,232.0,CC BY-SA 3.0,,"I think you are using incorrect terminology: you have _rates_, not _percentages_. So in the first group, you have the event occurring at an (average) rate of 0.1 / minute, for example. Is that correct?",2013-10-21 17:04:06.940 +105142,57768,18845.0,CC BY-SA 3.0,,"For anyone who wants to know more about the use of negative correlation in Monte Carlo simulation, try googling ""antithetic variates"". More info in course notes [here](http://www.columbia.edu/~ks20/4703-Sigman/4703-07-Notes-ATV.pdf) or [here](http://www.math.kent.edu/~oana/math60093/10lecture5.pdf).",2013-10-21 17:39:19.887 +105143,57949,668.0,CC BY-SA 3.0,,"Please tell us what $Z,$ $\varepsilon,$ $\hat{\varepsilon},$ and $\sigma$ represent. In particular, what formulas do you know that express $\hat{\varepsilon}$ in terms of $\varepsilon$ and $Z$? (In so doing, I suspect you will find the answer to this question yourself.)",2013-10-21 17:39:26.730 +105144,57942,22909.0,CC BY-SA 3.0,,@nico you are right i should have written before. Please see the edit,2013-10-21 17:40:19.890 +105145,57948,668.0,CC BY-SA 3.0,,"Could you please explain what you mean by ""register the data"" and what the nature of these ""technical difficulties"" is? It sounds like you may have censored data: choosing an appropriate procedure for such data depends on the nature of the censoring.",2013-10-21 17:41:34.237 +105146,57205,22558.0,CC BY-SA 3.0,,"Hello Momo, thanks for the comment. ""not touch the data"" means not to remove any data or time-step or replace with 0 or the mean, it would compromise the information about the specific-time-lag linear dependence. I partially recoded the matlab (link above) autocorrelation and partial autocorrelation functions to deal with NaNs: any data couples including NaNs is excluded from the computation. This is done for each lag. It worked for me. Any suggestion is well accepted.",2013-10-21 17:47:07.990 +105147,57937,20473.0,CC BY-SA 3.0,,Go to an answer of mine in math.SE where the full derivation of the t-density can be found. http://math.stackexchange.com/questions/474733/derivation-of-the-density-function-of-student-t-distribution-from-this-big-integ/480327#480327,2013-10-21 18:02:31.280 +105148,57948,22910.0,CC BY-SA 3.0,,"Sorry, I tried to be concise. The used machine only can count until a limit (100000) so counts over that limit are registered as ""over 100000"" and this is the problem, instead of the decreasing curve of the normal distribution I have an over 100000 group which is not very useful.",2013-10-21 18:12:28.033 +105149,57944,10409.0,CC BY-SA 3.0,,"Thanks, I have never run a regression model before. The goal is to see what opponent_facts have the biggest influence on a player's points. And ideally, how much should I expect a player's points to increase or decrease. + +I'm looking at the ""Estimate"" column of the summary. It tells me that a certain player_id has an Estimate of -2.7648, but I see their mean points is 2.975. Is the Estimate column the one you were talking about where I can see the ""average points expected for a player"" or the ""average points expected against a given opponent"". That would be really cool to know.",2013-10-21 18:12:37.067 +105176,57962,5448.0,CC BY-SA 3.0,,"@Glen_b You're correct, as usual. To the OP - how does the number of cells $N$ differ from a sample of $N$ cells? I am assuming whether a cell survives or not is independent, to a reasonable degree of approximation, of the survival of other cells. My suspicion is that you really mean that the expected number of cells surviving is $N \exp\{-\beta x\}$, which doesn't agree with $\lambda = N \exp\{-\beta x\}$.",2013-10-21 20:41:21.610 +105150,57782,17670.0,CC BY-SA 3.0,,"@AdamO I fitted the model with type.measure=""auc"" and although the training set model still out performs the full set model, it's now by a very small amount. I read up on AUC, and this result is reasonable to me because in order to gauge performance I am simply optimizing on a single prediction threshold which gives the maximum difference between true/false positives, which isn't what the AUC objective is doing. In the 3 steps I outlined at the bottom of my question, is step 3 necessary, or should I just stop at step 2 with the optimized prediction threshold since its performing better?",2013-10-21 18:14:50.633 +105151,57949,22752.0,CC BY-SA 3.0,,@whuber $\hat{y}=Z \hat{\beta} + \hat{\epsilon}$. The epsilons are the error terms and $\sigma^2$ is variance. I still however don't know why this holds...,2013-10-21 18:15:05.107 +105152,57782,17670.0,CC BY-SA 3.0,,"@AdamO Also, if you could post your previous comment as an answer I will accept it as the answer.",2013-10-21 18:16:00.393 +105153,57944,1805.0,CC BY-SA 3.0,,"@Bradford: Yes it is, but that ""estimate"" is for that player, against the AVERAGE opponent. So in your example, the regression model thinks that that player has an estimated mean points of -2.7648, but they played against easier than average opponents, which is the reason their actually average points is 2.975. In other words, the regression model corrects for the fact that each player has played a different set of opponents, and some of those opponents might have been easier (or harder) than average.",2013-10-21 18:21:16.690 +105154,57940,2958.0,CC BY-SA 3.0,,""" when we say that there is a 95% chance of rain tomorrow, it is unclear to what entities that 95% applies"" Gigerenzer (e.g. in ""Risk Savvy"") discusses this but in an entirely practical and non-philosophical way. He suggests that at the very least you spell out 95% of what (for weather forcasts: usually days that are similar to tomorrow), or better: that 19 out of 20 such days had rain and give a definition of what ""rain"" means specifically. He also argues that school children can understand such statements, but hardly anyone can if the vital information about the denominator is omitted.",2013-10-21 18:35:20.553 +105156,57942,22909.0,CC BY-SA 3.0,,"After the correction from @nico , here is the solution: + +https://onlinecourses.science.psu.edu/stat414/node/268. Thank you so much to everyone",2013-10-21 18:46:32.797 +105157,57836,1411.0,CC BY-SA 3.0,,I would also note that the `?pvalues` help page in the the new version of `lme4` includes a lot of information on this topic.,2013-10-21 18:54:23.307 +105158,57962,5448.0,CC BY-SA 3.0,,"Do you have a particular functional form for $g(\beta,x)$ in mind?",2013-10-21 19:02:49.237 +105159,57962,10594.0,CC BY-SA 3.0,,"yes, $\lambda=g(\beta,x)=Nexp^{-\beta x}$, where N refers to the original number of cells and $exp^{-\beta x}$ refers to the fraction of cells that are died.",2013-10-21 19:07:12.577 +105160,57961,22906.0,CC BY-SA 3.0,,"Thanks for your answer. However, I'm talking about the density function which takes value f1 in the range 0 to t1, value f2 for the range t1 to t2, and f3 for more than t2. In this situation, can I use the 2nd approach? I know the first approach is more applicable, but I can't derive the CDF for this complicated function.",2013-10-21 19:13:24.500 +105161,57949,668.0,CC BY-SA 3.0,,"That's because you expressed $\hat{y}$ rather than $\hat{\varepsilon}$ in terms of $Z$. You might find that searching our site for [""idempotent""](http://stats.stackexchange.com/search?q=idempotent) gives particularly helpful pointers.",2013-10-21 19:14:11.140 +105162,57948,668.0,CC BY-SA 3.0,,"That is called *right censoring.* The issue you face is not one of testing normality but of appropriately handling the censored data. It would help for you to edit your question to reflect that more explicitly, rather than relying on readers to look through these comments. In particular, please clarify what you mean by ""comparing"" counts: do you wish to compare two distributions, two means, or something else?",2013-10-21 19:18:52.287 +105163,57962,5448.0,CC BY-SA 3.0,,"I'm not quite sure you've distinguished between $p(y=0)$ and $\lambda$, so just to check: $p(y=0) = \exp\{-\lambda\}$, $\lambda = N\exp\{-\beta x\}$, so $p(y=0) = \exp\{N\exp\{-\beta x\}\}$?",2013-10-21 19:33:21.910 +105164,57962,10594.0,CC BY-SA 3.0,,Yes.That's exactly my model,2013-10-21 19:37:11.147 +105165,57962,20473.0,CC BY-SA 3.0,,"Given what you write in your question, $exp(-\lambda)$ is the probability of having zero survival, and zero survival is mapped to $y=1$ (as you write it), not $y=0$. cc @jbowman",2013-10-21 19:46:00.740 +105167,57960,,CC BY-SA 3.0,,"I would have thought AUC is not best here because there is small loss for false negative, but large loss for false positive.",2013-10-21 19:50:41.993 +105168,57965,668.0,CC BY-SA 3.0,,"This is closely related to a generalized [birthday problem](http://stats.stackexchange.com/search?q=birthday): $N$ would be days of the year, the $D$ independent ""markings"" would be draws from a population of people, and the chance of $C$ ""clean"" balls is the chance that among those $D$ people there are $N-C$ unique birthdays. As an answer, do you seek a closed formula, an efficient algorithm, or an asymptotic formula (in $N$ or $D$)?",2013-10-21 19:59:09.907 +105169,57962,5448.0,CC BY-SA 3.0,,"Why would your sample size ($N$) affect the fraction of cells that have died? The count, perhaps, but the probability?",2013-10-21 20:00:38.830 +105170,57782,,CC BY-SA 3.0,,"What I don't quite understand here is why you haven't included anything about the predicted future price in your model, nor have you included the magnitude of profit/loss into the optimisation. Surely a decision to ""buy"" that leads to a 99% loss is much worse than a decision to ""buy"" that leads to a 1% loss, even though both are false positives.",2013-10-21 20:04:33.103 +105171,57960,5821.0,CC BY-SA 3.0,,"Well, the real problem is that OP has a continuous outcome (ROI) and is dichotomizing it as a loss/gain. But splitting hairs aside, with ROC regression *in general* ""stupid"" marker cut-off regions indeed count toward the AUC. You can use the partial AUC if you prespecify what counts as meaningful versus stupid marker values, and partial AUC regression has all the same performance capabilities (and issues).",2013-10-21 20:21:38.770 +105172,57962,10594.0,CC BY-SA 3.0,,"@jbowman. I am not sure If I get your questions. $N$ refers to the original number of cells, actually it is also a variable. $N exp^{-\beta x}$ is the cell survival function. I don't think $N$ refers to the sample size.",2013-10-21 20:25:42.797 +105173,57965,22914.0,CC BY-SA 3.0,,"I'm looking for a closed formula. I will look into the birthday problem, I didn't notice they are related.",2013-10-21 20:30:02.437 +105174,57962,594.0,CC BY-SA 3.0,,"""The original number of cells"" is the same as what I think jbowman intended by ""sample size"". I agree with his concern, as well.",2013-10-21 20:31:44.390 +105175,57967,594.0,CC BY-SA 3.0,,"It's small numbers (small expected counts) where it's critical to model as count data. Whether there's a calculation problem at some size of count will depend on the software, but I don't see that a carefully implemented calculation should have a problem with those counts. Either way, they're certainly large enough to approximate by normal distributions, via nonlinear least squares or Iterative Reweighted Least Squares, say, but that, too, would need to be carefully implemented.",2013-10-21 20:37:05.243 +105177,57962,10594.0,CC BY-SA 3.0,,"Only $exp^{-\beta x}$ is the probability of cells that have survived, which is irreverent to $N$. So $exp^{-\beta x}$ is the probability. $N exp^{-\beta x}$ gives the counts of cells that that have survived.",2013-10-21 20:44:53.877 +105179,57962,10594.0,CC BY-SA 3.0,,"@jbowman, Yes, your concern about in dependency of survival among cell is reasonable. However, we assume that cell survival is independent.",2013-10-21 20:54:54.260 +105180,57962,5448.0,CC BY-SA 3.0,,"You have, in one case, the probability of a cell surviving is $\exp\{-\lambda\}$, and in the other case, $\exp\{-\beta x\}$, which is inconsistent with $\lambda = \exp\{-\beta x\}$.",2013-10-21 20:58:18.310 +105181,57931,2666.0,CC BY-SA 3.0,,"Unless I'm missing something, nothing you added would imply the use of a cutpoint. Note that a predicted probability provides its own error rate.",2013-10-21 21:05:48.903 +105182,57969,668.0,CC BY-SA 3.0,,"There are many suitable distribution tests that apply here and are powerful (most likely they are too powerful, but that's another issue altogether), including the Chi-squared test and the Kolmogorov-Smirnov test. So what ""resource"" do you seek: a reference to some test that looks like yours or some test that will work well in the situations you describe?",2013-10-21 21:06:55.513 +105183,57961,22906.0,CC BY-SA 3.0,,Thank you so much Aniko for your such detail suggestion. Great Support!!,2013-10-21 21:08:48.727 +105184,57969,8869.0,CC BY-SA 3.0,,"Oops, I forgot to say that $N$ is very small. My understanding is that Kolmogorov-Smirnov and Chi-squared are not suitable for small $N$, since you need enough samples to give you a reasonable cumulative distribution function or histogram. I don't know a test that applies to this case. Edited post to emphasize this point.",2013-10-21 21:11:44.050 +105185,57942,232.0,CC BY-SA 3.0,,"You can add an official answer to your own questions, but in this case your answer might be wrong. It completely ignores the fact that there are 15 experiments in group 1, and that outcomes within an experiment might not be independent.",2013-10-21 21:16:03.347 +105186,57969,8869.0,CC BY-SA 3.0,,"Also, I am not wedded to this idea -- I just thought it might be more powerful than K-S or Chi-squared at low sample size. Forming the CDF of 5 samples strikes me as a bit ham-fisted. My boss wants me to perform a test of this sort and I just want the best tool for the job, and I don't care if it's my idea or something totally unrelated.",2013-10-21 21:27:39.023 +105187,57969,668.0,CC BY-SA 3.0,,"K-S has no problems with small data sets. You are unlikely to find anything more powerful than it unless you make strong distributional assumptions. I don't understand why you need to construct a ""reasonable CDF"": the relevant CDF comes from accumulating $p(x)$, not from observing samples. I also don't understand what ""further assumptions"" you could possibly apply, given that $p(x)$ appears fully to specify your reference distribution. These, and several other disconnects, make me worry that you might not have communicated your problem accurately. Perhaps you could provide an example?",2013-10-21 21:31:34.747 +105188,57970,668.0,CC BY-SA 3.0,,"Presumably this is the PDF, because if it were used as a CDF your question would be trivially easy. However, you need to tell us its domain, because its integral diverges.",2013-10-21 21:33:53.677 +105189,57962,10594.0,CC BY-SA 3.0,,"$exp^{−\lambda}$ is the probability of counting zero survival cells, given the individual cell survival probability $exp^{−\beta x}$ and the expected number of counted survived cell as $Nexp^{−\beta x}$. Are these two definitions the same? I am confused.",2013-10-21 21:38:28.943 +105190,57969,8869.0,CC BY-SA 3.0,,"In K-S the test statistic is $\sup_t |F_N(t) - F(t)|$ where $F_n$ is the empirical CDF of the sample and $F$ is the CDF of $p$, so actually the sample CDF is relevant, and having few samples is a problem since the Kolmogorov distribution is only the asymptotic distribution of the test statistic for large $N$. Of course, almost all tests are justified by asymptotic results, but it was still a concern for me. When I said ""further assumptions"" I just meant I didn't want to make any distributional assumptions. Any other questions?",2013-10-21 21:38:57.280 +105191,57723,22781.0,CC BY-SA 3.0,,"Dear Patrick, I thank for your very prompt answer and helpful references! Best,",2013-10-21 22:02:40.413 +105192,57972,8888.0,CC BY-SA 3.0,,"I think that is not enough information. When you say, a proportion of a field is covered in water, is there any additional knowledge about the form of that proportion? Is it one connected area? Are there constraints to the form? For example, imagine an area stretched zig zag over the whole area, but only covering 10%. For a certain, not small size of the hoop, the probability of water inside it will be 1.",2013-10-21 22:06:22.110 +105193,57959,594.0,CC BY-SA 3.0,,"At a fixed $\alpha$, the ratio is a constant. The critical values themselves come from the inverse of the cdf (the quantile function).",2013-10-21 22:21:08.357 +105194,57948,594.0,CC BY-SA 3.0,,"Even without the censoring (which is a big issue), why would *counts* be asserted to be 'normal after a log transformation'? What's the basis for such an assertion?",2013-10-21 22:28:05.017 +105195,57970,21119.0,CC BY-SA 3.0,,Added the domain. But not sure how it would diverge considering that $\exp(-\exp(-x^2))<\exp(-x^2)$,2013-10-21 22:36:54.703 +105196,57970,594.0,CC BY-SA 3.0,,"The inequality you assert is false. Take $x=1$. $\exp(-x^2)\approx 0.368 < \exp(-0.368)\approx 0.692$. As $|x|$ gets larger, so does your function. Try drawing a picture.",2013-10-21 22:41:31.290 +105197,57972,594.0,CC BY-SA 3.0,,Could you explain how your intuition works? I don't see that inequality being implied by the conditions without further assumptions.,2013-10-21 23:18:01.613 +105198,57973,594.0,CC BY-SA 3.0,,"'Fit a distribution to data' is equivalent to 'estimate the parameters from data'. Some common methods include maximum likelihood or method of moments. In R see, for example, the `fitdistr` function in MASS, which comes with R (`?MASS::fitdistr`), which has an example of fitting a t-distribution. It's certainly possible to do this with a t-distribution and plot the fitted distribution. However, see the warning in the example I mentioned.",2013-10-21 23:25:35.300 +105199,57969,668.0,CC BY-SA 3.0,,"What you are saying, then, is that the K-S test has little power when $N=5$. But so do all other distributional tests. A good way to overcome the problem of knowing only the asymptotic distribution is with Monte Carlo simulation. In effect, then, it seems that you ought to be wondering whether there is a better test *statistic* than either the K-S or the $\chi^2$ to test your particular null hypothesis.",2013-10-21 23:27:50.660 +105200,57973,668.0,CC BY-SA 3.0,,"For threads on this topic, please [search our site](http://stats.stackexchange.com/search?q=student+fit+distribution).",2013-10-21 23:29:51.280 +105201,57973,5237.0,CC BY-SA 3.0,,"In addition, questions that are *only* about how to do something in R, when the OP does not have a substantive statistical question, are off-topic for CV (see our [help page](http://stats.stackexchange.com/help)). That is, this Q would be off-topic even if it weren't a duplicate. Note that some of such questions might be on-topic on [Stack Overflow](http://stackoverflow.com/), but they need to be legitimate programming questions, & not just 'what is the package / function for this'.",2013-10-21 23:37:06.910 +105227,57990,20470.0,CC BY-SA 3.0,,"I think the confusion is this: **Bayes' theorem** is just the manipulation of the conditional probabilities as you give at the beginning of your question. The **Bayesian Estimation** makes use of the Bayes theorem to make parameter estimations. It is only in the latter, do the maximum likelihood estimation (MLE) and the parameter theta, etc. come into play.",2013-10-22 09:00:14.807 +105202,57962,5448.0,CC BY-SA 3.0,,"If the individual cell survival probability is $\exp\{-\beta x\}$, then the individual cell non-survival probability is $1 - \exp\{-\beta x\}$. The probability that no cells out of $N$ survive is just the product of the $N$ (identical) probabilities that each cell fails to survive. This probability is $(1-\exp\{-\beta x\})^N$. With your definition of $\lambda$ as $N\exp\{-\beta x\}$, you are getting this probability as $\exp\{-N\exp\{-\beta x\}\}$, which is obviously not the same.",2013-10-22 01:31:02.580 +105203,57968,20473.0,CC BY-SA 3.0,,"Clarifications: first I presume that by $\exp()$ you mean the Exponential distribution, and not the base of the natural logarithms. If yes, then, is $Q_j$ the mean value, or the reciprocal of the mean value (because both these widespread parametrizations of the Exponential distribution are unfortunately symbolized the same way).",2013-10-22 01:36:44.193 +105204,57979,5237.0,CC BY-SA 3.0,,"It may help you to read my answer here: [When to use Fisher and Neyman-Pearson framework?](http://stats.stackexchange.com/questions/23142//51823#51823) As for how the rejection region relates to p-values, the RR is defined as that region where $p<\alpha$.",2013-10-22 01:51:31.903 +105205,57979,20179.0,CC BY-SA 3.0,,"It is an excellent explaniation. So under Neyman's setting, does it requre any property of T(x), so that p<α is equivalent to x belongs to the rejection region?",2013-10-22 01:58:30.540 +105206,57979,5237.0,CC BY-SA 3.0,,"I'm not sure I totally follow your question, but there is no property of T(x) being in the rejection region except that p,,2010-07-19 19:14:44.080 +46,28,4.0,2,,CC BY-SA 2.5,18914b59-1d73-4c14-a8b6-25d429a1888e,"Last year, I read a blog post from [Bendan O'Connor][1] entitled [""Statistics vs. Machine Learning, fight!""][2] that discussed some of the differences between the two fields. [Andrew Gelman responded to favorably to this][3]: + +Simon Blomberg: +> From R's fortunes +> package: To paraphrase provocatively, +> 'machine learning is statistics minus +> any checking of models and +> assumptions'. +> -- Brian D. Ripley (about the difference between machine learning +> and statistics) useR! 2004, Vienna +> (May 2004) :-) Season's Greetings! + +Andrew Gelman: + +> In that case, maybe we should get rid +> of checking of models and assumptions +> more often. Then maybe we'd be able to +> solve some of the problems that the +> machine learning people can solve but +> we can't! + +There was also the [**""Statistical Modeling: The Two Cultures""** paper][4] by Leo Breiman in 2001 which argued that Statisticians rely too heavily on data modeling, and that machine learning techniques are making progress by instead relying on the *predictive accuracy* of models. + +Has the Statistics field changed over the last decade in response to these critiques? Do the *two cultures* still exist or has Statistics grown to embrace machine learning techniques such as neural networks and support vector machines? + + + [1]: http://anyall.org/ + [2]: http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/ + [3]: http://www.stat.columbia.edu/~cook/movabletype/archives/2008/12/machine-learnin.html + [4]: http://www.stat.osu.edu/~bli/dmsl/papers/Breiman.pdf",,2010-07-19 19:14:44.080 +289,143,114.0,3,,CC BY-SA 2.5,657b3316-c103-42cc-bdd9-a52a5fedc530,,,2010-07-19 21:32:38.523 +288,143,114.0,1,,CC BY-SA 2.5,657b3316-c103-42cc-bdd9-a52a5fedc530,Algorithms to compute the running median?,,2010-07-19 21:32:38.523 +287,143,114.0,2,,CC BY-SA 2.5,657b3316-c103-42cc-bdd9-a52a5fedc530,"On smaller window sizes, `n log n` sorting might work. Are there any better algorithms to achieve this?",,2010-07-19 21:32:38.523 +323,143,,6,user88,CC BY-SA 2.5,a6573b2a-0409-4d07-98ba-52887586892e,,edited tags,2010-07-19 22:05:23.010 +670,143,,6,,CC BY-SA 2.5,eabdccd9-fcd9-47a7-a1aa-76e66eef254d,,edited tags,2010-07-20 15:47:15.207 +672,143,,6,,CC BY-SA 2.5,c541c63b-6843-45e8-aeb3-ff70f45b4d85,,edited tags,2010-07-20 15:56:17.447 +708,143,,6,,CC BY-SA 2.5,91e1ac65-2a05-465e-8a14-176318e40a18,,edited tags,2010-07-20 18:12:30.547 +717,143,0.0,10,,,27606156-98ee-43d6-992a-478139afd75a,"{""Voters"":[{""Id"":88,""DisplayName"":""mbq""},{""Id"":190,""DisplayName"":""Peter Smit""},{""Id"":13,""DisplayName"":""Sharpie""},{""Id"":103,""DisplayName"":""rcs""},{""Id"":28,""DisplayName"":""Srikant Vadali""}]}",2,2010-07-20 18:54:42.177 +750,356,166.0,1,,CC BY-SA 2.5,47492cb7-f60a-4302-819f-86df9d13f334,What is the difference between the Shapiro-Wilk test of normality and the Kolmogorov-Smirnov test of normality?,,2010-07-21 00:24:35.500 +751,356,166.0,3,,CC BY-SA 2.5,47492cb7-f60a-4302-819f-86df9d13f334,,,2010-07-21 00:24:35.500 +752,356,166.0,2,,CC BY-SA 2.5,47492cb7-f60a-4302-819f-86df9d13f334,What is the difference between the Shapiro-Wilk test of normality and the Kolmogorov-Smirnov test of normality? When will results from these two methods differ?,,2010-07-21 00:24:35.500 +864,412,186.0,2,,CC BY-SA 2.5,cafcf180-f874-4b98-a9a4-8167bc6c8e35,"What book would you recommend for scientists who are not statisticians? + +Clear delivery is most appreciated. As well as the explanation of the appropriate techniques and methods for typical tasks: time series analysis, presentation and aggregation of large data sets.",,2010-07-21 15:01:21.127 +865,412,186.0,1,,CC BY-SA 2.5,cafcf180-f874-4b98-a9a4-8167bc6c8e35,What book would you recommend for non-statistician?,,2010-07-21 15:01:21.127 +866,412,186.0,16,,,cafcf180-f874-4b98-a9a4-8167bc6c8e35,,,2010-07-21 15:01:21.127 +863,412,186.0,3,,CC BY-SA 2.5,cafcf180-f874-4b98-a9a4-8167bc6c8e35,,,2010-07-21 15:01:21.127 +872,414,4.0,16,,,f43ac356-f089-4201-b246-91cced19a08b,,,2010-07-21 15:13:21.493 +871,414,4.0,1,,CC BY-SA 2.5,f43ac356-f089-4201-b246-91cced19a08b,"What is your favorite ""data analysis"" cartoon?",,2010-07-21 15:13:21.493 +869,414,4.0,2,,CC BY-SA 2.5,f43ac356-f089-4201-b246-91cced19a08b,"This is one of my favorites: + +![alt text][1] + +One entry per answer. This is in the vein of [this StackOverflow question][2]. + +P.S. Do not hotlink the cartoon without the site's permission please. + + + [1]: http://imgs.xkcd.com/comics/correlation.png + [2]: http://stackoverflow.com/questions/84556/whats-your-favorite-programmer-cartoon",,2010-07-21 15:13:21.493 +870,414,4.0,3,,CC BY-SA 2.5,f43ac356-f089-4201-b246-91cced19a08b,,,2010-07-21 15:13:21.493 +1014,412,190.0,6,,CC BY-SA 2.5,673b9773-839a-4584-a7a9-2c64642d1fd8,,edited tags,2010-07-22 12:56:00.153 +1106,541,,2,user28,CC BY-SA 2.5,9ba6544f-eddd-49b9-834a-4a38974300ff,"ANOVA is equivalent to linear regression with the use of suitable dummy variables. The conclusions remain the same irrespective of whether you use ANOVA or linear regression. + +In light of their equivalence, is there any reason why ANOVA is used instead of linear regression? + +Note: I am particularly interested in hearing about **technical** reasons for the use of ANOVA instead of linear regression. ",,2010-07-23 15:17:56.770 +1104,541,,3,user28,CC BY-SA 2.5,9ba6544f-eddd-49b9-834a-4a38974300ff,,,2010-07-23 15:17:56.770 +1105,541,,1,user28,CC BY-SA 2.5,9ba6544f-eddd-49b9-834a-4a38974300ff,Why is ANOVA taught / used as if it is a different research methodology compared to linear regression? ,,2010-07-23 15:17:56.770 +7706,3188,450.0,2,,CC BY-SA 2.5,1c727d83-861b-4e39-be04-68dda5440df8,"It's bad form to sort an array to compute a median. Medians (and other quantiles) are typically computed using the [quickselect][1] algorithm, with $O(n)$ complexity. + + +You may also want to look at my answer to a recent related question [here][2]. + + + [1]: http://www.ics.uci.edu/~eppstein/161/960125.html + [2]: http://stats.stackexchange.com/questions/3372/is-it-possible-to-accumulate-a-set-of-statistics-that-describes-a-large-number-of/3376#3376",,2010-10-09 19:02:09.717 +8970,3646,211.0,1,,CC BY-SA 2.5,362780e6-f8e1-4074-a58c-1bc9aa366330,Kendall Tau or Spearman's rho ?,,2010-10-24 13:15:49.687 +8969,3646,211.0,2,,CC BY-SA 2.5,362780e6-f8e1-4074-a58c-1bc9aa366330,"In which cases should one prefer the one over the other? + +I found someone who claims an advantage for Kendall, [for pedagogical reasons][1], are there other reasons? + + + [1]: http://www.rsscse.org.uk/ts/bts/noether/text.html",,2010-10-24 13:15:49.687 +8971,3646,211.0,3,,CC BY-SA 2.5,362780e6-f8e1-4074-a58c-1bc9aa366330,,,2010-10-24 13:15:49.687 +1111,543,182.0,2,,CC BY-SA 2.5,604765be-29fa-42bb-8e40-6909a0f62e8f,"As an economist, the analysis of variance (ANOVA) is taught and usually understood in relation to linear regression (e.g. in Arthur Goldberger's *A Course in Econometrics*). Economists/Econometricians typically view ANOVA as uninteresting and prefer to move straight to regression models. From the perspective of linear (or even generalised linear) models, ANOVA assigns coefficients into batches, with each batch corresponding to a ""source of variation"" in ANOVA terminology. + +Generally you can replicate the inferences you would obtain from ANOVA using regression but not always OLS regression. Multilevel models are needed for analysing hierarchical data structures such as ""split-plot designs,"" where between-group effects are compared to group-level errors, and within-group effects are compared to data-level errors. Gelman's paper [1] goes into great detail about this problem and effectively argues that ANOVA is an important statistical tool that should still be taught for it's own sake. + +In particular Gelman argues that ANOVA is a way of understanding and structuring multilevel models. Therefore ANOVA is not an alternative to regression but as a tool for summarizing complex high-dimensional inferences and for exploratory data analysis. + +Gelman is a well-respected statistician and some credence should be given to his view. However, almost all of the empirical work that I do would be equally well served by linear regression and so I firmly fall into the camp of viewing it as a little bit pointless. Some disciplines with complex study designs (e.g. psychology) may find ANOVA useful. + +[1] Gelman, A. (2005). Analysis of variance: why it is more important than ever (with discussion). *Annals of Statistics* 33, 1–53.",,2010-07-23 15:35:55.653 +1112,541,,5,user28,CC BY-SA 2.5,9e914d5f-91a3-4993-afe7-e09b9455d94a,"ANOVA is equivalent to linear regression with the use of suitable dummy variables. The conclusions remain the same irrespective of whether you use ANOVA or linear regression. + +In light of their equivalence, is there any reason why ANOVA is used instead of linear regression? + +Note: I am particularly interested in hearing about **technical** reasons for the use of ANOVA instead of linear regression. + +**Edit** + +Here is one example using one-way ANOVA. Suppose, you want to know if the average height of male and females is the same. To test for your hypothesis you would collect data from a random sample of male and females (say 30 each) and perform the ANOVA analysis (i.e., sum of squares for gender and error) to decide whether an effect exists. + +You could also use linear regression to test for this as follows: + +Define: + +Gender = 1 if respondent is a male and 0 otherwise. + +Height = Intercept + beta * Gender + error + +where + +error ~ N(0,sigma^2) + +Then a test of whether beta = 0 is a an equivalent test for your hypothesis.",added 650 characters in body,2010-07-23 15:44:44.420 +1184,412,,6,,CC BY-SA 2.5,07dd87c5-5a69-4a97-b688-f7c2f19ef30a,,Added the 'subjective' tag to the question,2010-07-24 09:53:34.437 +1185,412,,9,,CC BY-SA 2.5,cdbfc8ae-9930-4e8d-85fb-2e9dd9e5ad49,,Rollback to [673b9773-839a-4584-a7a9-2c64642d1fd8],2010-07-24 09:58:03.123 +1253,412,,6,,CC BY-SA 2.5,51620419-30bc-4850-8129-9357dfc28496,,edited tags,2010-07-26 09:48:59.033 +1255,412,186.0,9,,CC BY-SA 2.5,6b52e104-aad0-4e04-bad5-8a5c25a64489,,Rollback to [cafcf180-f874-4b98-a9a4-8167bc6c8e35],2010-07-26 10:35:37.920 +2349,143,,6,,CC BY-SA 2.5,1d9dd915-3289-489f-8816-9699a9ffd407,,edited tags,2010-08-03 12:14:50.543 +2694,1248,399.0,1,,CC BY-SA 2.5,e161200b-7be7-4d74-86d2-a71767e9dcfa,Statistics Jokes,,2010-08-06 01:53:47.023 +2696,1248,399.0,3,,CC BY-SA 2.5,e161200b-7be7-4d74-86d2-a71767e9dcfa,,,2010-08-06 01:53:47.023 +2695,1248,399.0,2,,CC BY-SA 2.5,e161200b-7be7-4d74-86d2-a71767e9dcfa,"Well we've got favourite statistics quotes. What about statistics jokes? + +So, what's your favourite statistics joke?",,2010-08-06 01:53:47.023 +2699,1248,132.0,16,,,00000000-0000-0000-0000-000000000000,,,2010-08-06 02:30:56.857 +3008,28,4.0,16,,,c45d4a16-e06d-4fd0-9c2c-540cd4aafb53,,,2010-08-09 13:05:50.603 +3007,28,4.0,6,,CC BY-SA 2.5,c45d4a16-e06d-4fd0-9c2c-540cd4aafb53,,edited tags,2010-08-09 13:05:50.603 +3201,412,,5,,CC BY-SA 2.5,d733de28-55d8-4b51-8726-b570a14f5734,"What book would you recommend for scientists who are not statisticians? + +Clear delivery is most appreciated. As well as the explanation of the appropriate techniques and methods for typical tasks: time series analysis, presentation and aggregation of large data sets. +",added 2 characters in body; edited title,2010-08-11 08:26:44.373 +3202,412,,4,,CC BY-SA 2.5,d733de28-55d8-4b51-8726-b570a14f5734,What book would you recommend for non-statisticians?,added 2 characters in body; edited title,2010-08-11 08:26:44.373 +3207,414,,5,,CC BY-SA 2.5,2e347005-52d7-4f9b-abd1-cb638923071e,"This is one of my favorites: + +![alt text][1] + +One entry per answer. This is in the vein of the Stack Overflow question *[What’s your favorite “programmer” cartoon?][2]*. + +P.S. Do not hotlink the cartoon without the site's permission please. + + [1]: http://imgs.xkcd.com/comics/correlation.png + [2]: http://stackoverflow.com/questions/84556/whats-your-favorite-programmer-cartoon",Named the link.,2010-08-11 08:49:02.137 +3437,143,0.0,11,,,24eae004-c4b8-4ea9-b154-03167c95c3d1,"{""Voters"":[{""Id"":251,""DisplayName"":""ars""},{""Id"":159,""DisplayName"":""Rob Hyndman""}]}",,2010-08-13 00:30:00.863 +4009,1760,723.0,2,,CC BY-SA 2.5,2f78dcca-ecb1-4452-ba76-9a3f08a4d4a8,"A question which bothered me for some time, which I don't know how to address: + +Every day, my weatherman gives a percentage chance of rain (let's assume its calculated to 9000 digits and he has never repeated a number). Every subsequent day, it either rains or does not rain. + +I have years of data - pct chance vs rain or not. *Given this weatherman's history*, if he says tonight that tomorrow's chance of rain is X, then what's my best guess as to what the chance of rain really is?",,2010-08-19 05:56:06.483 +4010,1760,723.0,3,,CC BY-SA 2.5,2f78dcca-ecb1-4452-ba76-9a3f08a4d4a8,,,2010-08-19 05:56:06.483 +4008,1760,723.0,1,,CC BY-SA 2.5,2f78dcca-ecb1-4452-ba76-9a3f08a4d4a8,Is my weatherman accurate?,,2010-08-19 05:56:06.483 +23159,8529,,25,,,b92a50c2-35a0-47ed-aecc-68cbee4e0a5b,,http://twitter.com/#!/StackStats/status/56463592052109312,2011-04-08 21:09:04.103 +23167,8529,,4,user88,CC BY-SA 3.0,11972e04-1927-4d3e-8f39-b951d3b85b96,What are some interesting and well-written applied statistics papers?,edited title,2011-04-08 23:28:40.090 +23237,8529,674.0,6,,CC BY-SA 3.0,582ad9c9-14e6-459d-9848-43f193916195,,add tag + link to CVJC paper,2011-04-09 18:54:46.130 +68726,22797,674.0,4,,CC BY-SA 3.0,808bec7e-d707-4701-afc0-0ab8c121377c,How does regression with and without intercept followed by test of stationarity affect cointegration test?,edited title,2012-04-02 10:34:42.647 +176732,54724,16174.0,6,,CC BY-SA 3.0,53b4720d-700c-43c1-84e2-57f73d508e02,,"Sentence case style in title, tag, LaTeX",2013-09-04 20:39:44.190 +176731,54724,,24,,CC BY-SA 3.0,53b4720d-700c-43c1-84e2-57f73d508e02,,"Proposed by 22468 approved by 7290, 14156 edit id of 5245",2013-09-04 20:39:44.190 +4068,1787,668.0,2,,CC BY-SA 2.5,77e02a1c-de83-4092-9220-e10415c1128e,"In effect you are thinking of a model in which the *true* chance of rain, *p*, is a function of the *predicted* chance *q*: *p* = *p(q*). Each time a prediction is made, you observe one realization of a Bernoulli variate having probability *p(q)* of success. This is a classic logistic regression setup if you are willing to model the true chance as a linear combination of basis functions *f1*, *f2*, ..., *fk*; that is, the model says + +>Logit(*p*) = *b0* + *b1 f1(q)* + *b2 f2(q)* + ... + *bk fk(q)* + *e* + +with iid errors *e*. If you're agnostic about the form of the relationship (although if the weatherman is any good *p(q) - q* should be reasonably small), consider using a set of splines for the basis. The output, as usual, consists of estimates of the coefficients and an estimate of the variance of *e*. Given any future prediction *q*, just plug the value into the model with the estimated coefficients to obtain an answer to your question (and use the variance of *e* to construct a prediction interval around that answer if you like). + +This framework is flexible enough to include other factors, such as the possibility of changes in the quality of predictions over time. It also lets you test hypotheses, such as whether *p* = *q* (which is what the weatherman implicitly claims).",,2010-08-19 13:21:56.153 +4922,2156,114.0,2,,CC BY-SA 2.5,de2d608e-1eae-4ffb-b7aa-21aab24c7e98,"I'm working on a small (200M) corpus of text, which I want to explore with some cluster analysis. What books or articles on that subject would you recommend? + +",,2010-09-01 23:57:06.760 +4923,2156,114.0,1,,CC BY-SA 2.5,de2d608e-1eae-4ffb-b7aa-21aab24c7e98,Recommended books or articles as introduction to Cluster Analysis?,,2010-09-01 23:57:06.760 +4924,2156,114.0,3,,CC BY-SA 2.5,de2d608e-1eae-4ffb-b7aa-21aab24c7e98,,,2010-09-01 23:57:06.760 +4955,2156,,16,user88,,00000000-0000-0000-0000-000000000000,,,2010-09-02 09:35:37.073 +4959,2169,674.0,2,,CC BY-SA 2.5,fd9acf1f-7cf0-4596-82c6-e8a9ce70feb0,"It may be worth looking at M.W. Berry's books: + +1. *Survey of Text Mining I: Clustering, Classification, and Retrieval* (2003) +2. *Survey of Text Mining II: Clustering, Classification, and Retrieval* (2008) + +They consist of series of applied and review papers. The latest seems to be available as PDF at the following address: http://bit.ly/deNeiy. + +Here are few links related to CA as applied to text mining: + +* [Document Topic Generation in Text Mining by Using Cluster Analysis with EROCK][1] +* [An Approach to Text Mining using Information Extraction][2] + +You can also look at *Latent Semantic Analysis*, but see my response there: [Working through a clustering problem][3]. + + + [1]: http://www.cscjournals.org/csc/manuscript/Journals/IJCSS/volume4/Issue2/IJCSS-271.pdf + [2]: http://www.google.fr/url?sa=t&source=web&cd=8&ved=0CFoQFjAH&url=http%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fdownload%3Fdoi%3D10.1.1.21.8862%26rep%3Drep1%26type%3Dpdf&ei=P3h_TJXfAdT34AbIuN3TCw&usg=AFQjCNHaVKyo45HtjNKiEPvFnPuvD2MvmQ&sig2=vPQ1BmIz8vb16q1Oysi8_g + [3]: http://stats.stackexchange.com/questions/369/working-through-a-clustering-problem/2196#2196",,2010-09-02 10:25:32.673 +4958,2169,0.0,16,,,fd9acf1f-7cf0-4596-82c6-e8a9ce70feb0,,,2010-09-02 10:25:32.673 +5781,2509,628.0,2,,CC BY-SA 2.5,27c1371d-3a9a-4aba-ad50-45fbe0e7477c,"In today's pattern recognition class my professor talked about Principal Component Analysis , Eigen Vectors & Eigen Values. + +I got the mathematics of it. If I'm asked to find eigen values etc. I'll do it correctly like a machine. But I didn't **Understand** it. I didn't get the purpose of it. I didn't get the feel of it. I strongly believe in + +>you do not really understand something unless you can explain it to your grandmother -- Albert Einstien + +Well, I can't explain these concepts to a layman or grandma. + +1. Why Principal Component Analysis , Eigen Vectors & Eigen Values? What was the *need* for these concepts? +2. How would you explain these to a layman?",,2010-09-15 20:05:55.993 +5782,2509,628.0,1,,CC BY-SA 2.5,27c1371d-3a9a-4aba-ad50-45fbe0e7477c,"Making sense of Principal Component Analysis , Eigen Vectors & Eigen Values",,2010-09-15 20:05:55.993 +5783,2509,628.0,3,,CC BY-SA 2.5,27c1371d-3a9a-4aba-ad50-45fbe0e7477c,,,2010-09-15 20:05:55.993 +5791,2509,668.0,5,,CC BY-SA 2.5,f7802b4e-ec49-4fa0-818b-48ab1b0e74d6,"In today's pattern recognition class my professor talked about Principal Component Analysis , Eigenvectors & Eigenvalues. + +I got the mathematics of it. If I'm asked to find eigenvalues etc. I'll do it correctly like a machine. But I didn't **Understand** it. I didn't get the purpose of it. I didn't get the feel of it. I strongly believe in + +>you do not really understand something unless you can explain it to your grandmother -- Albert Einstein + +Well, I can't explain these concepts to a layman or grandma. + +1. Why Principal Component Analysis , Eigenvectors & Eigenvalues? What was the *need* for these concepts? +2. How would you explain these to a layman?","Fixed spelling of ""eigenvectors"" and ""eigenvalues"".; deleted 1 characters in body",2010-09-15 21:36:58.120 +5792,2509,668.0,4,,CC BY-SA 2.5,f7802b4e-ec49-4fa0-818b-48ab1b0e74d6,"Making sense of Principal Component Analysis , Eigenvectors & Eigenvalues","Fixed spelling of ""eigenvectors"" and ""eigenvalues"".; deleted 1 characters in body",2010-09-15 21:36:58.120 +5797,2509,,4,user88,CC BY-SA 2.5,c6793033-d493-4d22-92c7-dd55595f21ce,"Making sense of principal component analysis, eigenvectors & eigenvalues","Fixed spelling and format, removed unrelated tags.",2010-09-15 22:14:55.840 +5798,2509,,6,user88,CC BY-SA 2.5,c6793033-d493-4d22-92c7-dd55595f21ce,,"Fixed spelling and format, removed unrelated tags.",2010-09-15 22:14:55.840 +5799,2509,,5,user88,CC BY-SA 2.5,c6793033-d493-4d22-92c7-dd55595f21ce,"In today's pattern recognition class my professor talked about PCA, eigenvectors & eigenvalues. + +I got the mathematics of it. If I'm asked to find eigenvalues etc. I'll do it correctly like a machine. But I didn't **understand** it. I didn't get the purpose of it. I didn't get the feel of it. I strongly believe in + +>you do not really understand something unless you can explain it to your grandmother -- Albert Einstein + +Well, I can't explain these concepts to a layman or grandma. + +1. Why PCA, eigenvectors & eigenvalues? What was the *need* for these concepts? +2. How would you explain these to a layman?","Fixed spelling and format, removed unrelated tags.",2010-09-15 22:14:55.840 +5811,2509,628.0,6,,CC BY-SA 2.5,855da758-58dc-4e69-956d-1c22aa1d4ecd,,edited tags,2010-09-16 04:37:42.657 +5851,1760,,6,user88,CC BY-SA 2.5,567aeb04-9f9f-47bb-946f-a25f5cdc1f0f,,edited tags,2010-09-16 06:59:48.217 +5868,2509,,6,user88,CC BY-SA 2.5,ace200aa-2690-4cf4-a2ba-8df9cbd31c4a,,edited tags,2010-09-16 07:13:27.197 +6097,2156,,6,user88,CC BY-SA 2.5,8f6ca963-1ec8-4380-b083-f985a474999b,,edited tags,2010-09-17 20:23:04.700 +6591,1760,190.0,6,,CC BY-SA 2.5,a6f6e9f6-f447-412f-8923-11b1ba896e95,,edited tags,2010-09-23 16:05:25.783 +30738,10911,22.0,1,,CC BY-SA 3.0,c0dec5b6-cdb4-4ac3-99c2-397b1aef27d2,How to calculate the confidence interval of the mean of means?,,2011-06-16 16:58:13.537 +179773,48658,,25,,,c9633158-4ccd-4218-8441-96ee96c3e23e,,http://twitter.com/#!/StackStats/status/380301995867127808,2013-09-18 12:07:14.613 +8978,3649,674.0,2,,CC BY-SA 2.5,ecf8eef0-7f18-40b3-b477-449356e0ce6d,"I found that Spearman correlation is mostly used in place of usual linear correlation when working with integer valued scores on a measurement scale, when it has a moderate number of possible scores or when we don't want to make rely on assumptions about the bivariate relationships. As compared to Pearson coefficient, the interpretation of Kendall's tau seems to me less direct than that of Spearman's rho, in the sense that it quantifies the difference between the % of concordant and discordant pairs among all possible pairwise events. In my understanding, Kendall's tau more closely resemble [Goodman-Kruskal Gamma][1]. + +I just browsed an article from Larry Winner in the J. Statistics Educ. (2006) which discusses the use of both measures, [NASCAR Winston Cup Race Results for 1975-2003][2]. + +I also found [@onestop][3] answer about [Pearson's or Spearman's correlation with non-normal data][4] interesting in this respect. + + + [1]: http://en.wikipedia.org/wiki/Gamma_test_(statistics) + [2]: http://www.amstat.org/publications/jse/v14n3/datasets.winner.html + [3]: http://stats.stackexchange.com/questions/3730/pearsons-or-spearmans-correlation-with-non-normal-data/3744#3744 + [4]: http://stats.stackexchange.com/questions/3730/pearsons-or-spearmans-correlation-with-non-normal-data",,2010-10-24 14:26:35.747 +8986,3646,,4,user88,CC BY-SA 2.5,e2a8e6de-b9e0-43a1-99b4-65d18f8c9c50,Kendall Tau or Spearman's rho?,edited title,2010-10-24 15:46:10.027 +9216,28,,6,user88,CC BY-SA 2.5,59d2fd0b-c0aa-4798-b72a-bdbfae0d45e4,,edited tags,2010-10-28 10:39:42.043 +9334,3649,674.0,5,,CC BY-SA 2.5,32685c81-b163-4348-aa75-1b11ad3f4128,"I found that Spearman correlation is mostly used in place of usual linear correlation when working with integer valued scores on a measurement scale, when it has a moderate number of possible scores or when we don't want to make rely on assumptions about the bivariate relationships. As compared to Pearson coefficient, the interpretation of Kendall's tau seems to me less direct than that of Spearman's rho, in the sense that it quantifies the difference between the % of concordant and discordant pairs among all possible pairwise events. In my understanding, Kendall's tau more closely resembles [Goodman-Kruskal Gamma][1]. + +I just browsed an article from Larry Winner in the J. Statistics Educ. (2006) which discusses the use of both measures, [NASCAR Winston Cup Race Results for 1975-2003][2]. + +I also found [@onestop][3] answer about [Pearson's or Spearman's correlation with non-normal data][4] interesting in this respect. + +Of note, Kendall's tau (the *a* version) has connection to Somers' D (and Harrell's C) used for predictive modelling (see e.g., [Interpretation of Somers’ D under four simple models][5] by RB Newson and reference [6] therein, and articles by Newson published in the Stata Journal 2006). An overview of rank-sum tests is provided in [Efficient Calculation of Jackknife Confidence Intervals for Rank Statistics][6], that was published in the JSS (2006). + + + [1]: http://en.wikipedia.org/wiki/Gamma_test_(statistics) + [2]: http://www.amstat.org/publications/jse/v14n3/datasets.winner.html + [3]: http://stats.stackexchange.com/questions/3730/pearsons-or-spearmans-correlation-with-non-normal-data/3744#3744 + [4]: http://stats.stackexchange.com/questions/3730/pearsons-or-spearmans-correlation-with-non-normal-data + [5]: http://www.imperial.ac.uk/nhli/r.newson/miscdocs/intsomd1.pdf + [6]: http://www.jstatsoft.org/v15/i01/paper",add a note on kendall's tau and predictive modelling,2010-10-31 10:29:50.693 +10524,4187,287.0,1,,CC BY-SA 2.5,a78d2b81-a02b-4460-b6f1-70eb56ff41b6,Statistical Sins,,2010-11-15 18:46:37.113 +10523,4187,287.0,2,,CC BY-SA 2.5,a78d2b81-a02b-4460-b6f1-70eb56ff41b6,"I'm a grad student in psychology, and as I pursue more and more independent studies in statistics, I am increasingly amazed by the inadequacy of my formal training. Both personal and second hand experience suggests that the paucity of statistical rigor in undergraduate and graduate training is rather ubiquitous within psychology. As such, I thought it would be useful for independent learners like myself to create a list of ""Statistical Sins"", tabulating statistical practices taught to grad students as standard practice that are in fact either superseded by superior (more powerful, or flexible, or robust, etc) modern methods or shown to be frankly invalid. Anticipating that other fields might also experience a similar state of affairs, I propose a community wiki where we can collect a list of Statistical Sins across disciplines. Please submit one ""sin"" per answer.",,2010-11-15 18:46:37.113 +10525,4187,287.0,3,,CC BY-SA 2.5,a78d2b81-a02b-4460-b6f1-70eb56ff41b6,,,2010-11-15 18:46:37.113 +10540,4187,,16,user88,,00000000-0000-0000-0000-000000000000,,,2010-11-15 20:28:53.713 +10547,4187,,4,user88,CC BY-SA 2.5,9443f777-cbfa-4c91-97ad-14a55f37830e,What are common statistical sins?,edited title,2010-11-15 20:29:19.393 +10548,4187,,6,user88,CC BY-SA 2.5,ca45a05e-b875-4679-8b09-79abc81142e9,,edited tags,2010-11-15 20:29:26.957 +11966,4705,1209.0,3,,CC BY-SA 2.5,0998ef5d-278a-43dd-bada-15885d12473c,,,2010-12-04 00:08:23.027 +11965,4705,1209.0,1,,CC BY-SA 2.5,0998ef5d-278a-43dd-bada-15885d12473c,Most famous statisticians,,2010-12-04 00:08:23.027 +11967,4705,1209.0,2,,CC BY-SA 2.5,0998ef5d-278a-43dd-bada-15885d12473c,What are the most important statisticians and what is it that made them famous? (Reply just one scientist per answer please),,2010-12-04 00:08:23.027 +11969,4705,,16,user88,,00000000-0000-0000-0000-000000000000,,,2010-12-04 00:12:46.677 +11971,4705,,6,user88,CC BY-SA 2.5,f4b20939-9bad-402d-8814-94b8a172c23f,,edited tags; added 3 characters in body,2010-12-04 00:13:35.163 +11970,4705,,5,user88,CC BY-SA 2.5,f4b20939-9bad-402d-8814-94b8a172c23f,"What are the most important statisticians and what is it that made them famous? +(Reply just one scientist per answer please)",edited tags; added 3 characters in body,2010-12-04 00:13:35.163 +11977,4705,1209.0,9,,CC BY-SA 2.5,f07672a7-e6c0-40c4-abf9-d21990165d5c,,Rollback to [0998ef5d-278a-43dd-bada-15885d12473c],2010-12-04 00:50:06.453 +11976,4705,1209.0,8,,CC BY-SA 2.5,f07672a7-e6c0-40c4-abf9-d21990165d5c,What are the most important statisticians and what is it that made them famous? (Reply just one scientist per answer please),Rollback to [0998ef5d-278a-43dd-bada-15885d12473c],2010-12-04 00:50:06.453 +11978,4705,1209.0,6,,CC BY-SA 2.5,a1721099-9036-4699-842d-268642f2cfb9,,edited tags,2010-12-04 00:58:41.857 +11984,4705,,5,user1108,CC BY-SA 2.5,c6e5a4e5-17d0-4936-9782-879a8565bce9,Who are the most important statisticians and what is it that made them famous? (Reply just one scientist per answer please),"changed ""what"" to ""who""",2010-12-04 02:46:25.963 +11995,4714,60.0,2,,CC BY-SA 2.5,3352fdd3-59c5-4e30-a8f7-eb83f1b625b7,[Thomas Bayes](http://en.wikipedia.org/wiki/Thomas_Bayes) for inventing Bayes' theorem,,2010-12-04 03:46:20.583 +11996,4714,,16,,,a55c79d7-bc33-46d8-ab20-7fd2745692be,,,2010-12-04 03:46:20.583 +12727,4705,,6,user88,CC BY-SA 2.5,584325bc-be4a-41e9-9610-e18fab7f9329,,edited tags,2010-12-13 18:16:21.700 +12749,4705,,6,,CC BY-SA 2.5,85d40d6e-6022-4e93-a149-983b0a77aae8,,edited tags,2010-12-13 23:12:56.030 +12757,5015,1542.0,1,,CC BY-SA 2.5,92add026-16b7-4c9c-8f33-8c20789f9710,regression: the interaction wipes out my direct effects...,,2010-12-13 23:43:17.117 +12758,5015,1542.0,3,,CC BY-SA 2.5,92add026-16b7-4c9c-8f33-8c20789f9710,,,2010-12-13 23:43:17.117 +12756,5015,1542.0,2,,CC BY-SA 2.5,92add026-16b7-4c9c-8f33-8c20789f9710,"In a regression, the interaction term wipes out both related direct effects? Do I drop the interaction or report the outcome? The interaction was not part of the original hypothesis. ",,2010-12-13 23:43:17.117 +12768,5020,1411.0,2,,CC BY-SA 2.5,25f11557-8533-4ac3-9e06-02f74a63b3d4,"I think this one is tricky; as you hint, there's 'moral hazard' here: if you hadn't looked at the interaction at all, you'd be free and clear, but now that you have there is a suspicion of data-dredging if you drop it. + +The key is *probably* a change in the meaning of your effects when you go from the main-effects-only to the interaction model. What you get for the 'main effects' depends very much on how your treatments and contrasts are coded. In R, the default is treatment contrasts with the first factor levels (the ones with the first names in alphabetical order unless you have gone out of your way to code them differently) as the baseline levels. + +Say (for simplicity) that you have two levels, 'control' and 'trt', for each factor. Without the interaction, the meaning of the 'v1.trt' parameter (assuming treatment contrasts as is the default in R) is ""average difference between 'v1.control' and 'v1.trt' group""; the meaning of the 'v2.trt' parameter is ""average difference between 'v2.control' and 'v2.trt'"". + +With the interaction, 'v1.trt' is the average difference between 'v1.control' and 'v1.trt' **in the 'v2.control' group**, and similarly 'v2.trt' is the average difference between v2 groups in the 'v1.control' group. Thus, if you have fairly small treatment effects in each of the control groups, but a large effect in the treatment groups, you could easily see what you're seeing. + +The only way I can see this happening *without* a significant interaction term, however, is if all the effects are fairly weak (so that what you really mean by ""the effect disappeared"" is that you went from p=0.06 to p=0.04, across the magic significance line). + +Another possibility is that you are 'using up too many degrees of freedom' -- that is, the parameter estimates don't actually change that much, but the residual error term is sufficiently inflated by having to estimate another 4 [ = (2-1)*(5-1)] parameters that your significant terms become non-significant. Again, I would only expect this with a small data set/relatively weak effects. + +One possible solution is to move to sum contrasts, although this is also delicate -- you have to be convinced that 'average effect' is meaningful in your case. The very best thing is to plot your data and to look at the coefficients and understand what's happening in terms of the estimated parameters. + +Hope that helps.",,2010-12-14 02:25:20.237 +12797,5015,674.0,5,,CC BY-SA 2.5,7da4b295-54b3-4e2a-9d0b-24a0eed66e21,"In a regression, the interaction term wipes out both related direct effects. Do I drop the interaction or report the outcome? The interaction was not part of the original hypothesis. ",edited body; edited title,2010-12-14 08:35:11.153 +12796,5015,674.0,4,,CC BY-SA 2.5,7da4b295-54b3-4e2a-9d0b-24a0eed66e21,What if interaction wipes out my direct effects in regression?,edited body; edited title,2010-12-14 08:35:11.153 +13201,1760,190.0,6,,CC BY-SA 2.5,cf0f7035-f62d-44a0-ad2d-65acc7414a4d,,edited tags,2010-12-20 07:28:59.943 +16377,4187,,38,user88,,6887d756-76f7-4279-bd02-adccbc90ac17,"[{""Id"":88,""DisplayName"":""mbq""}]",from http://stats.stackexchange.com/questions/6880/what-are-the-most-dangerous-concepts-in-the-practice-of-data-analysis,2011-02-06 11:01:16.173 +17674,6788,1790.0,2,,CC BY-SA 2.5,4a47f257-7908-4b94-be00-8058328c39fb,"I recently stumbled upon the concept of **sample complexity**, and was wondering if there are any texts, papers or tutorials that provide: + + 1. A good introduction to the concept + 2. An analysis of the sample complexity of different classification methods or kernel machines. + 3. Advice or information on how to measure it in practice. + +Any help with the topic would be greatly appreciated. + + +",,2011-02-19 22:41:23.000 +17676,6788,1790.0,3,,CC BY-SA 2.5,4a47f257-7908-4b94-be00-8058328c39fb,,,2011-02-19 22:41:23.000 +17675,6788,1790.0,1,,CC BY-SA 2.5,4a47f257-7908-4b94-be00-8058328c39fb,Measuring and analyzing sample complexity,,2011-02-19 22:41:23.000 +17697,6788,,25,,,5754225a-24bc-43dc-94fb-11f46b56aa28,,http://twitter.com/#!/StackStats/status/39238529569652736,2011-02-20 08:22:49.150 +17724,6788,1790.0,5,,CC BY-SA 2.5,a2ac4c34-4688-43ed-b4e6-224c50c69736,"I recently stumbled upon the concept of [**sample complexity**][1], and was wondering if there are any texts, papers or tutorials that provide: + + 1. A good introduction to the concept + 2. An analysis of the sample complexity of different classification methods or kernel machines. + 3. Advice or information on how to measure it in practice. + +Any help with the topic would be greatly appreciated. + + + [1]: http://www.google.com/search?sourceid=chrome&ie=UTF-8&q=%22sample%20complexity%22",added 93 characters in body,2011-02-20 17:52:31.897 +17729,6788,1790.0,5,,CC BY-SA 2.5,4ef8d6b5-e493-4e90-8c13-694a1956fdc2,"I recently stumbled upon the concept of [**sample complexity**][1], and was wondering if there are any texts, papers or tutorials that provide: + + 1. A good introduction to the concept + 2. An analysis of the sample complexity of different classification methods or kernel methods. + 3. Advice or information on how to measure it in practice. + +Any help with the topic would be greatly appreciated. + + + [1]: http://www.google.com/search?q=%22sample%20complexity%22",deleted 25 characters in body; deleted 1 characters in body,2011-02-20 19:06:47.563 +21473,7965,1691.0,3,,CC BY-SA 2.5,e9e8566c-becc-467f-ad03-079813adc239,,,2011-03-24 14:51:27.800 +21472,7965,1691.0,1,,CC BY-SA 2.5,e9e8566c-becc-467f-ad03-079813adc239,Colinearity and scaling when using kmeans,,2011-03-24 14:51:27.800 +21471,7965,1691.0,2,,CC BY-SA 2.5,e9e8566c-becc-467f-ad03-079813adc239,"I'm trying to gain a better understanding of kmeans clustering and am still unclear about colinearity and scaling of data. To explore colinearity, I made a plot of all five variables that I am considering shown in the figure below, along with a correlation calculation. +![colinearity][1] + +I started off with a larger number of parameters, and excluded any that had a correlation higher than 0.6 (an assumption I made). The five I choose to include are shown in this diagram. + +Then, I scaled the date using the `R` function `scale(x)` before applying the `kmeans()` function. However, I'm not sure whether `center = TRUE` and `scale = TRUE` should also be included as I don't understand the differences that these arguments make. (The `scale()` description is given as `scale(x, center = TRUE, scale = TRUE)`). + +Is the process that I describe an appropriate way of identifying clusters? + + + + + [1]: https://i.stack.imgur.com/W5MZJ.jpg",,2011-03-24 14:51:27.800 +21482,7965,,4,user88,CC BY-SA 2.5,7c37b9c4-a16d-4065-82f3-a3ddc8437d2e,Colinearity and scaling when using k-means,edited title,2011-03-24 16:12:33.673 +21484,7965,,25,,,f582b0ff-7e07-4a71-baa2-988b09484c82,,http://twitter.com/#!/StackStats/status/50971048736329728,2011-03-24 17:23:39.727 +23138,8529,793.0,2,,CC BY-SA 3.0,d059ff9f-0d0d-425d-a518-c3b37da95746,"What are some good papers describing *applications* of statistics that would be fun and informative to read? Just to be clear, I'm not really looking for papers describing new statistical methods (e.g., a paper on least angle regression), but rather papers describing how to solve real-world problems. + +For example, one paper that would fit what I'm looking is the climate paper from the second Cross-Validated Journal Club. I'm kind of looking for more statistics-ish papers, rather than machine learning papers, but I guess it's kind of a fuzzy distinction (I'd classify the Netflix Prize papers as a bit borderline, and a paper on sentiment analysis as something I'm *not* looking for). + +I'm asking because most of the applications of statistics I've seen are either the little snippets you seen in textbooks, or things related to my own work, so I'd like to branch out a bit.",,2011-04-08 19:01:11.850 +23137,8529,793.0,1,,CC BY-SA 3.0,d059ff9f-0d0d-425d-a518-c3b37da95746,What are some interesting and well-written *applied* statistics papers to read?,,2011-04-08 19:01:11.850 +23139,8529,793.0,3,,CC BY-SA 3.0,d059ff9f-0d0d-425d-a518-c3b37da95746,,,2011-04-08 19:01:11.850 +23140,8529,668.0,16,,,ac32a436-a0b4-4ad8-bc04-d15a5937bcb1,,,2011-04-08 19:04:21.230 +23141,8529,793.0,4,,CC BY-SA 3.0,78024e68-fd28-435a-8c4e-d5171457e657,What are some interesting and well-written *applied* statistics papers?,edited title,2011-04-08 19:10:58.470 +23236,8529,674.0,5,,CC BY-SA 3.0,582ad9c9-14e6-459d-9848-43f193916195,"What are some good papers describing *applications* of statistics that would be fun and informative to read? Just to be clear, I'm not really looking for papers describing new statistical methods (e.g., a paper on least angle regression), but rather papers describing how to solve real-world problems. + +For example, one paper that would fit what I'm looking is the climate paper from the [second Cross-Validated Journal Club][1]. I'm kind of looking for more statistics-ish papers, rather than machine learning papers, but I guess it's kind of a fuzzy distinction (I'd classify the Netflix Prize papers as a bit borderline, and a paper on sentiment analysis as something I'm *not* looking for). + +I'm asking because most of the applications of statistics I've seen are either the little snippets you seen in textbooks, or things related to my own work, so I'd like to branch out a bit. + + + [1]: http://meta.stats.stackexchange.com/questions/685/second-cross-validated-journal-club",add tag + link to CVJC paper,2011-04-09 18:54:46.130 +23707,8681,1040.0,3,,CC BY-SA 3.0,38e5df04-e0a7-429c-a6fc-5bbd8c1e1979,,,2011-04-14 01:33:55.987 +23706,8681,1040.0,1,,CC BY-SA 3.0,38e5df04-e0a7-429c-a6fc-5bbd8c1e1979,Where can I find good publicly available data that I could use to teach z-scores to my college students?,,2011-04-14 01:33:55.987 +23705,8681,1040.0,2,,CC BY-SA 3.0,38e5df04-e0a7-429c-a6fc-5bbd8c1e1979,I am sick of using the examples in the book. Is there an easy place to find data for which z-score/percentile/normal distribution stuff would be easy to see?,,2011-04-14 01:33:55.987 +23711,8681,,25,,,89f82674-de81-4977-91aa-a6b75f07e493,,http://twitter.com/#!/StackStats/status/58366156850999296,2011-04-14 03:09:10.617 +23758,8699,155.0,2,,CC BY-SA 3.0,a969388e-10a0-4c40-b68e-4762e25de01f,"You may wish to read answers to this existing question on [freely available datasets](http://stats.stackexchange.com/questions/7/locating-freely-available-data-samples). + +In general, I imagine that you'd want a dataset with some interesting metric variables. +In psychology research methods classes that I've taught, we've often looked at datasets with intelligence or personality test scores. + +If you want a personality example, I have some [personality data and metadata on github](https://github.com/jeromyanglim/Sweave_Personality_Reports) based on the [IPIP](http://ipip.ori.org/), an public domain measure of the Big 5 factors of personality. + +* [github repository home](https://github.com/jeromyanglim/Sweave_Personality_Reports) +* [data](https://github.com/jeromyanglim/Sweave_Personality_Reports/blob/master/data/ipip.tsv) +* [metadata](https://github.com/jeromyanglim/Sweave_Personality_Reports/blob/master/meta/ipipmeta.tsv) +* [David Smith's summary](http://blog.revolutionanalytics.com/2010/12/how-to-create-pdf-reports-with-r.html)",,2011-04-14 10:55:13.367 +26020,412,1693.0,4,,CC BY-SA 3.0,cde75dd3-4f25-4c36-b8b6-b15e4b15091a,What book would you recommend for non-statistician scientists?,That 1 word seemed necessary.,2011-05-04 18:19:29.047 +26420,9524,2872.0,1,,CC BY-SA 3.0,b70c32e3-f269-4e7e-8fdb-8fb77aa0fa6d,"Any good movies with maths, probabilities etc?",,2011-05-07 11:13:51.243 +26419,9524,2872.0,2,,CC BY-SA 3.0,b70c32e3-f269-4e7e-8fdb-8fb77aa0fa6d,"Can you suggest some good movies which involve math, probabilities etc? One example is [21][1]. I would also be interested in movies that involve algorithms (e.g. text decryption). In general ""geeky"" movies with famous scientific theories but no science fiction or documentaries. Thanks in advance! + + + [1]: http://en.wikipedia.org/wiki/21_%282008_film%29",,2011-05-07 11:13:51.243 +26421,9524,2872.0,3,,CC BY-SA 3.0,b70c32e3-f269-4e7e-8fdb-8fb77aa0fa6d,,,2011-05-07 11:13:51.243 +26426,9524,,25,,,78ac57da-6346-4ccb-9f31-ed7f30c5100a,,http://twitter.com/#!/StackStats/status/66837557815676928,2011-05-07 12:11:30.140 +26427,9524,674.0,16,,,101c07d2-4565-4357-abbe-0b1d27f2aced,,,2011-05-07 12:32:39.210 +26429,9524,674.0,4,,CC BY-SA 3.0,db37784d-623d-4729-9123-68d777ee8d93,Are there any good movies involving mathematics or probability?,edited tags; edited title,2011-05-07 12:34:34.433 +26428,9524,674.0,6,,CC BY-SA 3.0,db37784d-623d-4729-9123-68d777ee8d93,,edited tags; edited title,2011-05-07 12:34:34.433 +26440,9529,192.0,16,,,0a4c4ecc-5f60-49ec-b06b-60958c546b3b,,,2011-05-07 14:19:42.887 +26439,9529,192.0,2,,CC BY-SA 3.0,1b2c43cf-4004-4f05-892c-6916f9394665,"[Pi][1] + + + [1]: http://www.imdb.com/title/tt0138704/",,2011-05-07 14:19:42.887 +27801,10008,1506.0,3,,CC BY-SA 3.0,a636a9f6-56e9-4a4a-b514-bbbdc8628926,,,2011-05-20 01:19:45.107 +27799,10008,1506.0,2,,CC BY-SA 3.0,a636a9f6-56e9-4a4a-b514-bbbdc8628926,"Is it ever valid to include a two-way interaction in a model without including the main effects? What if your hypothesis is only about the interaction, do you still need to include the main effects?",,2011-05-20 01:19:45.107 +27800,10008,1506.0,1,,CC BY-SA 3.0,a636a9f6-56e9-4a4a-b514-bbbdc8628926,Including the Interaction but not the Main Effects in a Model,,2011-05-20 01:19:45.107 +27808,10008,,25,,,e98ca262-ea36-4578-bfbb-edc1a72adb28,,http://twitter.com/#!/StackStats/status/71412951185235969,2011-05-20 03:12:29.113 +27839,10008,,4,user88,CC BY-SA 3.0,6ce9f82e-fcb2-45dc-bfe0-16d6be1bb924,Including the interaction but not the main effects in a model,edited title,2011-05-20 09:20:21.180 +27970,10069,2666.0,2,,CC BY-SA 3.0,a1336a00-51f7-4503-bea7-c598c55dae2e,"In my experience, not only is it necessary to have all lower order effects in the model when they are connected to higher order effects, but it is also important to properly model (e.g., allowing to be nonlinear) main effects that are seemingly unrelated to the factors in the interactions of interest. That's because interactions between x1 and x2 can be stand-ins for main effects of x3 and x4. I.e. sometimes *seem* to be needed because they are collinear with omitted variables or omitted nonlinear (e.g., spline) terms.",,2011-05-21 12:31:20.447 +29493,10541,2690.0,2,,CC BY-SA 3.0,6044637d-d46a-4f80-8894-da665d912571,"Does any know the reference/link where i can find the matlab implementation of gap statistics for clustering as mentioned in [this][1] paper. + + + [1]: http://gremlin1.gdcb.iastate.edu/MIP/gene/MicroarrayData/gapstatistics.pdf",,2011-06-05 05:32:14.513 +29494,10541,2690.0,1,,CC BY-SA 3.0,6044637d-d46a-4f80-8894-da665d912571,Gap Statistics matlab Implementation,,2011-06-05 05:32:14.513 +29495,10541,2690.0,3,,CC BY-SA 3.0,6044637d-d46a-4f80-8894-da665d912571,,,2011-06-05 05:32:14.513 +29496,10541,,25,,,b96194d6-17ac-4792-8c40-8c864fccbd98,,http://twitter.com/#!/StackStats/status/77256861539250176,2011-06-05 06:14:06.167 +29521,10541,,4,user88,CC BY-SA 3.0,a809b720-a5d8-45d7-969f-9988df693472,Gap statistics MATLAB implementation,edited body; edited title,2011-06-05 12:11:37.537 +29522,10541,,5,user88,CC BY-SA 3.0,a809b720-a5d8-45d7-969f-9988df693472,"Does any know the reference/link where i can find the MATLAB implementation of gap statistics for clustering as mentioned in [this][1] paper? + + + [1]: http://gremlin1.gdcb.iastate.edu/MIP/gene/MicroarrayData/gapstatistics.pdf",edited body; edited title,2011-06-05 12:11:37.537 +30737,10911,22.0,3,,CC BY-SA 3.0,c0dec5b6-cdb4-4ac3-99c2-397b1aef27d2,,,2011-06-16 16:58:13.537 +30739,10911,22.0,2,,CC BY-SA 3.0,c0dec5b6-cdb4-4ac3-99c2-397b1aef27d2,"Imagine that you repeat an experiment three times. In each experiment, you collect triplicate measurements. The triplicates tend to be fairly close together, compared to the differences among the three experimental means. Computing the grand mean is pretty easy. But how can one compute a confidence interval for the grand mean? + +Sample data: + +Experiment 1: 34, 41, 39 + +Experiment 2: 45, 51, 52 + +Experiment 3: 29, 31, 35 + +Assume that the replicate values within an experiment follow a Gaussian distribution, as does the mean values of each experiment. The SD of variation within an experiment is smaller than the SD among experimental means. + +The simple approach is to first compute the mean of each experiment: 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the grand mean is 39.7 with the 95% confidence interval ranging from 17.4 to 61.9. + +The problem with that approach is that it totally ignores the variation among triplicates. I wonder if there isn't a good way to account for that variation. ",,2011-06-16 16:58:13.537 +31121,10911,22.0,5,,CC BY-SA 3.0,a1c18211-9322-4c90-93ae-9b2e53bdb965,"Imagine that you repeat an experiment three times. In each experiment, you collect triplicate measurements. The triplicates tend to be fairly close together, compared to the differences among the three experimental means. Computing the grand mean is pretty easy. But how can one compute a confidence interval for the grand mean? + +Sample data: + +Experiment 1: 34, 41, 39 + +Experiment 2: 45, 51, 52 + +Experiment 3: 29, 31, 35 + +Assume that the replicate values within an experiment follow a Gaussian distribution, as does the mean values of each experiment. The SD of variation within an experiment is smaller than the SD among experimental means. Assume also that there is no ordering of the three values in each experiment. The left-to-right order of the three values in each row is entirely arbitrary. + +The simple approach is to first compute the mean of each experiment: 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the grand mean is 39.7 with the 95% confidence interval ranging from 17.4 to 61.9. + +The problem with that approach is that it totally ignores the variation among triplicates. I wonder if there isn't a good way to account for that variation. ",Explained that the left-to-right order of the replicates in each row is arbitrary.,2011-06-20 15:10:13.723 +33635,412,668.0,38,,,275be647-00fe-46a4-811b-2844fbf9f5dc,"[{""Id"":919,""DisplayName"":""whuber""}]",from http://stats.stackexchange.com/questions/12964/introductory-materials-in-statistical-analysis-and-data-visualisation,2011-07-13 18:53:43.480 +34460,10911,668.0,6,,CC BY-SA 3.0,e3f36c61-5f49-455b-a259-c476f4660b60,,edited tags,2011-07-19 21:18:45.377 +36699,4714,60.0,5,,CC BY-SA 3.0,5b5afa53-4981-4e4a-a881-ab648c8234be,Rev. [Thomas Bayes](http://en.wikipedia.org/wiki/Thomas_Bayes) for discovering Bayes' theorem,added 7 characters in body,2011-08-09 17:16:30.820 +37746,13058,1124.0,2,,CC BY-SA 3.0,0f962e58-a484-4f48-9a63-a6041886afd2,"Anybody have any experience with software (preferably free, preferably open source) that will take an image of data plotted on cartesian coordinates (a standard, everyday plot) and extract the coordinates of the points plotted on the graph? + +Essentially, this is a data-mining problem and a **reverse** data-visualization problem.",,2011-08-18 04:14:22.583 +37744,13058,1124.0,3,,CC BY-SA 3.0,0f962e58-a484-4f48-9a63-a6041886afd2,,,2011-08-18 04:14:22.583 +37745,13058,1124.0,1,,CC BY-SA 3.0,0f962e58-a484-4f48-9a63-a6041886afd2,Software needed to scrape data from graph,,2011-08-18 04:14:22.583 +37748,13060,1805.0,2,,CC BY-SA 3.0,900d659c-78e5-4ef0-bcdc-cc38d92cd8dd,"Check out the [digitize][1] package for [R][2]. Its designed to solve exactly this sort of problem. + + + [1]: http://cran.r-project.org/web/packages/digitize/index.html + [2]: http://cran.r-project.org/",,2011-08-18 05:14:07.900 +37752,13058,,25,,,cd66f7ee-f31b-4427-b17d-bd6ecb46d669,,http://twitter.com/#!/StackStats/status/104079744433270784,2011-08-18 06:38:39.437 +39549,13631,4221.0,1,,CC BY-SA 3.0,a67f7563-8add-48b4-98c6-aa3af25404e7,Forecasting binary time series,,2011-09-01 14:56:28.933 +39547,13631,4221.0,3,,CC BY-SA 3.0,a67f7563-8add-48b4-98c6-aa3af25404e7,,,2011-09-01 14:56:28.933 +39548,13631,4221.0,2,,CC BY-SA 3.0,a67f7563-8add-48b4-98c6-aa3af25404e7,"I have a binary time series with 1 when the car is not moving, and 0 when the car is moving. I want to make a forecast for a time horizon up to 36 hours ahead and for each hour. + +My first approach was to use a Naive Bayes using the following inputs: t-24 (daily seasonal), t-48 (weekly seasonal), hour of the day. However, the results are not very good. + +Which articles or software do you recommend for this problem? + +Thanks, +Ricardo Bessa",,2011-09-01 14:56:28.933 +39550,13631,668.0,6,,CC BY-SA 3.0,db02cef5-5db9-4265-836b-d90858e51908,,edited tags,2011-09-01 15:05:15.883 +39554,13631,,25,,,23f2eb64-be55-40c1-b49c-2315201d8b97,,http://twitter.com/#!/StackStats/status/109289232429883392,2011-09-01 15:39:17.967 +39697,13631,668.0,6,,CC BY-SA 3.0,ef643b3b-2265-4631-99e4-24acd48794ff,,deleted 26 characters in body; edited tags,2011-09-02 18:54:00.667 +39698,13631,668.0,5,,CC BY-SA 3.0,ef643b3b-2265-4631-99e4-24acd48794ff,"I have a binary time series with 1 when the car is not moving, and 0 when the car is moving. I want to make a forecast for a time horizon up to 36 hours ahead and for each hour. + +My first approach was to use a Naive Bayes using the following inputs: t-24 (daily seasonal), t-48 (weekly seasonal), hour of the day. However, the results are not very good. + +Which articles or software do you recommend for this problem?",deleted 26 characters in body; edited tags,2011-09-02 18:54:00.667 +43119,14729,5898.0,3,,CC BY-SA 3.0,b4530a21-db63-4663-ad43-8b417207fe2f,,,2011-10-01 17:46:37.323 +43118,14729,5898.0,1,,CC BY-SA 3.0,b4530a21-db63-4663-ad43-8b417207fe2f,testing for linear dependence among the columns of a matrix,,2011-10-01 17:46:37.323 +43120,14729,5898.0,2,,CC BY-SA 3.0,b4530a21-db63-4663-ad43-8b417207fe2f,"I have a correlation matrix of security returns whose determinant is zero. (This is a bit surprising since the sample correlation matrix and the corresponding covariance matrix should theoretically be positive semi-definite.) + +My hypothesis is that at least one security is linearly dependent on other securities. Is there a function in R that sequentially tests each column a matrix for linear dependence? + +For example, one approach would be to build up a correlation matrix one security at a time and calculate the determinant at each step. When the determinant = 0 then stop as you have identified the security who is a linear combination of other securities. + +Any other techniques to identify linear dependence in such a matrix are appreciated.",,2011-10-01 17:46:37.323 +179775,55576,21833.0,1,,CC BY-SA 3.0,aaf7aaeb-ac21-4d4d-b4be-c8c1715d8b8d,Generating random numbers based on partial correlation data,,2013-09-18 12:09:15.653 +179776,55576,21833.0,3,,CC BY-SA 3.0,aaf7aaeb-ac21-4d4d-b4be-c8c1715d8b8d,,,2013-09-18 12:09:15.653 +43122,14729,5898.0,5,,CC BY-SA 3.0,67de931f-9e3f-43e6-a759-f35741eb39d5,"I have a correlation matrix of security returns whose determinant is zero. (This is a bit surprising since the sample correlation matrix and the corresponding covariance matrix should theoretically be positive definite.) + +My hypothesis is that at least one security is linearly dependent on other securities. Is there a function in R that sequentially tests each column a matrix for linear dependence? + +For example, one approach would be to build up a correlation matrix one security at a time and calculate the determinant at each step. When the determinant = 0 then stop as you have identified the security who is a linear combination of other securities. + +Any other techniques to identify linear dependence in such a matrix are appreciated.",deleted 5 characters in body,2011-10-01 17:54:45.020 +43138,14729,674.0,4,,CC BY-SA 3.0,b9f630bd-8fd3-43be-be43-353b98b2b8c2,Testing for linear dependence among the columns of a matrix,edited title,2011-10-01 19:39:48.100 +43333,14790,2081.0,2,,CC BY-SA 3.0,9d1571c5-e38f-43bd-b1d4-75ee38f08767,"You seem to ask a really provoking question: how to detect, given a singular correlation (or covariance, or sum-of-squares-and-cross-product) matrix, which column is linearly dependent on which. I tentatively suppose that **sweep operation** could help. Here is my probe in SPSS (not R) to illustrate. + +Let's generate some data: + + v1 v2 v3 v4 v5 + -1.64454 .35119 -.06384 -1.05188 .25192 + -1.78520 -.21598 1.20315 .40267 1.14790 + 1.36357 -.96107 -.46651 .92889 -1.38072 + -.31455 -.74937 1.17505 1.27623 -1.04640 + -.31795 .85860 .10061 .00145 .39644 + -.97010 .19129 2.43890 -.83642 -.13250 + -.66439 .29267 1.20405 .90068 -1.78066 + .87025 -.89018 -.99386 -1.80001 .42768 + -1.96219 -.27535 .58754 .34556 .12587 + -1.03638 -.24645 -.11083 .07013 -.84446 + +Let's create some linear dependancy between V2, V4 and V5: + + compute V4 = .4*V2+1.2*V5. + execute. + +So, we modified our column V4. + + matrix. + get X. /*take the data*/ + compute M = sscp(dat). /*SSCP matrix, X'X; it is singular*/ + print rank(M). /*with rank 5-1=4, because there's 1 group of interdependent columns*/ + loop i= 1 to 5. /*Start iterative sweep operation on M from column 1 to column 5*/ + -compute M = sweep(M,i). + -print M. /*That's printout we want to trace*/ + end loop. + end matrix. + +The printouts of M in 5 iterations: + + M + .06660028 -.12645565 -.54275426 -.19692972 -.12195621 + .12645565 3.20350385 -.08946808 2.84946215 1.30671718 + .54275426 -.08946808 7.38023317 -3.51467361 -2.89907198 + .19692972 2.84946215 -3.51467361 13.88671851 10.62244471 + .12195621 1.30671718 -2.89907198 10.62244471 8.41646486 + + M + .07159201 .03947417 -.54628594 -.08444957 -.07037464 + .03947417 .31215820 -.02792819 .88948298 .40790248 + .54628594 .02792819 7.37773449 -3.43509328 -2.86257773 + .08444957 -.88948298 -3.43509328 11.35217042 9.46014202 + .07037464 -.40790248 -2.86257773 9.46014202 7.88345168 + + M + .112041875 .041542117 .074045215 -.338801789 -.282334825 + .041542117 .312263922 .003785470 .876479537 .397066281 + .074045215 .003785470 .135542964 -.465602725 -.388002270 + .338801789 -.876479537 .465602725 9.752781632 8.127318027 + .282334825 -.397066281 .388002270 8.127318027 6.772765022 + + M + .1238115070 .0110941027 .0902197842 .0347389906 .0000000000 + .0110941027 .3910328733 -.0380581058 -.0898696977 -.3333333333 + .0902197842 -.0380581058 .1577710733 .0477405054 .0000000000 + .0347389906 -.0898696977 .0477405054 .1025348498 .8333333333 + .0000000000 .3333333333 .0000000000 -.8333333333 .0000000000 + + M + .1238115070 .0110941027 .0902197842 .0347389906 .0000000000 + .0110941027 .3910328733 -.0380581058 -.0898696977 .0000000000 + .0902197842 -.0380581058 .1577710733 .0477405054 .0000000000 + .0347389906 -.0898696977 .0477405054 .1025348498 .0000000000 + .0000000000 .0000000000 .0000000000 .0000000000 .0000000000 + +Notice that eventually column 5 got full of zeros. This means (as I understand it) that V5 is linearly tied with some of *preceeding* columns. Which columns? Look at iteration where column 5 is last not full of zeroes - iteration 4. We see there that V5 is tied with V2 and V4 with coefficients -.3333 and .8333: V5 = -.3333*V2+.8333*V4, which corresponds to what we've done with the data: V4 = .4*V2+1.2*V5. + +That's how we knew which column is linearly tied with which other. I didn't check how helpful is the above approach in more general case with many groups of interdependancies in the data. In the above example it appeared helpful, though.",,2011-10-03 09:34:57.197 +43335,14790,2081.0,5,,CC BY-SA 3.0,f91a351c-14c3-4445-bf1c-1c2ced8329a6,"You seem to ask a really provoking question: how to detect, given a singular correlation (or covariance, or sum-of-squares-and-cross-product) matrix, which column is linearly dependent on which. I tentatively suppose that **sweep operation** could help. Here is my probe in SPSS (not R) to illustrate. + +Let's generate some data: + + v1 v2 v3 v4 v5 + -1.64454 .35119 -.06384 -1.05188 .25192 + -1.78520 -.21598 1.20315 .40267 1.14790 + 1.36357 -.96107 -.46651 .92889 -1.38072 + -.31455 -.74937 1.17505 1.27623 -1.04640 + -.31795 .85860 .10061 .00145 .39644 + -.97010 .19129 2.43890 -.83642 -.13250 + -.66439 .29267 1.20405 .90068 -1.78066 + .87025 -.89018 -.99386 -1.80001 .42768 + -1.96219 -.27535 .58754 .34556 .12587 + -1.03638 -.24645 -.11083 .07013 -.84446 + +Let's create some linear dependancy between V2, V4 and V5: + + compute V4 = .4*V2+1.2*V5. + execute. + +So, we modified our column V4. + + matrix. + get X. /*take the data*/ + compute M = sscp(X). /*SSCP matrix, X'X; it is singular*/ + print rank(M). /*with rank 5-1=4, because there's 1 group of interdependent columns*/ + loop i= 1 to 5. /*Start iterative sweep operation on M from column 1 to column 5*/ + -compute M = sweep(M,i). + -print M. /*That's printout we want to trace*/ + end loop. + end matrix. + +The printouts of M in 5 iterations: + + M + .06660028 -.12645565 -.54275426 -.19692972 -.12195621 + .12645565 3.20350385 -.08946808 2.84946215 1.30671718 + .54275426 -.08946808 7.38023317 -3.51467361 -2.89907198 + .19692972 2.84946215 -3.51467361 13.88671851 10.62244471 + .12195621 1.30671718 -2.89907198 10.62244471 8.41646486 + + M + .07159201 .03947417 -.54628594 -.08444957 -.07037464 + .03947417 .31215820 -.02792819 .88948298 .40790248 + .54628594 .02792819 7.37773449 -3.43509328 -2.86257773 + .08444957 -.88948298 -3.43509328 11.35217042 9.46014202 + .07037464 -.40790248 -2.86257773 9.46014202 7.88345168 + + M + .112041875 .041542117 .074045215 -.338801789 -.282334825 + .041542117 .312263922 .003785470 .876479537 .397066281 + .074045215 .003785470 .135542964 -.465602725 -.388002270 + .338801789 -.876479537 .465602725 9.752781632 8.127318027 + .282334825 -.397066281 .388002270 8.127318027 6.772765022 + + M + .1238115070 .0110941027 .0902197842 .0347389906 .0000000000 + .0110941027 .3910328733 -.0380581058 -.0898696977 -.3333333333 + .0902197842 -.0380581058 .1577710733 .0477405054 .0000000000 + .0347389906 -.0898696977 .0477405054 .1025348498 .8333333333 + .0000000000 .3333333333 .0000000000 -.8333333333 .0000000000 + + M + .1238115070 .0110941027 .0902197842 .0347389906 .0000000000 + .0110941027 .3910328733 -.0380581058 -.0898696977 .0000000000 + .0902197842 -.0380581058 .1577710733 .0477405054 .0000000000 + .0347389906 -.0898696977 .0477405054 .1025348498 .0000000000 + .0000000000 .0000000000 .0000000000 .0000000000 .0000000000 + +Notice that eventually column 5 got full of zeros. This means (as I understand it) that V5 is linearly tied with some of *preceeding* columns. Which columns? Look at iteration where column 5 is last not full of zeroes - iteration 4. We see there that V5 is tied with V2 and V4 with coefficients -.3333 and .8333: V5 = -.3333*V2+.8333*V4, which corresponds to what we've done with the data: V4 = .4*V2+1.2*V5. + +That's how we knew which column is linearly tied with which other. I didn't check how helpful is the above approach in more general case with many groups of interdependancies in the data. In the above example it appeared helpful, though.",deleted 2 characters in body,2011-10-03 09:44:41.570 +44941,15281,3641.0,3,,CC BY-SA 3.0,308ec5ab-4a6f-4041-bbfe-4a01e09300f0,,,2011-10-13 11:46:03.773 +44939,15281,3641.0,2,,CC BY-SA 3.0,308ec5ab-4a6f-4041-bbfe-4a01e09300f0,"Is there a specific method to detect change points(structural breaks) in a timeseries?(stocks prices) + +Thanks",,2011-10-13 11:46:03.773 +44940,15281,3641.0,1,,CC BY-SA 3.0,308ec5ab-4a6f-4041-bbfe-4a01e09300f0,How to detect change(shift) in a timeseries,,2011-10-13 11:46:03.773 +44975,15281,668.0,6,,CC BY-SA 3.0,492894dc-a353-47ec-943c-4cafd3a9248f,,edited tags,2011-10-13 17:35:25.130 +45075,15281,3641.0,4,,CC BY-SA 3.0,e84cc2a1-4c13-4b3b-9cb6-e53e712672c0,How to detect structural change in a timeseries,edited title,2011-10-14 11:36:29.290 +45752,15542,4911.0,3,,CC BY-SA 3.0,ff1de0f7-ac9e-4895-b7ee-071f6b935b5a,,,2011-10-18 21:24:39.543 +45751,15542,4911.0,1,,CC BY-SA 3.0,ff1de0f7-ac9e-4895-b7ee-071f6b935b5a,"What are the ""hot algorithms"" for machine learning?",,2011-10-18 21:24:39.543 +45750,15542,4911.0,2,,CC BY-SA 3.0,ff1de0f7-ac9e-4895-b7ee-071f6b935b5a,"this is a naive question from someone starting to learn machine learning. I'm reading these days the book ""Machine Learning: An algorithmic perspective"" from Marsland. I find it useful as an introductory book, but now I would like to go into advanced algorithms, those that are currently giving the best results. I'm mostly interested in bioinformatics: clustering of biological networks and finding patterns in biological sequences, particularly applied to single nucleotide polymorphism (SNP) analysis. Could you recommend me some reivews or books to read? + +Many thanks! +",,2011-10-18 21:24:39.543 +45776,15542,,25,,,580c95ca-cc39-4224-a465-f9a877f814f8,,http://twitter.com/#!/StackStats/status/126418768904720385,2011-10-18 22:05:57.933 +47868,16209,5196.0,2,,CC BY-SA 3.0,73dde68c-0572-477b-8721-a2746443264a,"How to convert the x below to into a vector like y? + + x <- [""a"", ""b"", ""b"", ""c"", ...] + + y <- [1, 2, 2, 3, ...] + +",,2011-11-06 09:47:15.640 +47866,16209,5196.0,3,,CC BY-SA 3.0,73dde68c-0572-477b-8721-a2746443264a,,,2011-11-06 09:47:15.640 +47867,16209,5196.0,1,,CC BY-SA 3.0,73dde68c-0572-477b-8721-a2746443264a,How to convert an vector of enumerable strings into a vector of numbers?,,2011-11-06 09:47:15.640 +47878,16212,1927.0,2,,CC BY-SA 3.0,56c96588-9bef-4359-af3d-80193163a6b0,"Here is a possibility, very similar than that of @Roman Lustrik, but just a little bit more automatic. + +Say that + > x <- c(""a"", ""b"", ""b"", ""c"") + +Then + + > x <- as.factor(x) + > levels(x) <- 1:length(levels(x)) + > x <- as.numeric(x) + +makes the job: + + > print(x) + [1] 1 2 2 3 +",,2011-11-06 11:12:09.377 +47880,16209,5196.0,5,,CC BY-SA 3.0,8a95de3c-2fbb-4839-ba1f-aaacddcebab9,"How to convert the x below to into a vector like y? + + x <- [""a"", ""b"", ""b"", ""c"", ...] + + y <- [1, 2, 2, 3, ...] + + +**UPDATE:** + +I end up with: + + levels(x) <- 1:length(levels(x))",added 63 characters in body,2011-11-06 11:21:43.040 +48046,16209,668.0,4,,CC BY-SA 3.0,530f9c82-47d1-4163-9446-238f4028c866,How to convert a vector of enumerable strings into a vector of numbers?,edited title,2011-11-07 22:02:53.807 +48171,16313,5234.0,1,,CC BY-SA 3.0,2a13055a-934e-4f80-99a6-bc2164c1deda,How does the Goodman-Kruskal Gamma test and the Kendall-Tau or Spearman-Rho test compare?,,2011-11-09 02:39:58.810 +48170,16313,5234.0,3,,CC BY-SA 3.0,2a13055a-934e-4f80-99a6-bc2164c1deda,,,2011-11-09 02:39:58.810 +48172,16313,5234.0,2,,CC BY-SA 3.0,2a13055a-934e-4f80-99a6-bc2164c1deda,"In my work, we are comparing predicted rankings versus true rankings for some sets of data. Up until recently, we've been using Kendall-Tau alone. A group working on a similar project suggested we try to use the [Goodman-Kruskal Gamma test](http://en.wikipedia.org/wiki/Gamma_test_(statistics)) instead, and that they preferred it. I was wondering what the differences between the different rank correlation algorithms were. + +The best I've found was [this answer](http://stats.stackexchange.com/questions/3943/kendall-tau-or-spearmans-rho/3946#3946), which claims Spearman is used in place of usual linear correlations, and that Kendall-Tau is less direct and more closely resembles Goodman-Kruskal Gamma. The data I'm working with doesn't seem to have any obvious linear correlations, and the data is heavily skewed and non-normal. + +Also, Spearman generally reports better correlation than Kendall-Tau for our data, and I was wondering what that says about the data specifically. I'm not a statistician, so some of the papers I'm reading on these things just seem like jargon to me, sorry.",,2011-11-09 02:39:58.810 +48196,16313,,4,user88,CC BY-SA 3.0,d4d06604-0256-40e1-b28b-5231f2773d50,How does the Goodman-Kruskal gamma test and the Kendall tau or Spearman rho test compare?,edited title,2011-11-09 07:54:20.863 +48222,16313,,25,,,dd2db29b-245f-454e-a911-f38913dd9a7a,,http://twitter.com/#!/StackStats/status/134228649456041984,2011-11-09 11:19:38.553 +74072,24506,7341.0,2,,CC BY-SA 3.0,3d76502f-1e6d-42d0-b5a7-2ce4d22e779a,"I'm using JMP to analyze some sample data to make predictions about the population. My sample is from a destructive QC test, so I obviously want to minimize my sample. I have a response (my Y) and a known factor (a very strong and consistent correlation that is measurable by non-destructive means) but the exact relationship between them varies from lot to lot (the slope and y offset vary). + +So, in JMP, I am fitting a line and then showing the ""confidence limits for an individual predicted value"" which I believe gives me an indicator of how the population is likely to behave. So I'm using that plot to make disposition decisions. I want to automate this process, perhaps using R, but I'm a total novice at R. I could do the math if I was just dealing with a mean and standard deviation, but I don't know how to do it with a fit line and a known factor. Can someone please give me either the general information on how to get the confidence limits around the line, or else tell me how to do the whole thing in R? + +Thankss much.",,2012-05-04 17:02:04.137 +48230,16337,2081.0,2,,CC BY-SA 3.0,4dab8f73-9a2f-455e-a67a-64698eb4d463,"**Spearman rho vs Kendall tau**. These two are so much computationally different that you *cannot* directly compare their magnitudes. Spearman is usually higher by 1/4 to 1/3 and this makes one incorrectly conclude that Spearman is ""better"" for a particular dataset. The difference between rho and tau is in their ideology, *proportion-of-variance* for rho and *probability* for tau. Rho is a usual Pearson r applied for ranked data, and like r, is more sensitive to points with large moments (that is, deviations from cloud centre) than to points with small moments. Therefore rho is quite sensitive to the shape of the cloud: the coefficient for rhombic cloud will be higher than the coefficient for dumbbelled cloud. Tau is an extension of Gamma and is equally sensitive to all points, so it is less sensitive to peculiarities in shape of the ranked cloud. Tau is more ""general"" than rho, for rho is warranted only when you believe the underlying relationship between the variables is monotonic. Rho is comparable with r in magnitude; tau is not. + +**Kendall tau as Gamma**. Tau is just a standardized form of Gamma. Several related measures all have numerator $P-Q$ but differ in normalizing *denominator*: + + - Gamma: $P+Q$ + - Somer's D(""x dependent""): $P+Q+T_x$ + - Somer's D(""y dependent""): $P+Q+T_y$ + - Somer's D(""symmetric""): arithmetic mean of the above two + - Kendall's Tau-b corr. (most suitable for square tables): geometric mean of those two + - Kendall's Tau-c corr. (most suitable for rectangular tables): $N^2(k-1)/2k$ + - Kendall's Tau-a corr. (makes nо adjustment for ties): $N(N-1)/2 = P+Q+T_x+T_y+T_{xy}$ + +where $P$ - number of pairs of observations with ""concordance"", $Q$ - with ""inversion""; $T_x$ - number of ties by variable X, $T_y$ - by variable Y, $T_{xy}$ – by both variables; $N$ - number of observations, $k$ - number of distinct values in that variable where this number is less. +",,2011-11-09 11:41:29.170 +48231,16337,2081.0,5,,CC BY-SA 3.0,06b7aa90-7328-487a-8a0c-bc7f02816ac5,"**Spearman rho vs Kendall tau**. These two are so much computationally different that you *cannot* directly compare their magnitudes. Spearman is usually higher by 1/4 to 1/3 and this makes one incorrectly conclude that Spearman is ""better"" for a particular dataset. The difference between rho and tau is in their ideology, *proportion-of-variance* for rho and *probability* for tau. Rho is a usual Pearson r applied for ranked data, and like r, is more sensitive to points with large moments (that is, deviations from cloud centre) than to points with small moments. Therefore rho is quite sensitive to the shape of the cloud (upon ranling data): the coefficient for rhombic cloud will be higher than the coefficient for dumbbelled cloud. Tau is an extension of Gamma and is equally sensitive to all points, so it is less sensitive to peculiarities in shape of the ranked cloud. Tau is more ""general"" than rho, for rho is warranted only when you believe the underlying relationship between the variables is monotonic. Rho is comparable with r in magnitude; tau is not. + +**Kendall tau as Gamma**. Tau is just a standardized form of Gamma. Several related measures all have numerator $P-Q$ but differ in normalizing *denominator*: + + - Gamma: $P+Q$ + - Somer's D(""x dependent""): $P+Q+T_x$ + - Somer's D(""y dependent""): $P+Q+T_y$ + - Somer's D(""symmetric""): arithmetic mean of the above two + - Kendall's Tau-b corr. (most suitable for square tables): geometric mean of those two + - Kendall's Tau-c corr. (most suitable for rectangular tables): $N^2(k-1)/2k$ + - Kendall's Tau-a corr. (makes nо adjustment for ties): $N(N-1)/2 = P+Q+T_x+T_y+T_{xy}$ + +where $P$ - number of pairs of observations with ""concordance"", $Q$ - with ""inversion""; $T_x$ - number of ties by variable X, $T_y$ - by variable Y, $T_{xy}$ – by both variables; $N$ - number of observations, $k$ - number of distinct values in that variable where this number is less. +",added 20 characters in body,2011-11-09 11:56:10.417 +48232,16337,2081.0,5,,CC BY-SA 3.0,8988fdf2-6004-4296-8897-c482a39e4aec,"**Spearman rho vs Kendall tau**. These two are so much computationally different that you *cannot* directly compare their magnitudes. Spearman is usually higher by 1/4 to 1/3 and this makes one incorrectly conclude that Spearman is ""better"" for a particular dataset. The difference between rho and tau is in their ideology, *proportion-of-variance* for rho and *probability* for tau. Rho is a usual Pearson r applied for ranked data, and like r, is more sensitive to points with large moments (that is, deviations from cloud centre) than to points with small moments. Therefore rho is quite sensitive to the shape of the cloud: the coefficient for rhombic cloud will be higher than the coefficient for dumbbelled cloud. Tau is an extension of Gamma and is equally sensitive to all points, so it is less sensitive to peculiarities in shape of the ranked cloud. Tau is more ""general"" than rho, for rho is warranted only when you believe the underlying relationship between the variables is monotonic. Rho is comparable with r in magnitude; tau is not. + +**Kendall tau as Gamma**. Tau is just a standardized form of Gamma. Several related measures all have numerator $P-Q$ but differ in normalizing *denominator*: + + - Gamma: $P+Q$ + - Somer's D(""x dependent""): $P+Q+T_x$ + - Somer's D(""y dependent""): $P+Q+T_y$ + - Somer's D(""symmetric""): arithmetic mean of the above two + - Kendall's Tau-b corr. (most suitable for square tables): geometric mean of those two + - Kendall's Tau-c corr. (most suitable for rectangular tables): $N^2(k-1)/2k$ + - Kendall's Tau-a corr. (makes nо adjustment for ties): $N(N-1)/2 = P+Q+T_x+T_y+T_{xy}$ + +where $P$ - number of pairs of observations with ""concordance"", $Q$ - with ""inversion""; $T_x$ - number of ties by variable X, $T_y$ - by variable Y, $T_{xy}$ – by both variables; $N$ - number of observations, $k$ - number of distinct values in that variable where this number is less. +",edited body,2011-11-09 12:06:16.160 +48338,16366,5249.0,3,,CC BY-SA 3.0,2d928bb0-4a91-4fe9-bd6b-3f44175f6ec7,,,2011-11-10 03:08:12.977 +48336,16366,5249.0,2,,CC BY-SA 3.0,2d928bb0-4a91-4fe9-bd6b-3f44175f6ec7,"I'm confused about how to calculate the perplexity of a holdout sample when doing Latent Dirichlet Allocation (LDA). The papers on the topic breeze over it, making me think I'm missing something obvious.. So.. + +Perplexity is seen as a good measure of performance for LDA. The idea is that you keep a holdout sample, train your LDA on the rest of the data, then calculate the perplexity of the holdout. + +The perplexity could be given by the forumla: + +$per(D_{test})=exp\{-\frac{\sum_{d=1}^{M}\log p(\mathbb{w}_d)}{\sum_{d=1}^{M}N_d}\} $ + +(taken from [Image retrieval on large-scale image databases, Horster et al][2]) +Here $M$ is the number of documents (in the test sample, presumably), $\mathbb{w}_d$ represents the words in document $d$, $N_d$ the number of words in document $d$. + +It is not clear to me how to sensibly calcluate $p(\mathbb{w}_d)$, since we don't have topic mixtures for the held out documents. Ideally, we would integrate over the Dirichlet prior for all possible topic mixtures and use the topic multinomials we learned. Calculating this integral doesn't seem an easy task however. + +Alternatively, we could attempt to learn an optimal topic mixture for each held out document (given our learned topics) and use this to calculate the perplexity. This would be doable, however it's not as trivial as papers such as Horter et al and Blei et al seem to suggest, and it's not immediately clear to me that the result will be equivalent to the ideal case above. + + + + + + [1]: https://i.stack.imgur.com/yWP97.jpg + [2]: http://doi.acm.org/10.1145/1282280.1282283",,2011-11-10 03:08:12.977 +48337,16366,5249.0,1,,CC BY-SA 3.0,2d928bb0-4a91-4fe9-bd6b-3f44175f6ec7,How to calculate perplexity of a holdout with LDA,,2011-11-10 03:08:12.977 +48346,16366,2081.0,4,,CC BY-SA 3.0,0c8204ea-50fa-4151-8add-d2897f059886,How to calculate perplexity of a holdout with Latent Dirichlet Allocation,edited title,2011-11-10 06:24:54.233 +54652,18345,5448.0,2,,CC BY-SA 3.0,101d508e-840d-48b2-92b9-c6d503b7121a,"The LR (likelihood ratio) test actually is testing the hypothesis that a specified subset of the parameters equal some pre-specified values. In the case of model selection, generally (but not always) that means some of the parameters equal zero. If the models are nested, the parameters in the larger model that are not in the smaller model are the ones being tested, with values specified implicitly by their exclusion from the smaller model. If the models aren't nested, you aren't testing this any more, because BOTH models have parameters that aren't in the other model, so the LR test statistic doesn't have the asymptotic $\chi^2$ distribution that it (usually) does in the nested case. + +AIC, on the other hand, is not used for formal testing. It is used for informal comparisons of models with differing numbers of parameters. The penalty term in the expression for AIC is what allows this comparison. But no assumptions are made about the functional form of the asymptotic distribution of the differences between the AIC of two non-nested models when doing the model comparison, and the difference between two AICs is not treated as a test statistic.",,2012-01-01 17:41:56.327 +74283,24506,,4,user88,CC BY-SA 3.0,216d616c-d992-426c-a52a-bac828ae0063,Confidence interval for values for a fitted line,edited title,2012-05-05 20:47:03.430 +181537,55722,15827.0,4,,CC BY-SA 3.0,2c950a00-ef5e-4b19-b2c0-aaa310f0a991,Looking for a good and complete probability and statistics book,small fixes to English,2013-09-25 11:17:31.700 +48372,16366,674.0,5,,CC BY-SA 3.0,86ea3e53-d91e-492f-bd85-6c9f9e3cf955,"I'm confused about how to calculate the perplexity of a holdout sample when doing Latent Dirichlet Allocation (LDA). The papers on the topic breeze over it, making me think I'm missing something obvious... + +Perplexity is seen as a good measure of performance for LDA. The idea is that you keep a holdout sample, train your LDA on the rest of the data, then calculate the perplexity of the holdout. + +The perplexity could be given by the formula: + +$per(D_{test})=exp\{-\frac{\sum_{d=1}^{M}\log p(\mathbb{w}_d)}{\sum_{d=1}^{M}N_d}\} $ + +(Taken from [Image retrieval on large-scale image databases, Horster et al][2].) + +Here $M$ is the number of documents (in the test sample, presumably), $\mathbb{w}_d$ represents the words in document $d$, $N_d$ the number of words in document $d$. + +It is not clear to me how to sensibly calcluate $p(\mathbb{w}_d)$, since we don't have topic mixtures for the held out documents. Ideally, we would integrate over the Dirichlet prior for all possible topic mixtures and use the topic multinomials we learned. Calculating this integral doesn't seem an easy task however. + +Alternatively, we could attempt to learn an optimal topic mixture for each held out document (given our learned topics) and use this to calculate the perplexity. This would be doable, however it's not as trivial as papers such as Horter et al and Blei et al seem to suggest, and it's not immediately clear to me that the result will be equivalent to the ideal case above. + + + + + + [1]: https://i.stack.imgur.com/yWP97.jpg + [2]: http://doi.acm.org/10.1145/1282280.1282283",edited title,2011-11-10 08:29:44.023 +48373,16366,674.0,4,,CC BY-SA 3.0,86ea3e53-d91e-492f-bd85-6c9f9e3cf955,How to calculate perplexity of a holdout with Latent Dirichlet Allocation?,edited title,2011-11-10 08:29:44.023 +48807,15542,1406.0,5,,CC BY-SA 3.0,6e8e2806-e31a-4065-9d89-28d74b6c48b9,"This is a naive question from someone starting to learn machine learning. I'm reading these days the book ""Machine Learning: An algorithmic perspective"" from Marsland. I find it useful as an introductory book, but now I would like to go into advanced algorithms, those that are currently giving the best results. I'm mostly interested in bioinformatics: clustering of biological networks and finding patterns in biological sequences, particularly applied to single nucleotide polymorphism (SNP) analysis. Could you recommend me some reviews or books to read? +",deleted 16 characters in body,2011-11-14 12:28:30.437 +48901,16537,1831.0,2,,CC BY-SA 3.0,8f71eff4-ad42-4cd0-969c-1a8414bd0cd0,"[Deep Learning][1] got a lot of focus since 2006. It's basically an approach to train deep neural networks and is leading to really impressive results on very hard datasets (like document clustering or object recognition). Some people are talking about the second neural network renaissance (eg in [this Google talk][2] by Schmidhuber). + +If you want to be impressed you should look at this Science paper [Reducing the Dimensionality of Data with Neural Networks,][3] Hinton & Salakhutdinov. + +(There is so much work going on right now in that area, that there is only two upcoming books I know about that will treat it: [Large scale machine learning][4], Langford et al and [Machine Learning: a probabilistic perspective][5] by Kevin Murphy.) + +If you want to know more, check out what the main deep learning groups are doing: [Stanford][6], [Montreal][7] and most importantly [Toronto #1][8] and [Toronto #2][9]. + + + [1]: http://deeplearning.net/ + [2]: http://www.youtube.com/watch?v=rkCNbi26Hds + [3]: http://www.mit.edu/~rsalakhu/papers/science.pdf + [4]: http://www.cambridge.org/aus/catalogue/catalogue.asp?isbn=9780521192248 + [5]: http://www.cs.ubc.ca/~murphyk/MLbook/index.html + [6]: http://www.cs.stanford.edu/people/ang/ + [7]: http://www.iro.umontreal.ca/~bengioy/yoshua_en/index.html + [8]: http://www.cs.toronto.edu/~hinton/ + [9]: http://www.utstat.toronto.edu/~rsalakhu/",,2011-11-14 20:26:28.373 +50409,16998,5479.0,1,,CC BY-SA 3.0,504e401b-ec61-46a6-abcf-1c7bb1558bc1,Large Text Corpus,,2011-11-24 21:22:19.287 +50407,16998,5479.0,3,,CC BY-SA 3.0,504e401b-ec61-46a6-abcf-1c7bb1558bc1,,,2011-11-24 21:22:19.287 +50408,16998,5479.0,2,,CC BY-SA 3.0,504e401b-ec61-46a6-abcf-1c7bb1558bc1,"I am looking for large (>1000) text corpus to download. Preferably with **world news** or some kind of **reports**. I have only found one with patents. Any suggestions? +Thanks in advance.",,2011-11-24 21:22:19.287 +50413,17000,65.0,2,,CC BY-SA 3.0,0a6dbb03-dc99-4e2f-866c-7d4ec6c57dcc,Do not the Wikileaks texts suit you?,,2011-11-24 21:48:24.163 +50417,16998,,4,user88,CC BY-SA 3.0,01530515-c877-4b58-9c85-1ae101686dbf,Where to find a large text corpus?,added 2 characters in body; edited tags; edited title,2011-11-24 22:07:16.037 +50415,16998,,6,user88,CC BY-SA 3.0,01530515-c877-4b58-9c85-1ae101686dbf,,added 2 characters in body; edited tags; edited title,2011-11-24 22:07:16.037 +50416,16998,,5,user88,CC BY-SA 3.0,01530515-c877-4b58-9c85-1ae101686dbf,"I am looking for large (>1000) text corpus to download. Preferably with **world news** or some kind of **reports**. I have only found one with patents. Any suggestions? + +Thanks in advance.",added 2 characters in body; edited tags; edited title,2011-11-24 22:07:16.037 +50440,16998,,25,,,5b68229b-b463-43f3-978c-ca093f894619,,http://twitter.com/#!/StackStats/status/139891353785077760,2011-11-25 02:21:12.780 +50539,16998,668.0,16,,,36fa3f61-2e7c-4e71-879f-c7ddba16c861,,,2011-11-25 15:52:54.157 +50540,17000,668.0,16,,,07569919-7a93-4df4-9c3d-7a3460f88c84,,,2011-11-25 15:52:54.157 +52708,4705,1209.0,8,,CC BY-SA 3.0,46ee557c-6802-44e1-952b-383358c4564e,What are the most important statisticians and what is it that made them famous? (Reply just one scientist per answer please),Rollback to [0998ef5d-278a-43dd-bada-15885d12473c],2011-12-13 10:15:04.773 +52709,4705,1209.0,9,,CC BY-SA 3.0,46ee557c-6802-44e1-952b-383358c4564e,,Rollback to [0998ef5d-278a-43dd-bada-15885d12473c],2011-12-13 10:15:04.773 +54620,18335,5038.0,3,,CC BY-SA 3.0,c7afb6f4-5687-4296-a138-723d0b135298,,,2012-01-01 12:15:03.760 +54618,18335,5038.0,2,,CC BY-SA 3.0,c7afb6f4-5687-4296-a138-723d0b135298,"Both the likelihood ratio test and the AIC are tools for choosing between two models and both are based on the log-likelihood. + +But, why the likelihood ratio test can't be used to choose between two non-nested models while AIC can?",,2012-01-01 12:15:03.760 +54619,18335,5038.0,1,,CC BY-SA 3.0,c7afb6f4-5687-4296-a138-723d0b135298,Non-nested model selection,,2012-01-01 12:15:03.760 +61671,20667,5911.0,2,,CC BY-SA 3.0,f352f93e-00b4-4f87-8218-2c840a03ce9e,"Is there any datasets of 2 dimensional datapoints (each datapoint is a vector of two values (x,y)) following different distributions and forms ? Or is there any codes to generate such a datapoints ? I want to use them to plot/visualise how some clustering algorithms performs on them. Here are some examples: + + - http://www.cise.ufl.edu/~jmishra/clustering/ClusterImages/KMeans4.jpg + - http://www.aishack.in/wp-content/uploads/2010/07/kmeans-example.jpg + - http://www.ti.uni-bielefeld.de/html/research/ngpca/spiral.1.png + - http://courses.ee.sun.ac.za/Pattern_Recognition_813/lectures/6_em/img19.png + - http://3.bp.blogspot.com/_k1D0z3ucw7o/SITL8Et0QAI/AAAAAAAADSw/Bp_N8c9i5SE/s320/Sigma0.25.png + - http://www.newfolderconsulting.com/prtdoc/prtDocDataGen_01.png + - ... etc",,2012-02-16 21:14:21.930 +54677,18345,5448.0,5,,CC BY-SA 3.0,bd59794b-5fc9-4117-ae72-3fbc438ad24e,"The LR (likelihood ratio) test actually is testing the hypothesis that a specified subset of the parameters equal some pre-specified values. In the case of model selection, generally (but not always) that means some of the parameters equal zero. If the models are nested, the parameters in the larger model that are not in the smaller model are the ones being tested, with values specified implicitly by their exclusion from the smaller model. If the models aren't nested, you aren't testing this any more, because BOTH models have parameters that aren't in the other model, so the LR test statistic doesn't have the asymptotic $\chi^2$ distribution that it (usually) does in the nested case. + +AIC, on the other hand, is not used for formal testing. It is used for informal comparisons of models with differing numbers of parameters. The penalty term in the expression for AIC is what allows this comparison. But no assumptions are made about the functional form of the asymptotic distribution of the differences between the AIC of two non-nested models when doing the model comparison, and the difference between two AICs is not treated as a test statistic. + +I'll add that there is some disagreement over the use of AIC with non-nested models, as the theory is worked out for nested models. Hence my emphasis on ""not...formal"" and ""not...test statistic."" I use it for non-nested models, but not in a hard-and-fast way, more as an important, but not the sole, input into the model building process.",added 344 characters in body,2012-01-01 23:02:09.460 +54688,18335,,25,,,b0f27a0f-b1b7-4ed5-bab8-234df6a0a12e,,http://twitter.com/#!/StackStats/status/153663247508193280,2012-01-02 02:25:48.097 +55317,4705,668.0,6,,CC BY-SA 3.0,d229089b-e579-4c0a-bc11-80e7ef91842e,,edited tags,2012-01-07 01:53:14.437 +55832,10008,668.0,38,,,72919f37-cbd1-4a46-b360-3b0324fc2554,"[{""Id"":919,""DisplayName"":""whuber""}]",from http://stats.stackexchange.com/questions/20862/covariate-present-in-a-logistic-regression-model-as-a-effect-modifier-but-not-a,2012-01-10 14:02:57.437 +60422,20234,12900.0,2,vzn,CC BY-SA 3.0,36b42512-b757-4ba4-a493-04059dcd742f,"hi all recently there was a ML-like question over on cstheory stackexchange & I posted an answer recommending powells method, gradient descent or genetic algorithms or other ""approximation algorithms"". in a comment someone told me these methods were ""heuristics"" and _not_ ""approximation algorithms"" and frequently did not come close to the theoretical optimum (because they ""frequently get stuck in local minima""). + +my question, do others agree with that? also it seems to me there is a sense in which heuristic algorithms can be guaranteed to come close to theoretical optimums if they are set up to explore a large part of the search space (eg setting parameters/step sizes small), although I havent seen that in a paper. does anyone know if this has been shown or proven in a paper somewhere? (if not for a large class of algorithms maybe for a small class say NNs etc)",,2012-02-10 19:03:03.517 +60423,20234,12900.0,3,vzn,CC BY-SA 3.0,36b42512-b757-4ba4-a493-04059dcd742f,,,2012-02-10 19:03:03.517 +60421,20234,12900.0,1,vzn,CC BY-SA 3.0,36b42512-b757-4ba4-a493-04059dcd742f,"are machine learning techniques ""approximation algorithms""?",,2012-02-10 19:03:03.517 +60441,20240,1073.0,2,,CC BY-SA 3.0,d7ce0bc2-0c64-459b-97b1-6ac4e2d77d1a,"I think you're mixing multiple important concepts. Let me try to clarify a couple of things: + + - There are metaheuristic methods, which are methods that iteratively try to improve a candidate solution. Examples of this are tabu search, simulated annealing, genetic algorithms, etc. Observe that while there can be many cases where these methods work nicely, there isn't any deep understanding of when these methods work and when they don't. And more importantly when they don't get to the solution, we can be arbitrarily far from it. Problems solved by metaheuristic methods tend to be discrete in nature, because there are far better tools to handle continuous problems. But every now and then you see metaheuristics for continuous problems, too. + + - There are numerical optimization methods, people in this community carefully the nature of the function that is to be optimized and the restrictions of the solution (into groups like convex optimization, quadratic programming, linear programming etc) and apply algorithms that have been shown to work for that type of function, and that type of restrictions. When people in this area say ""shown to work"" they mean a proof. The situation is that these types of methods work in continuous problems. But when your problem falls in this category, this is definitely the tool to use. + + - There are discrete optimization methods, which tend to be things that in nature are connected to algorithms to well studied discrete problems: such as shortest paths, max flow, etc. People in this area also care that their algorithms really work (proofs). There are a subset of people in this group that study really hard problems for which no fast algorithm is expected to exist. They then study approximation algorithms, which are fast algorithms for which they are able to show that their solution is within a constant factor of the true optimum. This is called ""approximation algorithms"". These people also show their results as proofs. + +So... to answer your question, I do not think that metaheuristics are approximation algorithms. It doesn't seem to me as something connected to opinion, it is just fact. + + + +",,2012-02-10 21:45:51.503 +61390,20561,786.0,1,,CC BY-SA 3.0,abd789e5-91d5-4535-beaf-e8f6f3929d78,How to deal with Gaps/NaNs in time series data when using Matlab for autocorrelation and neural networks?,,2012-02-15 19:25:44.330 +61389,20561,786.0,2,,CC BY-SA 3.0,abd789e5-91d5-4535-beaf-e8f6f3929d78,"I have a time series of measurements (heights). In the observation period, the measurement process went down for some time points. So the resulting data is a vector with NaNs where there were gaps in the data. Using MATLAB, this is causing me a problem when computing the autocorrelation (`autocorr`) and applying neural networks (`nnstart`). + +How should these Gaps/NaNs be dealt with? Should I just remove these from the vector? Or replace their entry with an interpolated value? (if so how in MATLAB)",,2012-02-15 19:25:44.330 +61388,20561,786.0,3,,CC BY-SA 3.0,abd789e5-91d5-4535-beaf-e8f6f3929d78,,,2012-02-15 19:25:44.330 +61396,20561,,4,user88,CC BY-SA 3.0,a2b288f3-30a7-4464-9d8f-e344352b7a44,How to deal with gaps/NaNs in time series data when using Matlab for autocorrelation and neural networks?,edited title,2012-02-15 19:44:22.593 +61431,20561,786.0,5,,CC BY-SA 3.0,5125c547-a0bd-4175-8914-95746e330ebe,"I have a time series of measurements (heights-one dimensional series). In the observation period, the measurement process went down for some time points. So the resulting data is a vector with NaNs where there were gaps in the data. Using MATLAB, this is causing me a problem when computing the autocorrelation (`autocorr`) and applying neural networks (`nnstart`). + +How should these Gaps/NaNs be dealt with? Should I just remove these from the vector? Or replace their entry with an interpolated value? (if so how in MATLAB)",clarification,2012-02-15 23:19:08.517 +61672,20667,5911.0,1,,CC BY-SA 3.0,f352f93e-00b4-4f87-8218-2c840a03ce9e,"2D artificial data of different distributions and forms, existing datasets or generation code",,2012-02-16 21:14:21.930 +61673,20667,5911.0,3,,CC BY-SA 3.0,f352f93e-00b4-4f87-8218-2c840a03ce9e,,,2012-02-16 21:14:21.930 +68714,22797,7769.0,2,,CC BY-SA 3.0,5510d113-64ee-4412-a1a2-b5da2133932f,"For a simple 2 variables (say X and Y) cointegration test, how does it affect our analysis, if we perform regression on X and Y with and without the intercept, and then test the spread for stationarity. + +I am doing this analysis for stocks.",,2012-04-02 07:39:49.013 +68716,22797,7769.0,3,,CC BY-SA 3.0,5510d113-64ee-4412-a1a2-b5da2133932f,,,2012-04-02 07:39:49.013 +68715,22797,7769.0,1,,CC BY-SA 3.0,5510d113-64ee-4412-a1a2-b5da2133932f,"Cointegration:- Regression with and without intercepts, and testing spreads for stationarity",,2012-04-02 07:39:49.013 +61824,28,,5,,CC BY-SA 3.0,0660907c-db69-4602-a1dd-036eae33de32,"Last year, I read a blog post from [Bendan O'Connor][1] entitled [""Statistics vs. Machine Learning, fight!""][2] that discussed some of the differences between the two fields. [Andrew Gelman responded to favorably to this][3]: + +Simon Blomberg: +> From R's fortunes +> package: To paraphrase provocatively, +> 'machine learning is statistics minus +> any checking of models and +> assumptions'. +> -- Brian D. Ripley (about the difference between machine learning +> and statistics) useR! 2004, Vienna +> (May 2004) :-) Season's Greetings! + +Andrew Gelman: + +> In that case, maybe we should get rid +> of checking of models and assumptions +> more often. Then maybe we'd be able to +> solve some of the problems that the +> machine learning people can solve but +> we can't! + +There was also the [**""Statistical Modeling: The Two Cultures""** paper][4] by Leo Breiman in 2001 which argued that Statisticians rely too heavily on data modeling, and that machine learning techniques are making progress by instead relying on the *predictive accuracy* of models. + +Has the Statistics field changed over the last decade in response to these critiques? Do the *two cultures* still exist or has Statistics grown to embrace machine learning techniques such as neural networks and support vector machines? + + + [1]: http://anyall.org/ + [2]: http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/ + [3]: http://andrewgelman.com/2008/12/machine_learnin/ + [4]: http://www.stat.osu.edu/~bli/dmsl/papers/Breiman.pdf",gelman's blog moved and the old redirects seem broken; insert new url,2012-02-17 15:49:06.140 +62108,20667,0.0,36,,,60b0d735-4c7a-489f-9339-c31be0d24988,,from http://stackoverflow.com/questions/9319168/2d-artificial-data-of-different-distributions-and-forms-existing-datasets-or-ge,2012-02-20 07:47:45.927 +62127,20667,674.0,6,,CC BY-SA 3.0,f8736e43-ac8b-4d9b-8ab5-eb55a18aa69a,,edited tags,2012-02-20 09:06:30.837 +62361,20667,221.0,5,,CC BY-SA 3.0,620cdcf1-8998-46dc-aaea-e6344367280e,"I am looking for datasets of 2 dimensional datapoints (each datapoint is a vector of two values (x,y)) following different distributions and forms ? Or is there any codes to generate such a datapoints ? I want to use them to plot/visualise how some clustering algorithms performs on them. Here are some examples: + + - [star like cloud data][1] + - [four clusters, one easy seperable][2] + - [a spiral (no cluster)][3] + - [a ring][4] + - [two barely seperated clouds][5] + - [two parallel clusters forming a spiral][6] + - ... etc + + + [1]: http://www.cise.ufl.edu/~jmishra/clustering/ClusterImages/KMeans4.jpg + [2]: http://www.aishack.in/wp-content/uploads/2010/07/kmeans-example.jpg + [3]: http://www.ti.uni-bielefeld.de/html/research/ngpca/spiral.1.png + [4]: http://%20http://courses.ee.sun.ac.za/Pattern_Recognition_813/lectures/6_em/img19.png + [5]: http://3.bp.blogspot.com/_k1D0z3ucw7o/SITL8Et0QAI/AAAAAAAADSw/Bp_N8c9i5SE/s320/Sigma0.25.png + [6]: http://www.newfolderconsulting.com/prtdoc/prtDocDataGen_01.png",changed title to accomodate for the application to cluster analysis; changed bare links to SE-provided description-link-style,2012-02-21 12:47:33.070 +62362,20667,221.0,4,,CC BY-SA 3.0,620cdcf1-8998-46dc-aaea-e6344367280e,Looking for 2D artificial data to demonstrate properties of clustering algorithms,changed title to accomodate for the application to cluster analysis; changed bare links to SE-provided description-link-style,2012-02-21 12:47:33.070 +62370,20667,,16,user88,,3a92734b-4ea6-49c5-8d6e-828f5ef4960c,,,2012-02-21 13:06:45.223 +62692,20240,0.0,36,,,66326469-f9aa-4b4d-824e-a686eca4faad,,from http://machinelearning.stackexchange.com/questions/104/are-machine-learning-techniques-approximation-algorithms/105#105,2012-02-22 19:46:04.020 +62693,20234,0.0,36,,,77630c49-3505-4c0c-a665-b4eaed309e80,,from http://machinelearning.stackexchange.com/questions/104/are-machine-learning-techniques-approximation-algorithms,2012-02-22 19:46:04.020 +62861,20234,,4,user88,CC BY-SA 3.0,ed8a9760-3a2a-4fa4-a9ef-7b93b58e501c,"Are machine learning techniques ""approximation algorithms""?",edited title,2012-02-22 23:58:08.800 +62931,20234,,25,,,ca22530d-6b53-4fa8-bd52-7b31ffd6ee48,,http://twitter.com/#!/StackStats/status/172646050895638528,2012-02-23 11:36:40.773 +63076,20234,,6,,CC BY-SA 3.0,14ebc707-d316-463d-b232-01b8744d6045,,edited tags,2012-02-24 09:22:17.303 +67006,28,568.0,5,,CC BY-SA 3.0,0f0a7774-d8d2-4f92-b536-c32f261ebc21,"Last year, I read a blog post from [Bendan O'Connor][1] entitled [""Statistics vs. Machine Learning, fight!""][2] that discussed some of the differences between the two fields. [Andrew Gelman responded to favorably to this][3]: + +Simon Blomberg: +> From R's fortunes +> package: To paraphrase provocatively, +> 'machine learning is statistics minus +> any checking of models and +> assumptions'. +> -- Brian D. Ripley (about the difference between machine learning +> and statistics) useR! 2004, Vienna +> (May 2004) :-) Season's Greetings! + +Andrew Gelman: + +> In that case, maybe we should get rid +> of checking of models and assumptions +> more often. Then maybe we'd be able to +> solve some of the problems that the +> machine learning people can solve but +> we can't! + +There was also the [**""Statistical Modeling: The Two Cultures""** paper][4] by Leo Breiman in 2001 which argued that Statisticians rely too heavily on data modeling, and that machine learning techniques are making progress by instead relying on the *predictive accuracy* of models. + +Has the Statistics field changed over the last decade in response to these critiques? Do the *two cultures* still exist or has Statistics grown to embrace machine learning techniques such as neural networks and support vector machines? + + + [1]: http://anyall.org/ + [2]: http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/ + [3]: http://andrewgelman.com/2008/12/machine_learnin/ + [4]: http://projecteuclid.org/euclid.ss/1009213726",fixed broken link,2012-03-22 14:45:13.073 +68281,22674,7714.0,3,,CC BY-SA 3.0,e6199626-9024-4a98-8136-48c0a4e24c12,,,2012-03-30 03:51:41.627 +68282,22674,7714.0,1,,CC BY-SA 3.0,e6199626-9024-4a98-8136-48c0a4e24c12,Belief Propagation for MRF with complex cliques,,2012-03-30 03:51:41.627 +68283,22674,7714.0,2,,CC BY-SA 3.0,e6199626-9024-4a98-8136-48c0a4e24c12,"Is there a belief propagation algorithm for exact inference on a MRF with complex clique structures (i.e. ones involving more than 2 neighbours)? + +For MRF's with cliques that only involve pairwise interaction, you could just search out far enough and cluster to form an acyclic graph and run the usual BP. With more complex cliques, this seems impossible to me as clustering might involve cutting through a clique with multiple members on either side. Is there a workaround for this? Perhaps some clever conditioning arguments?",,2012-03-30 03:51:41.627 +68284,22674,7714.0,4,,CC BY-SA 3.0,514f4d04-808d-4cf8-8392-2494309ec6f2,Belief Propagation on MRF with complex cliques,edited title,2012-03-30 03:59:07.250 +68314,22674,,4,user88,CC BY-SA 3.0,64237466-458f-4543-8422-abbd0306dfd0,Belief propagation on MRF with complex cliques,edited title,2012-03-30 09:56:54.120 +68343,4187,,6,,CC BY-SA 3.0,48123a9a-35c5-4f90-8bf4-a17c096bed74,,tagging with big-list,2012-03-30 13:30:25.143 +184201,56859,155.0,6,,CC BY-SA 3.0,c9516594-9ccd-488f-8fc3-414b330e5282,,edited tags; edited title,2013-10-05 05:52:04.663 +69401,23019,7739.0,2,,CC BY-SA 3.0,c2a12930-342d-44b6-a3f2-64ae387ff8ac,"Essentially, I have two collinear variables which could be seen as either random or as fixed effects, a variable I'm trying to predict, and a variable that's assuredly a random effect. + +__Dependent var:__ Number of neuron spikes (`FiringRate`) in a specific region of mousebrain + +__Fixed effects:__ + +__1)__ `Time` at which data sample was taken (on a linear scale in days -- so day two would be 2, day 5 would be 5, and so on) + +__2)__ The `Age` of the mouse in days (so there's definitely collinearity between this and the `Time` variable, but there are enough mice of different ages to make this worthwhile as a separate variable) + +__Random effect:__ `Subject` -- ""Name"" (ID number) of the mouse + +Essentially, I'm wondering if it would be appropriate to run two LMEs. In the first, I'd treat `Age` and `Subject` as random variables in order to control for the effects of `Age` and see if Time is a significant predictor of the # of spikes (dependent variable). In the second, I'd enter `Time` and `Subject` as random variables to see if `Age` was a significant predictor. + + library(lme4) + a = lmer(FiringRate ~ Time + (1|Age) + (1|Subject)) + b = lmer(FiringRate ~ Age + (1|Time) + (1|Subject)) + ",,2012-04-05 23:08:58.800 +69399,23019,7739.0,3,,CC BY-SA 3.0,c2a12930-342d-44b6-a3f2-64ae387ff8ac,,,2012-04-05 23:08:58.800 +69400,23019,7739.0,1,,CC BY-SA 3.0,c2a12930-342d-44b6-a3f2-64ae387ff8ac,"R: How to ""control"" for another variable in Linear Mixed Effects Regression model?",,2012-04-05 23:08:58.800 +69402,23019,7739.0,5,,CC BY-SA 3.0,1e3333e1-8c49-41cd-a463-f7500c498c11,"Essentially, I have two collinear variables which could be seen as either random or as fixed effects, a dependent variable I'm fitting the model to, and a variable that's assuredly a random effect. + +__Dependent var:__ Number of neuron spikes (`FiringRate`) in a specific region of mousebrain + +__Fixed effects:__ + +__1)__ `Time` at which data sample was taken (on a linear scale in days -- so day two would be 2, day 5 would be 5, and so on) + +__2)__ The `Age` of the mouse in days (so there's definitely collinearity between this and the `Time` variable, but there are enough mice of different ages to make this worthwhile as a separate variable) + +__Random effect:__ `Subject` -- ""Name"" (ID number) of the mouse + +Essentially, I'm wondering if it would be appropriate to run two LMEs. In the first, I'd treat `Age` and `Subject` as random variables in order to control for the effects of `Age` and see if Time is a significant predictor of the # of spikes (dependent variable). In the second, I'd enter `Time` and `Subject` as random variables to see if `Age` was a significant predictor. + + library(lme4) + a = lmer(FiringRate ~ Time + (1|Age) + (1|Subject)) + b = lmer(FiringRate ~ Age + (1|Time) + (1|Subject)) + ",added 13 characters in body,2012-04-05 23:15:21.473 +69413,23019,7739.0,5,,CC BY-SA 3.0,f45fc943-f923-46a8-98be-ac2dc340c533,"Essentially, I have two collinear variables which could be seen as either random or as fixed effects, a dependent variable I'm fitting the model to, and a variable that's assuredly a random effect. + +__Dependent var:__ Number of neuron spikes (`FiringRate`) in a specific region of mousebrain + +__Fixed effects:__ + +__1)__ `Time` at which data sample was taken (on a linear scale in days -- so day two would be 2, day 5 would be 5, and so on) + +__2)__ The `Age` of the mouse in days (so there's definitely collinearity between this and the `Time` variable, but there are enough mice of different ages to make this worthwhile as a separate variable) + +__Random effect:__ `Subject` -- ""Name"" (ID number) of the mouse + +Essentially, I'm wondering if it would be appropriate to run two LMEs. In the first, I'd treat `Age` and `Subject` as random variables in order to control for the effects of `Age` (and thus the collinearity between `Age` and `Time`) and see if Time is a significant predictor of the # of spikes (dependent variable). In the second, I'd enter `Time` and `Subject` as random variables to see if `Age` was a significant predictor. + + library(lme4) + a = lmer(FiringRate ~ Time + (1|Age) + (1|Subject)) + b = lmer(FiringRate ~ Age + (1|Time) + (1|Subject)) + ",added 53 characters in body,2012-04-06 01:52:06.693 +69601,23019,,25,,,6d9c677e-c38f-4861-8712-a0a9fef76ce8,,http://twitter.com/#!/StackStats/status/188501092475207682,2012-04-07 05:38:57.223 +69629,23087,5643.0,1,,CC BY-SA 3.0,c97fe292-fd58-4503-af06-80b1895e9644,Moving-average model error terms,,2012-04-07 12:48:41.467 +69630,23087,5643.0,3,,CC BY-SA 3.0,c97fe292-fd58-4503-af06-80b1895e9644,,,2012-04-07 12:48:41.467 +69628,23087,5643.0,2,,CC BY-SA 3.0,c97fe292-fd58-4503-af06-80b1895e9644,"This is a basic question on Box-Jenkins MA models. As I understand it, an MA model is basically a linear regression of time-series values $Y$ against previous error terms $e_t,..., e_{t-n}$. That is, the observation $Y$ is first regressed against its previous values $Y_{t-1}, ..., Y_{t-n}$ and then one or more $Y - \hat{Y}$ values are used as the error terms for the MA model. + +But how are the error terms calculated in an ARIMA(0, 0, 2) model? If the MA model is used without an autoregressive part and thus no estimated value, how can I possibly have an error term?",,2012-04-07 12:48:41.467 +73060,6788,1790.0,5,,CC BY-SA 3.0,a8fcd97d-005b-4620-af03-99dc676537af,"I recently stumbled upon the concept of [**sample complexity**][1], and was wondering if there are any texts, papers or tutorials that provide: + + 1. An introduction to the concept + 2. An analysis of the sample complexity of established and popular classification methods or kernel methods. + 3. Advice or information on how to measure it in practice. + +Any help with the topic would be greatly appreciated. + + + [1]: http://www.google.com/search?q=%22sample%20complexity%22",deleted 4 characters in body,2012-04-29 17:50:04.333 +73597,6788,1790.0,5,,CC BY-SA 3.0,ff29749c-cfbd-48f2-ba62-6789ec0a24c0,"I recently stumbled upon the concept of [**sample complexity**][1], and was wondering if there are any texts, papers or tutorials that provide: + + 1. An introduction to the concept (rigorous or informal) + 2. An analysis of the sample complexity of established and popular classification methods or kernel methods. + 3. Advice or information on how to measure it in practice. + +Any help with the topic would be greatly appreciated. + + + [1]: http://www.google.com/search?q=%22sample%20complexity%22",added 23 characters in body,2012-05-02 14:19:06.373 +74073,24506,7341.0,1,,CC BY-SA 3.0,3d76502f-1e6d-42d0-b5a7-2ce4d22e779a,Confidence Interval for Values for a Fitted Line,,2012-05-04 17:02:04.137 +74074,24506,7341.0,3,,CC BY-SA 3.0,3d76502f-1e6d-42d0-b5a7-2ce4d22e779a,,,2012-05-04 17:02:04.137 +95592,30864,4890.0,2,,CC BY-SA 3.0,138b717d-80e6-415f-afe9-8938d2a1c11c,"A low rank approximation $\hat{X}$ of $X$ can be decomposed into a matrix square root as $G=U_{r}\lambda_{r}^\frac{1}{2}$ where the eigen decomposition of $X$ is $U\lambda U^T$, thereby reducing the number of features, which can be represented by $G$ based on the rank-r approximation as $\hat{X}=GG^T$. Note that the subscript $r$ represents the number of eigen-vectors and eigen-values used in the approximation. Hence, it does reduce the number of features to represent the data. In some examples low-rank approximations are considered as basis or latent variable(dictionary) based expansions of the original data, under special constraints like orthogonality, non-negativity(non-negative matrix factorization) etc. ",,2012-08-28 00:53:18.257 +74371,24602,3662.0,2,,CC BY-SA 3.0,1d2f615d-41f5-427e-afe1-76f1ce974595,"If your using linear regression I would recommend using [the rms package](http://biostat.mc.vanderbilt.edu/wiki/Main/Rrms) in R. It is very easy to use and has lots of nice features. + +Here's an example: + + # Load package (remember to install.packages(""rms"") or this will fail the first time) + library(rms) + + # Get a dataset to experiment with + data(mtcars) + mtcars$am <- factor(mtcars$am, levels=0:1, labels=c(""Automatic"", ""Manual"")) + + # The rms package needs this work properly + dd <- datadist(mtcars) + options(datadist=""dd"") + + # Do the regression + f <- ols(mpg~wt, data=mtcars, x=T, y=T) + + # Plot regular mean confidence interval + p <- Predict(f, wt=seq(2.5, 4, by=.001), conf.type=""mean"") + plot(p, ylim=c(10, 30), col=""lightblue"") + + # Plot wide confidence interval + p <- Predict(f, wt=seq(2.5, 4, by=.001), conf.type=""individual"") + plot(p, ylim=c(10, 30), col=""lightblue"") + +Gives this output: + +![Plain line][1] + +Now usually you want to test the linearity assumption: + + # Try the model with a restricted cubic spline + f <- ols(mpg~rcs(wt, 3), data=mtcars, x=T, y=T) + anova(f) + +Gives this output: + + > anova(f) + Analysis of Variance Response: mpg + + Factor d.f. Partial SS MS F P + wt 2 922.04230 461.021149 65.54 <.0001 + Nonlinear 1 74.31705 74.317047 10.56 0.0029 + REGRESSION 2 922.04230 461.021149 65.54 <.0001 + ERROR 29 204.00489 7.034651 + +And if you plot the graphs with the same code as a bove you get this picture: + +![Regression with a spline][2] + +If you want to make your formula more complicated just add that variable: + + f <- ols(mpg~rcs(wt, 3)+am, data=mtcars, x=T, y=T) + p <- Predict(f, wt=seq(2.5, 4, by=.001), am=levels(mtcars$am), conf.type=""individual"") + plot(p) + +I don't know anything about JMP, it shouldn't be too difficult but I recommend learning R because it gives you an incredible freedom. + +Hope this helped. + + [1]: https://i.stack.imgur.com/UAvh7.png + [2]: https://i.stack.imgur.com/jkftF.png +",,2012-05-06 10:01:28.687 +75872,25072,7421.0,2,,CC BY-SA 3.0,6bf1d89b-922e-4d86-bf5e-2095145cefaa,"Before it is pointed, I am aware that a [very similar question was already asked][1]. Still, I am in doubt regarding the concept. + +More specifically, it is mentioned by the most voted answer that: + +>In terms of a **simple rule of thumb**, I'd suggest that you: + +>1. Run factor analysis if you assume or wish to test a theoretical model of latent factors causing observed variables. + +>2. Run principal components analysis If you want to simply reduce your correlated observed variables to a smaller set of important independent composite variables. + +**Question 1:** + +I am having difficulties on understanding based on the results I obtained from R where exactly I am inputing my *theoretical model of latent factors*. I am using the functions from [statsmethods][2]. On both **factanal()** and **princomp()** the inputs were the same: A table where each row represented one data point and the columns consisted of different attributes I was interested on reducing. Thus, this add to my confusion on where is this pre assumed model play its role. I noticed that for factor analysis function I used parallel analysis also suggested by the site using the nScree() function to determine the number of factors and I specified if I wanted a varimax (orthogonal) or promax (oblique) rotation. Is that what is it mean by the model? Being ale to choose the amount of factors and the type of rotation? + +**Question 2:** + +I bought a book to study about this from Richard L. Gorsuch. On this book there is something that the author caught attention on the difference between PCA (Principal Component Analysis) and EFA (Exploratory Factor Analysis): It is mentioned that PCA is for **population** while EFA is for **sample**. Is that true? I didn't see that being mentioned on any discussion I read so far. Is it irrelevant? + +**Question 3:** + +I noticed that all those methods seems to impose the normal distribution constraint. I also read that for larger sets this constraint can be ignored. Is that true or PCA, EFA and CFA are sensible to distribution constraint violations? + + +**Question 4:** Where from the results of PCA and EFA should I note that one is talking about latent factors (EFA) and the other is just clustering on components (factors) the variables? The outputs from R looks the same to me. Is it just the way I perceive what the factors being shown as output? I noted that both show me the table where I can see which I can observe which of my variables are expressed the most of my factors. **What is the difference on the interpretation I should have on which variable belongs to which factor** in respect to PCA and EFA? EFA is saying those with higher expression seems to be more explained by that latent factor while PCA is trying to say that factor is holding those variables from what is it observed? + +**Question 5** +Finally the last question is regarding CFA (Confirmatory Factor Analysis). + +On the same function website the following image is being shown: + +![Confirmatory Factor Analysis][3] + +I read that CFA is usually followed after EFA for hypothesis testing. In that sense, EFA tells you which are the latent factors (which are the output factors) and then you use CFA assuming those factors you observed from EFA for hypothesis testing? + +**Question 6** + +For EFA one of the available rotations on the literature is direct oblimium. I heard that it can accounts for both promax and varimax so 'it takes the best of two words'. Is that true? I am also trying to find a function that employs them on R, since the one suggested on the site does not. I would be happy to get any suggestion on this one. + + +--- + +I hope it is noted that this question is way more specific on the doubts regarding EFA and PCA and also adds to CFA so not to get closed for being repeated on the subject. If at least one of the questions is answered I am more than happy too as to clarify the confusion in my head. + +Thank you. + + + [1]: http://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi + [2]: http://www.statmethods.net/advstats/factor.html + [3]: https://i.stack.imgur.com/NNrvh.gif",,2012-05-14 05:55:32.040 +75874,25072,7421.0,3,,CC BY-SA 3.0,6bf1d89b-922e-4d86-bf5e-2095145cefaa,,,2012-05-14 05:55:32.040 +75873,25072,7421.0,1,,CC BY-SA 3.0,6bf1d89b-922e-4d86-bf5e-2095145cefaa,"Differences on Exploratory Factor Analysis, Confirmatory Factor Analysis and Principal Component Analysis",,2012-05-14 05:55:32.040 +75875,25072,7421.0,5,,CC BY-SA 3.0,8a8f3f42-91aa-4ffe-a355-a9e3b9b871b0,"Before it is pointed, I am aware that a [very similar question was already asked][1]. Still, I am in doubt regarding the concept. + +More specifically, it is mentioned by the most voted answer that: + +>In terms of a **simple rule of thumb**, I'd suggest that you: + +>1. Run factor analysis if you assume or wish to test a theoretical model of latent factors causing observed variables. + +>2. Run principal components analysis If you want to simply reduce your correlated observed variables to a smaller set of important independent composite variables. + +**Question 1:** + +I am having difficulties on understanding based on the results I obtained from R where exactly I am inputing my *theoretical model of latent factors*. I am using the functions from [statsmethods][2]. On both **factanal()** and **princomp()** the inputs were the same: A table where each row represented one data point and the columns consisted of different attributes I was interested on reducing. Thus, this add to my confusion on where is this pre assumed model play its role. I noticed that for factor analysis function I used parallel analysis also suggested by the site using the nScree() function to determine the number of factors and I specified if I wanted a varimax (orthogonal) or promax (oblique) rotation. Is that what is it mean by the model? Being able to choose the amount of factors and the type of rotation? + +**Question 2:** + +I bought a book to study about this from Richard L. Gorsuch. On this book there is something that the author caught attention on the difference between PCA (Principal Component Analysis) and EFA (Exploratory Factor Analysis): It is mentioned that PCA is for **population** while EFA is for **sample**. Is that true? I didn't see that being mentioned on any discussion I read so far. Is it irrelevant? + +**Question 3:** + +I noticed that all those methods seems to impose the normal distribution constraint. I also read that for larger sets this constraint can be ignored. Is that true or PCA, EFA and CFA are sensible to distribution constraint violations? + + +**Question 4:** Where from the results of PCA and EFA should I note that one is talking about latent factors (EFA) and the other is just clustering on components (factors) the variables? The outputs from R looks the same to me. Is it just the way I perceive what the factors being shown as output? I noted that both show me the table where I can see which I can observe which of my variables are expressed the most of my factors. **What is the difference on the interpretation I should have on which variable belongs to which factor** in respect to PCA and EFA? EFA is saying those with higher expression seems to be more explained by that latent factor while PCA is trying to say that factor is holding those variables from what is it observed? + +**Question 5** +Finally the last question is regarding CFA (Confirmatory Factor Analysis). + +On the same function website the following image is being shown: + +![Confirmatory Factor Analysis][3] + +I read that CFA is usually followed after EFA for hypothesis testing. In that sense, EFA tells you which are the latent factors (which are the output factors) and then you use CFA assuming those factors you observed from EFA for hypothesis testing? + +**Question 6** + +For EFA one of the available rotations on the literature is direct oblimium. I heard that it can accounts for both promax and varimax so 'it takes the best of two words'. Is that true? I am also trying to find a function that employs them on R, since the one suggested on the site does not. I would be happy to get any suggestion on this one. + + +--- + +I hope it is noted that this question is way more specific on the doubts regarding EFA and PCA and also adds to CFA so not to get closed for being repeated on the subject. If at least one of the questions is answered I am more than happy too as to clarify the confusion in my head. + +Thank you. + + + [1]: http://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi + [2]: http://www.statmethods.net/advstats/factor.html + [3]: https://i.stack.imgur.com/NNrvh.gif",added 1 characters in body,2012-05-14 06:09:44.487 +75876,25072,7421.0,5,,CC BY-SA 3.0,a6360364-d39c-4c01-ba65-1432330c9363,"Before it is pointed, I am aware that a [very similar question was already asked][1]. Still, I am in doubt regarding the concept. + +More specifically, it is mentioned by the most voted answer that: + +>In terms of a **simple rule of thumb**, I'd suggest that you: + +>1. Run factor analysis if you assume or wish to test a theoretical model of latent factors causing observed variables. + +>2. Run principal components analysis If you want to simply reduce your correlated observed variables to a smaller set of important independent composite variables. + +**Question 1:** + +I am having difficulties on understanding based on the results I obtained from R where exactly I am inputing my *theoretical model of latent factors*. I am using the functions from [statsmethods][2]. On both **factanal()** and **princomp()** the inputs were the same: A table where each row represented one data point and the columns consisted of different attributes I was interested on reducing. Thus, this add to my confusion on where is this pre assumed model play its role. I noticed that for factor analysis function I used parallel analysis also suggested by the site using the nScree() function to determine the number of factors and I specified if I wanted a varimax (orthogonal) or promax (oblique) rotation. Is that what is it mean by the model? Being able to choose the amount of factors and the type of rotation? + +The results being provided as visual graphs for both PCA and EFA also doesn't seem to highlight this difference which adds to my confusion. Where does this distinction can be observed on them? + +![PCA][3] +PCA + +![EFA][4] +EFA + +**Question 2:** + +I bought a book to study about this from Richard L. Gorsuch. On this book there is something that the author caught attention on the difference between PCA (Principal Component Analysis) and EFA (Exploratory Factor Analysis): It is mentioned that PCA is for **population** while EFA is for **sample**. Is that true? I didn't see that being mentioned on any discussion I read so far. Is it irrelevant? + +**Question 3:** + +I noticed that all those methods seems to impose the normal distribution constraint. I also read that for larger sets this constraint can be ignored. Is that true or PCA, EFA and CFA are sensible to distribution constraint violations? + + +**Question 4:** Where from the results of PCA and EFA should I note that one is talking about latent factors (EFA) and the other is just clustering on components (factors) the variables? The outputs from R looks the same to me. Is it just the way I perceive what the factors being shown as output? I noted that both show me the table where I can see which I can observe which of my variables are expressed the most of my factors. **What is the difference on the interpretation I should have on which variable belongs to which factor** in respect to PCA and EFA? EFA is saying those with higher expression seems to be more explained by that latent factor while PCA is trying to say that factor is holding those variables from what is it observed? + +**Question 5** +Finally the last question is regarding CFA (Confirmatory Factor Analysis). + +On the same function website the following image is being shown: + +![Confirmatory Factor Analysis][5] + +I read that CFA is usually followed after EFA for hypothesis testing. In that sense, EFA tells you which are the latent factors (which are the output factors) and then you use CFA assuming those factors you observed from EFA for hypothesis testing? + +**Question 6** + +For EFA one of the available rotations on the literature is direct oblimium. I heard that it can accounts for both promax and varimax so 'it takes the best of two words'. Is that true? I am also trying to find a function that employs them on R, since the one suggested on the site does not. I would be happy to get any suggestion on this one. + + +--- + +I hope it is noted that this question is way more specific on the doubts regarding EFA and PCA and also adds to CFA so not to get closed for being repeated on the subject. If at least one of the questions is answered I am more than happy too as to clarify the confusion in my head. + +Thank you. + + + [1]: http://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi + [2]: http://www.statmethods.net/advstats/factor.html + [3]: https://i.stack.imgur.com/SGK56.jpg + [4]: https://i.stack.imgur.com/Eg4MN.jpg + [5]: https://i.stack.imgur.com/NNrvh.gif",added few images to highlight the doubts,2012-05-14 06:17:10.343 +75877,25072,,4,user88,CC BY-SA 3.0,dc2b5e8c-78d1-4bac-ab88-b383a27a4ab8,"Differences on exploratory factor analysis, confirmatory factor analysis and principal component analysis",edited title,2012-05-14 07:04:25.323 +75924,25087,8363.0,2,,CC BY-SA 3.0,c7fff30b-43b7-429b-9cd5-7187003b5008,I will just address question 2. I have some doubts about how well the author knows his subject if he really said it the way you have presented it. PCA is applied to the sample just like EFA and CFA. It simply takes a list of n possibly related factors looks at how the points (samples) scatter in n-dimension space and then gets the first principal component as the linear combination that explains more of the variability in the data than any other linear combination. Then the second looks at orthogonal directions to the first to find theone out of those that explains the most of the remaining variability and so on with the 3rd and 4th. So sometimes one can take just 1-3 components to describe most of the variation in the data. That is why factor analysis and principal componet analysis are described according to 1 and 2 in your statement.,,2012-05-14 14:04:26.687 +76761,25072,7421.0,5,,CC BY-SA 3.0,23930539-bc39-4796-ab75-2b19cc5cc9c7,"Before it is pointed, I am aware that a [very similar question was already asked][1]. Still, I am in doubt regarding the concept. + +More specifically, it is mentioned by the most voted answer that: + +>In terms of a **simple rule of thumb**, I'd suggest that you: + +>1. Run factor analysis if you assume or wish to test a theoretical model of latent factors causing observed variables. + +>2. Run principal components analysis If you want to simply reduce your correlated observed variables to a smaller set of important independent composite variables. + +**Question 1:** + +I am having difficulties on understanding based on the results I obtained from R where exactly I am inputing my *theoretical model of latent factors*. I am using the functions from [statsmethods][2]. On both **factanal()** and **princomp()** the inputs were the same: A table where each row represented one data point and the columns consisted of different attributes I was interested on reducing. Thus, this add to my confusion on where is this pre assumed model play its role. I noticed that for factor analysis function I used parallel analysis also suggested by the site using the nScree() function to determine the number of factors and I specified if I wanted a varimax (orthogonal) or promax (oblique) rotation. Is that what is it mean by the model? Being able to choose the amount of factors and the type of rotation? + +The results being provided as visual graphs for both PCA and EFA also doesn't seem to highlight this difference which adds to my confusion. Where does this distinction can be observed on them? + +![PCA][3] +PCA + +![EFA][4] +EFA + +**Question 2:** -- Answered + +I bought a book to study about this from Richard L. Gorsuch. On this book there is something that the author caught attention on the difference between PCA (Principal Component Analysis) and EFA (Exploratory Factor Analysis): It is mentioned that PCA is for **population** while EFA is for **sample**. Is that true? I didn't see that being mentioned on any discussion I read so far. Is it irrelevant? + +**Question 3:** + +I noticed that all those methods seems to impose the normal distribution constraint. I also read that for larger sets this constraint can be ignored. Is that true or PCA, EFA and CFA are sensible to distribution constraint violations? + + +**Question 4:** Where from the results of PCA and EFA should I note that one is talking about latent factors (EFA) and the other is just clustering on components (factors) the variables? The outputs from R looks the same to me. Is it just the way I perceive what the factors being shown as output? I noted that both show me the table where I can see which I can observe which of my variables are expressed the most of my factors. **What is the difference on the interpretation I should have on which variable belongs to which factor** in respect to PCA and EFA? EFA is saying those with higher expression seems to be more explained by that latent factor while PCA is trying to say that factor is holding those variables from what is it observed? + +**Question 5** +Finally the last question is regarding CFA (Confirmatory Factor Analysis). + +On the same function website the following image is being shown: + +![Confirmatory Factor Analysis][5] + +I read that CFA is usually followed after EFA for hypothesis testing. In that sense, EFA tells you which are the latent factors (which are the output factors) and then you use CFA assuming those factors you observed from EFA for hypothesis testing? + +**Question 6** + +For EFA one of the available rotations on the literature is direct oblimium. I heard that it can accounts for both promax and varimax so 'it takes the best of two words'. Is that true? I am also trying to find a function that employs them on R, since the one suggested on the site does not. I would be happy to get any suggestion on this one. + + +--- + +I hope it is noted that this question is way more specific on the doubts regarding EFA and PCA and also adds to CFA so not to get closed for being repeated on the subject. If at least one of the questions is answered I am more than happy too as to clarify the confusion in my head. + +Thank you. + + + [1]: http://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi + [2]: http://www.statmethods.net/advstats/factor.html + [3]: https://i.stack.imgur.com/SGK56.jpg + [4]: https://i.stack.imgur.com/Eg4MN.jpg + [5]: https://i.stack.imgur.com/NNrvh.gif",added 12 characters in body,2012-05-18 19:22:13.883 +76764,10069,2666.0,5,,CC BY-SA 3.0,9acb4028-2693-4cd1-af47-24768fb023a9,"In my experience, not only is it necessary to have all lower order effects in the model when they are connected to higher order effects, but it is also important to properly model (e.g., allowing to be nonlinear) main effects that are seemingly unrelated to the factors in the interactions of interest. That's because interactions between x1 and x2 can be stand-ins for main effects of x3 and x4. Interactions sometimes *seem* to be needed because they are collinear with omitted variables or omitted nonlinear (e.g., spline) terms.",added 8 characters in body,2012-05-18 19:29:16.353 +79109,26070,5911.0,2,,CC BY-SA 3.0,0530a116-5968-42eb-9c51-f5b323f99168,I have seen somewhere that classical distances (like euclidean distance) becomes weekly discriminant when we have multidimensional and sparse data. Why ? Do you have an example of two sparse data vectors where the distance do not performs well ? In this case which similarity should we use ?,,2012-06-01 13:55:13.253 +79110,26070,5911.0,1,,CC BY-SA 3.0,0530a116-5968-42eb-9c51-f5b323f99168,Euclidean distance is usually not good for sparse data?,,2012-06-01 13:55:13.253 +79111,26070,5911.0,3,,CC BY-SA 3.0,0530a116-5968-42eb-9c51-f5b323f99168,,,2012-06-01 13:55:13.253 +79128,26070,668.0,6,,CC BY-SA 3.0,1666741c-0682-467a-a68f-f4557767bbcb,,added 7 characters in body; edited tags,2012-06-01 15:30:45.033 +79127,26070,668.0,5,,CC BY-SA 3.0,1666741c-0682-467a-a68f-f4557767bbcb,I have seen somewhere that classical distances (like Euclidean distance) become weakly discriminant when we have multidimensional and sparse data. Why? Do you have an example of two sparse data vectors where the Euclidean distance does not perform well? In this case which similarity should we use?,added 7 characters in body; edited tags,2012-06-01 15:30:45.033 +79262,26070,,25,,,20af2220-ff35-46b9-96ca-49f1b2005d47,,http://twitter.com/#!/StackStats/status/208879359052886016,2012-06-02 11:14:54.757 +80964,26657,190.0,2,,CC BY-SA 3.0,15956cab-12e6-4234-8f3e-0d8be45ac328,"Here is a simple toy example illustrating the effect of dimensionality in a discrimination problem. + +**Framework** Assume that $\xi$ is a gaussian vector with mean $\nu$ and diagonal covariance $\sigma Id$ ($\sigma$ is known) and that you want to test the simple hypothesis + +$$H_0: \;\nu=0,\; Vs \; H_{\theta}: \; \nu=\theta $$ +(for a given $\theta\in \mathbb{R}^n$) $\theta$ is not necessarily known in advance. + +**Test statistic with energy**. The intuition you certainlly have is that it is a good idea to evaluate the norm/energy $\mathcal{E}_n=\frac{1}{n}\sum_{i=1}^n\xi_i^2$ of you observation $\xi$ to build a test statistic. Actually you can construct a standardized centered (under $H_0$) version $T_n$ of the energy $T_n=\frac{\sum_i\xi_i^2-\sigma^2}{\sqrt{2n\sigma^4}}$. That makes a critical region at level $\alpha$ of the form $\{T_n\geq v_{1-\alpha}\}$ for a well chosen $v_{1-\alpha}$ + + + +**Power of the test and dimension.** In this case it is an easy probabiliy exercice to show the following formula for the power of your test: + +> $$P_{\theta}(T\leq v_{1-\alpha})=P\left (Z\leq \frac{v_{1-\alpha}}{\sqrt{1+2\|\theta\|_2^2/(n\sigma^2)}}-\frac{\|\theta\|^2_2}{\sqrt{2n\sigma^4+2\sigma^2\|\theta\|_2^2/(n\sigma^2)}}\right )$$ +> with $Z$ a sum of $n$ iid random variables with $\mathbb{E}[Z]=0$ and $Var(Z)=1$. + +This means that the power of your test is increased by the energy of your signal $\|\theta\|^2_2$ and decreased by $n$. + +**Toward a test with a thresholded statistic.** If you do not have much energy in your signal but if you know a linear traformation that can help you to have this energy concentrated in a small part of your signal, then you can build a test statistic that will only evaluate the energy for the small part of your signal. If you known in advance where it is concentrated (for example you known there cannot be high frequencies in your signal) then you can obtain a power in the preceding test with $n$ replaced by a small number and $\|\theta\|^2_2$ allmost the same... If you do not know it in advance you have to estimate it this leads to well known thresholding tests. + +Note that this argument is exactly at the root many papers such as + + - A Antoniadis, F Abramovich, T Sapatinas, and B Vidakovic. Wavelet methods for testing +in functional analysis of variance models. International Journal on Wavelets and its +applications, 93 :1007–1021, 2004. + - M. V. Burnashef and Begmatov. On a problem of signal detection leading to stable distribution. Theory of probability and its applications, 35(3) :556–560, 1990. + - Y. Baraud. Non asymptotic minimax rate of testing in signal detection. Bernoulli, 8 :577–606, 2002. + - J Fan. Test of significance based on wavelet thresholding and neyman’s truncation. JASA, +91 :674–688, 1996. + - J. Fan and S-K Lin. Test of significance when data are curves. JASA, 93 :1007–1021, 1998. + - V. Spokoiny. Adaptative hypothesis testing using wavelets. Annals of Statistics, 24(6) :2477–2498, december 1996. +",,2012-06-12 09:23:27.760 +81125,26657,190.0,5,,CC BY-SA 3.0,1ba58f6d-8d11-4ba8-8fc7-f3f092a0cd6a,"Here is a simple toy example illustrating the effect of dimensionality in a discrimination problem. + +**Heuristic.** The key issue here is that the Euclidian norm gives the same importance to any direction. This constitutes a lack of a priori, and as you certainly know in high dimension there is no free lunch (i.e. if you don't known what you are searching for, then why noise is not what you are searching for). I would say that for any problem there is a limit of information that is necessary to find something else than noise. This limit is related somehow to the ""size"" of the area you are trying to explore with regard to the ""noise"" level (i.e. level of uninformative content). In high dimension if you have the a priori that your signal is sparse then you can remove non sparse vector with metric that fill the space with sparse vector or by using thresholding. + +**Framework** Assume that $\xi$ is a gaussian vector with mean $\nu$ and diagonal covariance $\sigma Id$ ($\sigma$ is known) and that you want to test the simple hypothesis + +$$H_0: \;\nu=0,\; Vs \; H_{\theta}: \; \nu=\theta $$ +(for a given $\theta\in \mathbb{R}^n$) $\theta$ is not necessarily known in advance. + +**Test statistic with energy**. The intuition you certainlly have is that it is a good idea to evaluate the norm/energy $\mathcal{E}_n=\frac{1}{n}\sum_{i=1}^n\xi_i^2$ of you observation $\xi$ to build a test statistic. Actually you can construct a standardized centered (under $H_0$) version $T_n$ of the energy $T_n=\frac{\sum_i\xi_i^2-\sigma^2}{\sqrt{2n\sigma^4}}$. That makes a critical region at level $\alpha$ of the form $\{T_n\geq v_{1-\alpha}\}$ for a well chosen $v_{1-\alpha}$ + + + +**Power of the test and dimension.** In this case it is an easy probabiliy exercice to show the following formula for the power of your test: + +> $$P_{\theta}(T\leq v_{1-\alpha})=P\left (Z\leq \frac{v_{1-\alpha}}{\sqrt{1+2\|\theta\|_2^2/(n\sigma^2)}}-\frac{\|\theta\|^2_2}{\sqrt{2n\sigma^4+2\sigma^2\|\theta\|_2^2/(n\sigma^2)}}\right )$$ +> with $Z$ a sum of $n$ iid random variables with $\mathbb{E}[Z]=0$ and $Var(Z)=1$. + +This means that the power of your test is increased by the energy of your signal $\|\theta\|^2_2$ and decreased by $n$. + +**Toward a test with a thresholded statistic.** If you do not have much energy in your signal but if you know a linear traformation that can help you to have this energy concentrated in a small part of your signal, then you can build a test statistic that will only evaluate the energy for the small part of your signal. If you known in advance where it is concentrated (for example you known there cannot be high frequencies in your signal) then you can obtain a power in the preceding test with $n$ replaced by a small number and $\|\theta\|^2_2$ allmost the same... If you do not know it in advance you have to estimate it this leads to well known thresholding tests. + +Note that this argument is exactly at the root many papers such as + + - A Antoniadis, F Abramovich, T Sapatinas, and B Vidakovic. Wavelet methods for testing +in functional analysis of variance models. International Journal on Wavelets and its +applications, 93 :1007–1021, 2004. + - M. V. Burnashef and Begmatov. On a problem of signal detection leading to stable distribution. Theory of probability and its applications, 35(3) :556–560, 1990. + - Y. Baraud. Non asymptotic minimax rate of testing in signal detection. Bernoulli, 8 :577–606, 2002. + - J Fan. Test of significance based on wavelet thresholding and neyman’s truncation. JASA, +91 :674–688, 1996. + - J. Fan and S-K Lin. Test of significance when data are curves. JASA, 93 :1007–1021, 1998. + - V. Spokoiny. Adaptative hypothesis testing using wavelets. Annals of Statistics, 24(6) :2477–2498, december 1996. +",added 775 characters in body,2012-06-13 06:03:39.853 +81702,412,668.0,38,,,6aaaae68-e8ba-499c-a44c-f070d563bca3,"[{""Id"":919,""DisplayName"":""whuber""}]",from http://stats.stackexchange.com/questions/30528/concise-book-tutorial-for-statistics-data-analysis,2012-06-15 15:18:35.453 +82550,27120,568.0,2,,CC BY-SA 3.0,2f33f795-f5a9-4ceb-aa56-4458ebfbf8de,"As a non-native english speaker I was wondering which of the **square** or **squared** expression I should use. For instance in mean **square** error or mean **squared** error. + +According to the Internet, it seems both forms are used indistinctly. Is one expression more square than the other ?",,2012-06-20 16:41:10.737 +82551,27120,568.0,1,,CC BY-SA 3.0,2f33f795-f5a9-4ceb-aa56-4458ebfbf8de,Mean square error or mean squared error,,2012-06-20 16:41:10.737 +82552,27120,568.0,3,,CC BY-SA 3.0,2f33f795-f5a9-4ceb-aa56-4458ebfbf8de,,,2012-06-20 16:41:10.737 +82556,27120,668.0,6,,CC BY-SA 3.0,57763cab-db74-4cc2-ab81-658839b63241,,edited tags,2012-06-20 16:54:18.370 +95611,30862,,25,,,9ec383bd-1259-45c3-be63-5a04a7b8cfb8,,http://twitter.com/#!/StackStats/status/240282770423832579,2012-08-28 03:00:51.527 +126223,40121,14728.0,2,,CC BY-SA 3.0,08a3103d-cd12-49f2-9aa8-2b4d408b4ec8,"I'm trying to compare some experiment data using JMP. In the `Compare Means` function, there are different tests such as `Each Pair, Student’s t` and `All Pairs, Tukey HSD`, which gives circles of different radius. What's the difference between the tests and how are these radius calculated? I found this [help file][1] but it did not answer my question. + + + [1]: http://www.jmp.com/support/help/Additional_Examples_of_the_Oneway_Platform.shtml",,2013-02-10 03:03:22.580 +82590,27132,668.0,2,,CC BY-SA 3.0,2205616d-4ab0-4890-ab4e-e1c1d8d5e4be,"**The conceptual uses of ""square"" and ""squared"" are subtly different,** although interchangeable: + +* ""Squared"" refers to the past *action* of taking or computing the second power. E.g., $x^2$ is usually read as ""x-squared,"" not ""x-square."" (The latter is sometimes encountered but I suspect it results from speakers who are accustomed to clipping their phrases or who just haven't heard the terminal dental in ""x-squared."") + +* ""Square"" refers to the *result* of taking the second power. E.g., $x^2$ can be referred to as the ""square of x."" (The illocution ""squared of x"" is never used.) + +These suggest that a person using a phrase like ""mean squared error"" is thinking in terms of a *computation*: take the errors, square them, average those. The phrase ""mean square error"" has a more conceptual feel to it: average the square errors. The user of this phrase may be thinking in terms of square errors rather than the errors themselves. I believe this shows up especially in theoretical literature where the second form, ""square,"" appears more often (I believe: I haven't systematically checked). + +Obviously both are equivalent in function and safely interchangeable in practice. It is interesting, though, that some careful Google queries give substantially different hit counts. Presently, + + ""mean squared"" -square -root -Einstein -Relativity + +returns about 367,000 results (notice the necessity of ruling out the phrase ""$e=m c^2$"" popularly quoted in certain contexts, which demands the use of ""squared"" instead of ""square"" when written out), while + + ""mean square"" -squared -root -Einstein -Relativity + +(maintaining analogous exclusions for comparability) returns an order of magnitude more, at 3.47 million results. This (weakly) suggests people favor ""mean square"" over ""mean squared,"" but don't take this too much to heart: ""mean squared"" is used in official SAS documentation, for instance.",,2012-06-20 20:29:04.380 +82614,27132,668.0,5,,CC BY-SA 3.0,6b2395e3-2026-4d16-9020-57b702a39355,"**The conceptual uses of ""square"" and ""squared"" are subtly different,** although (almost) interchangeable: + +* ""Squared"" refers to the past *action* of taking or computing the second power. E.g., $x^2$ is usually read as ""x-squared,"" not ""x-square."" (The latter is sometimes encountered but I suspect it results from speakers who are accustomed to clipping their phrases or who just haven't heard the terminal dental in ""x-squared."") + +* ""Square"" refers to the *result* of taking the second power. E.g., $x^2$ can be referred to as the ""square of x."" (The illocution ""squared of x"" is never used.) + +These suggest that a person using a phrase like ""mean squared error"" is thinking in terms of a *computation*: take the errors, square them, average those. The phrase ""mean square error"" has a more conceptual feel to it: average the square errors. The user of this phrase may be thinking in terms of square errors rather than the errors themselves. I believe this shows up especially in theoretical literature where the second form, ""square,"" appears more often (I believe: I haven't systematically checked). + +Obviously both are equivalent in function and safely interchangeable in practice. It is interesting, though, that some careful Google queries give substantially different hit counts. Presently, + + ""mean squared"" -square -root -Einstein -Relativity + +returns about 367,000 results (notice the necessity of ruling out the phrase ""$e=m c^2$"" popularly quoted in certain contexts, which demands the use of ""squared"" instead of ""square"" when written out), while + + ""mean square"" -squared -root -Einstein -Relativity + +(maintaining analogous exclusions for comparability) returns an order of magnitude more, at 3.47 million results. This (weakly) suggests people favor ""mean square"" over ""mean squared,"" but don't take this too much to heart: ""mean squared"" is used in official SAS documentation, for instance.",added 9 characters in body,2012-06-21 01:36:39.797 +82789,27194,1085.0,1,,CC BY-SA 3.0,8ff84a0c-ae79-4626-9587-e5d450bd6482,what does that mean that 2 time series are colinear (or collinear)?,,2012-06-22 02:20:37.490 +82788,27194,1085.0,2,,CC BY-SA 3.0,8ff84a0c-ae79-4626-9587-e5d450bd6482,"I am familiar with the concept of cointegration. + +But I hear sometimes people talking about colinearity (or collinearity) for time series. +A set of points is collinear if they are on the same line. But what does that mean for time series? + +Is it exactly the same as cointegration of order 1? +Or is there something stronger/different in the concept of collinearity?",,2012-06-22 02:20:37.490 +82787,27194,1085.0,3,,CC BY-SA 3.0,8ff84a0c-ae79-4626-9587-e5d450bd6482,,,2012-06-22 02:20:37.490 +82812,27194,674.0,6,,CC BY-SA 3.0,c55e067f-4673-4b7c-92e0-1afbcaeeda95,,edited tags; edited title,2012-06-22 07:49:51.370 +82811,27194,674.0,4,,CC BY-SA 3.0,c55e067f-4673-4b7c-92e0-1afbcaeeda95,What does that mean that two time series are colinear?,edited tags; edited title,2012-06-22 07:49:51.370 +82951,27194,,25,,,5107e24f-1ca2-4d33-a388-cec6427ac4d2,,http://twitter.com/#!/StackStats/status/216252810881282050,2012-06-22 19:34:22.503 +87118,16366,,6,,CC BY-SA 3.0,9b4a1ac0-ffdf-4f09-9922-6f94c0ffd96c,,edited tags,2012-07-15 20:58:58.220 +90550,4714,60.0,5,,CC BY-SA 3.0,85734fde-3036-4b17-b66f-acbf39d8805a,Reverend [Thomas Bayes](http://en.wikipedia.org/wiki/Thomas_Bayes) for discovering Bayes' theorem,added 4 characters in body,2012-08-02 03:09:39.773 +91184,1248,,25,,,7ad766be-e930-45b3-a1f0-fa155b7770f8,,http://twitter.com/#!/StackStats/status/232388024548339712,2012-08-06 08:09:57.143 +93684,541,5237.0,5,,CC BY-SA 3.0,a9188445-f779-419a-a7b0-8470be848d3a,"ANOVA is equivalent to linear regression with the use of suitable dummy variables. The conclusions remain the same irrespective of whether you use ANOVA or linear regression. + +In light of their equivalence, is there any reason why ANOVA is used instead of linear regression? + +Note: I am particularly interested in hearing about **technical** reasons for the use of ANOVA instead of linear regression. + +**Edit** + +Here is one example using one-way ANOVA. Suppose, you want to know if the average height of male and females is the same. To test for your hypothesis you would collect data from a random sample of male and females (say 30 each) and perform the ANOVA analysis (i.e., sum of squares for gender and error) to decide whether an effect exists. + +You could also use linear regression to test for this as follows: + +Define: $\text{Gender} = 1$ if respondent is a male and $0$ otherwise. +$$ +\text{Height} = \text{Intercept} + \beta * \text{Gender} + \text{error} +$$ +where: $\text{error}\sim\mathcal N(0,\sigma^2)$ + +Then a test of whether $\beta = 0$ is a an equivalent test for your hypothesis.",added mathjax,2012-08-18 16:21:38.253 +94058,30434,9605.0,2,,CC BY-SA 3.0,6d4d3822-fb7e-41d1-983b-98cba880abe7,"This is indeed something often glossed over. + +Some people are doing something a bit cheeky: holding out a proportion of the words in each document, and giving using predictive probabilities of these held-out words given the document-topic mixtures as well as the topic-word mixtures. This is obviously not ideal as it doesn't evaluate performance on any held-out documents. + +To do it properly with held-out documents you do as suggested need to ""integrate over the Dirichlet prior for all possible topic mixtures"". http://people.cs.umass.edu/~wallach/talks/evaluation.pdf reviews a few methods for tackling this slightly unpleasant integral. I'm just about to try and implement this myself in fact, so good luck!",,2012-08-20 14:56:05.707 +95589,30862,1805.0,1,,CC BY-SA 3.0,ce4200a8-b09a-418d-aa4f-4da3f8770a83,Why bother with low rank approximations?,,2012-08-28 00:12:57.667 +95590,30862,1805.0,3,,CC BY-SA 3.0,ce4200a8-b09a-418d-aa4f-4da3f8770a83,,,2012-08-28 00:12:57.667 +95588,30862,1805.0,2,,CC BY-SA 3.0,ce4200a8-b09a-418d-aa4f-4da3f8770a83,"If you have a matrix with n rows and m columns, you can use SVD or other methods to calculate a [low-rank approximation][1] of the given matrix. + +However, the low rank approximation will still have n rows and m columns. How can low-rank-approximations be useful for machine learning and natural language processing, given that you are left with the same number of features? + + + [1]: http://en.wikipedia.org/wiki/Low-rank_approximation",,2012-08-28 00:12:57.667 +104635,33598,,6,user10525,CC BY-SA 3.0,c8b4acc1-5c74-4c86-bb89-b20e3917ffa3,,Attach image and related tag,2012-10-16 09:32:48.907 +131932,41914,13918.0,3,,CC BY-SA 3.0,dd90e082-7edf-4155-b4dd-caff8a793247,,,2013-03-12 11:17:01.283 +95694,30864,4890.0,5,,CC BY-SA 3.0,91755515-9824-400a-92e9-f0a2c70617d7,"A low rank approximation $\hat{X}$ of $X$ can be decomposed into a matrix square root as $G=U_{r}\lambda_{r}^\frac{1}{2}$ where the eigen decomposition of $X$ is $U\lambda U^T$, thereby reducing the number of features, which can be represented by $G$ based on the rank-r approximation as $\hat{X}=GG^T$. Note that the subscript $r$ represents the number of eigen-vectors and eigen-values used in the approximation. Hence, it does reduce the number of features to represent the data. In some examples low-rank approximations are considered as basis or latent variable (dictionary) based expansions of the original data, under special constraints like orthogonality, non-negativity (non-negative matrix factorization) etc.",added 1 characters in body,2012-08-28 14:22:41.110 +95922,30957,9446.0,3,,CC BY-SA 3.0,c4ff9d11-fdf7-4b61-9012-7ba7a6c1758c,,,2012-08-29 14:56:24.737 +95920,30957,9446.0,2,,CC BY-SA 3.0,c4ff9d11-fdf7-4b61-9012-7ba7a6c1758c,"I have a fairly long time-series of annual abundances ($N_t$) of a wildlife species (73 years of abundances). To forecast the population’s trajectory, I have used ARIMA modeling. Examination of the ACF and PACF of the first-order differenced time-series suggested a 10-year cycle exists. So I used a span 10 seasonal difference to account for this periodic pattern. Therefore, the response variable was: +$$ +Y_t=(\sqrt{N_t}-\sqrt{N_{t-1}})-(\sqrt{N_{t-10}}-\sqrt{N_{t-11}}) +$$ +Typically, I would have used a logarithmic transformation but it resulted in heteroscedastic residuals. Examination of the ACF and PACF of $Y_t$ indicated a multiplicative seasonal structure so I fit the model: +$$ +ARIMA(0,1,1)(0,1,1)_{10} +$$ +using the Forecast Package in `R`....`library(forecast)`. + +Example code for fitting the model: + + m1=Arima(y,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=10),include.mean=FALSE) + +The residuals of this model were normally distributed, not autocorrelated, and homoscedastic. + +I have been using the fitted model from above for some additional simulation work using the `simulate.Arima` function. However, I would like to initialize the simulation with a different time-series. The `arima.sim` function allows this but the `arima.sim` function doesn't seem to handle seasonal ARIMA models. With the `simulate.Arima` function one can use the `future=TRUE` option to simulate values that are ""future to and conditional on the data"" in the model `m1`. Can the data in the model object `m1` simply be replaced to create a simulation that is conditional on different data? + +For example: + + # Create a new model object for simulation. + m.sim=m1 + # Replace the data in the model object with the new data. + m.sim$x=new + # Simulation conditional on the new data. + sim.forecasts=replicate(1000,simulate.Arima(m.sim,future=TRUE,bootstrap=TRUE))",,2012-08-29 14:56:24.737 +95921,30957,9446.0,1,,CC BY-SA 3.0,c4ff9d11-fdf7-4b61-9012-7ba7a6c1758c,Initialize ARIMA simulations with different time-series,,2012-08-29 14:56:24.737 +95934,30960,132.0,2,,CC BY-SA 3.0,3ff51ceb-268a-40aa-ac58-9fe8d021bafd,"You can ""fit"" the model to different data and then simulate: + + m2 <- Arima(z,model=m1) + simulate.Arima(m2,future=TRUE,bootstrap=TRUE) + +`m2` will have the same parameters as `m1` (they are not re-estimated), but the residuals, etc., are computed on the new data. + +However, I am concerned with your model. Seasonal models are for when the seasonality is fixed and known. With animal population data, you almost certainly have aperiodic population cycling. This is a well-known phenomenon and can easily be handled with non-seasonal ARIMA models. Look at the literature on the Canadian lynx data for discussion. + +By all means, use the square root, but then I would use a non-seasonal ARIMA model. Provided the AR order is greater than 1, it is possible to have cycles. See + +You can do all this in one step: + + m1 <- auto.arima(y, lambda=0.5) + +Then proceed with your simulations as above. +",,2012-08-29 17:25:54.740 +95964,30957,9446.0,6,,CC BY-SA 3.0,0a2d2468-c0dc-4c33-8d45-5001fd329475,,edited tags,2012-08-29 21:08:35.373 +96965,2509,668.0,38,,,8e86171a-d1d1-4947-a6e0-d478293a0868,"[{""Id"":919,""DisplayName"":""whuber""}]",from http://stats.stackexchange.com/questions/35647/how-are-eigenvectors-and-principal-components-related,2012-09-04 12:24:34.043 +98179,31575,6404.0,1,,CC BY-SA 3.0,3e31d65c-12e2-4dd1-b090-0669c1336496,Estimating Markov Transition Probabilities from sequence data,,2012-09-11 15:29:12.027 +98180,31575,6404.0,3,,CC BY-SA 3.0,3e31d65c-12e2-4dd1-b090-0669c1336496,,,2012-09-11 15:29:12.027 +98178,31575,6404.0,2,,CC BY-SA 3.0,3e31d65c-12e2-4dd1-b090-0669c1336496,"I have a full set of sequences (432 observations to be precise) of 4 states $A-D$: eg + +$$Q=\left(\begin{array}{c c c c c} +A& C& D&D & B\\ +B& A& A&C & A\\ +\vdots&\vdots&\vdots&\vdots&\vdots\\ +B& C& A&D & A\\ + \end{array}\right)$$ + +Is there a way of calculating the transition matrix $$P_{ij}(Y_{t}=j|Y_{t-1}=i)$$ in Matlab or R or similar? I think the HMM package might help. Any thoughts?",,2012-09-11 15:29:12.027 +98185,31575,6404.0,5,,CC BY-SA 3.0,fbdf5795-cef6-4cf0-a73e-b9a654dc1ae8,"I have a full set of sequences (432 observations to be precise) of 4 states $A-D$: eg + +$$Y=\left(\begin{array}{c c c c c} +A& C& D&D & B\\ +B& A& A&C & A\\ +\vdots&\vdots&\vdots&\vdots&\vdots\\ +B& C& A&D & A\\ + \end{array}\right)$$ + +Is there a way of calculating the transition matrix $$P_{ij}(Y_{t}=j|Y_{t-1}=i)$$ in Matlab or R or similar? I think the HMM package might help. Any thoughts? + +eg: http://stats.stackexchange.com/questions/14360/estimating-markov-chain-probabilities",added 92 characters in body,2012-09-11 15:37:08.833 +98207,31587,7007.0,2,,CC BY-SA 3.0,fa47d38a-ff7d-4060-96cc-07de9ab271b7,"Please, check the comments above. Here is a quick implementation in R. + + x <- c(1,2,1,1,3,4,4,1,2,4,1,4,3,4,4,4,3,1,3,2,3,3,3,4,2,2,3) + p <- matrix(nrow = 4, ncol = 4, 0) + for (t in 1:(length(x) - 1)) p[x[t], x[t + 1]] <- p[x[t], x[t + 1]] + 1 + for (i in 1:4) p[i, ] <- p[i, ] / sum(p[i, ]) + +Results: + + > p + [,1] [,2] [,3] [,4] + [1,] 0.1666667 0.3333333 0.3333333 0.1666667 + [2,] 0.2000000 0.2000000 0.4000000 0.2000000 + [3,] 0.1428571 0.1428571 0.2857143 0.4285714 + [4,] 0.2500000 0.1250000 0.2500000 0.3750000 + +P.S. Does anyone know why R doesn't have a `++` like operator?",,2012-09-11 17:05:10.090 +98218,31575,6404.0,5,,CC BY-SA 3.0,bf8763a4-fe47-4841-9bb9-c107d3fc60f4,"I have a full set of sequences (432 observations to be precise) of 4 states $A-D$: eg + +$$Y=\left(\begin{array}{c c c c c c c} +A& C& D&D & B & A &C\\ +B& A& A&C & A&- &-\\ +\vdots&\vdots&\vdots&\vdots&\vdots&\vdots&\vdots\\ +B& C& A&D & A & B & A\\ + \end{array}\right)$$ + +**EDIT**: The observation sequences are of unequal lengths! Does this change anything? + +Is there a way of calculating the transition matrix $$P_{ij}(Y_{t}=j|Y_{t-1}=i)$$ in Matlab or R or similar? I think the HMM package might help. Any thoughts? + +eg: http://stats.stackexchange.com/questions/14360/estimating-markov-chain-probabilities",added 129 characters in body,2012-09-11 18:08:32.263 +98232,31575,,4,user88,CC BY-SA 3.0,9a82b108-2b89-444f-b582-1cbd1948627f,Estimating Markov transition probabilities from sequence data,edited title,2012-09-11 19:44:59.607 +126180,40104,14684.0,1,,CC BY-SA 3.0,fd79b115-4bed-4397-9e4c-70d83be1859a,The sum of two independent Poisson random variables,,2013-02-09 19:31:13.290 +98233,31587,7007.0,5,,CC BY-SA 3.0,2a0d8cfe-809f-4c73-bc7e-f760847ef3cd,"Please, check the comments above. Here is a quick implementation in R. + + x <- c(1,2,1,1,3,4,4,1,2,4,1,4,3,4,4,4,3,1,3,2,3,3,3,4,2,2,3) + p <- matrix(nrow = 4, ncol = 4, 0) + for (t in 1:(length(x) - 1)) p[x[t], x[t + 1]] <- p[x[t], x[t + 1]] + 1 + for (i in 1:4) p[i, ] <- p[i, ] / sum(p[i, ]) + +Results: + + > p + [,1] [,2] [,3] [,4] + [1,] 0.1666667 0.3333333 0.3333333 0.1666667 + [2,] 0.2000000 0.2000000 0.4000000 0.2000000 + [3,] 0.1428571 0.1428571 0.2857143 0.4285714 + [4,] 0.2500000 0.1250000 0.2500000 0.3750000 + +A (probably dumb) implementation in MATLAB (which I have never used, so I don't know if this is going to work. I've just googled ""declare vector matrix MATLAB"" to get the syntax): + + x = [ 1, 2, 1, 1, 3, 4, 4, 1, 2, 4, 1, 4, 3, 4, 4, 4, 3, 1, 3, 2, 3, 3, 3, 4, 2, 2, 3 ] + + n = size(x) + + p = [ 0, 0, 0, 0; 0, 0, 0, 0; 0, 0, 0, 0; 0, 0, 0, 0 ] + + for t = 1:n + p(x(t), x(t + 1)) = p(x(t), x(t + 1)) + 1 + end + + for i = 1:4 + p(i, :) = p(i, :) / sum(p(i, :)) + end + +P.S. Does anyone know why R doesn't have a `++` like operator?",added 368 characters in body,2012-09-11 19:48:22.687 +98888,31587,6404.0,5,,CC BY-SA 3.0,f562f3dd-e1c5-4272-ad63-c75d645e668c,"Please, check the comments above. Here is a quick implementation in R. + + x <- c(1,2,1,1,3,4,4,1,2,4,1,4,3,4,4,4,3,1,3,2,3,3,3,4,2,2,3) + p <- matrix(nrow = 4, ncol = 4, 0) + for (t in 1:(length(x) - 1)) p[x[t], x[t + 1]] <- p[x[t], x[t + 1]] + 1 + for (i in 1:4) p[i, ] <- p[i, ] / sum(p[i, ]) + +Results: + + > p + [,1] [,2] [,3] [,4] + [1,] 0.1666667 0.3333333 0.3333333 0.1666667 + [2,] 0.2000000 0.2000000 0.4000000 0.2000000 + [3,] 0.1428571 0.1428571 0.2857143 0.4285714 + [4,] 0.2500000 0.1250000 0.2500000 0.3750000 + +A (probably dumb) implementation in MATLAB (which I have never used, so I don't know if this is going to work. I've just googled ""declare vector matrix MATLAB"" to get the syntax): + + x = [ 1, 2, 1, 1, 3, 4, 4, 1, 2, 4, 1, 4, 3, 4, 4, 4, 3, 1, 3, 2, 3, 3, 3, 4, 2, 2, 3 ] + + n = length(x)-1 + + p = zeros(4,4) + + for t = 1:n + p(x(t), x(t + 1)) = p(x(t), x(t + 1)) + 1 + end + + + for i = 1:4 + p(i, :) = p(i, :) / sum(p(i, :)) + end + +P.S. Does anyone know why R doesn't have a `++` like operator?",quick clean-up. Matlab's size command returns the 2D length parameters of an array. so length returns the longest dimension,2012-09-15 21:47:52.567 +98889,31587,,24,,CC BY-SA 3.0,f562f3dd-e1c5-4272-ad63-c75d645e668c,,Proposed by 8686 approved by 686 edit id of 1740,2012-09-15 21:47:52.567 +99657,32038,11013.0,1,,CC BY-SA 3.0,7f1ab9d0-3fe6-463a-8e13-a9306f703a97,Minimum number of levels for a random effects factor?,,2012-09-20 01:56:50.007 +99659,32038,11013.0,2,,CC BY-SA 3.0,7f1ab9d0-3fe6-463a-8e13-a9306f703a97,"I'm using a mixed model in `R` (`lme4`) to analyze some repeated measures data. I have a response variable (fiber content of feces) and 3 fixed effects (body mass, etc.). My study only has 6 participants, with 16 repeated measures for each one (though two only have 12 repeats). The subjects are lizards that were given different combinations of food in different 'treatments'. + +My question is: can I use subject ID as a random effect? + +I know this is the usual course of action in longitudinal mixed effects models, to take account of the randomly sampled nature of the subjects and the fact that observations within subjects will be more closely correlated than those between subjects. But, treating subject ID as a random effect involves estimating a mean and variance for this variable. Since I have only 6 subjects (6 levels of this factor), is this enough to get an accurate characterization of the mean and variance? Does the fact that I have quite a few repeated measurements for each subject help in this regard (I don't see how it matters)? Finally, If I can't use subject ID as a random effect, will including it as a fixed effect allow me to control for the fact that I have repeated measures?",,2012-09-20 01:56:50.007 +99658,32038,11013.0,3,,CC BY-SA 3.0,7f1ab9d0-3fe6-463a-8e13-a9306f703a97,,,2012-09-20 01:56:50.007 +99721,32053,346.0,2,,CC BY-SA 3.0,30eaff0c-aafb-434e-a926-1bcd3ebff6e0,"Short answer: Yes, you can use ID as random effect with 6 levels. + +Slightly longer answer: The [mixed modeling FAQ for r-sig-mixed][1] says (among other things) the following under the headline ""*Should I treat factor xxx as fixed or random?*"": + +> One point of particular relevance to 'modern' mixed model estimation +> (rather than 'classical' method-of-moments estimation) is that, for +> practical purposes, there must be a reasonable number of +> random-effects levels (e.g. blocks) — more than 5 or 6 at a minimum. + +So you are at the lower bound, but on the right side of it. + + [1]: http://glmm.wikidot.com/faq",,2012-09-20 12:35:29.537 +99843,32038,11013.0,5,,CC BY-SA 3.0,8c3cd3e6-f264-44f7-8db5-c54c0c84646a,"I'm using a mixed model in `R` (`lme4`) to analyze some repeated measures data. I have a response variable (fiber content of feces) and 3 fixed effects (body mass, etc.). My study only has 6 participants, with 16 repeated measures for each one (though two only have 12 repeats). The subjects are lizards that were given different combinations of food in different 'treatments'. + +My question is: can I use subject ID as a random effect? + +I know this is the usual course of action in longitudinal mixed effects models, to take account of the randomly sampled nature of the subjects and the fact that observations within subjects will be more closely correlated than those between subjects. But, treating subject ID as a random effect involves estimating a mean and variance for this variable. Since I have only 6 subjects (6 levels of this factor), is this enough to get an accurate characterization of the mean and variance? Does the fact that I have quite a few repeated measurements for each subject help in this regard (I don't see how it matters)? Finally, If I can't use subject ID as a random effect, will including it as a fixed effect allow me to control for the fact that I have repeated measures? + +Edit 1: I'd just like to clarify that when I say ""can I"" use subject ID as a random effect, I mean ""is it a good idea to"". I know I can fit the model with a factor with just 2 levels, but surely this would be in-defensible? I'm asking at what point does it become sensible to think about treating subjects as random effects? It seems like the literature advises that 5-6 levels is a lower bound. It seems to me that the estimates of the mean and variance of the random effect would not be very precise until there were 15+ factor levels.",added 541 characters in body,2012-09-20 21:50:02.310 +100597,32317,8208.0,2,,CC BY-SA 3.0,71de4ded-8258-483a-8e75-61f0856afb15,"I'm looking for a good Java library implementing several clustering algorithms. + +I'll have to cluster some programs execution traces and I still don't know which algorithms I am going to need, so I'd like to use a library providing lot of them and that makes it easy to swap algorithms. + +So far I had a look at Weka but I don't know whether there is a more complete library available I'm missing.",,2012-09-25 20:11:11.667 +100598,32317,8208.0,1,,CC BY-SA 3.0,71de4ded-8258-483a-8e75-61f0856afb15,Good clustering Java library,,2012-09-25 20:11:11.667 +100599,32317,8208.0,3,,CC BY-SA 3.0,71de4ded-8258-483a-8e75-61f0856afb15,,,2012-09-25 20:11:11.667 +126208,40104,,25,,,e86d4231-d4fd-4c35-9595-e506e3459a0d,,http://twitter.com/#!/StackStats/status/300410953932091394,2013-02-10 01:08:47.493 +100830,32388,2105.0,2,,CC BY-SA 3.0,42b1fb9f-22db-4de7-8c71-cfa1abdc70f5,"Related to question [here][1]. + +I've been trying to teach myself about Network Analysis, and developing DAG charts in R. Let's say that I have the following data. + + dat=data.frame(sold=c(0,0,0,1,0,1), won=c(1,0,0,1,0,1), bid=c(5,3,2,5,3,4)) + dat + +Given what I'm trying to analyze, I know that the DAG plot should be as follows: + + bid => won => sold + +However, when I utilize the bnlearn package to generate the plot, it comes out as follows. It just can't be correct, and should be in the opposite direction. + + library(""bnlearn"") + library(""Rgraphviz"") + + bn.hc <- hc(dat, score = ""bic"") + graphviz.plot(bn.hc) + +![enter image description here][2] + +Can anyone help with diagnosing the problem in R? Is there something I'm missing in the code? or of my understanding of BN's? is this an issue w/ what I pass as the algorithm to use in 'score'? + + + [1]: http://stats.stackexchange.com/questions/37930/prediction-with-bayesian-networks-in-r + [2]: https://i.stack.imgur.com/Zg0rM.png",,2012-09-26 20:56:22.507 +100831,32388,2105.0,1,,CC BY-SA 3.0,42b1fb9f-22db-4de7-8c71-cfa1abdc70f5,Odd results from Bayesian Network in R,,2012-09-26 20:56:22.507 +100832,32388,2105.0,3,,CC BY-SA 3.0,42b1fb9f-22db-4de7-8c71-cfa1abdc70f5,,,2012-09-26 20:56:22.507 +100835,32388,2105.0,5,,CC BY-SA 3.0,0a641658-e5e9-46c6-8f45-c8e406626a5c,"Related to question [here][1]. + +I've been trying to teach myself about Network Analysis, and developing DAG charts in R. Let's say that I have the following data. + + dat=data.frame(sold=c(0,0,0,1,0,1), won=c(1,0,0,1,0,1), bid=c(5,3,2,5,3,4)) + dat + +Given what I'm trying to analyze, I know that the DAG plot should be as follows: + + bid => won => sold + +However, when I utilize the bnlearn package to generate the plot, it comes out as follows. It just can't be correct, and should be in the opposite direction. + + library(""bnlearn"") + library(""Rgraphviz"") + + bn.hc <- hc(dat, score = ""bic"") + graphviz.plot(bn.hc) + +![enter image description here][2] + +Now, I know that's just the data that I provided it to learn on, but I've messed around with the variable values, and it never turns our the way it should. Basically, a bid should determine whether you win, and whether you win should determine whether you can sell it. Just doesn't make sense. + +Isn't there some way to specify what variable is the response variable? + +Can anyone help with diagnosing the problem in R? Is there something I'm missing in the code? or of my understanding of BN's? is this an issue w/ what I pass as the algorithm to use in 'score'? + + + [1]: http://stats.stackexchange.com/questions/37930/prediction-with-bayesian-networks-in-r + [2]: https://i.stack.imgur.com/Zg0rM.png",added 296 characters in body,2012-09-26 21:08:45.927 +100837,32388,2105.0,5,,CC BY-SA 3.0,2d8c527c-d569-4ec4-a9e9-17b38f669605,"Related to question [here][1]. + +I've been trying to teach myself about Network Analysis, and developing DAG charts in R. Let's say that I have the following data. + + dat=data.frame(sold=c(0,0,0,1,0,1), won=c(1,0,0,1,0,1), bid=c(5,3,2,5,3,4)) + dat + +Given what I'm trying to analyze, I know that the DAG plot should be as follows: + + bid => won => sold + +However, when I utilize the bnlearn package to generate the plot, it comes out as follows. It just can't be correct, and should be in the opposite direction. + + library(""bnlearn"") + library(""Rgraphviz"") + + bn.hc <- hc(dat, score = ""bic"") + graphviz.plot(bn.hc) + +![enter image description here][2] + +Now, I know that's just the data that I provided it to learn on, but I've messed around with the variable values, and it never turns our the way it should. Basically, a bid should determine whether you win, and whether you win should determine whether you can sell it. Just doesn't make sense. + +Isn't there some way to specify what variable is the response variable? In my case, the response variable should be sold, and there be no arcs from sold to another node. + +Can anyone help with diagnosing the problem in R? Is there something I'm missing in the code? or of my understanding of BN's? is this an issue w/ what I pass as the algorithm to use in 'score'? + + + [1]: http://stats.stackexchange.com/questions/37930/prediction-with-bayesian-networks-in-r + [2]: https://i.stack.imgur.com/Zg0rM.png",added 98 characters in body,2012-09-26 21:14:48.620 +100843,32388,2105.0,5,,CC BY-SA 3.0,2e452461-f36b-4c20-81a9-c47932851467,"Related to question [here][1]. + +I've been trying to teach myself about Network Analysis, and developing DAG charts in R. Let's say that I have the following data. + + dat=data.frame(sold=c(0,0,0,1,0,1), won=c(1,0,0,1,0,1), bid=c(5,3,2,5,3,4)) + dat + +Given what I'm trying to analyze, I know that the DAG plot should be as follows: + + bid => won => sold + +However, when I utilize the bnlearn package to generate the plot, it comes out as follows. It just can't be correct, and should be in the opposite direction. + + library(""bnlearn"") + library(""Rgraphviz"") + + bn.hc <- hc(dat, score = ""bic"") + graphviz.plot(bn.hc) + +![enter image description here][2] + +Now, I know that's just the data that I provided it to learn on, but I've messed around with the variable values, and it never turns our the way it should. Basically, a bid should determine whether you win, and whether you win should determine whether you can sell it. Just doesn't make sense. + +Isn't there some way to specify what variable is the response variable? In my case, the response variable should be sold, and there should be no arcs from sold to another node. + +Can anyone help with diagnosing the problem in R? Is there something I'm missing in the code? or of my understanding of BN's? is this an issue w/ what I pass as the algorithm to use in 'score'? + + + [1]: http://stats.stackexchange.com/questions/37930/prediction-with-bayesian-networks-in-r + [2]: https://i.stack.imgur.com/Zg0rM.png",added 7 characters in body,2012-09-26 21:23:34.353 +100858,32388,,4,user88,CC BY-SA 3.0,eb569b4d-70c4-4c88-9247-b279f2edf766,Odd results from Bayesian network in R,edited title,2012-09-26 23:02:24.670 +104633,33598,11643.0,3,,CC BY-SA 3.0,428994e0-0b5b-442d-9427-ed2c06426e4a,,,2012-10-16 09:30:00.727 +104632,33598,11643.0,1,,CC BY-SA 3.0,428994e0-0b5b-442d-9427-ed2c06426e4a,How to identify structural change using a Chow test on Eviews,,2012-10-16 09:30:00.727 +104631,33598,11643.0,2,,CC BY-SA 3.0,428994e0-0b5b-442d-9427-ed2c06426e4a,"I have this little problem and I would appreciate some help. + +As part of my master thesis, I have to identify a trend in a univariate (GDP) time series for different countries. I have to separate the trend and the stochastic element in it for each country. + +I have managed to do so by doing: + +variable c @trend // for each country. + +And then running a AR(1) on the residuals // for each country. + + +However, now I need to identify structural breaks in one of these countries. I've been reading and searching all over the internet and books and I've found that the test most people use to identify these structural changes is the Chow Test. + +I know how to run the test, but I have't been able to figure out how to interpret the results, and decide whether there is a structural break or not. + +Here there is an example of the results: + + + https://i.stack.imgur.com/1kCqG.jpg + + +What puzzles me the most is the fact that, regardless the point I choose to break the series, I always get + +Prob. F(2,47) 0.0016 //or any very significant value, with the same degrees of freedom. + + +Can someone please help me understand how I should interpret these results in order to identify where the breaks lie? + +Thank you! + +Hernan. + + +",,2012-10-16 09:30:00.727 +104634,33598,,5,user10525,CC BY-SA 3.0,c8b4acc1-5c74-4c86-bb89-b20e3917ffa3,"I have this little problem and I would appreciate some help. + +As part of my master thesis, I have to identify a trend in a univariate (GDP) time series for different countries. I have to separate the trend and the stochastic element in it for each country. + +I have managed to do so by doing: + +variable c @trend // for each country. + +And then running a AR(1) on the residuals // for each country. + + +However, now I need to identify structural breaks in one of these countries. I've been reading and searching all over the internet and books and I've found that the test most people use to identify these structural changes is the Chow Test. + +I know how to run the test, but I have't been able to figure out how to interpret the results, and decide whether there is a structural break or not. + +Here there is an example of the results: + + +![enter image description here][1] + + +What puzzles me the most is the fact that, regardless the point I choose to break the series, I always get + +Prob. F(2,47) 0.0016 //or any very significant value, with the same degrees of freedom. + + +Can someone please help me understand how I should interpret these results in order to identify where the breaks lie? + + + [1]: https://i.stack.imgur.com/RL9Lz.jpg",Attach image and related tag,2012-10-16 09:32:48.907 +106474,34166,668.0,3,,CC BY-SA 3.0,02e02bfa-e679-43b0-add7-da3c37f51507,,,2012-10-25 20:10:18.553 +106477,34166,668.0,2,,CC BY-SA 3.0,02e02bfa-e679-43b0-add7-da3c37f51507,"##The situation## +Some researchers would like to put you to sleep. Depending on the secret toss of a fair coin, they will briefly awaken you either once (Heads) or twice (Tails). After each waking, they will put you back to sleep with a drug that makes you forget that awakening. When you are awakened, to what degree should *you* believe that the outcome of the coin toss was Heads? + +*(OK, maybe you don’t want to be the subject of this experiment! Suppose instead that Sleeping Beauty (SB) agrees to it (with the full approval of the Magic Kingdom’s Institutional Review Board, of course). She’s about to go to sleep for one hundred years, so what are one or two more days, anyway?)* + +![Maxfield Parrish illustration][1] + +*[Detail of a [Maxfield Parrish](http://en.wikipedia.org/wiki/Maxfield_Parrish) illustration.]* + +##Are you a Halfer or a Thirder?## + +**The Halfer position.** Simple! The coin is fair--and SB knows it--so she should believe there's a one-half chance of heads. + +**The Thirder position.** Were this experiment to be repeated many times, then the coin will be heads only one third of the time SB is awakened. Her probability for heads will be one third. + +###Thirders have a problem### + +Most, but not all, people who have written about this are thirders. But: + +* On Sunday evening, just before SB falls asleep, she must believe the chance of heads is one-half: that’s what it means to be a fair coin. + +* Whenever SB awakens, *she has learned absolutely nothing she did not know Sunday night.* What rational argument can she give, then, for stating that her belief in heads is now one-third and not one-half? + +##Some attempted explanations## + +* SB would necessarily lose money if she were to bet on heads with any odds other than 1/3. (Vineberg, *inter alios*) + +* One-half really is correct: just use the Everettian “many-worlds” interpretation of Quantum Mechanics! (Lewis). + +* SB updates her belief based on self-perception of her “temporal location” in the world. (Elga, *i.a.*) + +* SB is confused: “[It] seems more plausible to say that her epistemic state upon waking up should not include a definite degree of belief in heads. … The real issue is how one deals with known, unavoidable, cognitive malfunction.” [Arntzenius] + +------ + +##The question## + +**Accounting for what has already been written on this subject** (see the references as well as a [previous post](http://stats.stackexchange.com/a/23812)), **how can this paradox be resolved in a statistically rigorous way?** Is this even possible? + +----- + +##References## + +**Arntzenius, Frank** (2002). [*Reflections on Sleeping Beauty*](**Arntzenius, Frank** (2002). *Reflections on Sleeping Beauty*. Analysis 62.1 pp 53-62.). Analysis 62.1 pp 53-62. + +**Bradley, DJ** (2010). [*Confirmation in a Branching World: The Everett Interpretation and Sleeping Beauty*](http://philpapers.org/archive/BRACIB.1.pdf). Brit. J. Phil. Sci. 0 (2010), 1–21. + +Elga, Adam (2000). Self-locating belief and the Sleeping Beauty Problem. Analysis 60 pp 143-7. + +**Franceschi, Paul** (2005). [*Sleeping Beauty and the Problem of World Reduction*](http://philsci-archive.pitt.edu/2175/1/sb-en.pdf). Preprint. + +**Groisman, Berry** (2007). [*The end of Sleeping Beauty’s nightmare*](http://philsci-archive.pitt.edu/3624/1/SB_b.groisman_last.pdf). Preprint. + +**Lewis, D** (2001). *Sleeping Beauty: reply to Elga*. Analysis 61.3 pp 171-6. + +**Papineau, David** and **Victor Dura-Vila** (2008). *A Thirder and an Everettian: a reply to Lewis’s ‘Quantum Sleeping Beauty’*. + +**Pust, Joel** (2008). *Horgan on Sleeping Beauty*. Synthese 160 pp 97-101. + +**Vineberg, Susan** (undated, perhaps 2003). *Beauty’s Cautionary Tale*. + + + [1]: https://i.stack.imgur.com/zLmrR.png",,2012-10-25 20:10:18.553 +106475,34166,668.0,1,,CC BY-SA 3.0,02e02bfa-e679-43b0-add7-da3c37f51507,The Sleeping Beauty Paradox,,2012-10-25 20:10:18.553 +106482,34166,,25,,,eaf12d0a-d02a-48dd-9879-1e96591b3fdd,,http://twitter.com/#!/StackStats/status/261574473898139648,2012-10-25 21:06:29.000 +106563,34166,668.0,5,,CC BY-SA 3.0,2dbe05fd-1fd1-4c51-82a5-a640314ac466,"###The situation### +Some researchers would like to put you to sleep. Depending on the secret toss of a fair coin, they will briefly awaken you either once (Heads) or twice (Tails). After each waking, they will put you back to sleep with a drug that makes you forget that awakening. When you are awakened, to what degree should *you* believe that the outcome of the coin toss was Heads? + +*(OK, maybe you don’t want to be the subject of this experiment! Suppose instead that Sleeping Beauty (SB) agrees to it (with the full approval of the Magic Kingdom’s Institutional Review Board, of course). She’s about to go to sleep for one hundred years, so what are one or two more days, anyway?)* + +![Maxfield Parrish illustration][1] + +*[Detail of a [Maxfield Parrish](http://en.wikipedia.org/wiki/Maxfield_Parrish) illustration.]* + +###Are you a Halfer or a Thirder?### + +**The Halfer position.** Simple! The coin is fair--and SB knows it--so she should believe there's a one-half chance of heads. + +**The Thirder position.** Were this experiment to be repeated many times, then the coin will be heads only one third of the time SB is awakened. Her probability for heads will be one third. + +###Thirders have a problem### + +Most, but not all, people who have written about this are thirders. But: + +* On Sunday evening, just before SB falls asleep, she must believe the chance of heads is one-half: that’s what it means to be a fair coin. + +* Whenever SB awakens, *she has learned absolutely nothing she did not know Sunday night.* What rational argument can she give, then, for stating that her belief in heads is now one-third and not one-half? + +###Some attempted explanations### + +* SB would necessarily lose money if she were to bet on heads with any odds other than 1/3. (Vineberg, *inter alios*) + +* One-half really is correct: just use the Everettian “many-worlds” interpretation of Quantum Mechanics! (Lewis). + +* SB updates her belief based on self-perception of her “temporal location” in the world. (Elga, *i.a.*) + +* SB is confused: “[It] seems more plausible to say that her epistemic state upon waking up should not include a definite degree of belief in heads. … The real issue is how one deals with known, unavoidable, cognitive malfunction.” [Arntzenius] + +------ + +###The question### + +Accounting for what has already been written on this subject (see the references as well as a [previous post](http://stats.stackexchange.com/a/23812)), how can this paradox be resolved in a statistically rigorous way? Is this even possible? + +----- + +###References### + +Arntzenius, Frank (2002). [*Reflections on Sleeping Beauty*](http://www.joelvelasco.net/teaching/3865/arntzenius%20-%20reflections%20on%20sleeping%20beauty.pdf) Analysis 62.1 pp 53-62. + +Bradley, DJ (2010). [*Confirmation in a Branching World: The Everett Interpretation and Sleeping Beauty*](http://philpapers.org/archive/BRACIB.1.pdf). Brit. J. Phil. Sci. 0 (2010), 1–21. + +Elga, Adam (2000). Self-locating belief and the Sleeping Beauty Problem. Analysis 60 pp 143-7. + +Franceschi, Paul (2005). [*Sleeping Beauty and the Problem of World Reduction*](http://philsci-archive.pitt.edu/2175/1/sb-en.pdf). Preprint. + +Groisman, Berry (2007). [*The end of Sleeping Beauty’s nightmare*](http://philsci-archive.pitt.edu/3624/1/SB_b.groisman_last.pdf). Preprint. + +Lewis, D (2001). *Sleeping Beauty: reply to Elga*. Analysis 61.3 pp 171-6. + +Papineau, David and Victor Dura-Vila (2008). *A Thirder and an Everettian: a reply to Lewis’s ‘Quantum Sleeping Beauty’*. + +Pust, Joel (2008). *Horgan on Sleeping Beauty*. Synthese 160 pp 97-101. + +Vineberg, Susan (undated, perhaps 2003). *Beauty’s Cautionary Tale*. + + + [1]: https://i.stack.imgur.com/zLmrR.png",Formatting,2012-10-26 11:47:02.827 +109645,35097,9886.0,3,,CC BY-SA 3.0,2bd8d41b-4f7f-47e8-a585-249f9544130b,,,2012-11-11 15:56:03.667 +109644,35097,9886.0,1,,CC BY-SA 3.0,2bd8d41b-4f7f-47e8-a585-249f9544130b,What's wrong with last xkcd (Frequentists vs. Bayesians)?,,2012-11-11 15:56:03.667 +109643,35097,9886.0,2,,CC BY-SA 3.0,2bd8d41b-4f7f-47e8-a585-249f9544130b,"![xkcd comic number 1132][2] + +[This xkcd comic (Frequentists vs. Bayesians)][1] makes fun of a frequentist statistician who derives an obviously wrong result. + +However it seems to me that his reasoning is actually correct in the sense that it follows the standard frequentist methodology. + +So my question is ""does he correctly apply the frequentist methodology?"" + + - If no: what would be a correct frequentist inference in this scenario? How to integrate ""prior knowledge"" about the sun stability in the frequentist methodology? + - If yes: wtf? ;-) + + [1]: http://xkcd.com/1132 + [2]: https://i.stack.imgur.com/tStr4.png",,2012-11-11 15:56:03.667 +109649,35097,,25,,,c297e61f-84b2-430d-8662-6c6a70b5354f,,http://twitter.com/#!/StackStats/status/267657811683594240,2012-11-11 15:59:07.593 +109743,35097,,4,user88,CC BY-SA 3.0,ac6424b2-7acc-49e5-9954-e3a1578becd9,What's wrong with XKCD's Frequentists vs. Bayesians comic?,edited title,2012-11-12 00:38:54.837 +109862,35160,12273.0,2,,CC BY-SA 3.0,ec3ce893-d398-4833-91c0-9405cb0bb83a,"The main issue is that the first experiment (Sun gone nova) is not repeatable, which makes it highly unsuitable for frequentist methodology that interprets probability as estimate of how frequent an event is giving that we can repeat the experiment many times. In contrast, bayesian probability is interpreted as our degree of belief giving all available prior knowledge, making it suitable for common sense reasoning about one-time events. The dice throw experiment is repeatable, but I find it very unlikely that any frequentist would intentionally ignore the influence of the first experiment and be so confident in significance of the obtained results. + +Although it seems that author mocks frequentist reliance on repeatable experiments and their distrust of priors, giving the unsuitability of the experimental setup to the frequentist methodology I would say that real theme of this comic is not frequentist methodology but blind following of unsuitable methodology in general. Whether it's funny or not is up to you (for me it is) but I think it more misleads than clarifies the differences between the two approaches. +",,2012-11-12 16:27:57.197 +110145,35249,11884.0,3,,CC BY-SA 3.0,fdca3fc8-044c-401f-b119-72b321271ed7,,,2012-11-13 22:29:58.533 +110143,35249,11884.0,2,,CC BY-SA 3.0,fdca3fc8-044c-401f-b119-72b321271ed7,"First of all, sorry for the strange title, I had no idea how to describe my problem better. + +My issue is the following, I think it is pretty much limited to geosciences. + +I have several properties for every sample, which are divided by depth. + +For instance: + +(ID var1_0-20cm, var1_20-50cm, var1_50-100cm, var2_0-20cm, var2_20-50, ....) + +(1, 2.3, 2.1, 2.6, 10.5, 10.9, 15.0,...) +(2, 2.0, 1.1, 1.1, 5.5, 5.9, 5.0,...) +(3, 1.0, 0.0, 0.0, 3.5, 1.9, 1.0,...) + +Basically these are geological layers going from surface down to 100 cm depth. +I am trying to decrease the number of variables, either with PCA or factor analysis. +The issue is, that I would like to handle properties together, no matter what the depth is. + +(For instance I do not want to get rid of a layer in between the surface and the bottom layer.) + +Is there any way to handle them together, or group them for PCA or whatever. I tried to find some relevant information, but I think the problem is limited to a small portion of the science(maybe I am wrong), so I could not find anything usefull. + +Or any good idea or solution would be amazing and highly appreciated :) + +I hope I described the issue well :) I am not native english so sorry for the mistakes :) + +Thanks + +-v",,2012-11-13 22:29:58.533 +110144,35249,11884.0,1,,CC BY-SA 3.0,fdca3fc8-044c-401f-b119-72b321271ed7,Handling variables subset based on depth for PCA,,2012-11-13 22:29:58.533 +110151,35249,11884.0,4,,CC BY-SA 3.0,1281397b-0e37-44c9-8182-8cb81ab2c001,Reducing no of variables subsetted based on depth for PCA,edited title,2012-11-13 23:27:08.820 +113396,13060,1805.0,5,,CC BY-SA 3.0,313325ca-afe3-4d2f-9287-5a8ccfc6dc6b,"Check out the [digitize][1] package for [R][2]. Its designed to solve exactly this sort of problem. + +/edit: No longer on CRAN, but you can still [get it from R-Forge][3]. + + + [1]: http://cran.r-project.org/web/packages/digitize/index.html + [2]: http://cran.r-project.org/ + [3]: https://r-forge.r-project.org/R/?group_id=594",added 128 characters in body,2012-11-30 21:49:31.210 +116360,37182,11446.0,1,,CC BY-SA 3.0,361c0f6d-7473-4840-bda3-25d69bd7da92,How to specify in r spatial covariance structure similar to SAS sp(pow) in a marginal model?,,2012-12-14 15:06:25.837 +116361,37182,11446.0,3,,CC BY-SA 3.0,361c0f6d-7473-4840-bda3-25d69bd7da92,,,2012-12-14 15:06:25.837 +126224,40121,14728.0,1,,CC BY-SA 3.0,08a3103d-cd12-49f2-9aa8-2b4d408b4ec8,"In JMP, when compare means, how is comparison circle radius calculated?",,2013-02-10 03:03:22.580 +126225,40121,14728.0,3,,CC BY-SA 3.0,08a3103d-cd12-49f2-9aa8-2b4d408b4ec8,,,2013-02-10 03:03:22.580 +136699,43458,16452.0,1,,CC BY-SA 3.0,7b50b83b-d2fd-46f6-abcb-83f47dbc465c,How to check if removing a sample makes a difference in mean and stdev values?,,2013-04-02 14:45:37.780 +116359,37182,11446.0,2,,CC BY-SA 3.0,361c0f6d-7473-4840-bda3-25d69bd7da92,"I'm currently translating existing code from SAS to R. I have this SAS code : + + Proc mixed data=ALBI; + class NUM_PAT; + model CD4t=T /s ; + repeated / sub=NUM_PAT type=sp(pow)(T); + +The SAS spatial power covariance structure is useful for unequally spaced longitudinal measurements where the correlations decline as a function of time (as shown by the picture below). + +![Spatial Power Covariance Structure][1] + + + [1]: https://i.stack.imgur.com/s7RnV.png + +I think I have to use gls( ) from {nlme} since I don't have any random effects. My guess is that I need to use corSpatial, but I don't know how. + +Thanks for any help. + +",,2012-12-14 15:06:25.837 +116378,37182,,25,,,1ace1c95-3a94-42f6-a597-07471ea9f2e1,,http://twitter.com/#!/StackStats/status/279617248447057921,2012-12-14 16:02:01.877 +118326,37748,13370.0,1,,CC BY-SA 3.0,f4028a8c-e2e0-40a3-b550-6a026eead2e0,What is the computational complexity of the EM algorithm?,,2012-12-27 07:48:43.813 +118325,37748,13370.0,3,,CC BY-SA 3.0,f4028a8c-e2e0-40a3-b550-6a026eead2e0,,,2012-12-27 07:48:43.813 +118324,37748,13370.0,2,,CC BY-SA 3.0,f4028a8c-e2e0-40a3-b550-6a026eead2e0,"In general, and more specifically for Bernoulli mixture model (aka Latent Class Analysis).",,2012-12-27 07:48:43.813 +118612,37819,12314.0,2,,CC BY-SA 3.0,f162557b-d4e1-4221-a537-6fd720ed017a,"Is it okay to feed $I(0)$ variables into the Johansen procedure? I've read three sources that seem to state that this is not what you're supposed to do. However, whenever I've done this, I notice that $\Pi$ is full rank and so it leads me to a VAR and therefore I don't see any problem with this. ",,2012-12-29 16:08:10.207 +118613,37819,12314.0,3,,CC BY-SA 3.0,f162557b-d4e1-4221-a537-6fd720ed017a,,,2012-12-29 16:08:10.207 +118614,37819,12314.0,1,,CC BY-SA 3.0,f162557b-d4e1-4221-a537-6fd720ed017a,Putting stationary variables through Johansen procedure,,2012-12-29 16:08:10.207 +118646,33598,,25,,,3fe604be-c89b-4dc9-9cfb-54e04cccd200,,http://twitter.com/#!/StackStats/status/285189143577911296,2012-12-30 01:02:45.163 +119017,37182,11446.0,5,,CC BY-SA 3.0,59968bd8-832b-4092-9e9b-3d79e50fc875,"I'm currently translating existing code from SAS to R. I'm working on longitudinal data (CD4 count over time). I have the following SAS code : + + Proc mixed data=df; + class NUM_PAT; + model CD4t=T /s ; + repeated / sub=NUM_PAT type=sp(pow)(T); + +The SAS spatial power covariance structure is useful for unequally spaced longitudinal measurements where the correlations decline as a function of time (as shown by the picture below). +![Spatial Power Covariance Structure][1] + + + [1]: https://i.stack.imgur.com/s7RnV.png + +I think I have to use gls( ) from {nlme} since I don't have any random effects. As R 'only' provides ""spherical"", ""exponential"", ""gaussian"", ""linear"", and ""rational"" as correlation spatial structures, my guess is that I need to use corSpatial plus a weights argument. + +I tried the following code, but it doesn't work : + + gls(CD4t~T, data=df, na.action = (na.omit), method = ""ML"", + corr=corCompSymm(form=~1|NUM_PAT), weighhts=varConstPower(form=~1|T)) + +What am I doing wrong ? + +Thanks for any help. +",added 404 characters in body,2013-01-02 13:47:28.253 +119166,37182,11446.0,6,,CC BY-SA 3.0,685798ad-4d7e-420b-95cd-68bb7a7fbc39,,edited tags,2013-01-03 14:44:37.783 +119184,37981,13403.0,3,,CC BY-SA 3.0,4740b865-675b-4493-950c-7b499e06cf3e,,,2013-01-03 16:00:17.050 +119185,37981,13403.0,2,,CC BY-SA 3.0,4740b865-675b-4493-950c-7b499e06cf3e,"Happy New Year Everyone, + +I've tried looking this up, but haven't found a good answer. It is a question of definitions. + +I would like to describe the ""peakedness"" and tail ""heaviness"" of several skewed probability density functions. + +The features I want to describe, would they be called ""kurtosis""? I've only seen the word ""kurtosis"" used for symmetric distributions? + +Thank you +",,2013-01-03 16:00:17.050 +119183,37981,13403.0,1,,CC BY-SA 3.0,4740b865-675b-4493-950c-7b499e06cf3e,kurtosis and skewness - descriptive statistics,,2013-01-03 16:00:17.050 +119192,37981,5237.0,5,,CC BY-SA 3.0,f2639eda-6477-49d6-848a-4dc25b1ba6f2,"I would like to describe the ""peakedness"" and tail ""heaviness"" of several skewed probability density functions. + +The features I want to describe, would they be called ""kurtosis""? I've only seen the word ""kurtosis"" used for symmetric distributions?",removed peripheral info,2013-01-03 16:36:19.167 +119284,37981,,25,,,68ceccc1-0525-4b45-bfe2-19923ce67043,,http://twitter.com/#!/StackStats/status/287001087658057728,2013-01-04 01:02:46.207 +123965,33598,,4,user88,CC BY-SA 3.0,89d9b1fa-41b8-463a-9318-77eb8962d615,How to identify structural change using a Chow test on Eviews?,edited title,2013-01-29 08:33:00.967 +125943,40030,1790.0,3,,CC BY-SA 3.0,592c5e1a-1ae8-4ae7-8c1c-3f231cb84663,,,2013-02-07 20:58:31.927 +125941,40030,1790.0,1,,CC BY-SA 3.0,592c5e1a-1ae8-4ae7-8c1c-3f231cb84663,Understanding stratified CV,,2013-02-07 20:58:31.927 +125942,40030,1790.0,2,,CC BY-SA 3.0,592c5e1a-1ae8-4ae7-8c1c-3f231cb84663,"What is the difference between stratified CV and CV? + +Wikipedia says: + +> In stratified k-fold cross-validation, the folds are selected so that +> the mean response value is approximately equal in all the folds. In +> the case of a dichotomous classification, this means that each fold +> contains roughly the same proportions of the two types of class +> labels. + +But I am still confused. + +1. What does `mean response value` mean in this context? +2. Why is # 1 important? +3. How does one achieve #1 in practice? + +",,2013-02-07 20:58:31.927 +126179,40104,14684.0,2,,CC BY-SA 3.0,fd79b115-4bed-4397-9e4c-70d83be1859a,"Using wikipedia I found a way to calculate the probability mass function resulting from the sum of two Poisson random variables. However, I think that the approach I have is wrong. + +Let $X_1, X_2$ be two independent Poisson random variables with mean $\lambda_1, \lambda_2$, and + +$S_2 = a_1 X_1+a_2 X_2,$ + +where the $a_1$ and $a_2$ are constants, then the probability-generating function of $S_2$ is given by + +$G_{S_2}(z) = \operatorname{E}(z^{S_2})= \operatorname{E}(z^{a_1 X_1+a_2 X_2}) G_{X_1}(z^{a_1})G_{X_2}(z^{a_2}).$ + +Now, using the fact that the probability-generating function for a Poisson random variable is + +$G_{X_i}(z) = \textrm{e}^{\lambda_i(z - 1)}, $ + +we can write the probability-generating function of the sum of the two independent Poisson random variables as + +$G_{S_2}(z) = \textrm{e}^{\lambda_1(z^{a_1} - 1)}\textrm{e}^{\lambda_2(z^{a_2} - 1)}$ +$= \textrm{e}^{\lambda_1(z^{a_1} - 1)+\lambda+2(z^{a_2} - 1)}.$ + + +It seems that the probability mass function of $S_2$ is recovered by taking derivatives of $G_{S_2}(z)$ +$\operatorname{Pr}(S_2 = k) = \frac{G_{S_2}^{(k)}(0)}{k!}$, where $G_{S_2}^{(k)} = \frac{d^k G_{S_2}(z)}{ d z^k}.$ + +Is this is correct? I have the feeling I cannot just take the derivative to obtain the probability mass function, because of the constants $a_1$ and $a_2$. Is this right? Is there an alternative approach? + +If this is correct can I now obtain an approximation of the cumulative distribution by truncating the infinite sum over all k? + + +Please help! + +Michel",,2013-02-09 19:31:13.290 +126178,40104,14684.0,3,,CC BY-SA 3.0,fd79b115-4bed-4397-9e4c-70d83be1859a,,,2013-02-09 19:31:13.290 +126226,40104,14684.0,5,,CC BY-SA 3.0,46622481-6c90-4396-b656-7125761b0a4c,"Using wikipedia I found a way to calculate the probability mass function resulting from the sum of two Poisson random variables. However, I think that the approach I have is wrong. + +Let $X_1, X_2$ be two independent Poisson random variables with mean $\lambda_1, \lambda_2$, and + +$S_2 = a_1 X_1+a_2 X_2,$ + +where the $a_1$ and $a_2$ are constants, then the probability-generating function of $S_2$ is given by + +$G_{S_2}(z) = \operatorname{E}(z^{S_2})= \operatorname{E}(z^{a_1 X_1+a_2 X_2}) G_{X_1}(z^{a_1})G_{X_2}(z^{a_2}).$ + +Now, using the fact that the probability-generating function for a Poisson random variable is + +$G_{X_i}(z) = \textrm{e}^{\lambda_i(z - 1)}, $ + +we can write the probability-generating function of the sum of the two independent Poisson random variables as + +$G_{S_2}(z) = \textrm{e}^{\lambda_1(z^{a_1} - 1)}\textrm{e}^{\lambda_2(z^{a_2} - 1)}$ +$= \textrm{e}^{\lambda_1(z^{a_1} - 1)+\lambda_2(z^{a_2} - 1)}.$ + + +It seems that the probability mass function of $S_2$ is recovered by taking derivatives of $G_{S_2}(z)$ +$\operatorname{Pr}(S_2 = k) = \frac{G_{S_2}^{(k)}(0)}{k!}$, where $G_{S_2}^{(k)} = \frac{d^k G_{S_2}(z)}{ d z^k}.$ + +Is this is correct? I have the feeling I cannot just take the derivative to obtain the probability mass function, because of the constants $a_1$ and $a_2$. Is this right? Is there an alternative approach? + +If this is correct can I now obtain an approximation of the cumulative distribution by truncating the infinite sum over all k? + + +Please help! + +Michel",changed \lambda+2 into \lambda_2,2013-02-10 03:12:19.067 +126237,40121,14728.0,5,,CC BY-SA 3.0,9e6cf058-4c95-4a26-befa-68bdadf5c7a4,"I'm trying to compare several sets of experiment data, by comparing means. I read there are several different tests such as `Each Pair, Student’s t` and `All Pairs, Tukey HSD`, which give different circles of different radius, an example shown below + +oops I can't post image so the link is [here][1] + +How are the circles defined? How do I calculate the radius? And is there a rule what test one should use for what kind of data? + + + [1]: https://i.stack.imgur.com/Ce579.gif",added 33 characters in body; edited tags; edited title,2013-02-10 05:58:03.727 +126238,40121,14728.0,4,,CC BY-SA 3.0,9e6cf058-4c95-4a26-befa-68bdadf5c7a4,"different tests for compare means, how to calculate comparison circle radius?",added 33 characters in body; edited tags; edited title,2013-02-10 05:58:03.727 +126239,40121,14728.0,6,,CC BY-SA 3.0,9e6cf058-4c95-4a26-befa-68bdadf5c7a4,,added 33 characters in body; edited tags; edited title,2013-02-10 05:58:03.727 +126266,40121,,5,user88,CC BY-SA 3.0,eddbf5fd-6c76-4c2d-86f0-dd9509456155,"I'm trying to compare several sets of experiment data, by comparing means. I read there are several different tests such as *Each Pair, Student’s t* and *All Pairs, Tukey HSD*, which give different circles of different radius, an example shown below + +![enter image description here][1] + +How are the circles defined? How do I calculate the radius? And is there a rule what test one should use for what kind of data? + + + [1]: https://i.stack.imgur.com/QCKE1.gif",deleted 14 characters in body; edited title,2013-02-10 12:16:41.200 +126267,40121,,4,user88,CC BY-SA 3.0,eddbf5fd-6c76-4c2d-86f0-dd9509456155,A test for comparing means,deleted 14 characters in body; edited title,2013-02-10 12:16:41.200 +126296,40121,,4,user88,CC BY-SA 3.0,b4a45ecc-788f-4df1-8e95-477e2d39e355,Comparing many means in JMP,edited tags; edited title,2013-02-10 19:53:22.873 +126297,40121,,6,user88,CC BY-SA 3.0,b4a45ecc-788f-4df1-8e95-477e2d39e355,,edited tags; edited title,2013-02-10 19:53:22.873 +128647,40859,15044.0,2,,CC BY-SA 3.0,549a71af-215b-44f7-9334-b477d57d3d74,"I was naively validating my binomial logit models by testing on a test dataset. I had randomly divided the available data (~2000 rows) into training (~1500) and validation (~500) datasets. + +I now read a post in another thread ( Frank Harrell) that causes me to question my approach: + + +> Data splitting is not very reliable unless you have more than 15,000 +> observations. In other words, if you split the data again, accuracy +> indexes will vary too much from what you obtained with the first +> split. + +How serious is this worry and what are ways around it? The OP speaks of ""resampling"" but not sure how that works here for validation. ",,2013-02-22 08:40:44.930 +128648,40859,15044.0,1,,CC BY-SA 3.0,549a71af-215b-44f7-9334-b477d57d3d74,Validation: Data splitting into training vs. test datasets,,2013-02-22 08:40:44.930 +128649,40859,15044.0,3,,CC BY-SA 3.0,549a71af-215b-44f7-9334-b477d57d3d74,,,2013-02-22 08:40:44.930 +128656,40859,,25,,,d6fedac7-1e29-4233-ab25-80aee0406c2c,,http://twitter.com/#!/StackStats/status/304899083087261696,2013-02-22 10:23:00.777 +128677,40859,15044.0,5,,CC BY-SA 3.0,ab3dfcae-f22f-475e-a51e-cf4620a282f5,"I was naively validating my binomial logit models by testing on a test dataset. I had randomly divided the available data (~2000 rows) into training (~1500) and validation (~500) datasets. + +I now read a post in another thread ( Frank Harrell) that causes me to question my approach: + + +> Data splitting is not very reliable unless you have more than 15,000 +> observations. In other words, if you split the data again, accuracy +> indexes will vary too much from what you obtained with the first +> split. + +How serious is this worry and what are ways around it? The OP speaks of ""resampling"" but not sure how that works here for validation. + +Edit: Adding context as per @Bernhard's comment below: + +http://stats.stackexchange.com/questions/15618/comparing-logistic-regression-models",added 145 characters in body,2013-02-22 11:38:06.397 +128681,40870,1923.0,1,,CC BY-SA 3.0,77f91c26-5e31-4057-ad99-c1ee2ac19234,FRD calculation in target-decoy matching context,,2013-02-22 12:56:28.503 +128682,40870,1923.0,3,,CC BY-SA 3.0,77f91c26-5e31-4057-ad99-c1ee2ac19234,,,2013-02-22 12:56:28.503 +128680,40870,1923.0,2,,CC BY-SA 3.0,77f91c26-5e31-4057-ad99-c1ee2ac19234,"A common strategy in mass spectrometry of biological molecules is to upload observed spectra to a server so that they can be matched to a LARGE database of theoretical spectra of known molecules (a.k.a. *target* database). In order to control for false positives, a *decoy* database consisting of incorrect/irrelevant spectra is used. + +I have been reading more into this subject and have come up some questions regarding the calculation of the FDR measure from this target-decoy strategy. The basic idea of the FDR value is very intuitive: + +$FDR = \frac{FP}{FP + TP}$ + +where FP and TP stands for false and true positives respectively. This makes perfect sense to me; if I'm trying to guess some peoples' names out of a phone book, and get 8 right and 2 wrong, I would have 2 out of 10 false guesses, and thus my false discovery rate would be 20%. + +However reading [this tutorial][1] on how this is done in large scale on the servers, I got introduced to two different calculations, depending on whether or not the *target* and *decoy* databases are concatenated (page 2). + + +I don't think that this is a typo as I found other occurrences * of the mysterious factor 2 in front of FP in scientific literature. However the motivation behind this is never explained (at least I couldn't find it). + +I would appreciate some insight on where this doubling comes from. Likewise I wonder whether or not FDR calculation this way **assumes** that the error rate for each spectra match is the same for the target database and decoy database (i.e. *assuming* that getting 25 decoy hits *implies* 25 target hits are also false positives). It's not really clear for me why the error rate has to be the same for the two databases. Any comments on this subject is also appreciated. + +* one such reference is Elias et al Nature Methods - 2, 667 - 675 (2005) + + [1]: http://www.proteored.org/pme6/fdr_calculation_for_pme6.pdf + [2]: doi:10.1038/nmeth785",,2013-02-22 12:56:28.503 +128683,40870,1923.0,4,,CC BY-SA 3.0,5fa7b4b3-310d-4ecb-9eaf-cee08c9eb81b,False discovery rate calculation in target-decoy matching context,corrected spelling error in the title,2013-02-22 13:08:41.170 +129877,41244,15330.0,2,,CC BY-SA 3.0,511b413e-d9c0-4366-b124-bdc5e1c143fc,"Given N flips of the same coin and k occurences of 'heads', what is the probability distribution function of heads-probability?",,2013-03-01 05:09:26.123 +129876,41244,15330.0,1,,CC BY-SA 3.0,511b413e-d9c0-4366-b124-bdc5e1c143fc,Probability of heads in a biased coin,,2013-03-01 05:09:26.123 +129875,41244,15330.0,3,,CC BY-SA 3.0,511b413e-d9c0-4366-b124-bdc5e1c143fc,,,2013-03-01 05:09:26.123 +129878,41244,15330.0,5,,CC BY-SA 3.0,4267f8bb-bde3-4837-a942-67dc98101b4b,"Given N flips of the same coin resulting in k occurences of 'heads', what is the probability density function of heads-probability?",added 4 characters in body,2013-03-01 05:22:01.993 +129975,41244,15330.0,5,,CC BY-SA 3.0,08419f6a-88e7-453e-bc72-83ae8f11d8dc,"Given N flips of the same coin resulting in k occurences of 'heads', what is the probability density function of heads-probability of the coin?",added 12 characters in body,2013-03-01 20:14:36.900 +131931,41914,13918.0,1,,CC BY-SA 3.0,dd90e082-7edf-4155-b4dd-caff8a793247,Lewandowski algorithm demand forecasting,,2013-03-12 11:17:01.283 +131930,41914,13918.0,2,,CC BY-SA 3.0,dd90e082-7edf-4155-b4dd-caff8a793247,"I came across the Lewandowski method of demand forecasting in JDA Demand. Please help me understand at a high level the methodology it uses. I found a paper by Robert Hyndman titled +""A state space framework for automatic forecasting using exponential smoothing methods"" and it uses this method as one of methods they compare the algorithm to. Currently for us this a black box, we want to get some high level understand so that we can better fine tune the parameters they have provided as part of the software. It would be great if you can share some thoughts about the Lewandowski algorithm and point to some references that I could use for further research. +",,2013-03-12 11:17:01.283 +131956,41914,13918.0,5,,CC BY-SA 3.0,679db353-69e1-451f-a62f-e4b390b56067,"I came across the Lewandowski method of demand forecasting in JDA Demand. Please help me understand at a high level the methodology it uses. I found a paper by Robert Hyndman titled +""A state space framework for automatic forecasting using exponential smoothing methods"" and it uses this method as one of methods they compare their algorithm to in the paper. Currently for us this is a black box, we want to get some high level understanding so that we can better fine tune the parameters they have provided as part of the software. It would be great if you can share some thoughts about the Lewandowski algorithm and point to some references that I could use for further research. +",fixed grammar,2013-03-12 13:45:22.893 +133836,42513,15991.0,3,,CC BY-SA 3.0,d2af7f18-8d21-4ab5-8ea7-be61e96e4a0d,,,2013-03-19 21:23:49.697 +133834,42513,15991.0,1,,CC BY-SA 3.0,d2af7f18-8d21-4ab5-8ea7-be61e96e4a0d,Plotting mean in histogram,,2013-03-19 21:23:49.697 +133835,42513,15991.0,2,,CC BY-SA 3.0,d2af7f18-8d21-4ab5-8ea7-be61e96e4a0d,"I have a Really Stupid Question: +Is it ""okay"" to add a vertical line to a histogram to visualize the mean value? +It seems okay to me, but I've never seen this in textbooks and the likes, so I'm wondering if there's some sort of convention not to do that? The graph is for a term paper, I just want to make sure I don't accidentally break some super important unspoken stats rule. :)",,2013-03-19 21:23:49.697 +133842,42517,594.0,2,,CC BY-SA 3.0,d6d7588e-228d-407b-8f1f-e3d345964ead,"Of course, why not? + +![enter image description here][1] + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +For example, instead of drawing a line right across the plot, you might mark information along the bottom of it. + + [1]: https://i.stack.imgur.com/xNNl9.png",,2013-03-19 21:39:28.337 +133845,42517,594.0,5,,CC BY-SA 3.0,61fdef87-78be-4467-8cc2-e23666081aa9,"Of course, why not? + +![histogram with mean][1] + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +For example, instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][2] + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/0gvWk.png",added 154 characters in body,2013-03-19 21:51:28.330 +133846,42517,594.0,5,,CC BY-SA 3.0,43ee8aa1-9b0f-42ce-91c1-b0b45443ee17,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png",added 154 characters in body,2013-03-19 21:56:37.443 +133847,42517,594.0,5,,CC BY-SA 3.0,1d50281d-7afd-49cd-a461-3ee46545d298,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot][4] + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + + + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [4]: https://i.stack.imgur.com/l8IT2.png",added 377 characters in body,2013-03-19 22:08:02.433 +136701,43458,16452.0,2,,CC BY-SA 3.0,7b50b83b-d2fd-46f6-abcb-83f47dbc465c,"I'd like to ask if someone could help me with the following problem: + +we have measured the same sample 5 times and we would like to check if there are significant differences in mean and stdev values if we use: + +- All 5 datapoints +- Only the last 4 datapoints +- Only the last 3 datapoints + +We have performed ANOVA analysis but we are not sure about the results because we might not have homocedasticity. + +Which tests would you do to investigate this issue? + +Thanks in advance for your help. ",,2013-04-02 14:45:37.780 +142853,45457,5237.0,5,,CC BY-SA 3.0,2aaeebd3-83b7-4d5c-a6df-09343d242bcb,"I have a the following time series + + + Price BrokerID 632 Behaviour BrokerID 680 Behaviour ...BrokerID XYZ Behaviour + + 5.6 IP SP + 5.7 BP IP + 5.8 SP BP + 5.83 IP SP + +where `IP` is idle position, `BP` is buying position, and `SP` is selling position. I want to use Broker behaviour as the known variable and price as the hidden variable and predict it using HMM. But my question is how to find the emission matrix between a character vector (broker behaviour) and price numeric vector? ","Added tag ""r"" according to question's comments",2013-04-29 20:18:56.040 +142852,45457,,24,,CC BY-SA 3.0,0baa5beb-9aaf-4cb5-9dd3-ea57ce80ee88,,Proposed by 22468 approved by -1 edit id of 3308,2013-04-29 20:18:56.040 +133848,42517,594.0,5,,CC BY-SA 3.0,3c147044-0097-40f6-b533-d98b7f5d55da,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot][4] + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +(My plots are generated in R.) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [4]: https://i.stack.imgur.com/l8IT2.png",added 377 characters in body,2013-03-19 22:13:47.097 +133854,42513,,25,,,469cb11c-f850-48a3-8f45-e98f68a54194,,http://twitter.com/#!/StackStats/status/314145220960858114,2013-03-19 22:43:51.923 +133944,42517,594.0,5,,CC BY-SA 3.0,d4e679e2-99bb-46bf-8a3b-384bd82936fd,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][5] + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +(My plots are generated in R.) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [5]: https://i.stack.imgur.com/vRujH.png",put jitter on the rugplot to overcome discreteness caused by rounding x.,2013-03-20 09:44:06.130 +134100,42517,594.0,5,,CC BY-SA 3.0,835123de-d2d5-47a2-9332-cecba1c7151c,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][5] +(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.) + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +(My plots are generated in R.) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [5]: https://i.stack.imgur.com/vRujH.png",added 141 characters in body,2013-03-20 22:23:23.613 +134104,42513,166.0,4,,CC BY-SA 3.0,343090b4-75cc-4813-aef5-756205203d2f,Is it appropriate to plot the mean in a histogram?,updated title to more closely reflect the content of the question,2013-03-20 23:07:00.543 +134978,42885,2615.0,3,,CC BY-SA 3.0,f40b3e36-51ab-4967-922d-9a16233f0e8a,<2sls>,,2013-03-25 15:11:36.577 +134976,42885,2615.0,2,,CC BY-SA 3.0,f40b3e36-51ab-4967-922d-9a16233f0e8a,"I have one endogenous variable and two instruments for it, and I want to calculate my beta with the direct (one step) matrix formula + +beta = [X'ZZ'X]^-1 X'ZZ'Y + +But if I have two instruments for one endogenous variable X and Z are not the same length. + +Any ideas? +Thanks!",,2013-03-25 15:11:36.577 +134977,42885,2615.0,1,,CC BY-SA 3.0,f40b3e36-51ab-4967-922d-9a16233f0e8a,2SLS with two instruments for one endogenous variable in matlab,,2013-03-25 15:11:36.577 +134980,42885,2615.0,5,,CC BY-SA 3.0,1e657c01-28d8-436e-b68a-ca8d5b51e08d,"I have one endogenous variable and two instruments for it, and I want to calculate my beta with the direct (one step) matrix formula + +beta_2sls = (X' * Z * (Z' * X)^(-1) * Z' * X) ^ (-1) * X' * Z *(Z' * Z)^(-1) * Z' * Y) + +But if I have two instruments for one endogenous variable X and Z are not the same length. + +Any ideas? +Thanks!",changed 2sls formula,2013-03-25 15:22:38.943 +135820,42885,503.0,5,,CC BY-SA 3.0,71f0ef3e-86c6-44d4-94bb-08dc5fa4358d,"I have one endogenous variable and two instruments for it, and I want to calculate my beta with the direct (one step) matrix formula + +$\beta_2sls = X' Z(Z'X)^{-1}Z'X^{-1}X'Z(Z'Z)^{-1}Z'Y$ + +But if I have two instruments for one endogenous variable X and Z are not the same length. + +Any ideas? +Thanks!",made equation into LaTeX,2013-03-29 02:44:43.590 +136700,43458,16452.0,3,,CC BY-SA 3.0,7b50b83b-d2fd-46f6-abcb-83f47dbc465c,,,2013-04-02 14:45:37.780 +184211,56859,,25,,,f96991ca-a5c1-404d-810d-5cf303bc6566,,http://twitter.com/#!/StackStats/status/386417699796287488,2013-10-05 09:08:53.620 +138269,42517,594.0,5,,CC BY-SA 3.0,5da066cb-c400-49e8-a055-b91b19e53eea,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][5] +(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.) + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + +\-- + +My plots are generated in R. + +Edit: + +As @gung surmised, `abline(v=mean...` was used to draw the mean-line across the plot and `rug` was used to draw the data values (though I actually used `rug(jitter(...` because the data was rounded to integers). + +Here's a way to do the boxplot in between the histogram and the axis: + + hist(Davis2[,2],n=30) + boxplot(Davis2[,2], + add=TRUE,horizontal=TRUE,at=-0.75,border=""darkred"",boxwex=1.5,outline=FALSE) + +Of course, you won't have that data, so here's a similar-looking example with built in data: + + hist((islands)^(1/4),n=30) + boxplot((islands)^(1/4), + add=TRUE,horizontal=TRUE,at=-.25,border=""darkred"",boxwex=.6,outline=FALSE) + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [5]: https://i.stack.imgur.com/vRujH.png + +",added 686 characters in body,2013-04-10 03:14:22.220 +138271,42517,594.0,5,,CC BY-SA 3.0,c144c1cc-6c94-4785-9ba3-69a753082500,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][5] +(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.) + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + +\-- + +My plots are generated in R. + +Edit: + +As @gung surmised, `abline(v=mean...` was used to draw the mean-line across the plot and `rug` was used to draw the data values (though I actually used `rug(jitter(...` because the data was rounded to integers). + +Here's a way to do the boxplot in between the histogram and the axis: + + hist(Davis2[,2],n=30) + boxplot(Davis2[,2], + add=TRUE,horizontal=TRUE,at=-0.75,border=""darkred"",boxwex=1.5,outline=FALSE) + +Of course, you won't have that data, so here's a similar-looking example with built in data: + + hist((islands)^(1/4),n=30) + boxplot((islands)^(1/4), + add=TRUE,horizontal=TRUE,at=-.25,border=""darkred"",boxwex=.6,outline=FALSE) + +However, it's not a general solution - I don't guarantee it will always work as well as it does here (note I already changed the `at` and `boxwex` options). If you don't write an intelligent function to take care of everything, it's necessary to pay attention to what everything does to make sure it's doing what you want. + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [5]: https://i.stack.imgur.com/vRujH.png + +",added 326 characters in body,2013-04-10 03:24:08.257 +138272,42517,594.0,5,,CC BY-SA 3.0,a5aa4ab9-39df-41eb-b3f2-a64d2c23e8cc,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][5] +(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.) + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + +\-- + +My plots are generated in R. + +Edit: + +As @gung surmised, `abline(v=mean...` was used to draw the mean-line across the plot and `rug` was used to draw the data values (though I actually used `rug(jitter(...` because the data was rounded to integers). + +Here's a way to do the boxplot in between the histogram and the axis: + + hist(Davis2[,2],n=30) + boxplot(Davis2[,2], + add=TRUE,horizontal=TRUE,at=-0.75,border=""darkred"",boxwex=1.5,outline=FALSE) + +I'm not going to list what everything there is for, but you can check the arguments in the help (`?boxplot`) to find out what they're for, and play with them yourself. + +Of course, you won't have that data, so here's a similar-looking example with built in data: + + hist((islands)^(1/4),n=30) + boxplot((islands)^(1/4), + add=TRUE,horizontal=TRUE,at=-.25,border=""darkred"",boxwex=.6,outline=FALSE) + +However, it's not a general solution - I don't guarantee it will always work as well as it does here (note I already changed the `at` and `boxwex` options). If you don't write an intelligent function to take care of everything, it's necessary to pay attention to what everything does to make sure it's doing what you want. + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [5]: https://i.stack.imgur.com/vRujH.png + +",added 171 characters in body,2013-04-10 03:32:56.743 +138288,42517,594.0,5,,CC BY-SA 3.0,9686431b-415d-469d-8f49-669b2c417010,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][5] +(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.) + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + +\-- + +My plots are generated in R. + +Edit: + +As @gung surmised, `abline(v=mean...` was used to draw the mean-line across the plot and `rug` was used to draw the data values (though I actually used `rug(jitter(...` because the data was rounded to integers). + +Here's a way to do the boxplot in between the histogram and the axis: + + hist(Davis2[,2],n=30) + boxplot(Davis2[,2], + add=TRUE,horizontal=TRUE,at=-0.75,border=""darkred"",boxwex=1.5,outline=FALSE) + +I'm not going to list what everything there is for, but you can check the arguments in the help (`?boxplot`) to find out what they're for, and play with them yourself. + +Of course, you won't have that data, so here's a similar-looking example with built in data: + + hist((islands)^(1/4),n=30) + boxplot((islands)^(1/4), + add=TRUE,horizontal=TRUE,at=-.25,border=""darkred"",boxwex=.6,outline=FALSE) + +However, it's not a general solution - I don't guarantee it will always work as well as it does here (note I already changed the `at` and `boxwex` options\*). If you don't write an intelligent function to take care of everything, it's necessary to pay attention to what everything does to make sure it's doing what you want. + +\* -- an appropriate value for `at` is -0.5 times the value of `boxwex`; that would be a good default if you write a function to do it; `boxwex` would need to be scaled in a way that relates to the y-scale (height) of the boxplot; I'd suggest 0.04 to 0.05 times the upper y-limit might usually be okay. + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [5]: https://i.stack.imgur.com/vRujH.png + +",added 237 characters in body,2013-04-10 04:20:42.457 +138344,42517,594.0,5,,CC BY-SA 3.0,cdce9796-71f7-45c5-99ef-c8a9b10dd391,"Of course, why not? + +![histogram with mean][1] + +Here's an example (one of dozens I found with a simple google search): + +![hist with mean and median][2] + +(Image source is is the measuring usability blog, [here](http://www.measuringusability.com/average-times.php).) + +I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways. + +Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so: + +![histogram with marginal boxplot][3] + +There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, [here](https://www.soils.org/images/publications/aj/99/5/1366fig1.jpeg). + +Sometimes people mark in the data: + +![histogram rugplot with jitter][4] +(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.) + +There's an example of this kind, done in STATA, on [this page](https://www.ctspedia.org/do/view/CTSpedia/BasicHistogramExamples) (see the third one [here](https://www.ctspedia.org/wiki/pub/CTSpedia/BasicHistogramExamples/pic2.png)) + +Histograms are better with a little extra information - [they can be misleading on their own](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-histogram/51753#51753) + +You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.) + +\-- + +One last plot: + +![histogram with stripchart][5] + +\-- + +My plots are generated in R. + +Edit: + +As @gung surmised, `abline(v=mean...` was used to draw the mean-line across the plot and `rug` was used to draw the data values (though I actually used `rug(jitter(...` because the data was rounded to integers). + +Here's a way to do the boxplot in between the histogram and the axis: + + hist(Davis2[,2],n=30) + boxplot(Davis2[,2], + add=TRUE,horizontal=TRUE,at=-0.75,border=""darkred"",boxwex=1.5,outline=FALSE) + +I'm not going to list what everything there is for, but you can check the arguments in the help (`?boxplot`) to find out what they're for, and play with them yourself. + +However, it's not a general solution - I don't guarantee it will always work as well as it does here (note I already changed the `at` and `boxwex` options\*). If you don't write an intelligent function to take care of everything, it's necessary to pay attention to what everything does to make sure it's doing what you want. + +Here's how to create the data I used (I was trying to find whether Theil regression was really able to handle several influential outliers). It just happened to be data I was playing with when I first answered this question. + + library(""car"") + add <- data.frame(sex=c(""F"",""F""), + weight=c(150,130),height=c(NA,NA),repwt=c(55,50),repht=c(NA,NA)) + Davis2 <- rbind(Davis,add) + +\* -- an appropriate value for `at` is around -0.5 times the value of `boxwex`; that would be a good default if you write a function to do it; `boxwex` would need to be scaled in a way that relates to the y-scale (height) of the boxplot; I'd suggest 0.04 to 0.05 times the upper y-limit might often be okay. + +Code for the marginal stripchart: + + hist(Davis2[,2],n=30) + stripchart(jitter(Davis2[,2],amount=.5), + method=""jitter"",jitter=.5,pch=16,cex=.05,add=TRUE,at=-.75,col='purple3') + + [1]: https://i.stack.imgur.com/xNNl9.png + [2]: https://i.stack.imgur.com/fUqwi.jpg + [3]: https://i.stack.imgur.com/0gvWk.png + [4]: https://i.stack.imgur.com/vRujH.png + [5]: https://i.stack.imgur.com/7sHku.png +",added data and an extra plot,2013-04-10 08:42:35.867 +139547,44370,8063.0,3,,CC BY-SA 3.0,d83a57ab-0946-4b80-9986-8fccab06f0c1,,,2013-04-15 13:42:17.450 +139548,44370,8063.0,1,,CC BY-SA 3.0,d83a57ab-0946-4b80-9986-8fccab06f0c1,way to test for enrichment of differentially expressed genes in a genomic location,,2013-04-15 13:42:17.450 +139549,44370,8063.0,2,,CC BY-SA 3.0,d83a57ab-0946-4b80-9986-8fccab06f0c1,"I have an experiment where I expect a certain genomic location to influence gene expression levels of nearby genes. I have data for expression levels (Agilent 4x44 microarrays, Drosophila) in two groups - one where I expect expression to be affected and the other wild-type and I would like to run a test for overrepresentation of differentially expressed genes in a genomic location. + +My main problem is that I couldn't find a package (R/bioconductor) that would do it out of the box easily, so if you know about such a package, please let me know. In the meantime, this is what I figured out: I would run a sliding window over the whole genome and simply count number of differentially expressed genes in each window - this should tell me where I have the most differentially expressed genes in the genome. However, it will be dependent on gene density, so to obtain some sort of background distribution, I would run permutations of the samples (or p values), say, 1000 times, and check how often I am likely to find this number of windows with that number of differentially expressed genes compared to the observed numbers. Does this sound right? + +I should add that while I know the location that would mess up things, I cannot exclude that any other genomic region would not be affected as well. So I have to test the whole genome. + +Please advise on this approach and/or propose a better one...",,2013-04-15 13:42:17.450 +139558,44370,8063.0,4,,CC BY-SA 3.0,6d99f698-33b2-4c33-affa-f819f5c73d9c,A way to test for enrichment of differentially expressed genes in a genomic location,edited title,2013-04-15 14:26:51.783 +140326,44635,728.0,3,,CC BY-SA 3.0,bf522207-b2dc-4a54-8fb5-ae1af03fc385,,,2013-04-18 18:32:01.767 +140324,44635,728.0,2,,CC BY-SA 3.0,bf522207-b2dc-4a54-8fb5-ae1af03fc385,"We can test the symmetry of a distribution around $0$ by Wilcoxon sign rank test, based on its sample. + +But if we want to test if a distribution is symmetric around its mean, based on its sample $X_1, \dots, X_n$, is it valid to first normalize $X_i$ by the sample mean as $Y_i := X_i - \bar{X}$, and then apply Wilcoxon sign rank test to $Y_i$'s? + +If not, what are some ways? + +Thanks and regards!",,2013-04-18 18:32:01.767 +140325,44635,728.0,1,,CC BY-SA 3.0,bf522207-b2dc-4a54-8fb5-ae1af03fc385,Testing symmetry of a distribution around its mean,,2013-04-18 18:32:01.767 +140748,44772,17076.0,2,,CC BY-SA 3.0,346fc206-855b-487f-b7ca-73ba38240b2f,"Like many, I have stumbled across this site in an attempt to answer a stats question and I like what I see! What a great resource! + +I am doubting myself on which analysis to run for the following: +18 participants were evaluated at 4 time points with different conditions at each time. +They were given scores (on a discrete visual analog scale) by 2 raters. + +Is that a 2-way repeated measures ANOVA? Some variation of Friedman test? + +Ugh - my brain is fried from a rough week. Thanks for the help!",,2013-04-20 21:00:03.237 +140750,44772,17076.0,3,,CC BY-SA 3.0,346fc206-855b-487f-b7ca-73ba38240b2f,,,2013-04-20 21:00:03.237 +140749,44772,17076.0,1,,CC BY-SA 3.0,346fc206-855b-487f-b7ca-73ba38240b2f,"Appropriate Analysis for ordinal variable, repeated 4 times under different conditions, by the same 2 raters",,2013-04-20 21:00:03.237 +140767,44772,,24,,CC BY-SA 3.0,812b209f-806b-4401-a5fd-ba496547fb7f,,Proposed by 22468 approved by 930 edit id of 3243,2013-04-20 22:27:20.340 +140768,44772,16174.0,5,,CC BY-SA 3.0,812b209f-806b-4401-a5fd-ba496547fb7f,"I am doubting myself on which analysis to run for the following: +18 participants were evaluated at 4 time points with different conditions at each time. +They were given scores (on a discrete visual analog scale) by 2 raters. + +Is that a 2-way repeated measures ANOVA? Some variation of Friedman test? +",Removed text not related to question,2013-04-20 22:27:20.340 +140830,16537,,24,,CC BY-SA 3.0,5271a08a-9cf5-4dc9-a772-d937b700e477,,"Proposed by 9007 approved by 805, 930 edit id of 3247",2013-04-21 07:54:21.450 +142801,45457,17179.0,2,,CC BY-SA 3.0,ca8dd658-0510-4d78-a3b0-44a4d22d9762,"I have a the following time series + + + Price BrokerID 632 Behaviour BrokerID 680 Behaviour ...BrokerID XYZ Behaviour + + 5.6 IP SP + 5.7 BP IP + 5.8 SP BP + 5.83 IP SP + +where IP =Idle position , buying and ,selling position .I want to use Broker behaviour as the known variable and price as the hidden variable and predict it using HMM .But my question is how to find the emission matrix between a character vector (broker behaviour) and price numeric vector ",,2013-04-29 17:03:08.587 +142803,45457,17179.0,3,,CC BY-SA 3.0,ca8dd658-0510-4d78-a3b0-44a4d22d9762,,,2013-04-29 17:03:08.587 +140831,16537,,5,,CC BY-SA 3.0,5271a08a-9cf5-4dc9-a772-d937b700e477,"[Deep Learning][1] got a lot of focus since 2006. It's basically an approach to train deep neural networks and is leading to really impressive results on very hard datasets (like document clustering or object recognition). Some people are talking about the second neural network renaissance (eg in [this Google talk][2] by Schmidhuber). + +If you want to be impressed you should look at this Science paper [Reducing the Dimensionality of Data with Neural Networks,][3] Hinton & Salakhutdinov. + +(There is so much work going on right now in that area, that there is only two upcoming books I know about that will treat it: [Large scale machine learning][4], Langford et al and [Machine Learning: a probabilistic perspective][5] by Kevin Murphy.) + +If you want to know more, check out what the main deep learning groups are doing: [Stanford][6], [Montreal][7] and most importantly [Toronto #1][8] and [Toronto #2][9]. + + + [1]: http://deeplearning.net/ + [2]: http://www.youtube.com/watch?v=rkCNbi26Hds + [3]: http://www.utstat.toronto.edu/~rsalakhu/papers/science.pdf + [4]: http://www.cambridge.org/aus/catalogue/catalogue.asp?isbn=9780521192248 + [5]: http://www.cs.ubc.ca/~murphyk/MLbook/index.html + [6]: http://www.cs.stanford.edu/people/ang/ + [7]: http://www.iro.umontreal.ca/~bengioy/yoshua_en/index.html + [8]: http://www.cs.toronto.edu/~hinton/ + [9]: http://www.utstat.toronto.edu/~rsalakhu/",Update link,2013-04-21 07:54:21.450 +141367,44772,17076.0,5,,CC BY-SA 3.0,acf6ad29-8615-459d-9b1b-b991014c8698,"I am doubting myself on which analysis to run for the following: +18 participants were evaluated at 4 time points with different conditions at each time. +They were given scores (on a discrete visual analog scale) by 2 raters. + +The scores were calculated for a pair of participants: the pairs changed at each time point. +I do know which participant comprises each pair. + +Is that a 2-way repeated measures ANOVA? Some variation of Friedman test? +",added 146 characters in body,2013-04-23 17:52:53.203 +141488,40870,1923.0,5,,CC BY-SA 3.0,36929b06-7550-4410-9fea-2365df8c3e7f,"A common strategy in mass spectrometry of biological molecules is to upload observed spectra to a server so that they can be matched to a LARGE database of theoretical spectra of known molecules (a.k.a. *target* database). In order to control for false positives, a *decoy* database consisting of incorrect/irrelevant spectra is used. + +I have been reading more into this subject and have come up some questions regarding the calculation of the FDR measure from this target-decoy strategy. The basic idea of the FDR value is very intuitive: + +$FDR = \frac{FP}{FP + TP}$ + +where FP and TP stands for false and true positives respectively. This makes perfect sense to me; if I'm trying to guess some peoples' names out of a phone book, and get 8 right and 2 wrong, I would have 2 *false* out of 10 *total* guesses, and thus my false discovery rate would be 20%. + +However reading [this tutorial][1] on how this is done in large scale on the servers, I got introduced to two different calculations, depending on whether or not the *target* and *decoy* databases are concatenated (page 2). + + +I don't think that this is a typo as I found other occurrences * of the mysterious factor 2 in front of FP in scientific literature. However the motivation behind this is never explained (at least I couldn't find it). + +I would appreciate some insight on where this doubling comes from. Likewise I wonder whether or not FDR calculation this way **assumes** that the error rate for each spectra match is the same for the target database and decoy database (i.e. *assuming* that getting 25 decoy hits *implies* 25 target hits are also false positives). It's not really clear for me why the error rate has to be the same for the two databases. Any comments on this subject is also appreciated. + +* one such reference is Elias et al Nature Methods - 2, 667 - 675 (2005) + + [1]: http://www.proteored.org/pme6/fdr_calculation_for_pme6.pdf + [2]: doi:10.1038/nmeth785",added 10 characters in body,2013-04-24 08:39:29.520 +142246,45279,9095.0,2,,CC BY-SA 3.0,378f4df1-33d6-4625-a2f8-59f5b2c56a72,"I am using latent class analysis to cluster a sample of observations based on a set of binary variables. I am using R and the package poLCA. In LCA, you must specify the number of clusters you want to find. In practice, people usually run several models, each specifying a different number of classes, and then use various criteria to determine which is the ""best"" explanation of the data. + +I often find it very useful to look across the various models to try to understand how observations classified in model with class=(i) are distributed by the model with class = (i+1). At the very least you can sometimes find very robust clusters that exist regardless of the number of classes in the model. + +I would like a way to graph these relationships, to more easily communicate these complex results in papers and to colleagues who aren't statistically oriented. I imagine this is very easy to do in R using some kind of simple network graphics package, but I simply don't know how. + +Could anyone please point me in the right direction. Below is code to reproduce an example dataset. Each vector xi represents the classification of 100 observations, for number of possible classes i = 1, 2, 3, 4, 5. + + x1 <- sample(1:1, 100, replace=T) + x2 <- sample(1:2, 100, replace=T) + x3 <- sample(1:3, 100, replace=T) + x4 <- sample(1:4, 100, replace=T) + x5 <- sample(1:5, 100, replace=T) + + results <- cbind (x1, x2, x3, x4, x5) + +I imagine there is a way to produce a graph where the nodes are classifications and the edges reflect (by weights, or color maybe) the % of observations moving from classifications from one model to the next. E.g. + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/muEii.png",,2013-04-26 17:31:28.260 +142248,45279,9095.0,3,,CC BY-SA 3.0,378f4df1-33d6-4625-a2f8-59f5b2c56a72,,,2013-04-26 17:31:28.260 +142247,45279,9095.0,1,,CC BY-SA 3.0,378f4df1-33d6-4625-a2f8-59f5b2c56a72,Visualizing results from multiple latent class models,,2013-04-26 17:31:28.260 +142251,45280,17326.0,3,,CC BY-SA 3.0,a11eeec7-7e28-49f6-ac1b-7ccb111b0075,,,2013-04-26 17:33:27.823 +142250,45280,17326.0,1,,CC BY-SA 3.0,a11eeec7-7e28-49f6-ac1b-7ccb111b0075,Statistical test to show association of any kind between two variables,,2013-04-26 17:33:27.823 +142249,45280,17326.0,2,,CC BY-SA 3.0,a11eeec7-7e28-49f6-ac1b-7ccb111b0075,"I have two continuous variables which I have data from a physics exspirment. + +I want to test for association between the two variables but without assuming a monotonic relationship. I also only have 6 data point each with a large error associated with it and want the test to take this into consideration. + +Does anyone know of a statistical test of this type?",,2013-04-26 17:33:27.823 +142279,45279,,25,,,38accfd7-0742-4a77-9a4d-e3d0c4906ba1,,http://twitter.com/#!/StackStats/status/327874572810596353,2013-04-26 19:59:24.240 +142767,45279,9095.0,5,,CC BY-SA 3.0,22c59796-74fa-454a-8989-7f029494d459,"I am using latent class analysis to cluster a sample of observations based on a set of binary variables. I am using R and the package poLCA. In LCA, you must specify the number of clusters you want to find. In practice, people usually run several models, each specifying a different number of classes, and then use various criteria to determine which is the ""best"" explanation of the data. + +I often find it very useful to look across the various models to try to understand how observations classified in model with class=(i) are distributed by the model with class = (i+1). At the very least you can sometimes find very robust clusters that exist regardless of the number of classes in the model. + +I would like a way to graph these relationships, to more easily communicate these complex results in papers and to colleagues who aren't statistically oriented. I imagine this is very easy to do in R using some kind of simple network graphics package, but I simply don't know how. + +Could anyone please point me in the right direction. Below is code to reproduce an example dataset. Each vector xi represents the classification of 100 observations, in a model with i possible classes. I want to graph how observations (rows) move from class to class across the columns. + + x1 <- sample(1:1, 100, replace=T) + x2 <- sample(1:2, 100, replace=T) + x3 <- sample(1:3, 100, replace=T) + x4 <- sample(1:4, 100, replace=T) + x5 <- sample(1:5, 100, replace=T) + + results <- cbind (x1, x2, x3, x4, x5) + +I imagine there is a way to produce a graph where the nodes are classifications and the edges reflect (by weights, or color maybe) the % of observations moving from classifications from one model to the next. E.g. + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/muEii.png",added 71 characters in body,2013-04-29 14:34:50.910 +142802,45457,17179.0,1,,CC BY-SA 3.0,ca8dd658-0510-4d78-a3b0-44a4d22d9762,Predicting High Frequency Finance time series with HMM,,2013-04-29 17:03:08.587 +142851,45457,16174.0,6,,CC BY-SA 3.0,0baa5beb-9aaf-4cb5-9dd3-ea57ce80ee88,,"Added tag ""r"" according to question's comments",2013-04-29 20:18:56.040 +143035,45534,17447.0,3,,CC BY-SA 3.0,bc3a2f15-7af9-4188-8fca-181330e1cdac,,,2013-04-30 15:18:37.443 +143036,45534,17447.0,1,,CC BY-SA 3.0,bc3a2f15-7af9-4188-8fca-181330e1cdac,question about harrington paradox (my coursework),,2013-04-30 15:18:37.443 +143037,45534,17447.0,2,,CC BY-SA 3.0,bc3a2f15-7af9-4188-8fca-181330e1cdac,"2. Model +The firm and enforcement agency interact in more than one domain. This may arise because a single agency is responsible for enforcing more than one regulation or because it enforces the same regulation at more than one constituent plant of a multi-plant firm. +For simplicity we will assume that the number of domains is two and that they are ex ante identical. In each domain the firm is required to comply with a regulation. If it complies it inflicts no environmental damage otherwise it inflicts damage d, which is commonly observed. The cost to the ith firm of compliance in domain j [ h1, 2j will be denoted cij where ci 1 and ci 2 are independent, privately observed draws from a distribution f(c) with associated cumulative F(c). F is common knowledge. +If the agency observes non-compliance by a firm in either domain it can take that firm to court (‘‘pursue’’ the firm), in which case the firm is subject to a penalty L which is exogenous. Penalties are assumed to be restricted in the sense that +F(L) < 1. This implies that a policy of full-pursuit, whereby the agency pursues all 3 +violations, will not generate full-compliance. +The firm and enforcement agency are both risk neutral and aim to maximise +expected profit and minimise expected environmental damage respectively. + + +can someone explain to me what F(L) < 1 implies? + +if you need the context behind this model, please tell me ill explain that as well",,2013-04-30 15:18:37.443 +143039,45536,15663.0,2,,CC BY-SA 3.0,9b4a0dca-4916-404c-bb2b-1dc5ef87265c,"It means that the fine is lower than the compliance cost. + +This is what Harrington Paradox (http://en.wikipedia.org/wiki/Harrington_paradox) show: + +In the case of rational economics entities a firm will maximize its profit. This is not what is observed in reality. In theory, if the fine is lower than compliance cost a rationnal entity will not pay. In reality the fine is lower than compliance cost, but firms pay. + +This suggest image concern ( or altruism....) + +",,2013-04-30 15:36:32.853 +143062,45543,17454.0,1,,CC BY-SA 3.0,6c1a6483-bd0a-48d6-9b6b-e62295e7e01f,Is the square root of the symmetric Kullback-Leibler divergence a metric?,,2013-04-30 17:27:21.667 +143063,45543,17454.0,3,,CC BY-SA 3.0,6c1a6483-bd0a-48d6-9b6b-e62295e7e01f,,,2013-04-30 17:27:21.667 +143061,45543,17454.0,2,,CC BY-SA 3.0,6c1a6483-bd0a-48d6-9b6b-e62295e7e01f,"It is well known that the square root of the Jensen-Shannon divergence is a true metric, but how about the symmetric KL: D(P||Q)+D(Q||P)? I have reasons to believe that it also is a true metric but cannot find any references on that other than anecdotal comments such as that it behaves more like a metric when used.",,2013-04-30 17:27:21.667 +143065,45543,17454.0,5,,CC BY-SA 3.0,f880a338-7a31-4312-8890-7530c0a2a895,"It is well known that the square root of the Jensen-Shannon divergence is a true metric, but how about the square root of symmetric KL: D(P||Q)+D(Q||P)? I have reasons to believe that it also is a true metric but cannot find any references on that other than anecdotal comments such as that it behaves more like a metric when used.",added 15 characters in body,2013-04-30 17:36:18.407 +143112,45543,17454.0,5,,CC BY-SA 3.0,95c95e40-e4e7-4bd2-8f99-a223d24d12b8,"It is well known that the square root of the Jensen-Shannon divergence is a true metric, but how about the square root of symmetric KL: D(P||Q)+D(Q||P)? I have reasons to believe that it also is a true metric but cannot find any references on that other than anecdotal comments such as that it behaves more like a metric when used. + +Updated + +Kullback-Leibler divergence: $D(P||Q) = \sum_i p_i\log(p_i/q_i)$ + +Jensen-Shannon divergence: $J(P,Q) = \big(D(P||(P+Q)/2)+D(Q||(P+Q)/2)\big)/2$ + +Symmetric KL divergence: $S(P,Q) = D(P||Q)+D(Q||P) = \sum_i (p_i-q_i)\log(p_i/q_i)$ + +Square root of symmetric KL: $d_{KL}(P,Q) = \sqrt{S(P,Q)}$ + +Is $d_{KL}$ a metric? + +",Added math to define the question more exactly,2013-04-30 20:30:42.553 +143211,45543,17454.0,5,,CC BY-SA 3.0,a14a6e7f-686a-4e3a-92f3-bde20248f427,"It is well known that the square root of the Jensen-Shannon divergence is a true metric, but how about the square root of symmetric KL: D(P||Q)+D(Q||P)? I have reasons to believe that it also is a true metric but cannot find any references on that other than anecdotal comments such as that it behaves more like a metric when used. + +Update 1 + +Kullback-Leibler divergence: $D(P||Q) = \sum_i p_i\log(p_i/q_i)$ + +Jensen-Shannon divergence: $J(P,Q) = \big(D(P||(P+Q)/2)+D(Q||(P+Q)/2)\big)/2$ + +Symmetric KL divergence: $S(P,Q) = D(P||Q)+D(Q||P) = \sum_i (p_i-q_i)\log(p_i/q_i)$ + +Square root of symmetric KL: $d_{KL}(P,Q) = \sqrt{S(P,Q)}$ + +Is $d_{KL}$ a metric? + +Update 2 +I think the following upper and lower bounds hold: + +$\sum_i (p_i-q_i)^2 \leq \sum_i (p_i-q_i)\log(p_i/q_i) \leq \sum_i \log(p_i/q_i)^2$ + +Both of the square root of the bounds are metrics, I suppose, since they are the square of the Euclidean distances in the probability space and the log-prob space respectively. + + +",added 337 characters in body,2013-05-01 09:20:46.710 +143216,45543,17454.0,5,,CC BY-SA 3.0,a8c02a64-4ffa-402a-90ec-4a95cc887ee7,"It is well known that the square root of the Jensen-Shannon divergence is a true metric, but how about the square root of symmetric KL: D(P||Q)+D(Q||P)? I have reasons to believe that it also is a true metric but cannot find any references on that other than anecdotal comments such as that it behaves more like a metric when used. + +Update 1 + +Kullback-Leibler divergence: $D(P||Q) = \sum_i p_i\log(p_i/q_i)$ + +Jensen-Shannon divergence: $J(P,Q) = \big(D(P||(P+Q)/2)+D(Q||(P+Q)/2)\big)/2$ + +Symmetric KL divergence: $S(P,Q) = D(P||Q)+D(Q||P) = \sum_i (p_i-q_i)\log(p_i/q_i)$ + +Square root of symmetric KL: $d_{KL}(P,Q) = \sqrt{S(P,Q)}$ + +Is $d_{KL}$ a metric? + +Update 2 + +I think the following upper and lower bounds hold: + +$\sum_i (p_i-q_i)^2 \leq \sum_i (p_i-q_i)\log(p_i/q_i) \leq \sum_i \log(p_i/q_i)^2$ + +Both of the square root of the bounds are metrics, I suppose, since they are the square of the Euclidean distances in the probability space and the log-prob space respectively. + + +",Added more info on the problem,2013-05-01 09:36:54.540 +144862,46070,17678.0,5,,CC BY-SA 3.0,28ffc670-5a1f-4cd3-a0c8-42876df5ba81,"I think this is a basic question, but maybe I am confusing the concepts. + +Suppose I fit an ARIMA model to a time series using, for example, the function auto.arima() in the R forecast package. The model assumes constant variance. How do I obtain that variance? Is it the variance of the residuals? + +If I use the model for forecasting, I know that it gives me the conditional mean. I'd like to know the (constant) variance as well. + +Thank you. + +Bruno + +
+ +

Update 1:

+ +I added some code below. The variance given by `sigma2` isn't close to the one calculated from the data as if they were i.i.d samples. I'm still wondering if `sigma2` is the right option. See figure below for time series plot. + + demand.train <- c(10.06286, 9.56286, 10.51914, 12.39571, 14.72857, 15.89429, 15.89429, 17.06143, + 17.72857, 16.56286, 14.23000, 15.39571, 13.06286, 15.39571, 15.39571, 16.56286, + 16.21765, 15.93449, 14.74856, 14.46465, 15.38132) + timePoints.train <- c(""Q12006"", ""Q22006"", ""Q32006"", ""Q12007"", ""Q22007"", ""Q32007"", ""Q12008"", ""Q22008"", + ""Q32008"", ""Q12009"", ""Q22009"", ""Q32009"", ""Q12010"", ""Q22010"", ""Q32010"", ""Q12011"", + ""Q22011"", ""Q32011"", ""Q12012"", ""Q22012"", ""Q32012"") + + plot(1:length(timePoints.train), demand.train, type=""o"", xaxt=""n"", ylim=c(0, max(demand.train) + 2), + ylab=""Demand"", xlab=""Quadrimestre"") + + title(main=""Time Series Demand of Product C"", font.main=4) + axis(1, at=1:length(timePoints.train), labels=timePoints.train) + box() + + ### ARIMA Fit + library(forecast) + + # Time series + demandts.freq <- 3 + demandts.train <- ts(demand.train, frequency=demandts.freq, start=c(2006, 1)) + + # Model fitting + demandts.train.arima <- auto.arima(demandts.train, max.p=10, max.q=10, max.P=10, max.Q=10, max.order=10) + print(demandts.train.arima) + summary(demandts.train.arima) + demandts.train.arima.fit <- fitted(demandts.train.arima) + + # Forecast ARIMA (conditional means) + demandts.arima.forecast <- forecast(demandts.train.arima, h = 3, level=95) + print(demandts.arima.forecast) + + # Constant variance from ARIMA + demandts.arima.var <- demandts.train.arima$sigma2 + print(demandts.arima.var) + + # Variance from data + print(var(demandts.train)) + + +![Time Series Plot][1] + + + [1]: https://i.stack.imgur.com/E5gv0.png",added 2007 characters in body,2013-05-08 15:15:58.943 +143509,45279,9095.0,5,,CC BY-SA 3.0,97786d81-40b7-4360-8038-a077d49f35c3,"I am using latent class analysis to cluster a sample of observations based on a set of binary variables. I am using R and the package poLCA. In LCA, you must specify the number of clusters you want to find. In practice, people usually run several models, each specifying a different number of classes, and then use various criteria to determine which is the ""best"" explanation of the data. + +I often find it very useful to look across the various models to try to understand how observations classified in model with class=(i) are distributed by the model with class = (i+1). At the very least you can sometimes find very robust clusters that exist regardless of the number of classes in the model. + +I would like a way to graph these relationships, to more easily communicate these complex results in papers and to colleagues who aren't statistically oriented. I imagine this is very easy to do in R using some kind of simple network graphics package, but I simply don't know how. + +Could anyone please point me in the right direction. Below is code to reproduce an example dataset. Each vector xi represents the classification of 100 observations, in a model with i possible classes. I want to graph how observations (rows) move from class to class across the columns. + + x1 <- sample(1:1, 100, replace=T) + x2 <- sample(1:2, 100, replace=T) + x3 <- sample(1:3, 100, replace=T) + x4 <- sample(1:4, 100, replace=T) + x5 <- sample(1:5, 100, replace=T) + + results <- cbind (x1, x2, x3, x4, x5) + +I imagine there is a way to produce a graph where the nodes are classifications and the edges reflect (by weights, or color maybe) the % of observations moving from classifications from one model to the next. E.g. + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/muEii.png + +UPDATE: Having some progress with the igraph package. Starting from the code above... + +poLCA results recycle the same numbers to describe class membership, so you need to do a bit of recoding. + + N<-ncol(results) + n<-0 + for(i in 2:N) { + results[,i]<- (results[,i])+((i-1)+n) + n<-((i-1)+n) + } + +Then you need to get all the cross-tabulations and their frequencies, and rbind them into one matrix defining all the edges. There is probably a much more elegant way to do this. + + results <-as.data.frame(results) + + g1 <- count(results,c(""x1"", ""x2"")) + + g2 <- count(results,c(""x2"", ""x3"")) + colnames(g2) <- c(""x1"", ""x2"", ""freq"") + + g3 <- count(results,c(""x3"", ""x4"")) + colnames(g3) <- c(""x1"", ""x2"", ""freq"") + + g4 <- count(results,c(""x4"", ""x5"")) + colnames(g4) <- c(""x1"", ""x2"", ""freq"") + + results <- rbind(g1, g2, g3, g4) + + library(igraph) + + g1 <- graph.data.frame(results, directed=TRUE) + + tkplot(g1) + +",added 1064 characters in body,2013-05-02 15:18:21.187 +143513,45279,9095.0,5,,CC BY-SA 3.0,ee984bb1-4ecb-4131-90d5-d0ad162e975d,"I am using latent class analysis to cluster a sample of observations based on a set of binary variables. I am using R and the package poLCA. In LCA, you must specify the number of clusters you want to find. In practice, people usually run several models, each specifying a different number of classes, and then use various criteria to determine which is the ""best"" explanation of the data. + +I often find it very useful to look across the various models to try to understand how observations classified in model with class=(i) are distributed by the model with class = (i+1). At the very least you can sometimes find very robust clusters that exist regardless of the number of classes in the model. + +I would like a way to graph these relationships, to more easily communicate these complex results in papers and to colleagues who aren't statistically oriented. I imagine this is very easy to do in R using some kind of simple network graphics package, but I simply don't know how. + +Could anyone please point me in the right direction. Below is code to reproduce an example dataset. Each vector xi represents the classification of 100 observations, in a model with i possible classes. I want to graph how observations (rows) move from class to class across the columns. + + x1 <- sample(1:1, 100, replace=T) + x2 <- sample(1:2, 100, replace=T) + x3 <- sample(1:3, 100, replace=T) + x4 <- sample(1:4, 100, replace=T) + x5 <- sample(1:5, 100, replace=T) + + results <- cbind (x1, x2, x3, x4, x5) + +I imagine there is a way to produce a graph where the nodes are classifications and the edges reflect (by weights, or color maybe) the % of observations moving from classifications from one model to the next. E.g. + +![enter image description here][1] + + +UPDATE: Having some progress with the igraph package. Starting from the code above... + +poLCA results recycle the same numbers to describe class membership, so you need to do a bit of recoding. + + N<-ncol(results) + n<-0 + for(i in 2:N) { + results[,i]<- (results[,i])+((i-1)+n) + n<-((i-1)+n) + } + +Then you need to get all the cross-tabulations and their frequencies, and rbind them into one matrix defining all the edges. There is probably a much more elegant way to do this. + + results <-as.data.frame(results) + + g1 <- count(results,c(""x1"", ""x2"")) + + g2 <- count(results,c(""x2"", ""x3"")) + colnames(g2) <- c(""x1"", ""x2"", ""freq"") + + g3 <- count(results,c(""x3"", ""x4"")) + colnames(g3) <- c(""x1"", ""x2"", ""freq"") + + g4 <- count(results,c(""x4"", ""x5"")) + colnames(g4) <- c(""x1"", ""x2"", ""freq"") + + results <- rbind(g1, g2, g3, g4) + + library(igraph) + + g1 <- graph.data.frame(results, directed=TRUE) + + plot.igraph(g1, layout=layout.reingold.tilford) + +![enter image description here][2] + +Time to play more with the igraph options I guess. + + [1]: https://i.stack.imgur.com/muEii.png + [2]: https://i.stack.imgur.com/iCJ2Z.png",added 169 characters in body,2013-05-02 15:29:34.723 +143827,45804,17580.0,3,,CC BY-SA 3.0,f09d1a96-91a7-4fc4-9d6d-412b74fb16e0,,,2013-05-03 21:05:29.093 +143828,45804,17580.0,2,,CC BY-SA 3.0,f09d1a96-91a7-4fc4-9d6d-412b74fb16e0,"I am a first-year grad student in Computer Science, and I need some help with a problem that I think is statistically oriented. I have taken a statistics course, but it was abysmal and I haven't had time to rectify that. But anyway, my problem stems from a project I'm working on involving genetic programming, where I'm randomly generating functions. Please bear with my description, as it's been a while since I've had a formal theory course too. + + +I have two continuous (but not onto) functions **F** and **G**, both of which map **N** variables to a single output. The domain of the input variables is the integers between -100 and 100. The range of the output is the Real numbers. I want to find some statistical measure of how ""similar"" the two functions are; given the finite inputs (of which there will be 201^N possible), how much variance(?) there is between the two functions outputs. Two identical functions should return no variance, and two wildly different functions should return a high variance. + + +Since **N** will typically be greater than 6, I can't iterate through all the possible inputs and compare the outputs, so I figured I could take some sampling at regular intervals (e.g. every multiple of 10, so that it's only 10^N). But here's about where I realize I have no idea what I'm doing. How do I determine if two numbers are ""highly variant"" from each other? What sample size do I need to use to have confidence in my results? + +My current approach is to compare the functions with a two-sided Kolmogorov-Smirnov Test. Since that test doesn't seem to scale well to multi-variate problems, I've taken advantage of my limited domains to just treat the problem as having a single variable by concatenating my variables. So the first value of the variable is (-100:100:100:100:100:100), the second is (-100:100:100:100:100:099), and the last is (100:100:100:100:100:100). Does that even make sense?",,2013-05-03 21:05:29.093 +143826,45804,17580.0,1,,CC BY-SA 3.0,f09d1a96-91a7-4fc4-9d6d-412b74fb16e0,Finding the similarity between two functions,,2013-05-03 21:05:29.093 +144685,46070,17678.0,3,,CC BY-SA 3.0,52ab564e-92d6-4305-a134-592e75304928,,,2013-05-07 19:48:58.193 +144684,46070,17678.0,1,,CC BY-SA 3.0,52ab564e-92d6-4305-a134-592e75304928,Variance of a Time Series Fitted to an ARIMA Model,,2013-05-07 19:48:58.193 +144683,46070,17678.0,2,,CC BY-SA 3.0,52ab564e-92d6-4305-a134-592e75304928,"I think this is a basic question, but maybe I am confusing the concepts. + +Suppose I fit an ARIMA model to a time series using, for example, the function auto.arima() in the R forecast package. The model assumes constant variance. How do I obtain that variance? Is it the variance of the residuals? + +If I use the model for forecasting, I know that it gives me the conditional mean. I'd like to know the (constant) variance as well. + +Thank you. + +Bruno",,2013-05-07 19:48:58.193 +144776,46070,,4,user88,CC BY-SA 3.0,c529a8a4-82b1-44e8-a534-933108bf0c22,Variance of a time series fitted to an ARIMA model,edited title,2013-05-08 07:52:15.497 +150746,47846,17994.0,2,,CC BY-SA 3.0,5f0c0ec7-5a81-4865-bd25-b59c43fcee75,"I am looking for some information about the difference between binomial, negative binomial and Poisson regression and for which situations are these regression best fitted. + +Are there any tests I can preform in SPSS that can tell me which of these regressions is the best for my situation? + +Also, how do I run a Poisson or negative binomial in SPSS, since there are no options like that as I can see in the regression part? + +if you have any useful links I would appreciate it very much. + +Thank you, +",,2013-06-02 09:36:07.877 +150744,47846,17994.0,3,,CC BY-SA 3.0,5f0c0ec7-5a81-4865-bd25-b59c43fcee75,,,2013-06-02 09:36:07.877 +185573,57248,10278.0,2,,CC BY-SA 3.0,aa37ca6b-1998-4188-87ae-176c040d5644,"If you treat the variable as ordinal you are assuming a gene-dosage effect. +This is essentially a two degree of freedom test. +If you treat the variable as nominal you are not assuming any gene-dosage effect and instead comparing the mean of the three genotype groups this is a one degree of freedom test. +Hence the gene-dosage model (treating genotypes as ordinal) is more powerful. ",,2013-10-10 19:25:29.377 +144891,46070,17678.0,5,,CC BY-SA 3.0,7837fdca-d405-4deb-9d02-5135bb23d80b,"I think this is a basic question, but maybe I am confusing the concepts. + +Suppose I fit an ARIMA model to a time series using, for example, the function auto.arima() in the R forecast package. The model assumes constant variance. How do I obtain that variance? Is it the variance of the residuals? + +If I use the model for forecasting, I know that it gives me the conditional mean. I'd like to know the (constant) variance as well. + +Thank you. + +Bruno + +
+ +

Update 1:

+ +I added some code below. The variance given by `sigma2` isn't close to the one calculated from the fitted values. I'm still wondering if `sigma2` is the right option. See figure below for time series plot. + + demand.train <- c(10.06286, 9.56286, 10.51914, 12.39571, 14.72857, 15.89429, 15.89429, 17.06143, + 17.72857, 16.56286, 14.23000, 15.39571, 13.06286, 15.39571, 15.39571, 16.56286, + 16.21765, 15.93449, 14.74856, 14.46465, 15.38132) + timePoints.train <- c(""Q12006"", ""Q22006"", ""Q32006"", ""Q12007"", ""Q22007"", ""Q32007"", ""Q12008"", ""Q22008"", + ""Q32008"", ""Q12009"", ""Q22009"", ""Q32009"", ""Q12010"", ""Q22010"", ""Q32010"", ""Q12011"", + ""Q22011"", ""Q32011"", ""Q12012"", ""Q22012"", ""Q32012"") + + plot(1:length(timePoints.train), demand.train, type=""o"", xaxt=""n"", ylim=c(0, max(demand.train) + 2), + ylab=""Demand"", xlab=""Quadrimestre"") + + title(main=""Time Series Demand of Product C"", font.main=4) + axis(1, at=1:length(timePoints.train), labels=timePoints.train) + box() + + ### ARIMA Fit + library(forecast) + + # Time series + demandts.freq <- 3 + demandts.train <- ts(demand.train, frequency=demandts.freq, start=c(2006, 1)) + + # Model fitting + demandts.train.arima <- auto.arima(demandts.train, max.p=10, max.q=10, max.P=10, max.Q=10, max.order=10) + print(demandts.train.arima) + summary(demandts.train.arima) + demandts.train.arima.fit <- fitted(demandts.train.arima) + + # Forecast ARIMA (conditional means) + demandts.arima.forecast <- forecast(demandts.train.arima, h = 3, level=95) + print(demandts.arima.forecast) + + # Constant variance from ARIMA + demandts.arima.var <- demandts.train.arima$sigma2 + print(demandts.arima.var) + + # Variance from fitted values + print(var(demandts.train.arima.fit)) + + +![Time Series Plot][1] + + + [1]: https://i.stack.imgur.com/E5gv0.png",deleted 11 characters in body,2013-05-08 17:27:21.287 +145762,46384,15839.0,2,,CC BY-SA 3.0,b353bccf-17d3-47e7-a797-4cb6190acfe4,What is the difference of gaussian HMM and gaussian mixture HMM (the emission is gaussian or gaussian mixture)? I want to know if it is the same thing. What is the point when estimating the parameters using Baum Welch algorithm.,,2013-05-13 01:51:18.740 +145761,46384,15839.0,1,,CC BY-SA 3.0,b353bccf-17d3-47e7-a797-4cb6190acfe4,gaussian mixture HMM,,2013-05-13 01:51:18.740 +145763,46384,15839.0,3,,CC BY-SA 3.0,b353bccf-17d3-47e7-a797-4cb6190acfe4,,,2013-05-13 01:51:18.740 +147491,46894,18085.0,3,,CC BY-SA 3.0,83503990-ae8e-4d1e-a7ed-192f418499be,,,2013-05-19 14:17:34.503 +147492,46894,18085.0,2,,CC BY-SA 3.0,83503990-ae8e-4d1e-a7ed-192f418499be,"I have a question concerning the coefficients of VAR models used on multiple imputed data (high missigness in some variables: up to 40%). +In particular I would like to know how the coefficients are related to the explained variance. + +I have used vector autoregression on multiple imputed data (m=10) and have then combined the estimated coefficient with rubin's rule. +However, what confuses me is the fact that my imputation variance is quite small in relationship to the estimates and variance of coefficients, but the difference between the explained variance is huge (17% to 0.04%) between models. + +My idea is that since the highest imputation variance across all systems is at the constant (around a third of the variance value but 3-4 times higher then in other coefficients) and that this critically affects the explained variance. +But thats just a guess. + +I would be very happy if somebody could help me here.",,2013-05-19 14:17:34.503 +147493,46894,18085.0,1,,CC BY-SA 3.0,83503990-ae8e-4d1e-a7ed-192f418499be,Imputation variance and explained variance (in vector autoregression),,2013-05-19 14:17:34.503 +147590,37981,,4,user88,CC BY-SA 3.0,1bc3c281-c8b2-41db-8463-ad3b761d435a,"""Peakedness"" of a skewed probability density function",edited title,2013-05-19 22:47:17.833 +149359,47447,18356.0,3,,CC BY-SA 3.0,4482f89f-84bb-4753-aecd-48204430063f,,,2013-05-27 18:56:55.533 +149361,47447,18356.0,1,,CC BY-SA 3.0,4482f89f-84bb-4753-aecd-48204430063f,How to fit a simple count time series INAR(1) model in R,,2013-05-27 18:56:55.533 +149360,47447,18356.0,2,,CC BY-SA 3.0,4482f89f-84bb-4753-aecd-48204430063f,"I am trying to perform a simple time series analysis with count time series data. My data is a sequence of small integer values like 0,1,2 and 3. I learned from various sources that INAR model would be appropriate with such data. + +My question is whether anyone knows R codes for fitting a simple INAR(1) model (regressing time series data on a binary dummy variable). + +Appreciate any assistance.",,2013-05-27 18:56:55.533 +149550,47497,18382.0,1,,CC BY-SA 3.0,1999dc19-2add-44c8-b460-caf0eada062f,R Code for yeo-johnson transformation,,2013-05-28 08:54:55.873 +149551,47497,18382.0,2,,CC BY-SA 3.0,1999dc19-2add-44c8-b460-caf0eada062f,"I have writen a code for boxcox transformation (see below). But now I want to do a yeo-johnson Transformation because datc$plot contains zeros.I try, but I dont find a solution. + +lambda.fm1 <- boxcox(datc$plot ~ datc$cond.evlot*datc$cond.dl*datc$version), family=""yjPower"") +lambda.max <- lambda.fm1$x[which.max(lambda.fm1$y)] +require(car) +datc$plott <- bcPower(datc$plot, lambda = lambda.max, jacobian.adjusted = FALSE) +",,2013-05-28 08:54:55.873 +149552,47497,18382.0,3,,CC BY-SA 3.0,1999dc19-2add-44c8-b460-caf0eada062f,,,2013-05-28 08:54:55.873 +149565,47497,10579.0,5,,CC BY-SA 3.0,82a992cc-5956-4e98-bf56-4dbd49e5b6cf,"I have writen a code for `boxcox` transformation (see below). But now I want to do a yeo-johnson Transformation because `datc$plot` contains zeros. I try, but I dont find a solution. + + lambda.fm1 <- boxcox(datc$plot ~ datc$cond.evlot*datc$cond.dl*datc$version), + family=""yjPower"") + lambda.max <- lambda.fm1$x[which.max(lambda.fm1$y)] + require(car) + datc$plott <- bcPower(datc$plot, lambda = lambda.max, jacobian.adjusted = FALSE) + +",Improved formatting.,2013-05-28 10:05:13.463 +149564,47497,,24,,CC BY-SA 3.0,82a992cc-5956-4e98-bf56-4dbd49e5b6cf,,"Proposed by 13680 approved by 686, 930 edit id of 3536",2013-05-28 10:05:13.463 +149612,47497,594.0,5,,CC BY-SA 3.0,ab54965b-f8af-4965-a62c-06be401b71b6,"I have writen code for a Box-Cox transformation (see below). But now I want to do a Yeo-Johnson transformation because `datc$plot` contains zeros. I try, but I don't find a solution. + + lambda.fm1 <- boxcox(datc$plot ~ datc$cond.evlot*datc$cond.dl*datc$version), + family=""yjPower"") + lambda.max <- lambda.fm1$x[which.max(lambda.fm1$y)] + require(car) + datc$plott <- bcPower(datc$plot, lambda = lambda.max, jacobian.adjusted = FALSE) + +",formatting,2013-05-28 12:03:05.920 +149614,47497,594.0,5,,CC BY-SA 3.0,f80ec0c1-8bea-474e-9fa1-a619b07c2d38,"I have writen code for a Box-Cox transformation (see below). But now I want to do a Yeo-Johnson transformation because `datc$plot` contains zeros. I tried, but I didn't find a solution. + + lambda.fm1 <- boxcox(datc$plot ~ datc$cond.evlot*datc$cond.dl*datc$version), + family=""yjPower"") + lambda.max <- lambda.fm1$x[which.max(lambda.fm1$y)] + require(car) + datc$plott <- bcPower(datc$plot, lambda = lambda.max, jacobian.adjusted = FALSE) + +",added 3 characters in body,2013-05-28 12:08:47.473 +149616,47497,594.0,4,,CC BY-SA 3.0,2c7fcb84-9f44-4c70-aa72-1fbacc34ff25,R Code for Yeo-Johnson transformation,Also fixed title,2013-05-28 12:18:56.140 +150745,47846,17994.0,1,,CC BY-SA 3.0,5f0c0ec7-5a81-4865-bd25-b59c43fcee75,"Difference between binomial, negative binomial and Poisson regression",,2013-06-02 09:36:07.877 +161768,50739,10492.0,3,,CC BY-SA 3.0,2b322f2f-26bd-4ba5-8f8a-b2ac9b44b523,,,2013-07-14 10:14:44.800 +161767,50739,10492.0,1,,CC BY-SA 3.0,2b322f2f-26bd-4ba5-8f8a-b2ac9b44b523,Linear Regression and ANOVA,,2013-07-14 10:14:44.800 +150749,47846,594.0,5,,CC BY-SA 3.0,8150fce4-ce77-4c73-a3ca-1ef3175a1915,"I am looking for some information about the difference between binomial, negative binomial and Poisson regression and for which situations are these regression best fitted. + +Are there any tests I can perform in SPSS that can tell me which of these regressions is the best for my situation? + +Also, how do I run a Poisson or negative binomial in SPSS, since there are no options like that as I can see in the regression part? + +if you have any useful links I would appreciate it very much. + +Thank you, +",edited body,2013-06-02 09:43:45.347 +150753,47846,15827.0,6,,CC BY-SA 3.0,a03bcc94-a823-4c77-bd0a-3769a110b6a0,,deleted 15 characters in body; edited tags,2013-06-02 09:48:13.397 +150752,47846,15827.0,5,,CC BY-SA 3.0,a03bcc94-a823-4c77-bd0a-3769a110b6a0,"I am looking for some information about the difference between binomial, negative binomial and Poisson regression and for which situations are these regression best fitted. + +Are there any tests I can perform in SPSS that can tell me which of these regressions is the best for my situation? + +Also, how do I run a Poisson or negative binomial in SPSS, since there are no options such as I can see in the regression part? + +If you have any useful links I would appreciate it very much. + + +",deleted 15 characters in body; edited tags,2013-06-02 09:48:13.397 +151234,47981,16990.0,2,,CC BY-SA 3.0,a6d65662-ed58-4380-99eb-0d7556c7551a,"In a Wilcoxon signed-ranks statistical significance test, we came across some data that produces a $p$-value of $0.04993$. With a threshold of $p < 0.05$, is this result enough to reject the null hypothesis, or is it safer to say the test was inconclusive, since if we round the p-value to 3 decimal places it becomes $0.050$? +",,2013-06-04 09:21:32.970 +151236,47981,16990.0,3,,CC BY-SA 3.0,a6d65662-ed58-4380-99eb-0d7556c7551a,,,2013-06-04 09:21:32.970 +151235,47981,16990.0,1,,CC BY-SA 3.0,a6d65662-ed58-4380-99eb-0d7556c7551a,Is a p-value of 0.04993 enough to reject null hypothesis?,,2013-06-04 09:21:32.970 +151641,48103,11200.0,1,,CC BY-SA 3.0,8c37cf76-3157-4afe-9d35-8d50006e2b57,Fit a sine to data,,2013-06-05 18:23:47.483 +151642,48103,11200.0,3,,CC BY-SA 3.0,8c37cf76-3157-4afe-9d35-8d50006e2b57,,,2013-06-05 18:23:47.483 +151640,48103,11200.0,2,,CC BY-SA 3.0,8c37cf76-3157-4afe-9d35-8d50006e2b57,"although I read [this](http://stats.stackexchange.com/questions/60500/how-to-find-a-good-fit-for-semi-sinusoidal-model-in-r) post, I still have no idea how to apply this to my own data and hope that someone can help me out. + +I have the following data: + + y <- c(11.622967, 12.006081, 11.760928, 12.246830, 12.052126, 12.346154, 12.039262, 12.362163, 12.009269, 11.260743, 10.950483, 10.522091, 9.346292, 7.014578, 6.981853, 7.197708, 7.035624, 6.785289, 7.134426, 8.338514, 8.723832, 10.276473, 10.602792, 11.031908, 11.364901, 11.687638, 11.947783, 12.228909, 11.918379, 12.343574, 12.046851, 12.316508, 12.147746, 12.136446, 11.744371, 8.317413, 8.790837, 10.139807, 7.019035, 7.541484, 7.199672, 9.090377, 7.532161, 8.156842, 9.329572, 9.991522, 10.036448, 10.797905) + t <- 18:65 + +And now I simply want to fit a sine wave + +$$ +y(t)=A\cdot sin(\omega t+\phi) +C. +$$ + +with the four unknowns $A$, $\omega$, $\phi$ and $C$ to it. + +The rest of my code looks is the following + + res <- nls(y ~ A*sin(omega*ct+phi)+C, data=data.frame(t,y), start=list(A=1,omega=1,phi=1,C=1)) + co <- coef(res) + + fit <- function(x, a, b, c, d) {a*sin(b*x+c)+d} + + # Plot result + plot(x=t, y=y) + curve(fit(x, a=co[""A""], b=co[""omega""], c=co[""phi""], d=co[""C""]), add=TRUE ,lwd=2, col=""steelblue"") + +But the result is really poor. + +I would very much appreciate any help. + +Cheers.",,2013-06-05 18:23:47.483 +151646,48103,,5,,CC BY-SA 3.0,89db7ecb-ac8c-4ca3-86ef-570b37820c46,"Although I read [this](http://stats.stackexchange.com/questions/60500/how-to-find-a-good-fit-for-semi-sinusoidal-model-in-r) post, I still have no idea how to apply this to my own data and hope that someone can help me out. + +I have the following data: + + y <- c(11.622967, 12.006081, 11.760928, 12.246830, 12.052126, 12.346154, 12.039262, 12.362163, 12.009269, 11.260743, 10.950483, 10.522091, 9.346292, 7.014578, 6.981853, 7.197708, 7.035624, 6.785289, 7.134426, 8.338514, 8.723832, 10.276473, 10.602792, 11.031908, 11.364901, 11.687638, 11.947783, 12.228909, 11.918379, 12.343574, 12.046851, 12.316508, 12.147746, 12.136446, 11.744371, 8.317413, 8.790837, 10.139807, 7.019035, 7.541484, 7.199672, 9.090377, 7.532161, 8.156842, 9.329572, 9.991522, 10.036448, 10.797905) + t <- 18:65 + +And now I simply want to fit a sine wave + +$$ +y(t)=A\cdot sin(\omega t+\phi) +C. +$$ + +with the four unknowns $A$, $\omega$, $\phi$ and $C$ to it. + +The rest of my code looks is the following + + res <- nls(y ~ A*sin(omega*t+phi)+C, data=data.frame(t,y), start=list(A=1,omega=1,phi=1,C=1)) + co <- coef(res) + + fit <- function(x, a, b, c, d) {a*sin(b*x+c)+d} + + # Plot result + plot(x=t, y=y) + curve(fit(x, a=co[""A""], b=co[""omega""], c=co[""phi""], d=co[""C""]), add=TRUE ,lwd=2, col=""steelblue"") + +But the result is really poor. + +![Sine fit][1] + +I would very much appreciate any help. + +Cheers. + + + [1]: https://i.stack.imgur.com/IS0ae.png",added 64 characters in body,2013-06-05 18:43:38.480 +151711,48125,18416.0,2,,CC BY-SA 3.0,d5f99bb9-5a14-4fbc-8033-cad74168df97,"Is $||Y-X\beta||_2^2 + \lambda\beta^T K\beta$ , the standard loss-function in kernel ridge regression, or is it different? Also, is the gaussian kernel a standard choice used for the kernel, in practice? If not, which kernels are used more often than not? Also, is $\lambda$ the only parameter to be tuned via cross-validation or is the kernel parameter like $\sigma$ in a gaussian kernel, also tuned via cross validation in practice? Please confirm and/or correct my understanding of Kernel ridge regression.",,2013-06-06 01:38:15.400 +151710,48125,18416.0,3,,CC BY-SA 3.0,d5f99bb9-5a14-4fbc-8033-cad74168df97,,,2013-06-06 01:38:15.400 +151712,48125,18416.0,1,,CC BY-SA 3.0,d5f99bb9-5a14-4fbc-8033-cad74168df97,kernel ridge regression loss:,,2013-06-06 01:38:15.400 +151713,48125,,25,,,5cbdf2e5-f98f-4fdc-93e1-fce0232c44ef,,http://twitter.com/#!/StackStats/status/342460977054359553,2013-06-06 02:00:34.250 +162783,50982,1790.0,3,,CC BY-SA 3.0,6ac4b172-e012-4139-a772-da53984ea8c5,,,2013-07-17 13:51:44.960 +162781,50982,1790.0,2,,CC BY-SA 3.0,6ac4b172-e012-4139-a772-da53984ea8c5,"Say that I have two learning methods $A$ and $B$ and that I estimate their generalization performance with something like repeated cross validation. From this process I get a **distribution of scores** $P_A$ and $P_B$ for each method (e.g. their ROC AUC values). + +Looking at these distributions, it could be that $\mu_A \ge \mu_B$ but that $\sigma_A \ge \sigma_B$ (i.e. the expected generalization performance of $A$ is higher than $B$, but that there is more uncertainty about this estimation). + +What **mathematical methods** can I use to compare $P_A$ and $P_B$ and eventually make an informed decision about which model to use? + +**Note:** For the sake of simplicity, I am referring to two methods $A$ and $B$ here, but I am interested in methods that can be used to compare the distribution of scores of ~1000 learning methods (e.g. from a grid search) and eventually make a final decision about which model to use. + +",,2013-07-17 13:51:44.960 +151741,48133,594.0,2,,CC BY-SA 3.0,236d20af-4071-465a-931d-c6599a033225,"If you just want a good estimate of $\omega$ and don't care much about +its standard error: + + ssp <- spectrum(y) + per <- 1/ssp$freq[ssp$spec==max(ssp$spec)] + reslm <- lm(y ~ sin(2*pi/per*t)+cos(2*pi/per*t)) + summary(reslm) + + rg <- diff(range(y)) + plot(y~t,ylim=c(min(y)-0.1*rg,max(y)+0.1*rg)) + lines(fitted(reslm)~t,col=4,lty=2) # dashed blue line is sin fit + + # including 2nd harmonic really improves the fit + reslm2 <- lm(y ~ sin(2*pi/per*t)+cos(2*pi/per*t)+sin(4*pi/per*t)+cos(4*pi/per*t)) + summary(reslm2) + lines(fitted(reslm2)~t,col=3) # solid green line is periodic with second harmonic + +![sine plot][1] + +(A better fit still would perhaps account for the outliers in that series in some way, reducing their influence.) + +\--- + +If you want some idea of the uncertainty in $\omega$, you could use profile likelihood ([pdf1][2], [pdf2][3] - references on getting approximate CIs or SEs from profile likelihood or its variants aren't hard to locate) + +(Alternatively, you could feed these estimates into nls ... and start it already converged.) + + [1]: https://i.stack.imgur.com/ZF1P2.png + [2]: http://www.math.umt.edu/patterson/ProfileLikelihoodCI.pdf + [3]: http://www.utstat.toronto.edu/reid/research/slacpaper.pdf",,2013-06-06 08:03:29.833 +151973,48125,18416.0,5,,CC BY-SA 3.0,8486a4d5-54f0-4dd4-bb19-548a6901aa42,"Is $||Y-X\beta||_2^2 + \lambda\beta^T K\beta$ , the standard loss-function in kernel ridge regression, or is it different? Also, is the gaussian kernel a standard choice used for the kernel, in practice? If not, which kernels are used more often than not? Also, is $\lambda$ the only parameter to be tuned via cross-validation or is the kernel parameter like $\sigma$ in a gaussian kernel, also tuned via cross validation in practice? Please confirm and/or correct my understanding of Kernel ridge regression",deleted 1 characters in body,2013-06-07 00:01:06.023 +151974,48125,18416.0,4,,CC BY-SA 3.0,8486a4d5-54f0-4dd4-bb19-548a6901aa42,Kernel Ridge Regression,deleted 1 characters in body,2013-06-07 00:01:06.023 +151987,48125,18416.0,5,,CC BY-SA 3.0,76802a5f-ddd0-411c-906f-1c3840ab7b1d,"Is $||Y-X\beta||_2^2 + \lambda\beta^T K\beta$ , the standard loss-function in kernel ridge regression, or is it different? Also, is the gaussian kernel a standard choice used for the kernel, in practice? If not, which kernels are used more often than not? Also, is $\lambda$ the only parameter to be tuned via cross-validation or is the kernel parameter like $\sigma$ in a gaussian kernel, also tuned via cross validation in practice? Please confirm and/or correct my understanding of Kernel ridge regression.",added 1 characters in body,2013-06-07 03:19:45.887 +152013,28,15827.0,5,,CC BY-SA 3.0,65e61767-ac5a-4e89-94ab-e1fe72978eb2,"Last year, I read a blog post from [Brendan O'Connor][1] entitled [""Statistics vs. Machine Learning, fight!""][2] that discussed some of the differences between the two fields. [Andrew Gelman responded favorably to this][3]: + +Simon Blomberg: +> From R's fortunes +> package: To paraphrase provocatively, +> 'machine learning is statistics minus +> any checking of models and +> assumptions'. +> -- Brian D. Ripley (about the difference between machine learning +> and statistics) useR! 2004, Vienna +> (May 2004) :-) Season's Greetings! + +Andrew Gelman: + +> In that case, maybe we should get rid +> of checking of models and assumptions +> more often. Then maybe we'd be able to +> solve some of the problems that the +> machine learning people can solve but +> we can't! + +There was also the [**""Statistical Modeling: The Two Cultures""** paper][4] by Leo Breiman in 2001 which argued that statisticians rely too heavily on data modeling, and that machine learning techniques are making progress by instead relying on the *predictive accuracy* of models. + +Has the statistics field changed over the last decade in response to these critiques? Do the *two cultures* still exist or has statistics grown to embrace machine learning techniques such as neural networks and support vector machines? + + + [1]: http://anyall.org/ + [2]: http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/ + [3]: http://andrewgelman.com/2008/12/machine_learnin/ + [4]: http://projecteuclid.org/euclid.ss/1009213726",deleted 2 characters in body,2013-06-07 06:38:10.327 +152255,48125,18416.0,4,,CC BY-SA 3.0,433c0527-8558-45f1-b6c0-da59b6df4ae0,Loss for Kernel Ridge Regression,edited body,2013-06-08 00:58:52.503 +152254,48125,18416.0,5,,CC BY-SA 3.0,433c0527-8558-45f1-b6c0-da59b6df4ae0,"Is $||Y-X\beta||_2^2 + \lambda\beta^T K\beta$ , the standard loss-function in kernel ridge regression, or is it different? Also, is the gaussian kernel a standard choice used for the kernel, in practice? If not, which kernels are used more often than not? Also, is $\lambda$ the only parameter to be tuned via cross-validation or is the kernel parameter like $\sigma$ in a gaussian kernel, also tuned via cross validation in practice? Please confirm and/or correct my understanding of Kernel ridge regression!",edited body,2013-06-08 00:58:52.503 +153465,48597,18905.0,2,,CC BY-SA 3.0,7c7e28b4-1e41-4561-856d-75bb4d56cd6b,"I've just finished an animal experiment. I compared 1 control group and 1 experimental group, the only difference betweeen the two is type of diet. For statistic analysis I use independent t-test, and the result shows no significant differences between the two group. However the data shows the tendency that the experimental group has more benefit in all variables measured. So, what should I say about my data? All data are normally distributed. +My supervisor said that maybe because I used very small number of sample (each group n=8) so I could not gain any significant differences. He suggested me to do some ""probability test"" or something to extrapolate my data (which is I don't have any clue what he talked about). +So, is there any statistic analysis that I can use as what my supervisor told me to do? +Thank you.",,2013-06-13 04:46:26.887 +153466,48597,18905.0,1,,CC BY-SA 3.0,7c7e28b4-1e41-4561-856d-75bb4d56cd6b,"T-test shows no differences, but the experiment group shows tendency more benefit in all variables measured than control group",,2013-06-13 04:46:26.887 +153467,48597,18905.0,3,,CC BY-SA 3.0,7c7e28b4-1e41-4561-856d-75bb4d56cd6b,,,2013-06-13 04:46:26.887 +153469,48597,5237.0,5,,CC BY-SA 3.0,91bcb256-f8de-4f83-9d90-296de7831b27,"I've just finished an animal experiment. I compared 1 control group and 1 experimental group, the only difference between the two is type of diet. For statistical analysis I used the independent groups t-test, and the result showed no significant differences between the two groups. However, the data shows the tendency that the experimental group has more benefit in all variables measured. So, what should I say about my data? All data are normally distributed. + +My supervisor said that maybe because I used very small sample (each group n=8) that I could not find any significant differences. He suggested me to do some ""probability test"" or something to extrapolate my data (unfortunately, I don't have any clue what he was talking about). + +So, is there any statistical analysis that I can use like what my supervisor told me to do?",light editing,2013-06-13 05:13:29.993 +153470,48597,5237.0,6,,CC BY-SA 3.0,91bcb256-f8de-4f83-9d90-296de7831b27,,light editing,2013-06-13 05:13:29.993 +162782,50982,1790.0,1,,CC BY-SA 3.0,6ac4b172-e012-4139-a772-da53984ea8c5,Comparing distributions of generalization performance,,2013-07-17 13:51:44.960 +185660,57270,22593.0,3,,CC BY-SA 3.0,1a8e6a02-0e64-4b5f-857f-40ecb23b1df6,,,2013-10-11 00:34:33.700 +153679,48658,1926.0,2,,CC BY-SA 3.0,d34abced-098f-457b-87b5-afdba2aa3e67,"I am currently taking the PGM course by Daphne Koller on Coursera. In that, we generally model a Bayesian Network as a cause and effect directed graph of the variables which are part of the observed data. But on PyMC tutorials and examples I generally see that it not quite modeled in the same way as the PGM or atleast I am confused. In PyMC the parents of any observed real world variable are often the parameters of the distribution that you use to model the variable. + +Now my question really is a practical one. Suppose I have 3 variables for which data is observed (A, B, C) (lets assume they are all continuous variables just for the sake of it). From some domain knowledge, one can say that A and B cause C. So we have a BN here - A, B are the parents and C is the children. +now from the BN equation P(A, B, C) = P(C | A, B) * P(A) * P(B) + +I can say A and B are some normal distributions with some mu and sigma, but how do I model P(C | A, B) ? +The general idea I want to learn, is how do I learn this BN using PyMC so that I can query the BN. Or do I have to augment the BN with parameters of the model in some fashion. + +Is this problem solvable using pymc? or have I got some fundamentals wrong? + +Any help would be appreciated! +",,2013-06-13 19:42:13.623 +153680,48658,1926.0,3,,CC BY-SA 3.0,d34abced-098f-457b-87b5-afdba2aa3e67,,,2013-06-13 19:42:13.623 +153678,48658,1926.0,1,,CC BY-SA 3.0,d34abced-098f-457b-87b5-afdba2aa3e67,Bayesian network inference using pymc (Beginner's confusion),,2013-06-13 19:42:13.623 +155155,2509,668.0,6,,CC BY-SA 3.0,ffed9afa-d5ba-4792-869b-d61bb7184c09,,edited tags,2013-06-19 22:14:36.010 +156536,45280,,5,,CC BY-SA 3.0,cc162914-6a6a-4799-b014-b576d1c68ea2,"I have two continuous variables which I have data from a physics experiment. + +I want to test for association between the two variables but without assuming a monotonic relationship. I also only have 6 data point each with a large error associated with it and want the test to take this into consideration. + +Does anyone know of a statistical test of this type?",edited body,2013-06-25 20:19:10.577 +157640,41244,15827.0,5,,CC BY-SA 3.0,907560b8-a088-4a19-9e4e-1427cf677e0c,"Given $N$ flips of the same coin resulting in $k$ occurrences of 'heads', what is the probability density function of heads-probability of the coin?",added 5 characters in body,2013-06-30 08:48:04.803 +157776,45457,,4,user88,CC BY-SA 3.0,3cecd1b2-7f2a-428e-98f8-8107d16f3f7f,Predicting high frequency finance time series with HMM,edited title,2013-06-30 20:45:35.737 +158536,49879,19492.0,3,,CC BY-SA 3.0,ffad237c-16fa-4c55-9b55-f2d9ea6efbfd,,,2013-07-03 11:48:24.817 +158535,49879,19492.0,1,,CC BY-SA 3.0,ffad237c-16fa-4c55-9b55-f2d9ea6efbfd,What is an adaptive copula?,,2013-07-03 11:48:24.817 +158534,49879,19492.0,2,,CC BY-SA 3.0,ffad237c-16fa-4c55-9b55-f2d9ea6efbfd,"My basic question is: What is an adaptive copula? + +I have slides from a presentation (unfortunately, I cannot ask the author of the slides) about adaptive copulae and I am not getting, what this means resp. what this is good for? + +Here are the slides: +![sl1][1] +![sl2][2] +Then the slides continue with a change-point Test. I am wondering what this is about and why I need this in connection to copulae? + +The slides end with an adaptively estimated parameter plot: +![sl3][3] +![sl4][4] + +This seems to show, that my estimates are lagged behind. Any other interpretations, comments would be great! + [1]: https://i.stack.imgur.com/0F76A.png + [2]: https://i.stack.imgur.com/F3H0r.png + [3]: https://i.stack.imgur.com/qJXPm.png + [4]: https://i.stack.imgur.com/jYIy9.png",,2013-07-03 11:48:24.817 +158660,49906,5821.0,2,,CC BY-SA 3.0,4c57d016-2ed0-4a83-a64f-1770b24567d7,"I am interested in the modeling of binary response data in paired observations. We aim to make inference about the effectiveness of a pre-post intervention in a group, potentially adjusting for several covariates and determining whether there is effect modification by a group that received particularly different training as part of an intervention. + +Given data of the following form: + + id phase resp + 1 pre 1 + 1 post 0 + 2 pre 0 + 2 post 0 + 3 pre 1 + 3 post 0 + +And a $2 \times 2$ contingency table of paired response information: + +\begin{array}{cc|cc} +& & \mbox{Pre} & \\ +& & \mbox{Correct} & \mbox{Incorrect} \\ \hline +\mbox{Post} & \mbox{Correct} & a & b&\\ + & \mbox{Incorrect} & c& d&\\ +\end{array} + +We're interested in the test of hypothesis: $\mathcal{H}_0: \theta_c = 1$. + +McNemar's Test gives: $Q = \frac{(b-c)^2}{b+c} \sim \chi^2_1$ under $\mathcal{H}_0$ (asymptotically). This is intuitive because, under the null, we would expect an equal proportion of the discordant pairs ($b$ and $c$) to be favoring a positive effect ($b$) or a negative effect ($c$). With the probability of positive case definition defined $p =\frac{b}{b+c}$ and $n=b+c$. The odds of observing a positive discordant pair is $\frac{p}{1-p}=\frac{b}{c}$. + +On the other hand, conditional logistic regression uses a different approach to test the same hypothesis, by maximizing the conditional likelihood: + +$$\mathcal{L}(X ; \beta) = \prod_{j=1}^n \frac{\exp(\beta X_{j,2})}{\exp(\beta X_{j,1}) + \exp(\beta X_{j,2})}$$ + +where $\exp(\beta) = \theta_c$. + +So, what's the relationship between these tests? How can one do a simple test of the contingency table presented earlier? Looking at calibration of p-values from clogit and McNemar's approaches under the null, you'd think they were completely unrelated! + + library(survival) + n <- 100 + do.one <- function(n) { + id <- rep(1:n, each=2) + ph <- rep(0:1, times=n) + rs <- rbinom(n*2, 1, 0.5) + c( + 'pclogit' = coef(summary(clogit(rs ~ ph + strata(id))))[5], + 'pmctest' = mcnemar.test(table(ph,rs))$p.value + ) + } + + out <- replicate(1000, do.one(n)) + plot(t(out), main='Calibration plot of pvalues for McNemar and Clogit tests', + xlab='p-value McNemar', ylab='p-value conditional logistic regression') + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/HC8YV.jpg",,2013-07-03 17:50:20.467 +158661,49906,5821.0,3,,CC BY-SA 3.0,4c57d016-2ed0-4a83-a64f-1770b24567d7,,,2013-07-03 17:50:20.467 +158659,49906,5821.0,1,,CC BY-SA 3.0,4c57d016-2ed0-4a83-a64f-1770b24567d7,Relationship between McNemar's test and conditional logistic regression,,2013-07-03 17:50:20.467 +158841,49906,,25,,,465f7e5a-ff9f-4ce0-830a-898d4ffaac6b,,http://twitter.com/#!/StackStats/status/352653364342964224,2013-07-04 05:01:28.550 +161769,50739,10492.0,2,,CC BY-SA 3.0,2b322f2f-26bd-4ba5-8f8a-b2ac9b44b523,"I found two very useful posts about the difference between linear regression analysis and ANOVA and how to visualise them: + +http://stats.stackexchange.com/questions/555/why-is-anova-taught-used-as-if-it-is-a-different-research-methodology-compared + +http://stats.stackexchange.com/questions/5278/how-to-visualize-what-anova-does + +As stated in the first post, to test whether the average height of male and females is the same you can use a regression model (height = alpha + beta * gender + error) and test whether beta = 0. If beta = 0, then there's no difference in the height between males and females. However, I am not quite sure how this is tested when you have three groups. Image the following example: + + height (y) - group (x) + 5 - A + 6 - A + 7 - A + 6 - A + 30 - B + 32 - B + 34 - B + 33 - B + 20 - C + 19 - C + 21 - C + 22 - C + +The regression model would look like: + +height = a + b * group + e + +I quickly visualized the data (see image below) + +They way I understood the regression model is that it would now test whether +any of the three slopes (AB, AC or BC) has a slope b which is significantly different from 0. If that's the case one can conclude like in an ANOVA that there is at least one group which height is significantly different from one or more groups. Afterwards, one could use a post-hoc test of course to test which of the groups really differ. Is my understanding of how the regression models tests this hypothesis correct? + + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/6LD5Q.png + +",,2013-07-14 10:14:44.800 +163054,51047,17056.0,2,,CC BY-SA 3.0,9f1a3b93-8f46-4468-b8c7-3c27aa4225d9,"Over the past few weeks I have been trying to understand MCMC and the Metropolis-Hastings algorithm(s). Every time I think I understand it I realise that I am wrong. Most of the code examples I find on-line implement something that is not consistent with the description. i.e.: They say they implement Metropolis-Hastings but they actually implement random-walk metropolis. Others (almost always) silently skip the implementation of the Hastings correction ratio because they are using a symmetric proposal distribution. Actually, I haven't found a single simple example that calculates the ratio so far. That makes me even more confused. Can someone give me code examples (in any language) of the following: + + - Vanilla Non-Random Walk Metropolis-Hastings Algorithm with Hastings correction ratio calculation (even if this will end up being 1 when using a symmetric proposal distribution). + - Vanilla Random Walk Metropolis-Hastings algorithm. + - Vanilla Independent Metropolis-Hastings algorithm. + +No need to provide the Metropolis algorithms because if I am not mistaken the only difference between Metropolis and Metropolis-Hastings is that the first ones always sample from a symmetric distribution and thus they don't have the Hastings correction ratio. +No need to give detailed explanation of the algorithms. I do understand the basics but I am kinda confused with all the different names for the different variations of the Metropolis-Hastings algorithm but also with how you practically implement the Hastings correction ratio on the Vanilla non-random-walk MH. Thank you.",,2013-07-18 06:53:11.527 +163052,51047,17056.0,3,,CC BY-SA 3.0,9f1a3b93-8f46-4468-b8c7-3c27aa4225d9,,,2013-07-18 06:53:11.527 +163053,51047,17056.0,1,,CC BY-SA 3.0,9f1a3b93-8f46-4468-b8c7-3c27aa4225d9,"Confused with MCMC Metropolis-Hastings variations: Random-Walk, Non-Random-Walk, Independent, Metropolis",,2013-07-18 06:53:11.527 +163433,51047,17056.0,5,,CC BY-SA 3.0,39482c02-1b61-48bd-ba8f-f39d3d2b2007,"Over the past few weeks I have been trying to understand MCMC and the Metropolis-Hastings algorithm(s). Every time I think I understand it I realise that I am wrong. Most of the code examples I find on-line implement something that is not consistent with the description. i.e.: They say they implement Metropolis-Hastings but they actually implement random-walk metropolis. Others (almost always) silently skip the implementation of the Hastings correction ratio because they are using a symmetric proposal distribution. Actually, I haven't found a single simple example that calculates the ratio so far. That makes me even more confused. Can someone give me code examples (in any language) of the following: + + - Vanilla Non-Random Walk Metropolis-Hastings Algorithm with Hastings correction ratio calculation (even if this will end up being 1 when using a symmetric proposal distribution). + - Vanilla Random Walk Metropolis-Hastings algorithm. + - Vanilla Independent Metropolis-Hastings algorithm. + +No need to provide the Metropolis algorithms because if I am not mistaken the only difference between Metropolis and Metropolis-Hastings is that the first ones always sample from a symmetric distribution and thus they don't have the Hastings correction ratio. +No need to give detailed explanation of the algorithms. I do understand the basics but I am kinda confused with all the different names for the different variations of the Metropolis-Hastings algorithm but also with how you practically implement the Hastings correction ratio on the Vanilla non-random-walk MH. Please don't copy paste links that partially answer my questions because most likely I have already seen them. Those links led me to this confusion. Thank you.",added 147 characters in body,2013-07-19 04:46:14.300 +163920,3646,19377.0,6,,CC BY-SA 3.0,465b7c78-99f7-410c-a6e4-079f2265c8aa,,Added tags,2013-07-20 13:29:05.953 +163921,3646,,24,,CC BY-SA 3.0,465b7c78-99f7-410c-a6e4-079f2265c8aa,,"Proposed by 27403 approved by 6029, 686 edit id of 4251",2013-07-20 13:29:05.953 +163990,50982,1790.0,5,,CC BY-SA 3.0,606d052a-71e9-4806-ab20-7e003ee37b07,"Say that I have two learning methods $A$ and $B$ and that I estimate their generalization performance with something like repeated cross validation. From this process I get a **distribution of scores** $P_A$ and $P_B$ for each method (e.g. their ROC AUC values). + +Looking at these distributions, it could be that $\mu_A \ge \mu_B$ but that $\sigma_A \ge \sigma_B$ (i.e. the expected generalization performance of $A$ is higher than $B$, but that there is more uncertainty about this estimation). + +I think this is called the **bias-variance tradeoff**. + +What **mathematical methods** can I use to compare $P_A$ and $P_B$ and eventually make an informed decision about which model to use? + +**Note:** For the sake of simplicity, I am referring to two methods $A$ and $B$ here, but I am interested in methods that can be used to compare the distribution of scores of ~1000 learning methods (e.g. from a grid search) and eventually make a final decision about which model to use. + +",added 58 characters in body,2013-07-20 20:56:04.300 +163991,50982,1790.0,33,,,7fc5a35e-9151-4d96-b6f9-38bec6a245b2,,692,2013-07-20 21:03:13.030 +163992,50982,1790.0,5,,CC BY-SA 3.0,3a99a5c4-0922-43cd-b50d-30c6c35fb6c3,"Say that I have two learning methods $A$ and $B$ and that I estimate their generalization performance with something like repeated cross validation or nested cross validation. From this process I get a **distribution of scores** $P_A$ and $P_B$ for each method (e.g. their ROC AUC values). + +Looking at these distributions, it could be that $\mu_A \ge \mu_B$ but that $\sigma_A \ge \sigma_B$ (i.e. the expected generalization performance of $A$ is higher than $B$, but that there is more uncertainty about this estimation). + +I think this is called the **bias-variance tradeoff**. + +What **mathematical methods** can I use to compare $P_A$ and $P_B$ and eventually make an informed decision about which model to use? + +**Note:** For the sake of simplicity, I am referring to two methods $A$ and $B$ here, but I am interested in methods that can be used to compare the distribution of scores of ~1000 learning methods (e.g. from a grid search) and eventually make a final decision about which model to use. + +",added 27 characters in body,2013-07-20 21:03:38.780 +164036,50982,,25,,,7fad73ce-3648-47e9-a434-3b7786d48dbf,,http://twitter.com/#!/StackStats/status/358730969500495873,2013-07-20 23:31:42.820 +164736,51496,19870.0,3,,CC BY-SA 3.0,a1c050e7-5c3f-4a10-98ce-c447dd6b6b34,,,2013-07-23 03:50:02.683 +164735,51496,19870.0,1,,CC BY-SA 3.0,a1c050e7-5c3f-4a10-98ce-c447dd6b6b34,crt decision tree node mutually exclusive,,2013-07-23 03:50:02.683 +164734,51496,19870.0,2,,CC BY-SA 3.0,a1c050e7-5c3f-4a10-98ce-c447dd6b6b34,"I have been trying to understand the results of a CRT decision tree, my question is if the terminal nodes should be mutually exclusive? I am asking this because by reading the terminal nodes some variables seems to overlap each other. + +For instance some terminal nodes ""share"" the same profession: + +Node 23: Carpenter, Plumber, Sole trader, Truck Driver + +Node 24: Plumber, Truck Driver, Teacher, Retired. + +Probably I am reading the results incorrectly because it should not happen, at least in theory. + +Best Regards + +",,2013-07-23 03:50:02.683 +167013,52126,20367.0,3,,CC BY-SA 3.0,0855c6cc-9d31-4d3e-959d-fbda2d699811,,,2013-07-30 00:44:55.017 +164747,51496,,5,,CC BY-SA 3.0,f56759ea-e22d-471c-ab9a-c108958b0967,"I have been trying to understand the results of a CRT decision tree, my question is if the terminal nodes should be mutually exclusive? I am asking this because by reading the terminal nodes some variables seems to overlap each other. + +For instance some terminal nodes ""share"" the same profession: + +Node 23: carpenter, plumber, sole trader, truck driver + +Node 24: plumber, truck driver, teacher, retired. + +Probably I am reading the results incorrectly because it should not happen, at least in theory.",deleted 21 characters in body; edited title,2013-07-23 05:40:31.833 +164748,51496,,4,,CC BY-SA 3.0,f56759ea-e22d-471c-ab9a-c108958b0967,Should CRT decision tree node be mutually exclusive?,deleted 21 characters in body; edited title,2013-07-23 05:40:31.833 +165013,51577,20097.0,3,,CC BY-SA 3.0,64fca8fc-e10f-43c6-a37a-a05ee99b5588,,,2013-07-23 19:27:17.943 +165011,51577,20097.0,2,,CC BY-SA 3.0,64fca8fc-e10f-43c6-a37a-a05ee99b5588,"I try to predict values for regression in LIBSVM. My data is in time series. I use gridregression.m file in LIBSVM to find optimal parameters c, g and p. Gridregression.m file use cross validation to find optimal parameters, but is it ok to use cross validation in time series? + +When I use parameters from gridregression.m, sometimes the MSE is not better then the default values. ( cmd= '-s 3 -t 2' is sometimes better )",,2013-07-23 19:27:17.943 +165012,51577,20097.0,1,,CC BY-SA 3.0,64fca8fc-e10f-43c6-a37a-a05ee99b5588,LIBSVM parameter search in time series,,2013-07-23 19:27:17.943 +165246,51644,3733.0,3,,CC BY-SA 3.0,2b580020-bb7f-477a-a198-0b1c34c0e22c,,,2013-07-24 09:19:26.983 +165244,51644,3733.0,2,,CC BY-SA 3.0,2b580020-bb7f-477a-a198-0b1c34c0e22c,"I have been always tought that random effects only influence the variance (error), and that fixed effects influence only the mean. But I have found an example where random effects influence also the mean - the coefficient estimate: + + require(nlme) + set.seed(128) + n <- 100 + k <- 5 + cat <- as.factor(rep(1:k, each = n)) + cat_i <- 1:k # intercept per kategorie + x <- rep(1:n, k) + sigma <- 0.2 + alpha <- 0.001 + y <- cat_i[cat] + alpha * x + rnorm(n*k, 0, sigma) + plot(x, y) + + # simulate missing data + y[c(1:(n/2), (n*k-n/2):(n*k))] <- NA + + m1 <- lm(y ~ x) + summary(m1) + + m2 <- lm(y ~ cat + x) + summary(m2) + + m3 <- lme(y ~ x, random = ~ 1|cat, na.action = na.omit) + summary(m3) + +You can see that estimate of coefficient for `x` from model `m1` is -0.013780, while from model `m2` it is 0.0011713 - both significantly different from zero. + +Note that when I remove the line simulating missing data, the results are the same (it is full matrix). + +Why is that? + +PS: please note I am not a professional statistician, so if you are about to respond with a lot of math then please make also some simple summary for dummies :-)",,2013-07-24 09:19:26.983 +165245,51644,3733.0,1,,CC BY-SA 3.0,2b580020-bb7f-477a-a198-0b1c34c0e22c,Adding random effect influences coefficient estimates,,2013-07-24 09:19:26.983 +165321,51644,3733.0,5,,CC BY-SA 3.0,a189ec0c-3bce-4611-8337-c0143eae6d30,"I have been always tought that random effects only influence the variance (error), and that fixed effects influence only the mean. But I have found an example where random effects influence also the mean - the coefficient estimate: + + require(nlme) + set.seed(128) + n <- 100 + k <- 5 + cat <- as.factor(rep(1:k, each = n)) + cat_i <- 1:k # intercept per kategorie + x <- rep(1:n, k) + sigma <- 0.2 + alpha <- 0.001 + y <- cat_i[cat] + alpha * x + rnorm(n*k, 0, sigma) + plot(x, y) + + # simulate missing data + y[c(1:(n/2), (n*k-n/2):(n*k))] <- NA + + m1 <- lm(y ~ x) + summary(m1) + + m2 <- lm(y ~ cat + x) + summary(m2) + + m3 <- lme(y ~ x, random = ~ 1|cat, na.action = na.omit) + summary(m3) + +You can see that estimate of coefficient for `x` from model `m1` is -0.013780, while from model `m3` it is 0.0011713 - both significantly different from zero. + +Note that when I remove the line simulating missing data, the results are the same (it is full matrix). + +Why is that? + +PS: please note I am not a professional statistician, so if you are about to respond with a lot of math then please make also some simple summary for dummies :-)",edited body,2013-07-24 13:17:41.980 +165423,51644,6204.0,5,,CC BY-SA 3.0,92408bb6-e87a-49c2-b16b-324fe5eca3fe,"I have always been taught that random effects only influence the variance (error), and that fixed effects only influence the mean. But I have found an example where random effects influence also the mean - the coefficient estimate: + + require(nlme) + set.seed(128) + n <- 100 + k <- 5 + cat <- as.factor(rep(1:k, each = n)) + cat_i <- 1:k # intercept per kategorie + x <- rep(1:n, k) + sigma <- 0.2 + alpha <- 0.001 + y <- cat_i[cat] + alpha * x + rnorm(n*k, 0, sigma) + plot(x, y) + + # simulate missing data + y[c(1:(n/2), (n*k-n/2):(n*k))] <- NA + + m1 <- lm(y ~ x) + summary(m1) + + m2 <- lm(y ~ cat + x) + summary(m2) + + m3 <- lme(y ~ x, random = ~ 1|cat, na.action = na.omit) + summary(m3) + +You can see that the estimated coefficient for `x` from model `m1` is -0.013780, while from model `m3` it is 0.0011713 - both significantly different from zero. + +Note that when I remove the line simulating missing data, the results are the same (it is full matrix). + +Why is that? + +PS: please note I am not a professional statistician, so if you are about to respond with a lot of math then please make also some simple summary for dummies :-)",fixed some grammar/spelling,2013-07-24 17:33:40.770 +165424,51644,,24,,CC BY-SA 3.0,92408bb6-e87a-49c2-b16b-324fe5eca3fe,,"Proposed by 8451 approved by 7290, 919 edit id of 4365",2013-07-24 17:33:40.770 +166141,51895,9384.0,3,,CC BY-SA 3.0,2ca19789-f375-46bb-b686-ce8f2dbffc6c,,,2013-07-26 19:31:50.407 +166143,51895,9384.0,1,,CC BY-SA 3.0,2ca19789-f375-46bb-b686-ce8f2dbffc6c,SMOTE throws error for multi class imbalance problem,,2013-07-26 19:31:50.407 +167011,52126,20367.0,2,,CC BY-SA 3.0,0855c6cc-9d31-4d3e-959d-fbda2d699811,"I have 5 surveys of the same group of students over a semester. Each survey uses a 5 point Likert scale. The first and last survey contain some questions dealing with the begining and end of the class (first impressions, final impressions), but most of the questions are identical for all 4 or 5 of the surveys. + +I want to evaluate the statistical significance of changes to students' responses over time. Unfortunately statistics is not my strong suit. I know of the t-test, but that seems to only be applicable to two groups of data (please correct me if I'm wrong). How should I go about evaluating this data? Is a repeated measures one way ANOVA appropriate?",,2013-07-30 00:44:55.017 +167017,52126,20367.0,4,,CC BY-SA 3.0,95966e90-1735-4356-8b7c-289f62b6f8ed,How to evaluate Likert scale data changes over multiple surveys of the same group?,edited title,2013-07-30 01:13:16.480 +167029,52126,,4,,CC BY-SA 3.0,e803cf51-4e85-43d0-876e-40b140db677f,How to evaluate likert scale data changes over multiple surveys of the same group?,Improved Formating,2013-07-30 04:55:26.877 +185574,57242,674.0,4,,CC BY-SA 3.0,88f07a34-44d5-4dc4-9cdd-040669d35479,Estimating multivariate normal distribution by observing variance in different directions,edited title,2013-10-10 19:27:37.740 +166142,51895,9384.0,2,,CC BY-SA 3.0,2ca19789-f375-46bb-b686-ce8f2dbffc6c,"I am trying to use SMOTE to correct imbalance in my multi-class classification problem. +Although SMOTE works perfectly on the iris dataset as per the SMOTE help document, it does not work on a similar dataset. +Here is how my data looks. Note it has three classes with values 1, 2, 3. + + > data + looking risk every status + 1 0 1 0 1 + 2 0 0 0 1 + 3 0 0 0 2 + 4 0 0 0 1 + 5 0 0 0 1 + 6 3 0 0 1 + 7 0 0 0 1 + 8 0 0 0 1 + 9 0 1 0 1 + 10 0 0 0 1 + 11 0 0 0 3 + 12 0 0 0 1 + 13 0 0 0 1 + 14 0 0 0 1 + 15 0 0 0 2 + +It is in the form of dataframe, same as iris: + + > class(data) + [1] ""data.frame"" + +Here is my code using SMOTE and the error that it throws: + + > newData <- SMOTE(status ~ ., data, perc.over = 600,perc.under=100) + Error in scale.default(T, T[i, ], ranges) : subscript out of bounds + In addition: Warning messages: + 1: In FUN(newX[, i], ...) : + no non-missing arguments to max; returning -Inf + 2: In FUN(newX[, i], ...) : + no non-missing arguments to max; returning -Inf + 3: In FUN(newX[, i], ...) : + no non-missing arguments to max; returning -Inf + 4: In FUN(newX[, i], ...) : no non-missing arguments to min; returning Inf + 5: In FUN(newX[, i], ...) : no non-missing arguments to min; returning Inf + 6: In FUN(newX[, i], ...) : no non-missing arguments to min; returning Inf",,2013-07-26 19:31:50.407 +166648,50982,0.0,34,,,980eb327-c8a1-46c8-9ce3-f9be1de75085,,692,2013-07-28 22:42:36.070 +166923,52099,20363.0,2,,CC BY-SA 3.0,befbdb5d-a6d4-4dc3-a85f-649b7b809872,"I am trying to investigate the following problem using multinomial likelihoods and could really do with some advice regarding it's appropriateness and implementation in R + +A sequence is generated by selecting with replacement from a bag of n differently coloured balls and consists of the number of occurrences of each colour in the selection (i.e. each sequence is a vector of length n with each element a count corresponding to the number of occurrences of a particular colour in the sequence). The process is then repeated a number of times to generate a group of unique sequences (duplicate sequences are rejected). + +If a single sequence is selected at random as the test subject and a multinomial model is generated for each of the other sequences, using the colour count proportions as probabilities, can the likelihood be calculated for each multinomial model in the group using the test sequence as the data and would the greatest likelihood indicate the most alike sequence from the group? + +I have tried implementing this in R but am struggling with a couple of points. + +1) Calculating the likelihood fails if the number of colours is large since the factorial term falls out of bounds. +2) If the number of occurrences of each colour relative to the total number of colours is small then the probability is small and the product of the p^x terms tends to zero. + +I hope this makes sense and somebody is able to offer some advice. + +Thank you in advance for your help.",,2013-07-29 19:04:43.263 +166921,52099,20363.0,1,,CC BY-SA 3.0,befbdb5d-a6d4-4dc3-a85f-649b7b809872,Multinomial likelihood for large number of groups,,2013-07-29 19:04:43.263 +166922,52099,20363.0,3,,CC BY-SA 3.0,befbdb5d-a6d4-4dc3-a85f-649b7b809872,,,2013-07-29 19:04:43.263 +166949,52099,16174.0,4,,CC BY-SA 3.0,43b940e2-dddc-4463-94ea-559a8b0de478,Multinomial likelihood for large number of groups.,"tag ""likelihood"", readability",2013-07-29 20:16:35.787 +166950,52099,16174.0,6,,CC BY-SA 3.0,43b940e2-dddc-4463-94ea-559a8b0de478,,"tag ""likelihood"", readability",2013-07-29 20:16:35.787 +166951,52099,,24,,CC BY-SA 3.0,43b940e2-dddc-4463-94ea-559a8b0de478,,Proposed by 22468 approved by -1 edit id of 4466,2013-07-29 20:16:35.787 +166947,52099,15827.0,4,,CC BY-SA 3.0,cf9bc69f-59a8-43cf-9e26-20edab46ba75,Multinomial likelihood for large number of groups,"tag ""likelihood"", readability",2013-07-29 20:16:35.787 +166948,52099,16174.0,5,,CC BY-SA 3.0,43b940e2-dddc-4463-94ea-559a8b0de478,"I am trying to investigate the following problem using multinomial likelihoods and could really do with some advice regarding it's appropriateness and implementation in R. + +A sequence is generated by selecting with replacement from a bag of n differently coloured balls and consists of the number of occurrences of each colour in the selection (i.e. each sequence is a vector of length n with each element a count corresponding to the number of occurrences of a particular colour in the sequence). The process is then repeated a number of times to generate a group of unique sequences (duplicate sequences are rejected). + +If a single sequence is selected at random as the test subject and a multinomial model is generated for each of the other sequences, using the colour count proportions as probabilities, can the likelihood be calculated for each multinomial model in the group using the test sequence as the data and would the greatest likelihood indicate the most alike sequence from the group? + +I have tried implementing this in R but am struggling with a couple of points. + +1. Calculating the likelihood fails if the number of colours is large since the factorial term falls out of bounds. +2. If the number of occurrences of each colour relative to the total number of colours is small then the probability is small and the product of the $p^x$ terms tends to zero. + +I hope this makes sense and somebody is able to offer some advice. + +Thank you in advance for your help.","tag ""likelihood"", readability",2013-07-29 20:16:35.787 +166952,52099,15827.0,5,,CC BY-SA 3.0,cf9bc69f-59a8-43cf-9e26-20edab46ba75,"I am trying to investigate the following problem using multinomial likelihoods and could really do with some advice regarding its appropriateness and implementation in R. + +A sequence is generated by selecting with replacement from a bag of n differently coloured balls and consists of the number of occurrences of each colour in the selection (i.e. each sequence is a vector of length n with each element a count corresponding to the number of occurrences of a particular colour in the sequence). The process is then repeated a number of times to generate a group of unique sequences (duplicate sequences are rejected). + +If a single sequence is selected at random as the test subject and a multinomial model is generated for each of the other sequences, using the colour count proportions as probabilities, can the likelihood be calculated for each multinomial model in the group using the test sequence as the data and would the greatest likelihood indicate the most alike sequence from the group? + +I have tried implementing this in R but am struggling with a couple of points. + +1. Calculating the likelihood fails if the number of colours is large since the factorial term falls out of bounds. +2. If the number of occurrences of each colour relative to the total number of colours is small then the probability is small and the product of the $p^x$ terms tends to zero. + +I hope this makes sense and somebody is able to offer some advice. +","tag ""likelihood"", readability",2013-07-29 20:16:35.787 +167012,52126,20367.0,1,,CC BY-SA 3.0,0855c6cc-9d31-4d3e-959d-fbda2d699811,How to evaluate Likert scale data changes over multiple serveys of the same group?,,2013-07-30 00:44:55.017 +186114,57399,6813.0,1,,CC BY-SA 3.0,9fb279df-c7cf-483c-a537-7a1203d9c279,Modelling probabilties within friend sets,,2013-10-13 18:37:21.500 +167026,52126,594.0,5,,CC BY-SA 3.0,e22c5de6-f20e-4e5c-a4ed-b9ec5188cc3d,"I have five surveys of the same group of students over a semester. Each survey uses a 5-point Likert scale. The first and last survey contain some questions dealing with the beginning and end of the class (first impressions, final impressions), but most of the questions are identical for all four or five of the surveys. + +I want to evaluate the statistical significance of changes to students' responses over time. Unfortunately statistics is not my strong suit. I know of the t-test, but that seems to only be applicable to two groups of data (please correct me if I'm wrong). How should I go about evaluating this data? Is a repeated measures one-way ANOVA appropriate? +",Improved Formating,2013-07-30 04:55:26.877 +167030,52126,,6,,CC BY-SA 3.0,e803cf51-4e85-43d0-876e-40b140db677f,,Improved Formating,2013-07-30 04:55:26.877 +167025,52126,594.0,4,,CC BY-SA 3.0,e22c5de6-f20e-4e5c-a4ed-b9ec5188cc3d,How to evaluate Likert scale data changes over multiple surveys of the same group?,Improved Formating,2013-07-30 04:55:26.877 +167028,52126,,5,,CC BY-SA 3.0,e803cf51-4e85-43d0-876e-40b140db677f,"I have `5` surveys of the same group of students over a semester. Each survey uses a `5` point likert scale. The first and last survey contain some questions dealing with the beginning and end of the class (first impressions, final impressions), but most of the questions are identical for all 4 or 5 of the surveys. + +I want to evaluate the statistical significance of changes to students' responses over time. Unfortunately statistics is not my strong suit. I know of the `t-test`, but that seems to only be applicable to two groups of data (please correct me if I'm wrong). How should I go about evaluating this data? Is a repeated measures one way ANOVA appropriate?",Improved Formating,2013-07-30 04:55:26.877 +167027,52126,,24,,CC BY-SA 3.0,e803cf51-4e85-43d0-876e-40b140db677f,,Proposed by 27576 approved by -1 edit id of 4472,2013-07-30 04:55:26.877 +168110,52449,18845.0,2,,CC BY-SA 3.0,9c086957-2d21-409a-9930-bcae298f80da,"I've found two definitions in the literature for the autocorrelation time of a weakly stationary time series: + +$$ +\tau_a = 1+2\sum_{k=1}^\infty \rho_k \quad \text{versus} \quad \tau_b = 1+2\sum_{k=1}^\infty \left|\rho_k\right| +$$ + +where $\rho_k = \frac{\text{Cov}[X_t,X_{t+h}]}{\text{Var}[X_t]}$ is the autocorrelation at lag $k$. + +One application of the autocorrelation time is to find the ""effective sample size"": if you have $n$ observations of a time series, and you know its autocorrelation time $\tau$, then you can pretend that you have + +$$ +n_\text{eff} = \frac{n}{\tau} +$$ + +independent samples instead of $n$ correlated ones for the purposes of finding the mean. Estimating $\tau$ from data is non-trivial, but there are a few ways of doing it (see [Thompson 2010](http://arxiv.org/abs/1011.0175)). + +The definition without absolute values, $\tau_a$, seems more common in the literature; but it admits the possibility of $\tau_a<1$. Using R and the ""coda"" package: + + require(coda) + ts.uncorr <- arima.sim(model=list(),n=10000) # white noise + ts.corr <- arima.sim(model=list(ar=-0.5),n=10000) # AR(1) + effectiveSize(ts.uncorr) # Sanity check + # result should be close to 10000 + effectiveSize(ts.corr) + # result is in the neighborhood of 30000... ??? + +The ""effectiveSize"" function in ""coda"" uses a definition of the autocorrelation time equivalent to $\tau_a$, above. There are some other R packages out there that compute effective sample size or autocorrelation time, and all the ones I've tried give results consistent with this: that an AR(1) process with a negative AR coefficient has *more* effective samples than the correlated time series. This seems strange. + +Obviously, this can never happen in the $\tau_b$ definition of autocorrelation time. + +What is the correct definition of autocorrelation time? Is there something wrong with my understanding of effective sample sizes? The $n_\text{eff} > n$ result shown above seems like it must be wrong... what's going on?",,2013-08-02 14:46:27.663 +168111,52449,18845.0,1,,CC BY-SA 3.0,9c086957-2d21-409a-9930-bcae298f80da,Definition of autocorrelation time (for effective sample size),,2013-08-02 14:46:27.663 +168109,52449,18845.0,3,,CC BY-SA 3.0,9c086957-2d21-409a-9930-bcae298f80da,,,2013-08-02 14:46:27.663 +168161,52449,,25,,,c8f8e9d8-4c26-45ad-9f90-8307cfabccb3,,http://twitter.com/#!/StackStats/status/363355683162304512,2013-08-02 17:48:40.210 +168575,52567,728.0,2,,CC BY-SA 3.0,44b5e16d-f404-48b7-a340-add207e85da9,"In linear regression, $Y= X\beta$, why is $X$ called the design matrix? Can $X$ be designed or constructed arbitrarily to some degree as in art? Thanks!",,2013-08-04 18:26:28.673 +168576,52567,728.0,1,,CC BY-SA 3.0,44b5e16d-f404-48b7-a340-add207e85da9,"Meaning of ""design"" in design matrix?",,2013-08-04 18:26:28.673 +168574,52567,728.0,3,,CC BY-SA 3.0,44b5e16d-f404-48b7-a340-add207e85da9,,,2013-08-04 18:26:28.673 +169744,52871,18447.0,2,,CC BY-SA 3.0,6a4385d8-75a6-4576-9047-18129738dddb,"I want to analyze a multilevel multidimensional model in WinBUGS. the model is as below (N students responding to J items of a test, students are nested within J schools): + + model{ + #responses + for(i in 1:N){ + for(j in 1:K){ + logit(p[i,j])<- a1[j]*t[i,1]+a2[j]*t[i,2]-b[j] + y[i,j]~dbern(p[i,j] }#for j + + t[i,1:2]~dmnorm(mu[sc[i],1:2],tau.p[1:2,1:2]) + }#for i + + #school level + for(j in 1:J){ + mu[j,1:2]~dmnorm(m[j,1:2],tau.s[1:2,1:2]) + + }#for j of school + #priors + for(j in 1:J){ + m[j,1:2]~dmnorm(m0[1:2],cov[1:2,1:2]) + } + tau.p[1:2,1:2]~dwish(cov[1:2,1:2],2) + tau.s[1:2,1:2]~dwish(cov[1:2,1:2],2) + sigma.p[1:2,1:2]<-inverse(tau.p[,]) + sigma.s[1:2,1:2]<-inverse(tau.s[,]) + s2p<-sum(sigma.p[,]) + s2s<-sum(sigma.s[,]) + rho<-(s2s)/(s2s+s2p) + a1[1]~dlnorm(0,4) + a2[1]<-0 + b[1]~dnorm(0,1) + for(s in 2:K) { + a1[s]~dlnorm(0,4) + a2[s]~dlnorm(0,4) + b[s]~dnorm(0,1) + + }#for s of items + }#for model + +I've set these functions as initial values: + + ini<-function(){ + list(tau.p=matrix(rgamma(4,100,100),2,2), + tau.s=matrix(rgamma(4,100,100),2,2), + t=rmvnorm(2362,mean=c(0,0),sigma=diag(2)), + m=rmvnorm(116,mean=c(0,0),sigma=diag(2)), + mu=rmvnorm(116,mean=c(0,0),sigma=diag(2)), + a1=rlnorm(45,0, 0.4), + a2=c(NA,rlnorm(44,0, 0.4)), + b=rnorm(45)) + } +I use rube package in R to check and run my analysis and everything looks fine. When I run the model I receive ""Trap 66 (postcondition violated)"" or ""undefined real result"". I think the problem is from the initials but I have no idea how to solve it. + +Any idea? +",,2013-08-09 03:06:43.727 +169746,52871,18447.0,3,,CC BY-SA 3.0,6a4385d8-75a6-4576-9047-18129738dddb,,,2013-08-09 03:06:43.727 +169745,52871,18447.0,1,,CC BY-SA 3.0,6a4385d8-75a6-4576-9047-18129738dddb,Trap 66 in WinBUGS in a hierarchical Bayesian modeling,,2013-08-09 03:06:43.727 +171096,53261,449.0,2,,CC BY-SA 3.0,e75aac2b-9bd3-4627-94a6-275cbe8682cb,"I was recently looking at a paper in the journal *Psychological Science* and came across this: + +*F*(1, 71) = 4.5, *p* = .037, $\mu^2$ = .06 + +*F*(1, 71) = 0.08, *p* = .78, $\mu^2$ = .001 + +I was wondering what the $\mu^2$ is in the above. Typically in APA the third thing should be either the MSE or it should be a standardized effect size (or you should have all 4). I'm guessing it's a standardized effect size of some sort but I'm not familiar with it and searching the net has turned up nothing. The actual effect, as near as I can tell from the graph, is about 12 for the first one. + +Is this an effect size I haven't heard of yet or a typo in the article? + +Farrelly, D., Slater, R., Elliott, H. R., Walden, H. R. and Wetherell, M. A. (2013) Competitors Who Choose to Be Red Have Higher Testosterone Levels. *Psychological Science*, DOI:10.1177/0956797613482945",,2013-08-14 03:06:30.697 +171095,53261,449.0,1,,CC BY-SA 3.0,e75aac2b-9bd3-4627-94a6-275cbe8682cb,What is the $\mu^2$ squared effect size?,,2013-08-14 03:06:30.697 +186161,57416,22507.0,2,,CC BY-SA 3.0,1e4a3ab0-f21d-4abb-96e6-feeab411acf7,"Multinomial test, if I understand you correctly.",,2013-10-14 02:36:19.860 +169749,52871,18447.0,5,,CC BY-SA 3.0,39a8bbe3-e716-41d6-b8f6-0681415ddb55,"I want to analyze a multilevel multidimensional model in WinBUGS. the model is as below (N students responding to J items of a test, students are nested within J schools): + + model{ + #responses + for(i in 1:N){ + for(j in 1:K){ + logit(p[i,j])<- a1[j]*t[i,1]+a2[j]*t[i,2]-b[j] + y[i,j]~dbern(p[i,j] }#for j + + t[i,1:2]~dmnorm(mu[sc[i],1:2],tau.p[1:2,1:2]) + }#for i + + #school level + for(j in 1:J){ + mu[j,1:2]~dmnorm(m[j,1:2],tau.s[1:2,1:2]) + + }#for j of school + #priors + for(j in 1:J){ + m[j,1:2]~dmnorm(m0[1:2],cov[1:2,1:2]) + #m0=c(0,0) #cov=diag(2) + } + tau.p[1:2,1:2]~dwish(cov[1:2,1:2],2) + tau.s[1:2,1:2]~dwish(cov[1:2,1:2],2) + sigma.p[1:2,1:2]<-inverse(tau.p[,]) + sigma.s[1:2,1:2]<-inverse(tau.s[,]) + s2p<-sum(sigma.p[,]) + s2s<-sum(sigma.s[,]) + rho<-(s2s)/(s2s+s2p) + a1[1]~dlnorm(0,4) + a2[1]<-0 + b[1]~dnorm(0,1) + for(s in 2:K) { + a1[s]~dlnorm(0,4) + a2[s]~dlnorm(0,4) + b[s]~dnorm(0,1) + + }#for s of items + }#for model + +I've set these functions as initial values: + + ini<-function(){ + list(tau.p=matrix(rgamma(4,100,100),2,2), + tau.s=matrix(rgamma(4,100,100),2,2), + t=rmvnorm(2362,mean=c(0,0),sigma=diag(2)), + m=rmvnorm(116,mean=c(0,0),sigma=diag(2)), + mu=rmvnorm(116,mean=c(0,0),sigma=diag(2)), + a1=rlnorm(45,0, 0.4), + a2=c(NA,rlnorm(44,0, 0.4)), + b=rnorm(45)) + } +I use rube package in R to check and run my analysis and everything looks fine. When I run the model I receive ""Trap 66 (postcondition violated)"" or ""undefined real result"". I think the problem is from the initials but I have no idea how to solve it. + +Any idea?",added 42 characters in body,2013-08-09 03:48:10.500 +169753,52871,18447.0,5,,CC BY-SA 3.0,cb38a76a-5d35-4f26-b8cc-1814f7c11b3a,"I want to analyze a multilevel multidimensional model in WinBUGS. the model is as below (N=2362 students responding to K=45 items of a test, students are nested within J=116 schools): + + model{ + #responses + for(i in 1:N){ + for(j in 1:K){ + logit(p[i,j])<- a1[j]*t[i,1]+a2[j]*t[i,2]-b[j] + y[i,j]~dbern(p[i,j] }#for j + + t[i,1:2]~dmnorm(mu[sc[i],1:2],tau.p[1:2,1:2]) + }#for i + + #school level + for(j in 1:J){ + mu[j,1:2]~dmnorm(m[j,1:2],tau.s[1:2,1:2]) + + }#for j of school + #priors + for(j in 1:J){ + m[j,1:2]~dmnorm(m0[1:2],cov[1:2,1:2]) + #m0=c(0,0) #cov=diag(2) + } + tau.p[1:2,1:2]~dwish(cov[1:2,1:2],2) + tau.s[1:2,1:2]~dwish(cov[1:2,1:2],2) + sigma.p[1:2,1:2]<-inverse(tau.p[,]) + sigma.s[1:2,1:2]<-inverse(tau.s[,]) + s2p<-sum(sigma.p[,]) + s2s<-sum(sigma.s[,]) + rho<-(s2s)/(s2s+s2p) + a1[1]~dlnorm(0,4) + a2[1]<-0 + b[1]~dnorm(0,1) + for(s in 2:K) { + a1[s]~dlnorm(0,4) + a2[s]~dlnorm(0,4) + b[s]~dnorm(0,1) + + }#for s of items + }#for model + +I've set these functions as initial values: + + ini<-function(){ + list(tau.p=matrix(rgamma(4,100,100),2,2), + tau.s=matrix(rgamma(4,100,100),2,2), + t=rmvnorm(N,mean=c(0,0),sigma=diag(2)), + m=rmvnorm(J,mean=c(0,0),sigma=diag(2)), + mu=rmvnorm(J,mean=c(0,0),sigma=diag(2)), + a1=rlnorm(K,0, 0.4), + a2=c(NA,rlnorm(K-1,0, 0.4)), + b=rnorm(45,0,0.5)) + } +I use rube package in R to check and run my analysis and everything looks fine. When I run the model I receive ""Trap 66 (postcondition violated)"" or ""undefined real result"". I think the problem is from the initials but I have no idea how to solve it. + +Any idea? +",added 13 characters in body,2013-08-09 04:12:10.890 +169871,52910,3731.0,3,,CC BY-SA 3.0,1c09ff82-129e-430e-939d-b84b5c0f65cd,,,2013-08-09 13:48:43.087 +169870,52910,3731.0,1,,CC BY-SA 3.0,1c09ff82-129e-430e-939d-b84b5c0f65cd,What are typically encountered condition numbers in social science?,,2013-08-09 13:48:43.087 +169869,52910,3731.0,2,,CC BY-SA 3.0,1c09ff82-129e-430e-939d-b84b5c0f65cd,"As part of my thesis, I'm proving (or attempting to prove...) a few asymptotic results. Because these results depend on the condition number, I'd like to have some idea about the typical sizes of a condition numbers that crop up in social science research. That way, I can give some guidance about how large the sample size has to be before we reach the happy land of asymptopia. + +The setup is as follows. For the standard Generalized Least Squares (GLS) model + +$$Y = X\beta + e \quad \quad \quad e \sim N(0, V\sigma^2) $$ + +where $V$ is assumed to be known and positive definite, we define + +$$ X^- = (X^\top X)^{-1} X^\top \quad \quad \quad U = (I-XX^-)V$$ +and the condition number $\kappa$ + +$$ \kappa = \frac{ \lambda_{\text{max}} }{ \lambda_{\text{min}} } $$ + +where the $\lambda_\star$ values are the maximum and minimum eigenvalues of the matrix $U$. + +Does anyone have pointers to references for the sizes of condition numbers in social science research? I don't even know where to look. Any pointers for either + + 1. GLS models (as I've posed it), + 2. REML/ML models where $V$ is + estimated and then conditioned upon, or + 3. fixed effect only models + where $V$ is the identity matrix + +would be most welcome!",,2013-08-09 13:48:43.087 +169872,52910,3731.0,5,,CC BY-SA 3.0,232d5027-98a5-42a7-bea8-21bb6599d718,"As part of my thesis, I'm proving (or attempting to prove...) a few asymptotic results. Because these results depend on the condition number, I'd like to have some idea about the typical sizes of a condition numbers that crop up in social science research. That way, I can give some guidance about how large the sample size has to be before we reach the happy land of asymptopia. + +I'd be happy for any guidance. + +**My very specific** setup is as follows. For the standard Generalized Least Squares (GLS) model + +$$Y = X\beta + e \quad \quad \quad e \sim N(0, V\sigma^2) $$ + +where $V$ is assumed to be known and positive definite, we define + +$$ X^- = (X^\top X)^{-1} X^\top \quad \quad \quad U = (I-XX^-)V$$ +and the condition number $\kappa$ + +$$ \kappa = \frac{ \lambda_{\text{max}} }{ \lambda_{\text{min}} } $$ + +where the $\lambda_\star$ values are the maximum and minimum eigenvalues of the matrix $U$. + +Does anyone have pointers to references for the sizes of condition numbers in social science research? I don't even know where to look. Any pointers for either + + 1. OLS estimators (used incorrectly in a GLS context as posed above) + 2. GLS estimators (correctly analyzed) + 3. REML/ML estimators where $V$ is + estimated and then conditioned upon, or + 4. OLS fixed effect only models + where $V$ is the identity matrix + +would be most welcome!","improved formatting, clarified that my definition of a condition number is non-standard",2013-08-09 14:00:16.667 +171094,53261,449.0,3,,CC BY-SA 3.0,e75aac2b-9bd3-4627-94a6-275cbe8682cb,,,2013-08-14 03:06:30.697 +171099,53264,17249.0,2,,CC BY-SA 3.0,1b55933c-6460-4fff-8508-50bba11fee23,"I can only think of this referring to $\eta^2$, the proportion of variance explained in the dependent variable by the grouping variable (in this case, a binary variable). This would be indeed the same value as the $R^2$ obtained if the difference between the two groups was estimated using simple linear regression: + +$y_i=\beta_0+\beta_1group_i+\epsilon_i$",,2013-08-14 03:24:19.637 +185659,57270,22593.0,1,,CC BY-SA 3.0,1a8e6a02-0e64-4b5f-857f-40ecb23b1df6,Simulating groups different with respect to the orthogonal complement in R,,2013-10-11 00:34:33.700 +171100,53264,17249.0,5,,CC BY-SA 3.0,a212ce65-b0c8-4321-a136-3a63f4d0db73,"I can only think of this referring to $\eta^2$, the proportion of variance explained in the dependent variable by the grouping variable (in this case, a binary variable). This would be indeed the same value as the $R^2$ obtained if the difference between the two groups was estimated using simple linear regression: + +$y_i=\beta_0+\beta_1group_i+\epsilon_i$ + +I can see from the paper that the second F test is actually that of an interaction term, and since it has 1 degree of freedom, I am deducing that the second factor was also a binary variable. In this case, the $\eta^2$'s are partial $\eta^2$'s, which are the proportion of variance explained by the grouping variable (or the interaction term) controlling for the other grouping variable. In this more complex case, the partial $\eta^2$'s are the same as the partial $R^2$'s obtained from the multiple linear regression: + +$y_i=\beta_0+\beta_1group_{1i}+\beta_2group_{1i}+\beta_3 \cdot group_{1i} \cdot group_{2i} + \epsilon_i$",added 626 characters in body,2013-08-14 03:30:42.040 +171102,53264,17249.0,5,,CC BY-SA 3.0,ae3d442b-7faa-4944-9a35-776cd498365c,"I can only think of this referring to $\eta^2$, computed as: + +$\eta^2={SS_{effect} \over SS_{total}}$ + +This is the proportion of variance explained in the dependent variable by the grouping variable (in this case, a binary variable). This would be indeed the same value as the $R^2$ obtained if the difference between the two groups was estimated using simple linear regression: + +$y_i=\beta_0+\beta_1group_i+\epsilon_i$ + +I can see from the paper that the second F test is actually that of an interaction term, and since it has 1 degree of freedom, I am deducing that the second factor was also a binary variable. In this case, the $\eta^2$'s are partial $\eta^2$'s, which are the proportion of variance explained by the grouping variable (or the interaction term) controlling for the other grouping variable. In this more complex case, the partial $\eta^2$'s are the same as the partial $R^2$'s obtained from the multiple linear regression: + +$y_i=\beta_0+\beta_1group_{1i}+\beta_2group_{1i}+\beta_3 \cdot group_{1i} \cdot group_{2i} + \epsilon_i$",added formula to eta²,2013-08-14 03:57:01.523 +171106,53261,155.0,5,,CC BY-SA 3.0,2712bc6f-255d-4bba-8b80-7484f1ea94cd,"I was recently looking at a paper in the journal *Psychological Science* and came across this: + +*F*(1, 71) = 4.5, *p* = .037, $\mu^2$ = .06 + +*F*(1, 71) = 0.08, *p* = .78, $\mu^2$ = .001 + +I was wondering what the $\mu^2$ is in the above. Typically in APA the third thing should be either the MSE or it should be a standardized effect size (or you should have all 4). I'm guessing it's a standardized effect size of some sort but I'm not familiar with it and searching the net has turned up nothing. The actual effect, as near as I can tell from the graph, is about 12 for the first one. + +Is this an effect size I haven't heard of yet or a typo in the article? + +Farrelly, D., Slater, R., Elliott, H. R., Walden, H. R. and Wetherell, M. A. (2013) Competitors Who Choose to Be Red Have Higher Testosterone Levels. *Psychological Science*, DOI:10.1177/0956797613482945 + +Here's a screen shot of the text (p.2) + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/h5BtM.png",thought the screen shot might be useful.,2013-08-14 04:04:45.867 +171480,53384,20838.0,1,,CC BY-SA 3.0,5aa121c9-6908-487b-b82f-9a46cda3e493,Bootstrapping residuals: Am I doing it right?,,2013-08-15 21:35:20.763 +171481,53384,20838.0,3,,CC BY-SA 3.0,5aa121c9-6908-487b-b82f-9a46cda3e493,,,2013-08-15 21:35:20.763 +171479,53384,20838.0,2,,CC BY-SA 3.0,5aa121c9-6908-487b-b82f-9a46cda3e493,"**First of all:** +From what I understood, bootstrapping residuals works as follows: + + 1. Fit model to data + 2. Calculate the residuals + 3. Resample the residuals and add them to 1. + 4. Fit model to new dataset from 3. + 5. Repeat `n` times, but always add the resampled residuals to the fit + from 1. + +Is that correct so far? + +--- +**What I want to do** is something slightly different: + +I want to estimate parameter and prediction uncertainty for an algorithm that estimates some environmental variable. + +What I have is a error-free time-series (from a simulation) of that variable, `x_true`, to which I add some noise, `x_noise`, in order to generate a synthetic dataset `x`. +I then try to find optimal parameters by fitting my algorithm with the sum of squares `sum((x_estimate - x_true)^2)` (! not `x_estimate - x` !) as an objective function. In order to see how my algorithm performs and to create samples of my parameters' distributions, I want to resample `x_noise`, add it to `x_true`, fit my model again, rinse and repeat. Is that a valid approach to assess parameter uncertainty? Can I interpret the fits to the bootstrapped datasets as prediction uncertainty, or do I have to follow the procedure I posted above? + +Sorry if I'm not very clear with terminology, English isn't my first language and I'm pretty new to this.",,2013-08-15 21:35:20.763 +171488,53384,20838.0,5,,CC BY-SA 3.0,8eef2ba8-8e8e-4738-a9af-d1569f0bc044,"**First of all:** +From what I understood, bootstrapping residuals works as follows: + + 1. Fit model to data + 2. Calculate the residuals + 3. Resample the residuals and add them to 1. + 4. Fit model to new dataset from 3. + 5. Repeat `n` times, but always add the resampled residuals to the fit + from 1. + +Is that correct so far? + +--- +**What I want to do** is something slightly different: + +I want to estimate parameter and prediction uncertainty for an algorithm that estimates some environmental variable. + +What I have is a error-free time-series (from a simulation) of that variable, `x_true`, to which I add some noise, `x_noise`, in order to generate a synthetic dataset `x`. +I then try to find optimal parameters by fitting my algorithm with the sum of squares `sum((x_estimate - x_true)^2)` (! not `x_estimate - x` !) as an objective function. In order to see how my algorithm performs and to create samples of my parameters' distributions, I want to resample `x_noise`, add it to `x_true`, fit my model again, rinse and repeat. Is that a valid approach to assess parameter uncertainty? Can I interpret the fits to the bootstrapped datasets as prediction uncertainty, or do I have to follow the procedure I posted above? + +Sorry if I'm not very clear with terminology, English isn't my first language and I'm pretty new to this. + +/edit: I think I haven't really made clear what my model does. Think of it as essentially something like a de-noising method. It's not a predictive model, it's an algorithm that tries to extract the underlying signal of a noisy time-series of environmental data.",added 266 characters in body,2013-08-15 22:13:08.573 +171503,53384,20838.0,6,,CC BY-SA 3.0,f4b44eb1-97b7-4c31-aeea-f628927ae5a0,,added tag,2013-08-15 23:33:09.623 +171505,53391,20312.0,1,,CC BY-SA 3.0,148e1d2c-00af-4152-8635-e634fb590995,How do I calculate random baseline?,,2013-08-15 23:38:00.327 +171506,53391,20312.0,3,,CC BY-SA 3.0,148e1d2c-00af-4152-8635-e634fb590995,,,2013-08-15 23:38:00.327 +171504,53391,20312.0,2,,CC BY-SA 3.0,148e1d2c-00af-4152-8635-e634fb590995,"I am a bit confused as to how to calculate random baseline. If I understand correctly the random baseline is calculated by adding up the squared probabilities of all the classes. The random baseline classifier thus picks a class at random, instead of choosing the most frequent one. + +I have 7classes, each with # of items and a total of X. How do I find the probabilities? I have a clue, but I don't want to be mistaken as this is for an important piece of work for me. Thanks! :)",,2013-08-15 23:38:00.327 +171507,53384,20838.0,5,,CC BY-SA 3.0,44eed4c4-3e25-407a-9b37-1281ec9d6142,"**First of all:** +From what I understood, bootstrapping residuals works as follows: + + 1. Fit model to data + 2. Calculate the residuals + 3. Resample the residuals and add them to 1. + 4. Fit model to new dataset from 3. + 5. Repeat `n` times, but always add the resampled residuals to the fit + from 1. + +Is that correct so far? + +--- +**What I want to do** is something slightly different: + +I want to estimate parameter and prediction uncertainty for an algorithm that estimates some environmental variable. + +What I have is a error-free time-series (from a simulation) of that variable, `x_true`, to which I add some noise, `x_noise`, in order to generate a synthetic dataset `x`. +I then try to find optimal parameters by fitting my algorithm with the sum of squares `sum((x_estimate - x_true)^2)` (! not `x_estimate - x` !) as an objective function. In order to see how my algorithm performs and to create samples of my parameters' distributions, I want to resample `x_noise`, add it to `x_true`, fit my model again, rinse and repeat. Is that a valid approach to assess parameter uncertainty? Can I interpret the fits to the bootstrapped datasets as prediction uncertainty, or do I have to follow the procedure I posted above? + +Sorry if I'm not very clear with terminology, English isn't my first language and I'm pretty new to this. + +/edit: I think I haven't really made clear what my model does. Think of it as essentially something like a de-noising method. It's not a predictive model, it's an algorithm that tries to extract the underlying signal of a noisy time-series of environmental data. + + +/edit^2: **For the MATLAB-Users** out there, I wrote down some quick & dirty linear regression example of what I mean. + +This is what I believe ""ordinary"" bootstrapping of residuals is (please correct me if I'm wrong): http://pastebin.com/C0CJp3d1 + +This is what I want to do: http://pastebin.com/6748SLib",fixed one code example,2013-08-15 23:57:54.450 +171508,53384,,25,,,5673a4b4-4fcc-45f3-9636-9e898a52a0ce,,http://twitter.com/#!/StackStats/status/368161180998975488,2013-08-16 00:04:00.307 +171510,53391,15827.0,5,,CC BY-SA 3.0,6dd71773-15a3-4b18-b028-ab77c12f9003,"I am a bit confused as to how to calculate random baseline. If I understand correctly the random baseline is calculated by adding up the squared probabilities of all the classes. The random baseline classifier thus picks a class at random, instead of choosing the most frequent one. + +I have 7 classes, each with # of items and a total of X. How do I find the probabilities? ","""important for you"" not a criterion for anyone else; the question stands well on its own",2013-08-16 00:52:39.100 +171553,53404,20820.0,1,,CC BY-SA 3.0,69df8ff1-fdef-4002-b43d-fdc69c550f5b,why do we use one-tail test [F-test] in anova?,,2013-08-16 06:36:58.590 +171552,53404,20820.0,2,,CC BY-SA 3.0,69df8ff1-fdef-4002-b43d-fdc69c550f5b,"state reason for using one tail in the analysis of variance test ? + + + +why do we use one-tail test [F-test] in anova ?",,2013-08-16 06:36:58.590 +171554,53404,20820.0,3,,CC BY-SA 3.0,69df8ff1-fdef-4002-b43d-fdc69c550f5b,,,2013-08-16 06:36:58.590 +171582,53404,20820.0,6,,CC BY-SA 3.0,ac1b0109-213a-4c0c-81d1-2e59b27cc5e8,,edited tags,2013-08-16 09:21:43.467 +171673,53439,5208.0,3,,CC BY-SA 3.0,5e2a9f0d-40f5-4148-8e2e-af5b54a8c897,,,2013-08-16 15:35:05.747 +171672,53439,5208.0,1,,CC BY-SA 3.0,5e2a9f0d-40f5-4148-8e2e-af5b54a8c897,How would you frame this as a machine learning problem?,,2013-08-16 15:35:05.747 +171671,53439,5208.0,2,,CC BY-SA 3.0,5e2a9f0d-40f5-4148-8e2e-af5b54a8c897,"I have a trading software that buys and sells loans. There's an auction site where borrowers ask for some money and lenders bid on them until the borrower is fully funded and the auction ends. There's lots of information on each loan request. My trading bot always bids at the highest possible interest rate, if it is outbid, then it just re-bids slightly lower. Once I win the loan parts, I can sell them at a markup. Right now, I sell at the minimum markup, so that with fees I barely make a profit. + +What I'm not sure is what markup I should sell? The lower the markup the faster my loan parts sell, but I will get less profit too. On what loans should I bid? Should I bid on a loan auction with a higher interest rate, but which is not going to end for several days, thereby leaving my money stale, or should I bid on an auction with a lower interest rate, but which is going to end very soon. Sometimes in the former case, the borrower might decide to take the loan and not wait until the end of the auction, thereby I could secure a better interest rate than just bidding on the loan auction due to end soon. + +I was thinking of framing this problem as reinforcement learning, but I'm not sure how to do it. My goal is to maximiz the profit I make from trading loans. Any ideas?",,2013-08-16 15:35:05.747 +171792,53471,11155.0,2,,CC BY-SA 3.0,5cf3d5ab-3ab2-40b1-bc6e-63dae8ab0233,I think adaptive in this context just means the reestimation on a rolling basis. So the parameter should not change until there is a change point. Then the true parameter increases and stays constant after it decreases again because of the second change point. The estimated parameter is evaluated compared to the true parameter: How fast does it get the change point? How fast does it *adapt* to the new environment?,,2013-08-17 07:53:38.020 +171810,49879,,25,,,69536709-1587-453a-ab99-d05538c6e3f0,,http://twitter.com/#!/StackStats/status/368705372057374721,2013-08-17 12:06:25.727 +171972,40104,4656.0,4,,CC BY-SA 3.0,6355eb14-36ae-4ea1-8219-56329f6b50f9,The _weighted_ sum of two independent Poisson random variables,"added the word ""weighted"" in the title to distinguish this question from the more usual one",2013-08-18 13:02:31.897 +172127,53384,20838.0,5,,CC BY-SA 3.0,15adea5f-93f7-454f-9215-8852efdc7a71,"**First of all:** +From what I understood, bootstrapping residuals works as follows: + + 1. Fit model to data + 2. Calculate the residuals + 3. Resample the residuals and add them to 1. + 4. Fit model to new dataset from 3. + 5. Repeat `n` times, but always add the resampled residuals to the fit + from 1. + +Is that correct so far? + +--- +**What I want to do** is something slightly different: + +I want to estimate parameter and prediction uncertainty for an algorithm that estimates some environmental variable. + +What I have is a error-free time-series (from a simulation) of that variable, `x_true`, to which I add some noise, `x_noise`, in order to generate a synthetic dataset `x`. +I then try to find optimal parameters by fitting my algorithm with the sum of squares `sum((x_estimate - x_true)^2)` (! not `x_estimate - x` !) as an objective function. In order to see how my algorithm performs and to create samples of my parameters' distributions, I want to resample `x_noise`, add it to `x_true`, fit my model again, rinse and repeat. Is that a valid approach to assess parameter uncertainty? Can I interpret the fits to the bootstrapped datasets as prediction uncertainty, or do I have to follow the procedure I posted above? + +Sorry if I'm not very clear with terminology, English isn't my first language and I'm pretty new to this. + +/edit: I think I haven't really made clear what my model does. Think of it as essentially something like a de-noising method. It's not a predictive model, it's an algorithm that tries to extract the underlying signal of a noisy time-series of environmental data. + + +/edit^2: **For the MATLAB-Users** out there, I wrote down some quick & dirty linear regression example of what I mean. + +This is what I believe ""ordinary"" bootstrapping of residuals is (please correct me if I'm wrong): http://pastebin.com/C0CJp3d1 + +This is what I want to do: http://pastebin.com/mbapsz4c",Updated a comment in second code example,2013-08-19 09:04:57.950 +176726,54724,10957.0,2,,CC BY-SA 3.0,f021eb83-6877-4ab9-a762-66d6629e4d51,"I came across a simple question on comparing flexible models (i.e. splines) vs. inflexible models (e.g. linear regression) under different scenarios. The question is: + +In general, do we expect the performance of a flexible statistical learning method to perform better or worse than an inflexible method when: + +1. The number of predictors p is extremely large, and the number of observations n is small? +2. The variance of the error terms, i.e. σ2 = Var(e), is extremely high? + +I think for (1), when n is small, inflexible models are better (not sure). For (2), I don't know which model is (relatively) better.",,2013-09-04 20:24:41.247 +176727,54724,10957.0,1,,CC BY-SA 3.0,f021eb83-6877-4ab9-a762-66d6629e4d51,flexible and inflexible models in machine learning,,2013-09-04 20:24:41.247 +174630,54234,21204.0,2,,CC BY-SA 3.0,8c05e30e-ddee-40f1-9ee2-9d06b83164b1,"I have a question regarding to time series forecasting. In particular I've been working with a Bayesian approach, but I think the question is independent from that. + +I have several time series which are very stable in time, except on specific dates that they have sudden changes. The problem is that if I use a forecasting technique that looks at the past to predict the future, such as ARIMA, the days after the sudden changes have high impact on the forecast. + +Thus, to give a simple example, suppose I'm predicting $x_{t+1} = \sum \beta_j x_j, j,,2013-08-28 15:56:16.150 +174631,54234,21204.0,1,,CC BY-SA 3.0,8c05e30e-ddee-40f1-9ee2-9d06b83164b1,Weighting time series coefficients using model's likelihood,,2013-08-28 15:56:16.150 +174889,50739,,24,,CC BY-SA 3.0,04563081-e021-43d5-97e2-6d148f1cfe08,,Proposed by 27581 approved by 919 edit id of 4944,2013-08-29 13:44:27.413 +174890,50739,,5,,CC BY-SA 3.0,04563081-e021-43d5-97e2-6d148f1cfe08,"I found two very useful posts about the difference between linear regression analysis and ANOVA and how to visualise them: + +http://stats.stackexchange.com/questions/555/why-is-anova-taught-used-as-if-it-is-a-different-research-methodology-compared + +http://stats.stackexchange.com/questions/5278/how-to-visualize-what-anova-does + +As stated in the first post, to test whether the average height of male and females is the same you can use a regression model ($y = \alpha + \beta x + \epsilon$, where $y$ denotes height and $x$ denotes gender) and test whether $\beta = 0$. If $\beta = 0$, then there is no difference in the height between males and females. However, I am not quite sure how this is tested when you have three groups. Imagine the following example: + + height (y) - group (x) + 5 - A + 6 - A + 7 - A + 6 - A + 30 - B + 32 - B + 34 - B + 33 - B + 20 - C + 19 - C + 21 - C + 22 - C + +The regression model would look like: + +$$y = a+ b x + \epsilon$$ + +I quickly visualized the data (see image below) + +They way I understood the regression model is that it would now test whether +any of the three slopes (AB, AC or BC) has a slope $b$ which is significantly different from 0. If that's the case one can conclude like in an ANOVA that there is at least one group in which height is significantly different from one or more groups. Afterwards, one could use a post-hoc test of course to test which of the groups really differ. Is my understanding of how the regression models tests this hypothesis correct? + + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/6LD5Q.png + +",Formatted question,2013-08-29 13:44:27.413 +175755,54506,21322.0,1,,CC BY-SA 3.0,921ae92f-7cd6-4240-b71c-c77635a51121,How to analyse this data?,,2013-09-02 04:48:52.793 +175756,54506,21322.0,3,,CC BY-SA 3.0,921ae92f-7cd6-4240-b71c-c77635a51121,,,2013-09-02 04:48:52.793 +175754,54506,21322.0,2,,CC BY-SA 3.0,921ae92f-7cd6-4240-b71c-c77635a51121,"I am conducting an experiment investigating lineup accuracy and witness confidence. + +A long story short: we want to know what the pattern of false positives, hits and misses on a lineup task are under different lineup conditions and how confidence may vary with/independently of accuracy. Logically, witness confidence may also be affected by the different conditions, and we'd like to know this as well. + +The between subjects variables are: Gender (male, female), ethnicity (asian, caucasian), and lineup type (sequential- where people see each lineup member one at a time and make a decision about each one, and simultaneous- where people see all the lineup members and make a decision about whether they see the perpetrator or not) + +The within subjects variables are: Photo type (same vs different photo of the person), lineup ethnicity (asian vs caucasian lineups), confidence (5 levels of a likert scale from 1 ""not confidence at all"" to 5 ""extremely confident) + +The dependent variable is accuracy in terms of hits, misses and false positives (these could be coded as 0 or 1?) and correct recognition (hits-false positives) + +One of the problems is that we want to know the relationship between confidence and accuracy, which would necessitate that confidence is an independent variable, however we also want to know if the other variables might affect confidence (such as ethnicity or lineup type), so I'm having trouble figuring out the best way to analyse this data. + +Does anyone have any answers for me? Someone suggested maybe logistic regression, but they weren't really sure. I'm really not used to dealing with categorical data, so am in need of help! ",,2013-09-02 04:48:52.793 +175781,54506,,25,,,909d9002-9e7f-4383-a489-621b14f3fd09,,http://twitter.com/#!/StackStats/status/374458404137607168,2013-09-02 09:06:55.293 +176102,54574,11283.0,2,,CC BY-SA 3.0,e5f2574b-278c-4706-9f6e-313c24b7df6b,"I have calculated log-likelihood distances between 50 sequences according to the Formula (1): + +$$ +D(X_i,X_j)= 1/2(\log p(X_i|Mod_j)+\log p(X_j|Mod_i)), +$$ +where $ +p(X_i|Mod_j) +$ is the likelihood of sequence X_i being produced by model Mod_j, where $Mod_j$ is a corresponding Markov model of the given $Seq_j$, defined by its Transition Probability Matrix and Start Probabilities Vector. The measure is symmetrical as seen from the definition. To make the measure more ""legible"" and similar to the traditional measures, I compute distance=(1-D) from formula (1). Thus, D(X_i,X_i) = 0 and the distance increases if the likelihood decreases. + +Now, I have a 50x50 Distance Matrix.I have run a ""meaningfullness"" check, and it seemed ok for me - i.e. more similar sequences had smaller distance and very different ones had very large distance. The distances seemed to satisfy the triangle inequality. However, I have noticed that: + +1) the shorter sequences seem to be ""closer"" to all other sequences than longer ones. It seems that this distance measure is biased to favor short distances. + +2) I have tried PAM-clustering with the distance matrix by converting my distance matrix to dist object in R by using as.dist(), and my results were very bad, even for 2 clusters or 49 ( max avg.silhouette width produced by R function pam was 0.28). With some numbers of clusters the avg.silhouette widths were even negative. + +I am coming to conclusion that my way of computing medoids is invalid/conceptually wrong. What could be the problem? Can log-likelihood distance matrix be used with medoids clustering at all? +",,2013-09-03 10:48:31.337 +176104,54574,11283.0,3,,CC BY-SA 3.0,e5f2574b-278c-4706-9f6e-313c24b7df6b,,,2013-09-03 10:48:31.337 +176103,54574,11283.0,1,,CC BY-SA 3.0,e5f2574b-278c-4706-9f6e-313c24b7df6b,Log-likelihood distance measure validity for clustering,,2013-09-03 10:48:31.337 +176107,49906,15473.0,6,,CC BY-SA 3.0,5395e6c4-dbee-4d21-be55-51849baf6fd4,,add one tag,2013-09-03 10:49:18.003 +176108,49906,,24,,CC BY-SA 3.0,5395e6c4-dbee-4d21-be55-51849baf6fd4,,"Proposed by 21599 approved by 17230, 686 edit id of 5155",2013-09-03 10:49:18.003 +176126,54574,,5,,CC BY-SA 3.0,cbf2f1a2-17ef-42ca-b16e-5a4b6b07621b,"I have calculated log-likelihood distances between 50 sequences according to the Formula (1): + +$$ +D(X_i,X_j)= 1/2(\log p(X_i|Mod_j)+\log p(X_j|Mod_i)), +$$ +where $ +p(X_i|Mod_j) +$ is the likelihood of sequence $X_i$ being produced by model $Mod_j$, where $Mod_j$ is a corresponding Markov model of the given $Seq_j$, defined by its Transition Probability Matrix and Start Probabilities Vector. The measure is symmetrical as seen from the definition. To make the measure more ""legible"" and similar to the traditional measures, I compute distance$=(1-D)$ from formula (1). Thus, $D(X_i,X_i) = 0$ and the distance increases if the likelihood decreases. + +Now, I have a 50x50 Distance Matrix.I have run a ""meaningfullness"" check, and it seemed ok for me - i.e. more similar sequences had smaller distance and very different ones had very large distance. The distances seemed to satisfy the triangle inequality. However, I have noticed that: + +1) the shorter sequences seem to be ""closer"" to all other sequences than longer ones. It seems that this distance measure is biased to favor short distances. + +2) I have tried PAM-clustering with the distance matrix by converting my distance matrix to dist object in `R` by using as.dist(), and my results were very bad, even for 2 clusters or 49 ( max avg.silhouette width produced by `R` function pam was 0.28). With some numbers of clusters the avg.silhouette widths were even negative. + +I am coming to conclusion that my way of computing medoids is invalid/conceptually wrong. What could be the problem? Can log-likelihood distance matrix be used with medoids clustering at all? +",formatted the maths parts,2013-09-03 11:43:15.713 +176125,54574,,24,,CC BY-SA 3.0,cbf2f1a2-17ef-42ca-b16e-5a4b6b07621b,,"Proposed by 26338 approved by 17230, 22047 edit id of 5168",2013-09-03 11:43:15.713 +176151,54574,11283.0,5,,CC BY-SA 3.0,7a7b098b-f4a4-4f7b-a183-f2768cef65f1,"I have calculated log-likelihood distances between 50 sequences according to the Formula (1): + +$$ +D(X_i,X_j)= 1/2(\log p(X_i|Mod_j)+\log p(X_j|Mod_i)), +$$ +where $ +p(X_i|Mod_j) +$ is the likelihood of sequence $X_i$ being produced by model $Mod_j$, where $Mod_j$ is a corresponding Markov model of the given $Seq_j$, defined by its Transition Probability Matrix and Start Probabilities Vector. The measure is symmetrical as seen from the definition. To make the measure more ""legible"" and similar to the traditional measures, I compute distance$=(1-D)$ from formula (1). Thus, $D(X_i,X_i) = 0$ and the distance increases if the likelihood decreases. + +Now, I have a 50x50 Distance Matrix.I have run a ""meaningfullness"" check, and it seemed ok for me - i.e. more similar sequences had smaller distance and very different ones had very large distance. The distances seemed to satisfy the triangle inequality. However, I have noticed that: + +1) the shorter sequences seem to be ""closer"" to all other sequences than longer ones. It seems that this distance measure is biased to favor short distances. + +2) I have tried PAM-clustering with the distance matrix by converting my distance matrix to dist object in `R` by using as.dist(), and my results were very bad, even for 2 clusters or 49 ( max avg.silhouette width produced by `R` function pam was 0.28). With some numbers of clusters the avg.silhouette widths were even negative. + +I am coming to conclusion that my way of computing medoids is invalid/conceptually wrong. What could be the problem? Can log-likelihood distance matrix be used with medoids clustering at all? + +edit: I am including the heatmap of the distance matrix, where x- and y-axis represent sequences (1 through 50th). It looks strange to me but I cannot pinpoint what exactly doesn't feel right. + +![heatmap][1] + + + [1]: https://i.stack.imgur.com/RcSBc.png",included image,2013-09-03 12:22:26.347 +176340,54622,12744.0,1,,CC BY-SA 3.0,ff32c643-c67e-43cf-81b8-9210d8867740,"Do ""true"" multi-level models require Bayesian methods?",,2013-09-03 21:37:13.153 +176339,54622,12744.0,2,,CC BY-SA 3.0,ff32c643-c67e-43cf-81b8-9210d8867740,"I've been recently learning about mixed effects models (e.g. via Fitzmaurice, Laird, and Ware 's book *Applied Longitudinal Analysis*) as well as Bayesian hierarchical models (e.g. via Gelman and Hill's book *Data Analysis Using Regression and Multilevel/Hierarchical Models*) + +One curious thing I've noticed: The Bayesian literature tends to emphasize that their models can handle covariates at multiple level of analysis. For example, if the clustering is by person, and each person is measured in multiple ""trials,"" then the Bayesian hierarchical models can investigate the main effects of covariates both at the subject and trial level, as well as interactions across ""levels."" + +However, I have not seen these kinds of models in the textbooks introducing frequentist methods. + +I'm not sure if this is a coincidence, or an example of where Bayesian methods can do ""more complicated things."" Is it possible to use mixed effects models (e.g. the lme4 or nlme packages in the R statistical software) to investigate interactions of covariates across ""levels"" of analysis? + + + +",,2013-09-03 21:37:13.153 +176341,54622,12744.0,3,,CC BY-SA 3.0,ff32c643-c67e-43cf-81b8-9210d8867740,,,2013-09-03 21:37:13.153 +176350,54624,503.0,2,,CC BY-SA 3.0,3dc1b12b-fe5f-4f8d-a2b6-9abb55692113,"Yes it is. I don't know the commands in `R` but in `SAS PROC MIXED` you can have variables at either level in the MODEL statement and you can include interactions. e.g., a split plot design + + proc mixed; + class A B Block; + model Y = A B A*B; + random Block A*Block; + run; + +where A is assigned to whole plots and B is assigned to subplots. + + +",,2013-09-03 22:08:41.237 +176398,54637,21382.0,1,,CC BY-SA 3.0,1b932e59-71cc-4d8e-9ebc-9e42822b7b88,how to get pooled p-values on tests done in multiple imputed datasets?,,2013-09-04 01:06:26.173 +176399,54637,21382.0,3,,CC BY-SA 3.0,1b932e59-71cc-4d8e-9ebc-9e42822b7b88,,,2013-09-04 01:06:26.173 +176397,54637,21382.0,2,,CC BY-SA 3.0,1b932e59-71cc-4d8e-9ebc-9e42822b7b88,"Using Amelia in R, I obtained multiple imputed datasets. After that, I performed a repeated measures test in SPSS. Now, I want to pool test results. I know that I can use Rubin's rules (implemented through any multiple imputation package in R) to pool means and standard erros, but how do I pool p-values? Is it possible? Is there a function in R to do so? +Thanks in advance.",,2013-09-04 01:06:26.173 +176403,54637,,24,,CC BY-SA 3.0,55d21b0f-722c-413f-8362-eae00df3a582,,"Proposed by 21599 approved by 601, 805 edit id of 5211",2013-09-04 01:41:45.727 +176404,54637,15473.0,4,,CC BY-SA 3.0,55d21b0f-722c-413f-8362-eae00df3a582,How to get pooled p-values on tests done in multiple imputed datasets?,Fixed grammar,2013-09-04 01:41:45.727 +176402,54637,15473.0,5,,CC BY-SA 3.0,55d21b0f-722c-413f-8362-eae00df3a582,"Using Amelia in R, I obtained multiple imputed datasets. After that, I performed a repeated measures test in SPSS. Now, I want to pool test results. I know that I can use Rubin's rules (implemented through any multiple imputation package in R) to pool means and standard errors, but how do I pool p-values? Is it possible? Is there a function in R to do so? +Thanks in advance.",Fixed grammar,2013-09-04 01:41:45.727 +176728,54724,10957.0,3,,CC BY-SA 3.0,f021eb83-6877-4ab9-a762-66d6629e4d51,,,2013-09-04 20:24:41.247 +176733,54724,16174.0,4,,CC BY-SA 3.0,53b4720d-700c-43c1-84e2-57f73d508e02,Flexible and inflexible models in machine learning,"Sentence case style in title, tag, LaTeX",2013-09-04 20:39:44.190 +176734,54724,16174.0,5,,CC BY-SA 3.0,53b4720d-700c-43c1-84e2-57f73d508e02,"I came across a simple question on comparing flexible models (i.e. splines) vs. inflexible models (e.g. linear regression) under different scenarios. The question is: + +In general, do we expect the performance of a flexible statistical learning method to perform better or worse than an inflexible method when: + +1. The number of predictors $p$ is extremely large, and the number of observations $n$ is small? +2. The variance of the error terms, i.e. $σ^2 = \text{Var}(e)$, is extremely high? + +I think for (1), when $n$ is small, inflexible models are better (not sure). For (2), I don't know which model is (relatively) better.","Sentence case style in title, tag, LaTeX",2013-09-04 20:39:44.190 +177138,54836,20304.0,3,,CC BY-SA 3.0,5966db7f-40ab-4f21-8d87-0b85bae4b157,,,2013-09-06 15:56:05.860 +177136,54836,20304.0,2,,CC BY-SA 3.0,5966db7f-40ab-4f21-8d87-0b85bae4b157,"I am trying to implement LDA using the collapsed Gibbs sampler from +http://www.uoguelph.ca/~wdarling/research/papers/TM.pdf + +the main algorithm is shown below + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/X9OwX.png + +I'm a bit confused about the notation in the inner-most loop. n_dk refers to the count of the number of words assigned to topic k in document d, however I'm not sure which document d this is referring to. Is it the document that *word* (from the next outer loop) is in? Furthermore, the paper does not show how to get the hyperparameters alpha and beta. Should these be guessed and then tuned? Furthermore, I don't understand what the *W* refers to in the inner-most loop (or the beta without the subscript). + +Could anyone enlighten me?",,2013-09-06 15:56:05.860 +177137,54836,20304.0,1,,CC BY-SA 3.0,5966db7f-40ab-4f21-8d87-0b85bae4b157,Implementing LDA - notation confusion,,2013-09-06 15:56:05.860 +177139,54836,,6,,CC BY-SA 3.0,b94f0d74-74b1-42dd-bb01-e733da209010,,edited tags,2013-09-06 16:08:33.727 +177161,54836,2081.0,4,,CC BY-SA 3.0,57659a03-60cf-45f9-876b-fa8d3f8007b8,Implementing Latent Dirichlet Allocation - notation confusion,edited title,2013-09-06 17:45:26.307 +177442,54915,21523.0,2,,CC BY-SA 3.0,3523ce2f-d9d7-47b8-b2e8-861dc04bb4a2,"I'm trying to implement Pettitt test in R following papers like this: + +http://www.ias.ac.in/jess/forthcoming/JESS-D-13-00049.pdf (pag. 5, 6) + +or this: + +http://www.igu.in/17-3/paper-2.pdf + +But, I'm misunderstandig something because testing with some data I think that output is not correct. + +Here are the code: + + pettitt<-function(x,alpha=0.99) { + # Pettitt AN. 1979 A non-parametric approach to the change point detection. + # x is a vector + # alpha, integer, level of significance + x<-na.omit(x) + o<-rank(x) + s<-c() + L<-length(x) + for (i in 1:(L-1)) { + s<-c(s, + 2*(colSums(as.matrix(o[1:i]))) + -(i*(L+1)) + ) + } + vc<-sqrt((-1)*log(alpha)*(L^3 + L^2)/6) + output<-list(abs(s),vc) + return(output) +} + +Testing with larain and tempdub dataset from TSA package: + +

+library(TSA)
+data(larain)
+data(tempdub)
+pettitt(larain)
+[[1]]
+  [1]  78 118 180  76  30  30 144  90 124 148 224 334 314 298 362 444 356 334
+ [19] 300 302 194 121  83  55  45  57  25  95 175 195 193 287 181 231 175 213
+ [37] 301 331 421 345 392 322 282 354 372 274 194 130 188 248 175  97  85 153
+ [55] 105 171 181 189 245 297 401 375 449 557 467 551 594 576 602 490 406 354
+ [73] 262 266 362 248 244 214 208 200 247 147  89  13   9  15  97   5   9  83
+ [91]   3  95 123  63  31  12  44   6  48  34  72 108 208 164 170 282 214 148
+[109] 202 140 104   6 102  86
+
+[[2]]
+[1] 50.69224
+
+> max(pettitt(larain)[[1]])
+[1] 602
+
+
+pettitt(tempdub)
+[[1]]
+  [1]  83 161 226 235 164  60  80 169 220 219 188  74  57 177 266 281 228 147
+ [19]  19  82 125 140 102  41 100 197 235 254 233 141   1  97 144 153 112  26
+ [37]  73 206 255 258 235 137  28  49  98 101  46  29 149 252 281 274 247 160
+ [55]  43  70 115 126  79  22 157 248 317 328 287 224  96  27  86  79  27  82
+ [73] 225 348 407 406 351 256 125  10  58  77  32  61 200 314 381 386 353 216
+ [91] 124  40  35  70  35  36 173 302 365 386 321 242 131  10  51  38  19 146
+[109] 241 319 342 359 330 223  89  45 113 144 111   2 123 228 280 275 250 177
+[127]  34  50  89 102  59  22 131 248 334 359 302 198  73  46  83 100  73
+
+[[2]]
+[1] 70.96777
+
+> max(pettitt(tempdub)[[1]])
+[1] 407
+
+
+ +I don't know if I lost something in pettitt test or there are error in my code.",,2013-09-08 13:37:22.690 +177443,54915,21523.0,1,,CC BY-SA 3.0,3523ce2f-d9d7-47b8-b2e8-861dc04bb4a2,Implementing Pettitt test in R,,2013-09-08 13:37:22.690 +177444,54915,21523.0,3,,CC BY-SA 3.0,3523ce2f-d9d7-47b8-b2e8-861dc04bb4a2,,,2013-09-08 13:37:22.690 +177450,54915,5237.0,5,,CC BY-SA 3.0,871b4b5f-75a3-473b-aa10-e831ccf77240,"I'm trying to implement Pettitt test in R following papers like this [pdf](http://www.ias.ac.in/jess/forthcoming/JESS-D-13-00049.pdf) (pp. 5 & 6), or this [pdf](http://www.igu.in/17-3/paper-2.pdf). But, I'm misunderstanding something, because having tested it with some data, I think that output is not correct. + +Here is the code: + + pettitt <- function(x, alpha=0.99) { + # Pettitt AN. 1979 A non-parametric approach to the change point detection. + # x is a vector + # alpha, integer, level of significance + x <- na.omit(x) + o <- rank(x) + s <- c() + L <- length(x) + for (i in 1:(L-1)) { + s <- c(s, 2*(colSums(as.matrix(o[1:i]))) - (i*(L+1)) ) + } + vc <- sqrt((-1) * log(alpha) * (L^3 + L^2)/6) + output <- list(abs(s), vc) + return(output) + } + +Testing with `larain` and `tempdub` dataset from `TSA package`: + + library(TSA) + data(larain) + data(tempdub) + pettitt(larain) + [[1]] + [1] 78 118 180 76 30 30 144 90 124 148 224 334 314 298 362 444 356 334 + [19] 300 302 194 121 83 55 45 57 25 95 175 195 193 287 181 231 175 213 + [37] 301 331 421 345 392 322 282 354 372 274 194 130 188 248 175 97 85 153 + [55] 105 171 181 189 245 297 401 375 449 557 467 551 594 576 602 490 406 354 + [73] 262 266 362 248 244 214 208 200 247 147 89 13 9 15 97 5 9 83 + [91] 3 95 123 63 31 12 44 6 48 34 72 108 208 164 170 282 214 148 + [109] 202 140 104 6 102 86 + + [[2]] + [1] 50.69224 + + > max(pettitt(larain)[[1]]) + [1] 602 + + pettitt(tempdub) + [[1]] + [1] 83 161 226 235 164 60 80 169 220 219 188 74 57 177 266 281 228 147 + [19] 19 82 125 140 102 41 100 197 235 254 233 141 1 97 144 153 112 26 + [37] 73 206 255 258 235 137 28 49 98 101 46 29 149 252 281 274 247 160 + [55] 43 70 115 126 79 22 157 248 317 328 287 224 96 27 86 79 27 82 + [73] 225 348 407 406 351 256 125 10 58 77 32 61 200 314 381 386 353 216 + [91] 124 40 35 70 35 36 173 302 365 386 321 242 131 10 51 38 19 146 + [109] 241 319 342 359 330 223 89 45 113 144 111 2 123 228 280 275 250 177 + [127] 34 50 89 102 59 22 131 248 334 359 302 198 73 46 83 100 73 + + [[2]] + [1] 70.96777 + + > max(pettitt(tempdub)[[1]]) + [1] 407 + + +I don't know if I lost something in pettitt test or there are error in my code.",light editing & formatting,2013-09-08 14:04:35.343 +177453,54915,,25,,,151022fa-addc-4e75-8ebe-706f9a22c114,,http://twitter.com/#!/StackStats/status/376723347809517569,2013-09-08 15:06:59.810 +177852,16337,2081.0,5,,CC BY-SA 3.0,ecb1723a-e780-466a-bcf9-8affd469662d,"**Spearman rho vs Kendall tau**. These two are so much computationally different that you *cannot* directly compare their magnitudes. Spearman is usually higher by 1/4 to 1/3 and this makes one incorrectly conclude that Spearman is ""better"" for a particular dataset. The difference between rho and tau is in their ideology, *proportion-of-variance* for rho and *probability* for tau. Rho is a usual Pearson r applied for ranked data, and like r, is more sensitive to points with large moments (that is, deviations from cloud centre) than to points with small moments. Therefore rho is quite sensitive to the shape of the cloud after ranking: the coefficient for an oblong rhombic cloud will be higher than the coefficient for an oblong dumbbelled cloud (because sharp edges of the first are large moments). Tau is an extension of Gamma and is equally sensitive to all points, so it is less sensitive to peculiarities in shape of the ranked cloud. Tau is more ""general"" than rho, for rho is warranted only when you believe the underlying relationship between the variables is monotonic. Rho is comparable with r in magnitude; tau is not. + +**Kendall tau as Gamma**. Tau is just a standardized form of Gamma. Several related measures all have numerator $P-Q$ but differ in normalizing *denominator*: + + - Gamma: $P+Q$ + - Somer's D(""x dependent""): $P+Q+T_x$ + - Somer's D(""y dependent""): $P+Q+T_y$ + - Somer's D(""symmetric""): arithmetic mean of the above two + - Kendall's Tau-b corr. (most suitable for square tables): geometric mean of those two + - Kendall's Tau-c corr. (most suitable for rectangular tables): $N^2(k-1)/2k$ + - Kendall's Tau-a corr. (makes nо adjustment for ties): $N(N-1)/2 = P+Q+T_x+T_y+T_{xy}$ + +where $P$ - number of pairs of observations with ""concordance"", $Q$ - with ""inversion""; $T_x$ - number of ties by variable X, $T_y$ - by variable Y, $T_{xy}$ – by both variables; $N$ - number of observations, $k$ - number of distinct values in that variable where this number is less. +",added 87 characters in body,2013-09-10 10:14:19.363 +177859,16337,2081.0,5,,CC BY-SA 3.0,834fd617-762f-49fa-b3a5-cafe5a705a0c,"**Spearman rho vs Kendall tau**. These two are so much computationally different that you *cannot* directly compare their magnitudes. Spearman is usually higher by 1/4 to 1/3 and this makes one incorrectly conclude that Spearman is ""better"" for a particular dataset. The difference between rho and tau is in their ideology, *proportion-of-variance* for rho and *probability* for tau. Rho is a usual Pearson r applied for ranked data, and like r, is more sensitive to points with large moments (that is, deviations from cloud centre) than to points with small moments. Therefore rho is quite sensitive to the shape of the cloud after ranking: the coefficient for an oblong rhombic cloud will be higher than the coefficient for an oblong dumbbelled cloud (because sharp edges of the first are large moments). Tau is an extension of Gamma and is equally sensitive to all points, so it is less sensitive to peculiarities in shape of the ranked cloud. Tau is more ""general"" than rho, for rho is warranted only when you believe the underlying relationship between the variables is monotonic. Rho is comparable with r in magnitude; tau is not. + +**Kendall tau as Gamma**. Tau is just a standardized form of Gamma. Several related measures all have numerator $P-Q$ but differ in normalizing *denominator*: + + - Gamma: $P+Q$ + - Somer's D(""x dependent""): $P+Q+T_x$ + - Somer's D(""y dependent""): $P+Q+T_y$ + - Somer's D(""symmetric""): arithmetic mean of the above two + - Kendall's Tau-b corr. (most suitable for square tables): geometric mean of those two + - Kendall's Tau-c corr. (most suitable for rectangular tables): $N^2(k-1)/(2k)$ + - Kendall's Tau-a corr. (makes nо adjustment for ties): $N(N-1)/2 = P+Q+T_x+T_y+T_{xy}$ + +where $P$ - number of pairs of observations with ""concordance"", $Q$ - with ""inversion""; $T_x$ - number of ties by variable X, $T_y$ - by variable Y, $T_{xy}$ – by both variables; $N$ - number of observations, $k$ - number of distinct values in that variable where this number is less. +",added 2 characters in body,2013-09-10 10:46:32.390 +177900,55043,18198.0,2,,CC BY-SA 3.0,b5f3d14a-c2ed-4993-b73e-c41a04b6ae94,"My problem is similar to this one but I am looking for a different solution: (so if it should be merged just let me know). + +http://stats.stackexchange.com/questions/21742/measuring-whats-lost-in-pca-dimensionality-reduction + +I my application we have a correlation matrix of dimension 30 upon which we conduct a PCA analysis and retain the first three eigenvectors on the basis that they typically contain 90+% of the variation. + +However this has always struck me as a little arbitrary, I would like to test whether these smaller eigenvectors do actually contain a ""signal"" rather than white noise. + +I suppose one very simple method would be to split the data up and see if these smaller eignevectors maintain a similar shape, but I would like to find a more scientifically robust way to test this hypothesis. +",,2013-09-10 13:00:25.933 +177901,55043,18198.0,1,,CC BY-SA 3.0,b5f3d14a-c2ed-4993-b73e-c41a04b6ae94,"Testing whether small Eigenvalues produce a ""signal""",,2013-09-10 13:00:25.933 +177902,55043,18198.0,3,,CC BY-SA 3.0,b5f3d14a-c2ed-4993-b73e-c41a04b6ae94,,,2013-09-10 13:00:25.933 +177953,55043,18198.0,6,,CC BY-SA 3.0,b36dd6d7-23d1-4200-9a85-5102a479f8bb,,edited tags,2013-09-10 15:29:32.983 +178298,55150,21630.0,1,,CC BY-SA 3.0,42df28a9-3aab-430b-9843-3528cc18147c,Mathematical definition of causality,,2013-09-12 01:13:21.980 +185696,57278,20144.0,1,,CC BY-SA 3.0,5d3d56c9-9330-4d2b-88f3-518b100b01d3,"Exponential family parameter estimation and fitting, references",,2013-10-11 06:43:41.090 +185938,57348,594.0,5,,CC BY-SA 3.0,6ff91b81-0395-40da-ad4b-4532e4975f87,"If + +$$P = [0,0.9,0,0.1]$$ + +$$Q = [0,1,0,0]$$ + +Then $$KL(P||Q) = 0 + \ln(0.9/1)\cdot0.9 + 0 + 0 = -0.094$$ + +This shouldn't be possible from the Gibbs inequality. What am I misunderstanding? ",added 13 characters in body,2013-10-12 11:42:48.307 +178297,55150,21630.0,2,,CC BY-SA 3.0,42df28a9-3aab-430b-9843-3528cc18147c,"Let Y and X be random variables. E(Y|X) is the conditional mean of Y given X. We say Y is not causally related to X if E(Y|X) does not depend on X, i.e., it is equal to E(Y). Now, let's go along with this definiton of causality for a second. By the law of iterated expectations, E(XE(Y|X)) = E(E(XY|X)) = E(XY). This means that if E(Y|X) does not depend on X, if it is equal to E(Y), then E(X)E(Y) = E(XY). In other words: + +If X and Y are not causally related, then X and Y are uncorrelated! - This makes no sense and I know this must be wrong. What is my mistake? + +Kind regards, + +Christian",,2013-09-12 01:13:21.980 +178299,55150,21630.0,3,,CC BY-SA 3.0,42df28a9-3aab-430b-9843-3528cc18147c,,,2013-09-12 01:13:21.980 +178303,55150,21630.0,5,,CC BY-SA 3.0,5a0417cf-2e26-40b7-af28-f00ff2387346,"Let Y and X be random variables. E(Y|X) is the conditional mean of Y given X. We say Y is not causally related to X if E(Y|X) does not depend on X, i.e., it is equal to E(Y). Now, let's go along with this definiton of causality for a second. By the law of iterated expectations, E(XE(Y|X)) = E(E(XY|X)) = E(XY). This means that if E(Y|X) does not depend on X, if it is equal to E(Y), then E(X)E(Y) = E(XY). + +In econometrics we generally assume E(Y|X) = b0 + b1*X. So E(Y|X) = E(Y) is equivalent to b1 = 0. The logic applies in this specific scenario too. + +In other words: + +If X and Y are not causally related, then X and Y are uncorrelated! - This makes no sense and I know this must be wrong. What is my mistake? + +Kind regards, + +Christian",added 154 characters in body,2013-09-12 01:22:02.703 +178311,55150,5237.0,5,,CC BY-SA 3.0,7c2dc3e9-2c38-4f8f-8cdb-c3397fee19cb,"Let $Y$ and $X$ be random variables. $E(Y|X)$ is the conditional mean of $Y$ given $X$. We say $Y$ is not causally related to $X$ if $E(Y|X)$ does not depend on $X$, i.e., it is equal to $E(Y)$. Now, let's go along with this definition of causality for a second. By the law of iterated expectations, $E(XE(Y|X)) = E(E(XY|X)) = E(XY)$. This means that if $E(Y|X)$ does not depend on $X$, if it is equal to $E(Y)$, then $E(X)E(Y) = E(XY)$. + +In econometrics we generally assume $E(Y|X) = b_0 + b_1X$. So $E(Y|X) = E(Y)$ is equivalent to $b_1 = 0$. The logic applies in this specific scenario too. + +In other words: + +If $X$ and $Y$ are not causally related, then $X$ and $Y$ are uncorrelated! - This makes no sense and I know this must be wrong. What is my mistake? +",formatted; removed signature,2013-09-12 01:41:23.783 +178312,55150,21630.0,5,,CC BY-SA 3.0,14307d4b-f114-4f0b-b384-869cd9b1fa77,"Let $Y$ and $X$ be random variables. $E(Y|X)$ is the conditional mean of $Y$ given $X$. We say $Y$ is not causally related to $X$ if $E(Y|X)$ does not depend on $X$, which implies it is equal to $E(Y)$. Now, let's go along with this definition of causality for a second. By the law of iterated expectations, $E(XE(Y|X)) = E(E(XY|X)) = E(XY)$. This means that if $E(Y|X)$ does not depend on $X$, if it is equal to $E(Y)$, then $E(X)E(Y) = E(XY)$. + +In other words: + +If $X$ and $Y$ are not causally related, then $X$ and $Y$ are uncorrelated! - This makes no sense and I know this must be wrong. Have I defined causality incorrectly? What have I done wrong? + +In econometrics we generally assume $E(Y|X) = b_0 + b_1X$. So $E(Y|X) = E(Y)$ is equivalent to $b_1 = 0$. The logic applies in this specific scenario too. +",added 51 characters in body,2013-09-12 02:08:52.787 +178317,55150,,25,,,241a0024-1c80-4cea-8a54-7fd75db59461,,http://twitter.com/#!/StackStats/status/377991721180987392,2013-09-12 03:07:03.497 +178427,55182,17573.0,2,,CC BY-SA 3.0,d9871688-2e22-4bd2-820f-8fb0ce846fdf,"You have defined causality incorrectly, yes. Probably, you have heard the saying ""correlation isn't causation."" You have essentially defined causality as correlation. The problem is worse than that, though. Causality is not a statistical or probabilistic concept at all, at least as those topics are normally taught. There is no statistical or probabilistic definition of causality: nothing involving conditional expectations or conditional distributions or suchlike. It is hard to pick up this fact from courses in statistics or econometrics, though. + +Unfortunately, we tend to do a better job saying what causality isn't than what causality is. Causality always and everywhere comes from theory, from a priori reasoning, from assumptions. You mentioned econometrics. If you have been taught instrumental variables competently, then you know that causal effects can only be measured if you have an ""exclusion restriction."" And you know that exclusion restrictions always come from theory. + +You said you wanted math, though. The guy you want to read is [Judea Pearl][1]. It's not easy math, and the math sometimes wanders off into philosophy, but that's because causality is a hard subject. Here is [a page][2] with more links on the subject. + +If the math there is too hard, let me know, and I will see if I can find an easier presentation. + + + [1]: http://projecteuclid.org/DPubS/Repository/1.0/Disseminate?view=body&id=pdfview_1&handle=euclid.ssu/1255440554 + [2]: http://vserver1.cscs.lsa.umich.edu/~crshalizi/notebooks/causality.html",,2013-09-12 13:18:37.003 +178431,55182,17573.0,5,,CC BY-SA 3.0,d318bda4-2a10-4535-a249-bd6eebb6c455,"You have defined causality incorrectly, yes. Probably, you have heard the saying ""correlation isn't causation."" You have essentially defined causality as correlation. The problem is worse than that, though. Causality is not a statistical or probabilistic concept at all, at least as those topics are normally taught. There is no statistical or probabilistic definition of causality: nothing involving conditional expectations or conditional distributions or suchlike. It is hard to pick up this fact from courses in statistics or econometrics, though. + +Unfortunately, we tend to do a better job saying what causality isn't than what causality is. Causality always and everywhere comes from theory, from a priori reasoning, from assumptions. You mentioned econometrics. If you have been taught instrumental variables competently, then you know that causal effects can only be measured if you have an ""exclusion restriction."" And you know that exclusion restrictions always come from theory. + +You said you wanted math, though. The guy you want to read is [Judea Pearl][1]. It's not easy math, and the math sometimes wanders off into philosophy, but that's because causality is a hard subject. Here is [a page][2] with more links on the subject. Here is [a free online book][3] I just came across. Finally, here is [a previous question][4] where I gave an answer you might find useful. + + + [1]: http://projecteuclid.org/DPubS/Repository/1.0/Disseminate?view=body&id=pdfview_1&handle=euclid.ssu/1255440554 + [2]: http://vserver1.cscs.lsa.umich.edu/~crshalizi/notebooks/causality.html + [3]: http://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ + [4]: http://stats.stackexchange.com/questions/59588/what-is-endogeneity-and-what-does-it-mean-substantively-as-an-extension-what-is",added additional relevant links,2013-09-12 13:38:24.510 +178452,55150,,24,,CC BY-SA 3.0,2a5d9eac-f1f9-4cce-aadd-981efbf470c7,,"Proposed by 25212 approved by 686, 601 edit id of 5387",2013-09-12 14:01:36.937 +178451,55150,17573.0,6,,CC BY-SA 3.0,2a5d9eac-f1f9-4cce-aadd-981efbf470c7,,Added tag causal inference,2013-09-12 14:01:36.937 +178525,55209,20222.0,3,,CC BY-SA 3.0,6030c099-6008-47b8-ba82-1b3c70a4349f,,,2013-09-12 19:12:06.933 +178526,55209,20222.0,2,,CC BY-SA 3.0,6030c099-6008-47b8-ba82-1b3c70a4349f,"As a non-statistician, I need help in interpreting a customer specified two-part reliability requirement that I think involves KS. + +Requirement Part 1 + +R[4 years] must be greater than or equal to 0.95 and + +R[8 years] must be greater than or equal to 0.85 + +I have plotted the reliability (survival) function of a 2-parameter Weibull distribution that meets the above requirement in Plot A below. The shape parameter is 1.664 and the characteristic life is 23.844 for this distribution. + + +![Plot A]![enter image description here][1] + + +Requirement Part 2 + +The confidence level shall be 90% when demonstrating the Part 1 requirement via product life testing. + +It’s the Part 2 that I’m a bit shaky on. On page 8-54 of MIL-HDBK-338B (http://www.sre.org/pubs/Mil-Hdbk-338B.pdf) there is a table showing KS critical “d” values as a function samples size, N and significance level, alpha (also note the plot on page 8-57). From this table I took a d value of 0.264 based on a signficance value of 0.10 and a sample size of 20. Plot B below shows my result. My interpretation of Plot B is that after running a life test on 20 samples that if the resulting reliability plot does not fall below the lower boundary shown in Plot B then we have met the requirements. + +![Plot B]![enter image description here][2] + +I have two questions: + +1. Did I translate the Part 2 requirement properly when I used an alpha of 0.10 to obtain the KS critical value of 0.264 ? In other words, does a 90% confidence equal a 0.10 significance within the KS context ? If not, can someone provide guidance ? + +2. How would *you* interpret Plot B ? + +Many thanks. + + + [1]: https://i.stack.imgur.com/bY5JN.png + [2]: https://i.stack.imgur.com/ilZqf.png",,2013-09-12 19:12:06.933 +178527,55209,20222.0,1,,CC BY-SA 3.0,6030c099-6008-47b8-ba82-1b3c70a4349f,Interpretation of Kolmogorov-Smirnov Critical Value Generated Distributions,,2013-09-12 19:12:06.933 +178714,55260,13459.0,2,,CC BY-SA 3.0,61e5ebd1-a4c6-465d-9694-7b23e87c2e5f,"I have a dataframe with 2 million rows and approximately 200 columns/features. Approximately 30-40% of the entries are blank. I am trying to find important features for a binary response variable. The predictors may be categorical or continuous. + +I started with applying logistic regression, but having so much missing entries I feel that this is not a good approach as glm discard all records which have any item blank. So I am now looking to apply tree based algorithms (`rpart` or `gbm`) which are capable to handle missing data in a better way. + +Since my data is too big for `rpart` or `gbm`, I decided to randomly fetch 10,000 records from original data, apply `rpart` on that, and keep building a pool of important variables. However, even this 10,000 records seem to be too much for the `rpart` algorithm. + +What can I do in this situation? Is there any switch that I can use to make it fast? Or it is impossible to apply `rpart` on my data. + +I am using the following rpart command: + + varimp = rpart(fmla, dat=tmpData, method = ""class"")$variable.importance + +",,2013-09-13 15:45:00.397 +178716,55260,13459.0,3,,CC BY-SA 3.0,61e5ebd1-a4c6-465d-9694-7b23e87c2e5f,,,2013-09-13 15:45:00.397 +178715,55260,13459.0,1,,CC BY-SA 3.0,61e5ebd1-a4c6-465d-9694-7b23e87c2e5f,which regression tree to use for large data?,,2013-09-13 15:45:00.397 +178735,55260,5237.0,5,,CC BY-SA 3.0,6df3ac67-9fb1-42aa-9d67-eaba2cda024d,"I have a dataframe with 2 million rows and approximately 200 columns / features. Approximately 30-40% of the entries are blank. I am trying to find important features for a binary response variable. The predictors may be categorical or continuous. + +I started with applying logistic regression, but having so much missing entries I feel that this is not a good approach as glm discard all records which have any item blank. So I am now looking to apply tree based algorithms (`rpart` or `gbm`) which are capable to handle missing data in a better way. + +Since my data is too big for `rpart` or `gbm`, I decided to randomly fetch 10,000 records from original data, apply `rpart` on that, and keep building a pool of important variables. However, even this 10,000 records seem to be too much for the `rpart` algorithm. + +What can I do in this situation? Is there any switch that I can use to make it fast? Or it is impossible to apply `rpart` on my data. + +I am using the following rpart command: + + varimp = rpart(fmla, dat=tmpData, method = ""class"")$variable.importance + +",switched tag; light editing,2013-09-13 17:07:10.740 +178737,55260,5237.0,6,,CC BY-SA 3.0,6df3ac67-9fb1-42aa-9d67-eaba2cda024d,,switched tag; light editing,2013-09-13 17:07:10.740 +178736,55260,5237.0,4,,CC BY-SA 3.0,6df3ac67-9fb1-42aa-9d67-eaba2cda024d,Which regression tree to use for large data?,switched tag; light editing,2013-09-13 17:07:10.740 +179054,55361,227.0,2,,CC BY-SA 3.0,ea423552-8c7b-4505-a48b-1794fd331c90,"I'm working through Think Bayes (free here: http://www.greenteapress.com/thinkbayes/) and I'm on exercise 3.1. Here's a summary of the problem: + +""A railroad numbers its locomotives in order 1..N. One day you see a locomotive with the number 60. Estimate how many locomotives the railroad has."" + +This solution is found with the likelihood function and exponential prior like so: + + class Train(Suite): + def __init__(self, hypos, alpha=1.0): + # Create an exponential prior + Pmf.__init__(self) + for hypo in hypos: + self.Set(hypo, hypo**(-alpha)) + self.Normalize() + def Likelihood(self, data, hypo): + if hypo < data: + return 0 + else: + return (1.0/hypo) + +Conceptually this is saying, if we see a train number larger than one of our hypotheses (1...1000) then every hypothesis that's smaller has a zero chance of being correct. The rest of the hypotheses have a 1/number_of_trains chance of showing us a train with this number. + +In the exercise I'm working on the author then adds on a little extra. This assumes there's only one company. In real life however you'd have a mixture of big and small companies and bigger companies (both equally likely). However, this would mean that you're more likely to see a train from a bigger company since they'd have more trains. + +Now the question is how to reflect this in the likelihood function? + +This isn't Stack Overflow so I'm not really asking for coding help, but instead perhaps just help about how I might think about this problem in terms of a likelihood function.",,2013-09-15 23:02:42.580 +179055,55361,227.0,1,,CC BY-SA 3.0,ea423552-8c7b-4505-a48b-1794fd331c90,Locomotive problem with various size companies,,2013-09-15 23:02:42.580 +179056,55361,227.0,3,,CC BY-SA 3.0,ea423552-8c7b-4505-a48b-1794fd331c90,,,2013-09-15 23:02:42.580 +179324,55436,21778.0,3,,CC BY-SA 3.0,2d9d4004-d3f1-4abd-aded-1fab527cffbd,,,2013-09-16 22:19:43.457 +179323,55436,21778.0,1,,CC BY-SA 3.0,2d9d4004-d3f1-4abd-aded-1fab527cffbd,Creating a high predictive value classifier,,2013-09-16 22:19:43.457 +179325,55436,21778.0,2,,CC BY-SA 3.0,2d9d4004-d3f1-4abd-aded-1fab527cffbd,"I have a two-class classification problem with n-dimensional data. I would like to train a classifier (preferably but not necessarily linear) with 100% positive predictive value. In other words, I want the model to completely avoid one of the classes. For this application a low-ish sensitivity is OK as long as PPV is ~100%. +Do you have any suggestions of good techniques to use? +Thank you!",,2013-09-16 22:19:43.457 +179774,55576,21833.0,2,,CC BY-SA 3.0,aaf7aaeb-ac21-4d4d-b4be-c8c1715d8b8d,"I need to generate random numbers based on already existing partial correlation data (not correlation or covariance data). Specifically, a 168*12 matrix based on a 12*12 partial correlation matrix. The idea is to simulate a data matrix that can be used for testing a few components of a project. + +Any help in this regard would be appreciated. I have looked around but have not found any threads that talk about doing this with partial correlation data. + +If someone has ideas about implementation in MATLAB, that would be a bonus! + +Thanks a lot in advance!",,2013-09-18 12:09:15.653 +179885,55609,21842.0,2,,CC BY-SA 3.0,81b91f73-5152-46a9-b8b8-32a8b136aa2c,"I am a newbie here. well I am trying to find a method or a formula to forecast meals per day, which have ̀5 meals to upload on flights, sales, wastage and passengers are what I have to consider, the old template is not completed yet , and its not so good to forecast, and I can't think of other formulas or methods to forecast , I have the sales in the past few months.. anyone can suggest me which methods can solve this problem? well I am using MS.Excel to calculate or if there is a program to suggest me it would be great :) ..thanks a lot for helping and sorry for my bad english :)",,2013-09-18 18:04:46.113 +179887,55609,21842.0,3,,CC BY-SA 3.0,81b91f73-5152-46a9-b8b8-32a8b136aa2c,,,2013-09-18 18:04:46.113 +179886,55609,21842.0,1,,CC BY-SA 3.0,81b91f73-5152-46a9-b8b8-32a8b136aa2c,how to forecast daily sale using Excel,,2013-09-18 18:04:46.113 +179924,55617,21846.0,1,,CC BY-SA 3.0,f71e727d-faa6-41dc-a2b3-0cb8efc827f0,regression with rank order as dependent variable,,2013-09-18 19:30:10.233 +179923,55617,21846.0,2,,CC BY-SA 3.0,f71e727d-faa6-41dc-a2b3-0cb8efc827f0,"I have data on 44 firms that have all been ranked by an expert. The ""best"" firm has rank 1, the second best has rank 2, ..., the last one has rank 44. +I have a bunch of explanatory variables and would like to explain the rank of the firm on the basis of these variables. My inclination is to use a regression model, but am concerned about the fact that the dependent variable is limited, it can only be a positive discrete number. + +I have thought about ordinal regression, but that seems impossible since I would have as many categories as I have observations. + +What regression models would be possible? (preferably to be run in R) + +thanks, +Peter +",,2013-09-18 19:30:10.233 +179922,55617,21846.0,3,,CC BY-SA 3.0,f71e727d-faa6-41dc-a2b3-0cb8efc827f0,,,2013-09-18 19:30:10.233 +179931,55617,,4,,CC BY-SA 3.0,c72f5abb-33e2-4530-8e0d-584193f7cbce,Regression with rank order as dependent variable,edited title,2013-09-18 19:53:54.687 +180251,55722,21885.0,3,,CC BY-SA 3.0,541541c7-b766-4959-9575-bad302cfcdbb,,,2013-09-19 22:14:08.257 +180252,55722,21885.0,1,,CC BY-SA 3.0,541541c7-b766-4959-9575-bad302cfcdbb,Looking for a good and complete probabilty and statistics book,,2013-09-19 22:14:08.257 +180253,55722,21885.0,2,,CC BY-SA 3.0,541541c7-b766-4959-9575-bad302cfcdbb,"I never had the opportunity to visit a stats course from a math faculty. I am looking for a probability theory and statistics book that is complete and self sufficient. By complete I mean that it contains all the proofs and not just states results. By self sufficient I mean that I am not required to read another book to be able to understand the book. Of course it can require college level (math student) calculus and linear algebra. + +I have looked at multiple books and it didn't like any of them. + +http://www.amazon.com/Probability-Statistics-Edition-Morris-DeGroot/dp/0321500466 is not complete enough. It just states a lot of stuff without the derivation. Besides that I like it. + +http://www.amazon.de/All-Statistics-Statistical-Inference-Springer/dp/0387402721 +Didn't like it at all. Almost no explanations. + +Weighing the Odds from David Willams is more formal degroot and seems to be is complete and self sufficient. However, I find the style strange. He also invents new terms that only he seems to use. All the stuff that is explained in DeGroot too is explained better there. + +If you know a great book in German that's also fine as I am german + + + +",,2013-09-19 22:14:08.257 +180263,55722,,5,,CC BY-SA 3.0,49d9fd74-8fe8-41cd-b829-6bf7a24ff93f,"I never had the opportunity to visit a stats course from a math faculty. I am looking for a probability theory and statistics book that is complete and self sufficient. By complete I mean that it contains all the proofs and not just states results. By self sufficient I mean that I am not required to read another book to be able to understand the book. Of course it can require college level (math student) calculus and linear algebra. + +I have looked at multiple books and it didn't like any of them. + + * DeGroot & Schervish (2011) *[Probability and Statistics (4th Edition)](http://www.amazon.com/Probability-Statistics-Edition-Morris-DeGroot/dp/0321500466)* Pearson + + This is not complete enough. It just states a lot of stuff without the derivation. Besides that I like it. + + * Wasserman (2004) *[All of Statistics: A Concise Course in Statistical Inference](http://www.amazon.de/All-Statistics-Statistical-Inference-Springer/dp/0387402721)* Springer. + + Didn't like it at all. Almost no explanations. + +""Weighing the Odds"" from David Willams is more formal than DeGroot and seems to be complete and self sufficient. However, I find the style strange. He also invents new terms that only he seems to use. All the stuff that is explained in DeGroot too is explained better there. + +If you know a great book in German that's also fine as I am German. +",tidy up the links and some of the text,2013-09-19 22:45:38.583 +180325,55576,21833.0,6,,CC BY-SA 3.0,fd875972-05b1-4263-a5cd-d3d9489b3c08,,Elaborated on the question,2013-09-20 06:23:02.580 +180324,55576,21833.0,5,,CC BY-SA 3.0,fd875972-05b1-4263-a5cd-d3d9489b3c08,"I need to generate random numbers based on already existing partial correlation data (not correlation or covariance data). Specifically, a 168*12 matrix based on a 12*12 partial correlation matrix. The idea is to simulate a data matrix that can be used for testing a few components of a project. + +Any help in this regard would be appreciated. I have looked around but have not found any threads that talk about doing this with partial correlation data. + +If someone has ideas about implementation in MATLAB, that would be a bonus! + +Thanks a lot in advance! + + +Additions: +Apologies for any ambiguity. + +-What I mean by partial correlation matrix is a matrix containing the partial correlations, calculated for any two pairs by partialling out effect of all other pairs. + +-The goal is: given a matrix of partial correlation values, is there a way I can generate a data set (168*12) that would have these partial correlation values? + +-If there is a method to convert partial correlation to correlation values, that would be appreciated as well. + +Thanks again!",Elaborated on the question,2013-09-20 06:23:02.580 +181533,56091,2802.0,2,,CC BY-SA 3.0,2fe33530-03ff-46a5-8fff-35bb520ac699,"If you are searching for proofs, I have been working for some time on a free stats textbook that collects lots of proofs of elementary and less elementary facts that are difficult to find in probability and statistics books (because they are scattered here and there). You can have a look at it at http://www.statlect.com/ +",,2013-09-25 11:04:55.647 +185717,57284,,3,user14650,CC BY-SA 3.0,d0e821d4-743c-40ef-83e9-66dad2846efa,,,2013-10-11 09:06:48.623 +181536,55722,15827.0,5,,CC BY-SA 3.0,2c950a00-ef5e-4b19-b2c0-aaa310f0a991,"I never had the opportunity to visit a stats course from a math faculty. I am looking for a probability theory and statistics book that is complete and self-sufficient. By complete I mean that it contains all the proofs and not just states results. By self-sufficient I mean that I am not required to read another book to be able to understand the book. Of course it can require college level (math student) calculus and linear algebra. + +I have looked at multiple books and I didn't like any of them. + + * DeGroot & Schervish (2011) *[Probability and Statistics (4th Edition)](http://www.amazon.com/Probability-Statistics-Edition-Morris-DeGroot/dp/0321500466)* Pearson + + This is not complete enough. It just states a lot of stuff without the derivation. Besides that I like it. + + * Wasserman (2004) *[All of Statistics: A Concise Course in Statistical Inference](http://www.amazon.de/All-Statistics-Statistical-Inference-Springer/dp/0387402721)* Springer. + + Didn't like it at all. Almost no explanations. + +""Weighing the Odds"" from David Williams is more formal than DeGroot and seems to be complete and self-sufficient. However, I find the style strange. He also invents new terms that only he seems to use. All the stuff that is explained in DeGroot too is explained better there. + +If you know a great book in German that's also fine as I am German. +",small fixes to English,2013-09-25 11:17:31.700 +182197,56273,22126.0,2,,CC BY-SA 3.0,3d58bdc2-ec6e-4e7f-99ad-17ba229c5149,"I have a question, that is very important for me! :( +It is written in book ""basic statistics for business and economics"" for organizing data into a frequency distribution: +step 1: Decide on the number of classes. The goal is to use just enough groupings or classes to reveal the shape of the distribution. Some judgment is needed here. A useful recipe to determine the number of classes (k) is the ""2 to the k rule"". This guide suggests you select the smallest number (k) for the number of classes such that 2k (in words, 2 raised to the power of k) is greater than the number of observations (n). [n<=2k​] + +I want to know, how can I prove this formula?",,2013-09-27 11:50:54.613 +182198,56273,22126.0,1,,CC BY-SA 3.0,3d58bdc2-ec6e-4e7f-99ad-17ba229c5149,Frequency Distribution,,2013-09-27 11:50:54.613 +182196,56273,22126.0,3,,CC BY-SA 3.0,3d58bdc2-ec6e-4e7f-99ad-17ba229c5149,,,2013-09-27 11:50:54.613 +182206,56273,16474.0,6,,CC BY-SA 3.0,319e43fe-dd7e-42a2-9fdb-0e62559f4987,,deleted 27 characters in body; edited tags,2013-09-27 12:15:34.770 +182205,56273,16474.0,5,,CC BY-SA 3.0,319e43fe-dd7e-42a2-9fdb-0e62559f4987,"I have a question, that is very important for me! :( +It is written in book ""basic statistics for business and economics"" for organizing data into a frequency distribution: +step 1: Decide on the number of classes. The goal is to use just enough groupings or classes to reveal the shape of the distribution. Some judgment is needed here. A useful recipe to determine the number of classes (k) is the ""2 to the $k$ rule"". This guide suggests you select the smallest number ($k$) for the number of classes such that $2^k$ is greater than the number of observations ($n$). [$n<=2^k$​] + +I want to know, how can I prove this formula?",deleted 27 characters in body; edited tags,2013-09-27 12:15:34.770 +182207,56273,10060.0,5,,CC BY-SA 3.0,3e747219-1ad9-44c0-9c6f-e2bafd746d21,"I have a question, that is very important for me! :( +It is written in book *Basic Statistics for Business and Economics* for organizing data into a frequency distribution: +step 1: Decide on the number of classes. The goal is to use just enough groupings or classes to reveal the shape of the distribution. Some judgment is needed here. A useful recipe to determine the number of classes (k) is the ""2 to the $k$ rule"". This guide suggests you select the smallest number ($k$) for the number of classes such that $2^k$ is greater than the number of observations ($n$). [$n<=2^k$​] + +I want to know, how can I prove this formula?",Capitalize book title.,2013-09-27 12:18:04.597 +182220,56273,16043.0,5,,CC BY-SA 3.0,1a14aade-2587-428f-8a0a-c50714a3cd03,"I have a question related to the book *Basic Statistics for Business and Economics* for organizing data into a frequency distribution: +>Step 1: Decide on the number of classes. The goal is to use just enough groupings or classes to reveal the shape of the distribution. Some judgment is needed here. A useful recipe to determine the number of classes ($k$) is the ""2 to the $k$ rule"". This guide suggests you select the smallest number ($k$) for the number of classes such that $2^k$ is greater than the number of observations ($n$): [$n \le 2^k$​] + +I want to know, how can I prove this formula?",latex and formatting,2013-09-27 12:51:41.120 +182221,56273,,24,,CC BY-SA 3.0,1a14aade-2587-428f-8a0a-c50714a3cd03,,"Proposed by 22311 approved by 686, -1 edit id of 5508",2013-09-27 12:51:41.120 +182222,56273,449.0,5,,CC BY-SA 3.0,83a1d797-6620-4831-844b-04ba9bc564f0,"I have a question that is very important to me related to the book *Basic Statistics for Business and Economics* for organizing data into a frequency distribution: +>Step 1: Decide on the number of classes. The goal is to use just enough groupings or classes to reveal the shape of the distribution. Some judgment is needed here. A useful recipe to determine the number of classes ($k$) is the ""2 to the $k$ rule"". This guide suggests you select the smallest number ($k$) for the number of classes such that $2^k$ is greater than the number of observations ($n$): [$n \le 2^k$​] + +I want to know, how can I prove this formula?",latex and formatting,2013-09-27 12:51:41.120 +182549,56372,21108.0,2,,CC BY-SA 3.0,928b4018-77a8-4e62-b27e-5b398cf42ba5,"I just made an implementation of P(A/B)/P(¬A/B) for a ""people who bought this also bought..."" algorithm. + + +I'm doing it by + + P(A/B) = count_users(bought_A_and_B)/count_users(bougth_A) + P(¬A/B) = count_users(bougth_B_but_not_A)/count_users(did_not_buy_A) + +Then dividing the top one by the bottom one I get a score which makes absolute sense, but what kind of correlation am I calculating? What is this method called? Where can I read more about it?",,2013-09-28 23:42:17.863 +182550,56372,21108.0,1,,CC BY-SA 3.0,928b4018-77a8-4e62-b27e-5b398cf42ba5,Item correlation for recommender system,,2013-09-28 23:42:17.863 +182548,56372,21108.0,3,,CC BY-SA 3.0,928b4018-77a8-4e62-b27e-5b398cf42ba5,,,2013-09-28 23:42:17.863 +182781,56445,22189.0,3,,CC BY-SA 3.0,cff5421d-f6ce-497f-b7bd-92ac3fe5b2bf,,,2013-09-30 08:12:44.097 +182782,56445,22189.0,2,,CC BY-SA 3.0,cff5421d-f6ce-497f-b7bd-92ac3fe5b2bf,"I have following problem: +Within an independent groups 1-factor design I have two independent groups, with a sample size of 20 each. The data of the treatment group is not normally distributed, whereas the data for the control group is (checked with Shapiro-Wilk Normality Test). Now I want to check if the differences of the means of both groups are significant. +What is the appropriate test for this? I think it should be the Wilcoxon Rank Sum and Signed Rank Test, but I am not sure... + +Could please anybody help me? + +Thank you very much in advance.",,2013-09-30 08:12:44.097 +182783,56445,22189.0,1,,CC BY-SA 3.0,cff5421d-f6ce-497f-b7bd-92ac3fe5b2bf,"Testing for significance between means, having one normal distributed sample and one non normal distributed",,2013-09-30 08:12:44.097 +182822,56445,,5,,CC BY-SA 3.0,c5668d6b-ad14-42cc-a570-ec75a045e1d0,"I have following problem: + +Within an independent groups 1-factor design I have two independent groups, with a sample size of 20 each. The data of the treatment group is not normally distributed, whereas the data for the control group is (checked with Shapiro-Wilk Normality Test). Now I want to check if the differences of the means of both groups are significant. +What is the appropriate test for this? I think it should be the Wilcoxon Rank Sum and Signed Rank Test, but I am not sure... + +Could please anybody help me? +",formatting and remove sign off,2013-09-30 11:14:45.493 +182832,56445,,25,,,92218eee-abef-4e0e-a4d7-0b3e3fe1d76e,,http://twitter.com/#!/StackStats/status/384650938159747072,2013-09-30 12:08:24.857 +182881,543,,5,,CC BY-SA 3.0,8def996c-7bd9-42fc-8aff-c43f96c20ae0,"As an economist, the analysis of variance (ANOVA) is taught and usually understood in relation to linear regression (e.g. in Arthur Goldberger's *A Course in Econometrics*). Economists/Econometricians typically view ANOVA as uninteresting and prefer to move straight to regression models. From the perspective of linear (or even generalised linear) models, ANOVA assigns coefficients into batches, with each batch corresponding to a ""source of variation"" in ANOVA terminology. + +Generally you can replicate the inferences you would obtain from ANOVA using regression but not always OLS regression. Multilevel models are needed for analysing hierarchical data structures such as ""split-plot designs,"" where between-group effects are compared to group-level errors, and within-group effects are compared to data-level errors. [Gelman's paper][1] [1] goes into great detail about this problem and effectively argues that ANOVA is an important statistical tool that should still be taught for it's own sake. + +In particular Gelman argues that ANOVA is a way of understanding and structuring multilevel models. Therefore ANOVA is not an alternative to regression but as a tool for summarizing complex high-dimensional inferences and for exploratory data analysis. + +Gelman is a well-respected statistician and some credence should be given to his view. However, almost all of the empirical work that I do would be equally well served by linear regression and so I firmly fall into the camp of viewing it as a little bit pointless. Some disciplines with complex study designs (e.g. psychology) may find ANOVA useful. + +[1] Gelman, A. (2005). Analysis of variance: why it is more important than ever (with discussion). *Annals of Statistics* 33, 1–53. [doi:10.1214/009053604000001048][1] + +[1]:http://dx.doi.org/10.1214%2F009053604000001048",direct link didn't work; reverting,2013-09-30 15:35:44.163 +182882,543,,24,,CC BY-SA 3.0,8def996c-7bd9-42fc-8aff-c43f96c20ae0,,"Proposed by 30872 approved by 5836, 6029 edit id of 5524",2013-09-30 15:35:44.163 +183210,56580,20190.0,2,,CC BY-SA 3.0,328921c4-74fc-406e-a600-f30d2a86f8ee,Why log transformation may improve results of svm prediction(regression)? Does svm based on assumption of normal distribution or something else? ,,2013-10-01 18:59:27.453 +183208,56580,20190.0,1,,CC BY-SA 3.0,328921c4-74fc-406e-a600-f30d2a86f8ee,SVM and log transformation,,2013-10-01 18:59:27.453 +183209,56580,20190.0,3,,CC BY-SA 3.0,328921c4-74fc-406e-a600-f30d2a86f8ee,,,2013-10-01 18:59:27.453 +183357,56580,20190.0,5,,CC BY-SA 3.0,20ce087a-37e6-42fb-b53c-6f878030e573,"Why log transformation may improve results of svm prediction(regression)? Does svm based on assumption of normal distribution or something else? + +update1. I use Radial basis function kernel.",update1,2013-10-02 07:18:28.333 +183372,56580,20190.0,5,,CC BY-SA 3.0,5417088c-ac32-4c5d-8504-32b80be14cce,"Why log(natural logarithm) transformation may improve results of svm prediction(**regression**, eps-svm)? Does svm based on assumption of normal distribution or something else? + +update1. I use Radial basis function kernel.",added 32 characters in body,2013-10-02 08:21:38.333 +183373,56580,20190.0,4,,CC BY-SA 3.0,49eabf32-78ec-4140-94c4-8e463af06d28,Support Vector Machine(SVM) and log transformation,edited title,2013-10-02 08:37:02.353 +183375,56580,20470.0,5,,CC BY-SA 3.0,b76257e8-d3ce-464f-970a-b2d80ba51234,"Why may log(natural logarithm) transformation improve results of SVM prediction(**regression**, eps-svm)? Is SVM based on the assumption of normal distribution or something else? + +update1. I use Radial basis function kernel.",grammar / typo,2013-10-02 08:49:37.283 +183374,56580,,24,,CC BY-SA 3.0,b76257e8-d3ce-464f-970a-b2d80ba51234,,Proposed by 28740 approved by 930 edit id of 5527,2013-10-02 08:49:37.283 +183580,56684,,2,Ben,CC BY-SA 3.0,e6d624b4-359d-4fbe-805e-caab3eebd6ac,"I am working with two highly skewed Bernoulli distributions where 96-99+% of the samples are in the ""false"" category, and the rest are in the ""true"" category (sort of speak). I am looking for a two-sided test of difference of proportions between the two samples. I can often achieve 500+ ""trues"" and tens or hundreds of thousands of ""falses"" in a reasonable time but I'm not sure if approximation to the normal distribution can withstand this extreme skewness. + +I initially thought I might need something non-parametric, but here, I actually know the distribution. + +I have been using a student's t-test, while paying attention to sample size estimation, but past experience has led me to be skeptical of its results. Thanks for your help.",,2013-10-02 21:14:55.710 +183579,56684,,3,Ben,CC BY-SA 3.0,e6d624b4-359d-4fbe-805e-caab3eebd6ac,,,2013-10-02 21:14:55.710 +183578,56684,,1,Ben,CC BY-SA 3.0,e6d624b4-359d-4fbe-805e-caab3eebd6ac,Significance test for highly skewed Bernoulli distribution,,2013-10-02 21:14:55.710 +183882,56768,11490.0,3,,CC BY-SA 3.0,b500505a-003b-49b8-ad92-3e6859863a67,,,2013-10-03 21:13:50.837 +183883,56768,11490.0,2,,CC BY-SA 3.0,b500505a-003b-49b8-ad92-3e6859863a67,"Suppose we have yearly data representing the market share of three companies, +say A, B and C. In other words, we have observations: + +$$ + A_t, \; B_t \;\; \text{and} \;\; C_t \;\; \text{where} \; \; A_t+B_t+C_t = 1 +$$ +for $t = 1, \dots,T$. + +Suppose that in year $t$ the market share of company A has changed by $\Delta A_t = A_t - A_{t-1}$. Is there any way of estimating how that change can be sub-divided into market share lost to or acquired from companies B and C? My actual problem includes 5 companies, but I guess that the solution shouldn't change too much. +Thanks!",,2013-10-03 21:13:50.837 +183881,56768,11490.0,1,,CC BY-SA 3.0,b500505a-003b-49b8-ad92-3e6859863a67,Estimating hidden transfers of market share,,2013-10-03 21:13:50.837 +183925,56780,15280.0,1,,CC BY-SA 3.0,9751f311-a012-4d08-9982-678aaa929071,Problem with proof of Conditional expectation as best predictor,,2013-10-04 00:24:13.043 +183923,56780,15280.0,3,,CC BY-SA 3.0,9751f311-a012-4d08-9982-678aaa929071,,,2013-10-04 00:24:13.043 +184930,57053,5237.0,5,,CC BY-SA 3.0,801ef2d3-22c6-4d65-94b9-f6989f650e9e,"How can I describe descriptive statistics for a dummy variable (gender of worker in a shop)? Let's say this is the info that I have: + + mean : 0.47 + median : 0 + max : 1 + min : 0 + std. dev : 0.4998 + skewness : 0.101 + kurtosis : 1.01 + jarque bera : 85.67 + probability : 0 + +I know that some of the information is useless since it's a dummy variable. So how do I interpret it in words?",clarified issue in title; added tags; edited for English; formatted,2013-10-08 13:09:13.483 +183924,56780,15280.0,2,,CC BY-SA 3.0,9751f311-a012-4d08-9982-678aaa929071,"I have an issue with the proof of + +>> $E(Y|X) \in \arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big]$ + +which very likely reveal a deeper misunderstanding of expectations and conditional expectations. + +The proof I know goes as follows ( another version of this proof can be found [here][1]) + +\begin{align*} +&\arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big]\\ + = &\arg \min_{g(x)} E \Big[ \big(Y - E(Y|X) + E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ \big(Y - E(Y|X)\big)^2 - 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +\end{align*} + +The proof then typically continues with an argument showing that $-2 E\Big[ \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big)\Big] = 0$, and hence + +\begin{align*} +\arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big] = \arg \min_{g(x)} E \Big[\big(E(Y|X) - g(X)\big)^2\Big] +\end{align*} + +which can be seen to be minimized when $g(X) = E(Y|X)$. + +My puzzles about the proof are the following: + + 1. Consider + +>> $E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]$. + + It seems to me that, independently of any argument showing that the first term is always equal to zero, one can see that setting $g(X) = E(Y|X)$ minimizes the expression as it implies $\big(E(Y|X) - g(X)\big) =0$ and hence + +>> $E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big] = E( 0 + 0)$ = 0. + +But if this is true, then the one might repeat the proof replacing $E(Y|X)$ by any other function of $x$, say $h(x)$, and get to the conclusion that it is $h(x)$ that minimizes the expression. So there must be something I misunderstand (right?). + +2. I have some doubts about the meaning of $E[(Y−g(x))^2]$ in the statement of the problem. How should the notation be interpreted? Does it mean + +>>$E_X[(Y−g(x))^2]$, $E_Y[(Y−g(x))^2]$ or $E_{XY}[(Y−g(x))^2]$? + + [1]: http://www.econ.uiuc.edu/~wsosa/econ507/CEF.pdf",,2013-10-04 00:24:13.043 +183932,56783,20473.0,2,,CC BY-SA 3.0,01d25739-5b2a-425a-8433-3c6a82e5bec7,"*(This is an adaptation from Granger & Newbold(1986) ""Forecasting Economic Time Series"").* + +By construction, your _error cost function_ is $\left[Y-g(X)\right]^2$. This incorporates a critical assumption (that the error cost function is symmetric around zero) -a different error cost function would not necessarily have the conditional expected value as the $\arg \min$ of its expected value. +You cannot minimize your error cost function because it contains unknown quantities. So you decide to minimize its expected value instead. Then your objective function becomes + +$$E\left[Y-g(X)\right]^2 = \int_{-\infty}^{\infty}\left[y-g(X)\right]^2f_{Y|X}(y|x)dy $$ + +which I believe answers also your second question. It is intuitive that the expected value will be of $Y$ conditional on $X$, since we are trying to estimate/forecast $Y$ based on $X$. Decompose the square to obtain + +$$E\left[Y-g(X)\right]^2 = \int_{-\infty}^{\infty}y^2f_{Y|X}(y|x)dy -2g(X)\int_{-\infty}^{\infty}yf_{Y|X}(y|x)dy \\+ \Big[g(X)\Big]^2\int_{-\infty}^{\infty}f_{Y|X}(y|x)dy$$ + +The first term does not contain $g(X)$ so it does not affect minimization, and it can be ignored. The integral in the second term equals the conditional expected value of $Y$ given $X$, and the integral in the last term equals unity. So + +$$\arg \min_{g(x)} E\left[Y-g(X)\right]^2 = \arg \min_{g(x)} \Big\{ -2g(X)E(Y\mid X) + \Big[g(X)\Big]^2 \Big\}$$ + +The first derivative w.r.t $g(X)$ is $-2E(Y\mid X) + 2g(X)$ leading to the first order condition for minimization $g(X) = E(Y\mid X)$ while the second derivative is equal to $2>0$ which is sufficient for a minimum.",,2013-10-04 01:05:36.887 +183937,56784,594.0,2,,CC BY-SA 3.0,6f674147-8b56-4cd3-ac40-a5939190d812,"Leaving aside the obvious issue of the low power of the chi-square in this sort of circumstance, imagine doing a chi-square test for some density with unspecified parameters. + +For concreteness, let's say an exponential distribution with unknown mean and a sample size of say 100. + +In order to get a reasonable number of expected observations per bin some account would need to be taken of the data (e.g. if we chose to put 6 bins below the mean and 4 above it, that would still be using data-based bin boundaries). + +But this would presumably affect the distribution of the test statistic under the null. + +I have seen plenty of discussion about the fact that - *if* the parameters are estimated by maximum likelihood from the *binned* data you lose 1 d.f per estimated parameter (an issue dating right back to Fisher vs Karl Pearson) - but I don't recall reading anything about finding the bin boundaries themselves based on the data. (If you estimate them from the unbinned data, then with $k$ bins the distribution of the test statistic lies somewhere between a $\chi^2_{k}$ and a $\chi^2_{k-p}$.) + +Does this data-based choice of bins substantively impact significance level or power? Are there some approaches that matter more than others? If there is much of an effect, is it something that goes away in large samples? + +If it does have a substantive impact, this would seem to make the use of a chi-squared test when parameters are unknown almost useless in many cases (in spite of still being advocated in quite a few texts), unless you had a good a-priori estimate of the parameter. + +Discussion of the issues or pointers to references (preferably with a mention of their conclusions) would be useful. +",,2013-10-04 01:48:35.417 +183938,56784,594.0,1,,CC BY-SA 3.0,6f674147-8b56-4cd3-ac40-a5939190d812,Impact of data-based bin boundaries on a chi-square goodness of fit test?,,2013-10-04 01:48:35.417 +183939,56784,594.0,3,,CC BY-SA 3.0,6f674147-8b56-4cd3-ac40-a5939190d812,,,2013-10-04 01:48:35.417 +183940,56784,594.0,5,,CC BY-SA 3.0,acdec6a2-daea-4796-82fb-5650a3030d52,"Leaving aside the obvious issue of the low power of the chi-square in this sort of circumstance, imagine doing a chi-square goodness of test for some density with unspecified parameters, by binning the data. + +For concreteness, let's say an exponential distribution with unknown mean and a sample size of say 100. + +In order to get a reasonable number of expected observations per bin some account would need to be taken of the data (e.g. if we chose to put 6 bins below the mean and 4 above it, that would still be using data-based bin boundaries). + +But this use of bins based on seeing the data would presumably affect the distribution of the test statistic under the null. + +I have seen plenty of discussion about the fact that - *if* the parameters are estimated by maximum likelihood from the *binned* data - you lose 1 d.f per estimated parameter (an issue dating right back to Fisher vs Karl Pearson) - but I don't recall reading anything about finding the bin boundaries themselves based on the data. (If you estimate them from the unbinned data, then with $k$ bins the distribution of the test statistic lies somewhere between a $\chi^2_{k}$ and a $\chi^2_{k-p}$.) + +Does this data-based choice of bins substantively impact significance level or power? Are there some approaches that matter more than others? If there is much of an effect, is it something that goes away in large samples? + +If it does have a substantive impact, this would seem to make the use of a chi-squared test when parameters are unknown almost useless in many cases (in spite of still being advocated in quite a few texts), unless you had a good a-priori estimate of the parameter. + +Discussion of the issues or pointers to references (preferably with a mention of their conclusions) would be useful. +",added 2 characters in body,2013-10-04 01:58:21.100 +184932,57053,5237.0,6,,CC BY-SA 3.0,801ef2d3-22c6-4d65-94b9-f6989f650e9e,,clarified issue in title; added tags; edited for English; formatted,2013-10-08 13:09:13.483 +184952,57065,22477.0,2,,CC BY-SA 3.0,a3a40789-426a-4c30-967b-12e4a66c0c51,"We've run a split test of a new product feature and want to measure if the uplift on revenue is significant. Our observations are definitely not normally distributed (most of our users don't spend, and within those that do, it is heavily skewed towards lots of small spenders and a few very big spenders). + +We've decided on using bootstrapping to compare the means, to get round the issue of the data not being normally distributed (side-question: is this a legitimate use of bootstrapping?) + +My question is, do I need to trim outliers from the data set (eg the few very big spenders) before I run the bootstrapping, or does that not matter? + +Any advice would be much appreciated, + +Thanks, + +Fred",,2013-10-08 14:28:42.660 +183946,56784,594.0,5,,CC BY-SA 3.0,33db5069-1b24-44e3-b579-3efdc0ce01dc,"Leaving aside the obvious issue of the low power of the chi-square in this sort of circumstance, imagine doing a chi-square goodness of test for some density with unspecified parameters, by binning the data. + +For concreteness, let's say an exponential distribution with unknown mean and a sample size of say 100. + +In order to get a reasonable number of expected observations per bin some account would need to be taken of the data (e.g. if we chose to put 6 bins below the mean and 4 above it, that would still be using data-based bin boundaries). + +But this use of bins based on seeing the data would presumably affect the distribution of the test statistic under the null. + +I have seen plenty of discussion about the fact that - *if* the parameters are estimated by maximum likelihood from the *binned* data - you lose 1 d.f per estimated parameter (an issue dating right back to Fisher vs Karl Pearson) - but I don't recall reading anything about finding the bin boundaries themselves based on the data. (If you estimate them from the unbinned data, then with $k$ bins the distribution of the test statistic lies somewhere between a $\chi^2_{k}$ and a $\chi^2_{k-p}$.) + +Does this data-based choice of bins substantively impact significance level or power? Are there some approaches that matter more than others? If there is much of an effect, is it something that goes away in large samples? + +If it does have a substantive impact, this would seem to make the use of a chi-squared test when parameters are unknown almost useless in many cases (in spite of still being advocated in quite a few texts), unless you had a good a-priori estimate of the parameter. + +Discussion of the issues or pointers to references (preferably with a mention of their conclusions) would be useful. + +Edit, pretty much an aside to the main question: + +It occurs to me that there are potential solutions for the specific case of the exponential\* (and the uniform come to think of it), but I am still interested in the more general issue of the impact choosing bin boundaries. + +\* For example, for the exponential, one might use the smallest observation (say it is equal to $m$) to get a very rough idea of where to place the bins (since the smallest observation is exponential with mean $\mu/n$), and then test the remaining $n-1$ differences ($x_i - m$) for exponentiality. Of course that might yield a very poor estimate of $\mu$, and hence poor bin choices, though I suppose one might use the argument recursively in order to take the lowest two or three observations from which to choose reasonable bins and then test the differences above the largest for exponentiality) +",added 881 characters in body,2013-10-04 02:51:16.323 +183954,56780,,25,,,542001ad-303c-429f-b511-8ce7900c7370,,http://twitter.com/#!/StackStats/status/385964605274865664,2013-10-04 03:08:27.487 +183961,56780,15280.0,5,,CC BY-SA 3.0,6af01165-7cff-4a55-af1b-21089a5f836d,"I have an issue with the proof of + +>> $E(Y|X) \in \arg \min_{g(X)} E\Big[\big(Y - g(X)\big)^2\Big]$ + +which very likely reveal a deeper misunderstanding of expectations and conditional expectations. + +The proof I know goes as follows ( another version of this proof can be found [here][1]) + +\begin{align*} +&\arg \min_{g(X)} E\Big[\big(Y - g(x)\big)^2\Big]\\ + = &\arg \min_{g(X)} E \Big[ \big(Y - E(Y|X) + E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ \big(Y - E(Y|X)\big)^2 - 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +\end{align*} + +The proof then typically continues with an argument showing that $-2 E\Big[ \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big)\Big] = 0$, and hence + +\begin{align*} +\arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big] = \arg \min_{g(x)} E \Big[\big(E(Y|X) - g(X)\big)^2\Big] +\end{align*} + +which can be seen to be minimized when $g(X) = E(Y|X)$. + +My puzzles about the proof are the following: + + 1. Consider + +>> $E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]$. + + It seems to me that, independently of any argument showing that the first term is always equal to zero, one can see that setting $g(X) = E(Y|X)$ minimizes the expression as it implies $\big(E(Y|X) - g(X)\big) =0$ and hence + +>> $E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big] = E( 0 + 0)$ = 0. + +But if this is true, then the one might repeat the proof replacing $E(Y|X)$ by any other function of $X$, say $h(X)$, and get to the conclusion that it is $h(X)$ that minimizes the expression. So there must be something I misunderstand (right?). + +2. I have some doubts about the meaning of $E[(Y−g(X))^2]$ in the statement of the problem. How should the notation be interpreted? Does it mean + +>>$E_X[(Y−g(X))^2]$, $E_Y[(Y−g(X))^2]$ or $E_{XY}[(Y−g(X))^2]$? + + [1]: http://www.econ.uiuc.edu/~wsosa/econ507/CEF.pdf",capitalize X's,2013-10-04 03:53:34.993 +183994,56784,594.0,6,,CC BY-SA 3.0,e45e6c72-4470-4329-9aad-033a3ab39ac3,,edited tags,2013-10-04 07:28:52.303 +184026,56784,594.0,5,,CC BY-SA 3.0,371731ce-884c-4dae-95ba-ed7fc482d1fb,"Leaving aside the obvious issue of the low power of the chi-square in this sort of circumstance, imagine doing a chi-square goodness of test for some density with unspecified parameters, by binning the data. + +For concreteness, let's say an exponential distribution with unknown mean and a sample size of say 100. + +In order to get a reasonable number of expected observations per bin some account would need to be taken of the data (e.g. if we chose to put 6 bins below the mean and 4 above it, that would still be using data-based bin boundaries). + +But this use of bins based on seeing the data would presumably affect the distribution of the test statistic under the null. + +I have seen plenty of discussion about the fact that - *if* the parameters are estimated by maximum likelihood from the *binned* data - you lose 1 d.f per estimated parameter (an issue dating right back to Fisher vs Karl Pearson) - but I don't recall reading anything about finding the bin boundaries themselves based on the data. (If you estimate them from the unbinned data, then with $k$ bins the distribution of the test statistic lies somewhere between a $\chi^2_{k}$ and a $\chi^2_{k-p}$.) + +Does this data-based choice of bins substantively impact significance level or power? Are there some approaches that matter more than others? If there is much of an effect, is it something that goes away in large samples? + +If it does have a substantive impact, this would seem to make the use of a chi-squared test when parameters are unknown almost useless in many cases (in spite of still being advocated in quite a few texts), unless you had a good a-priori estimate of the parameter. + +Discussion of the issues or pointers to references (preferably with a mention of their conclusions) would be useful. + +--- + +Edit, pretty much an aside to the main question: + +It occurs to me that there are potential solutions for the specific case of the exponential\* (and the uniform come to think of it), but I am still interested in the more general issue of the impact choosing bin boundaries. + +\* For example, for the exponential, one might use the smallest observation (say it is equal to $m$) to get a very rough idea of where to place the bins (since the smallest observation is exponential with mean $\mu/n$), and then test the remaining $n-1$ differences ($x_i - m$) for exponentiality. Of course that might yield a very poor estimate of $\mu$, and hence poor bin choices, though I suppose one might use the argument recursively in order to take the lowest two or three observations from which to choose reasonable bins and then test the differences above the largest for exponentiality) +",edited tags,2013-10-04 10:18:14.337 +184027,56783,20473.0,5,,CC BY-SA 3.0,c6c8a449-b83c-4c81-9fa6-9bb6fc418ee7,"*(This is an adaptation from Granger & Newbold(1986) ""Forecasting Economic Time Series"").* + +By construction, your _error cost function_ is $\left[Y-g(X)\right]^2$. This incorporates a critical assumption (that the error cost function is symmetric around zero) -a different error cost function would not necessarily have the conditional expected value as the $\arg \min$ of its expected value. +You cannot minimize your error cost function because it contains unknown quantities. So you decide to minimize its expected value instead. Then your objective function becomes + +$$E\left[Y-g(X)\right]^2 = \int_{-\infty}^{\infty}\left[y-g(X)\right]^2f_{Y|X}(y|x)dy $$ + +which I believe answers also your second question. It is intuitive that the expected value will be of $Y$ conditional on $X$, since we are trying to estimate/forecast $Y$ based on $X$. Decompose the square to obtain + +$$E\left[Y-g(X)\right]^2 = \int_{-\infty}^{\infty}y^2f_{Y|X}(y|x)dy -2g(X)\int_{-\infty}^{\infty}yf_{Y|X}(y|x)dy \\+ \Big[g(X)\Big]^2\int_{-\infty}^{\infty}f_{Y|X}(y|x)dy$$ + +The first term does not contain $g(X)$ so it does not affect minimization, and it can be ignored. The integral in the second term equals the conditional expected value of $Y$ given $X$, and the integral in the last term equals unity. So + +$$\arg \min_{g(x)} E\left[Y-g(X)\right]^2 = \arg \min_{g(x)} \Big\{ -2g(X)E(Y\mid X) + \Big[g(X)\Big]^2 \Big\}$$ + +The first derivative w.r.t $g(X)$ is $-2E(Y\mid X) + 2g(X)$ leading to the first order condition for minimization $g(X) = E(Y\mid X)$ while the second derivative is equal to $2>0$ which is sufficient for a minimum. + +**ADDENDUM:The logic of the ""add and subtract"" proof approach.** + +The OP is puzzled by the approach stated in the question, because it seems tautological. It isn't, because while using the tactic of adding and subtracting makes a _specific part_ of the objective function zero for an arbitrary choice of the term that is added and subtracted, it does NOT equalize the _value function_ , namely the value of the objective function evaluated at the candidate minimizer. + +For the choice $g(X) = E(Y \mid X)$ we have the value function $ V\left(E(Y\mid X)\right) = E\Big[ (Y-E(Y \mid X))^2\mid X\Big]$ +For the arbitrary choice $g(X) = h(X)$we have the value funtion $ V\left(h(X)\right) = E\Big[ (Y-h(X))^2\mid X\Big]$. + +I claim that + +$$V\left(E(Y\mid X)\right) \le V\left(h(X)\right)$$ +$$\Rightarrow E(Y^2\mid X) -2E\Big [(YE(Y \mid X))\mid X\Big] + E\Big [(E(Y \mid X))^2\mid X\Big] \\\le E(Y^2\mid X) -2E\Big [(Yh(X))\mid X\Big] + E\Big [(h(X))^2\mid X\Big]$$ + +The first term of the LHS and the RHS cancel out. Also note that the outer expectation is conditional on $X$. By the properties of conditional expectations we end up with + +$$...\Rightarrow -2E(Y \mid X)\cdot E\Big (Y\mid X\Big) + \Big [E(Y \mid X)\Big]^2 \le -2E(Y\mid X)h(X) + \Big [h(X)\Big]^2$$ + +$$\Rightarrow 0 \le \Big [E(Y \mid X)\Big]^2-2E(Y\mid X)h(X) + \Big [h(X)\Big]^2$$ + +$$\Rightarrow 0 \le \Big [E(Y \mid X) - h(x)\Big]^2$$ +which holds with strict inequality if $h(x) \neq E(Y \mid X)$. So $E(Y \mid X)$ is the global and unique minimizer. + +But this also says that the ""add-and-subtract"" approach is not the most illuminating way of proof here. + +",Added an explanation for the proof indicated by the OP,2013-10-04 10:26:39.823 +184954,57065,22477.0,1,,CC BY-SA 3.0,a3a40789-426a-4c30-967b-12e4a66c0c51,Bootstrapping - do I need to remove outliers first?,,2013-10-08 14:28:42.660 +184953,57065,22477.0,3,,CC BY-SA 3.0,a3a40789-426a-4c30-967b-12e4a66c0c51,,,2013-10-08 14:28:42.660 +184968,57065,,25,,,342b7d4b-5d7c-4cda-b681-40c4baad3246,,http://twitter.com/#!/StackStats/status/387596378970808321,2013-10-08 15:12:32.667 +185716,57284,,1,user14650,CC BY-SA 3.0,d0e821d4-743c-40ef-83e9-66dad2846efa,How to set confidence level for wilcoxsign_test (package coin)?,,2013-10-11 09:06:48.623 +185940,57349,22630.0,4,,CC BY-SA 3.0,72b6caf7-36bf-4b95-acb6-e41fc8c61312,"What does ""Mean of each pixel over all images"" mean?",Added more details,2013-10-12 11:51:10.477 +184043,56784,594.0,5,,CC BY-SA 3.0,dbca9084-7715-41a6-8d29-6e4b896051ab,"Leaving aside the obvious issue of the low power of the chi-square in this sort of circumstance, imagine doing a chi-square goodness of test for some density with unspecified parameters, by binning the data. + +For concreteness, let's say an exponential distribution with unknown mean and a sample size of say 100. + +In order to get a reasonable number of expected observations per bin some account would need to be taken of the data (e.g. if we chose to put 6 bins below the mean and 4 above it, that would still be using data-based bin boundaries). + +But this use of bins based on seeing the data would presumably affect the distribution of the test statistic under the null. + +I have seen plenty of discussion about the fact that - *if* the parameters are estimated by maximum likelihood from the *binned* data - you lose 1 d.f per estimated parameter (an issue dating right back to Fisher vs Karl Pearson) - but I don't recall reading anything about finding the bin boundaries themselves based on the data. (If you estimate them from the unbinned data, then with $k$ bins the distribution of the test statistic lies somewhere between a $\chi^2_{k}$ and a $\chi^2_{k-p}$.) + +Does this data-based choice of bins substantively impact significance level or power? Are there some approaches that matter more than others? If there is much of an effect, is it something that goes away in large samples? + +If it does have a substantive impact, this would seem to make the use of a chi-squared test when parameters are unknown almost useless in many cases (in spite of still being advocated in quite a few texts), unless you had a good a-priori estimate of the parameter. + +Discussion of the issues or pointers to references (preferably with a mention of their conclusions) would be useful. + +--- + +Edit, pretty much an aside to the main question: + +It occurs to me that there are potential solutions for the specific case of the exponential\* (and the uniform come to think of it), but I am still interested in the more general issue of the impact choosing bin boundaries. + +\* For example, for the exponential, one might use the smallest observation (say it is equal to $m$) to get a very rough idea of where to place the bins (since the smallest observation is exponential with mean $\mu/n$), and then test the remaining $n-1$ differences ($x_i - m$) for exponentiality. Of course that might yield a very poor estimate of $\mu$, and hence poor bin choices, though I suppose one might use the argument recursively in order to take the lowest two or three observations from which to choose reasonable bins and then test the differences of the remaining observations above the largest of those smallest order statistics for exponentiality) +",added 65 characters in body,2013-10-04 13:09:48.050 +184047,56780,15280.0,5,,CC BY-SA 3.0,b578444c-ed0f-4f43-8e70-a55e8eff0531,"I have an issue with the proof of + +>> $E(Y|X) \in \arg \min_{g(X)} E\Big[\big(Y - g(X)\big)^2\Big]$ + +which very likely reveal a deeper misunderstanding of expectations and conditional expectations. + +The proof I know goes as follows ( another version of this proof can be found [here][1]) + +\begin{align*} +&\arg \min_{g(X)} E\Big[\big(Y - g(x)\big)^2\Big]\\ + = &\arg \min_{g(X)} E \Big[ \big(Y - E(Y|X) + E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ \big(Y - E(Y|X)\big)^2 - 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +\end{align*} + +The proof then typically continues with an argument showing that $-2 E\Big[ \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big)\Big] = 0$, and hence + +\begin{align*} +\arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big] = \arg \min_{g(x)} E \Big[\big(E(Y|X) - g(X)\big)^2\Big] +\end{align*} + +which can be seen to be minimized when $g(X) = E(Y|X)$. + +My puzzles about the proof are the following: + + 1. Consider + +>> $E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]$. + + It seems to me that, independently of any argument showing that the first term is always equal to zero, one can see that setting $g(X) = E(Y|X)$ minimizes the expression as it implies $\big(E(Y|X) - g(X)\big) =0$ and hence + +>> $E \Big[ -2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big] = E( 0 + 0)$ = 0. + +But if this is true, then one might repeat the proof replacing $E(Y|X)$ by any other function of $X$, say $h(X)$, and get to the conclusion that it is $h(X)$ that minimizes the expression. So there must be something I misunderstand (right?). + +2. I have some doubts about the meaning of $E[(Y−g(X))^2]$ in the statement of the problem. How should the notation be interpreted? Does it mean + +>>$E_X[(Y−g(X))^2]$, $E_Y[(Y−g(X))^2]$ or $E_{XY}[(Y−g(X))^2]$? + + [1]: http://www.econ.uiuc.edu/~wsosa/econ507/CEF.pdf",deleted 4 characters in body,2013-10-04 14:03:01.287 +184155,56780,15280.0,5,,CC BY-SA 3.0,30315ba8-73f9-4496-97dc-99aa5aace307,"I have an issue with the proof of + +>> $E(Y|X) \in \arg \min_{g(X)} E\Big[\big(Y - g(X)\big)^2\Big]$ + +which very likely reveal a deeper misunderstanding of expectations and conditional expectations. + +The proof I know goes as follows ( another version of this proof can be found [here][1]) + +\begin{align*} +&\arg \min_{g(X)} E\Big[\big(Y - g(x)\big)^2\Big]\\ + = &\arg \min_{g(X)} E \Big[ \big(Y - E(Y|X) + E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ \big(Y - E(Y|X)\big)^2 + 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +\end{align*} + +The proof then typically continues with an argument showing that $2 E\Big[ \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big)\Big] = 0$, and hence + +\begin{align*} +\arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big] = \arg \min_{g(x)} E \Big[\big(E(Y|X) - g(X)\big)^2\Big] +\end{align*} + +which can be seen to be minimized when $g(X) = E(Y|X)$. + +My puzzles about the proof are the following: + + 1. Consider + +>> $E \Big[ 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]$. + + It seems to me that, independently of any argument showing that the first term is always equal to zero, one can see that setting $g(X) = E(Y|X)$ minimizes the expression as it implies $\big(E(Y|X) - g(X)\big) =0$ and hence + +>> $E \Big[ 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big] = E( 0 + 0)$ = 0. + +But if this is true, then one might repeat the proof replacing $E(Y|X)$ by any other function of $X$, say $h(X)$, and get to the conclusion that it is $h(X)$ that minimizes the expression. So there must be something I misunderstand (right?). + +2. I have some doubts about the meaning of $E[(Y−g(X))^2]$ in the statement of the problem. How should the notation be interpreted? Does it mean + +>>$E_X[(Y−g(X))^2]$, $E_Y[(Y−g(X))^2]$ or $E_{XY}[(Y−g(X))^2]$? + + [1]: http://www.econ.uiuc.edu/~wsosa/econ507/CEF.pdf",sign error,2013-10-04 23:15:58.010 +184194,56859,6805.0,1,,CC BY-SA 3.0,bee0fd00-b9fd-4418-a516-c201cc38df00,"What are descriptive and inferential statistics, and how do they differ?",,2013-10-05 04:59:21.093 +184193,56859,6805.0,3,,CC BY-SA 3.0,bee0fd00-b9fd-4418-a516-c201cc38df00,,,2013-10-05 04:59:21.093 +184195,56859,6805.0,2,,CC BY-SA 3.0,bee0fd00-b9fd-4418-a516-c201cc38df00,"My understanding was that descriptive statistics quantitatively described features of a data sample, while inferential statistics made inferences about the populations from which samples were drawn. + +However, the [wikipedia page for statistical inference][1] states: + +> For the most part, statistical inference makes propositions about +> populations, using data drawn from the population of interest via some +> form of random sampling. + +The ""for the most part"" has thrown me. Are there examples of inferential statistics that don't make propositions about populations? + + + [1]: http://en.wikipedia.org/wiki/Statistical_inference",,2013-10-05 04:59:21.093 +184197,56859,6805.0,5,,CC BY-SA 3.0,69619d75-9e53-46c4-bf88-749a35faec51,"My understanding was that descriptive statistics quantitatively described features of a data sample, while inferential statistics made inferences about the populations from which samples were drawn. + +However, the [wikipedia page for statistical inference][1] states: + +> For the most part, statistical inference makes propositions about +> populations, using data drawn from the population of interest via some +> form of random sampling. + +The ""for the most part"" has made me think I perhaps don't properly understand these concepts. Are there examples of inferential statistics that don't make propositions about populations? + + + [1]: http://en.wikipedia.org/wiki/Statistical_inference",added 55 characters in body,2013-10-05 05:30:28.103 +184199,56860,155.0,2,,CC BY-SA 3.0,391fdd35-ce53-4ab1-8ba9-afbd94e441a7,"Coming from a behavioural sciences background, I associate this terminology particularly with introductory statistics textbooks. In this context the distinction is that : + +* **Descriptive statistics** are functions of the sample data that are intrinsically interesting in describing some feature of the data. Classic descriptive statistics include mean, min, max, standard deviation, median, skew, kurtosis. +* **Inferential statistics** are a function of the sample data that assists you to draw an inference regarding an hypothesis about a population parameter. Classic inferential statistics include z, t, $\chi^2$, F-ratio, etc. + +The important point is that any statistic, inferential or descriptive, is a function of the sample data. A parameter is a function of the population, where the term population is the same as saying the underlying data generating process. + +From this perspective the status of a given function of the data as a descriptive or inferential statistic depends on the purpose for which you are using it. + +That said, some statistics are clearly more useful in describing relevant features of the data, and some are well suited to aiding inference. + +* **Inferential statistics:** Standard test statistics like t and z, for a given data generating process, where the null hypothesis is false, the expected value is strongly influenced by sample size. Most researchers would not see such statistics as estimating a population parameter of intrinsic interest. +* **Descriptive statistics**: In contrast descriptive statistics do estimate population parameters that are typically of intrinsic interest. For example the sample mean and standard deviation provide estimates of the equivalent population parameters. Even descriptive statistics like the minimum and maximum provide information about equivalent or similar population parameters, although of course in this case, much more care is required. Furthermore, many descriptive statistics might be biased or otherwise less than ideal estimators. However, they still have some utility in estimating a population parameter of interest. + +So from this perspective, the important things to understand are: + +* **statistic**: function of the sample data +* **parameter**: function of the population (data generating process) +* **estimator**: function of the sample data used to provide an estimate of a parameter +* **inference**: process of reaching a conclusion about a parameter + +Thus, you could either define the distinction between descriptive and inferential based on the intention of the researcher using the statistic, or you could define a statistic based on how it is typically used. +",,2013-10-05 05:51:35.693 +184200,56859,155.0,4,,CC BY-SA 3.0,c9516594-9ccd-488f-8fc3-414b330e5282,What is the difference between descriptive and inferential statistics?,edited tags; edited title,2013-10-05 05:52:04.663 +184253,56875,947.0,2,,CC BY-SA 3.0,2c419e31-10fa-4d70-bf04-ab6eb4cea370,"I have become aware that a group at a large corporation is developing an econometrics model to forecast sales of their product. They are using this model solely to estimate sales in specified stress test economic scenarios where they are given what the economic environment will be like, including real GDP contraction, rising unemployment rate, etc... Because of the nature of those scenarios, they think the most proper way to construct this model is to focus solely on the 2008-2009 period capturing the main period of the recent financial crisis. They have monthly data, so that gives them 24 monthly data points. Given that GDP's frequency is really quarterly, on this one variable it gives them only 8 true datapoints. But, they extrapolate it into 24 month observation. + +For the record, if they chose to, they have good internal data going back to 2001 and up to the current period. But, as mentioned they decided to focus instead solely on the 2008-2009 period. + +I will also answer this question as I have built many such econometrics models. And, I invite others to debate and rebutt my answer... and to post your own better answer. ",,2013-10-05 14:58:52.307 +184252,56875,947.0,1,,CC BY-SA 3.0,2c419e31-10fa-4d70-bf04-ab6eb4cea370,Can you develop an econometrics model for stress test purpose only focusing on 2008-2009 data?,,2013-10-05 14:58:52.307 +184251,56875,947.0,3,,CC BY-SA 3.0,2c419e31-10fa-4d70-bf04-ab6eb4cea370,,,2013-10-05 14:58:52.307 +184258,56875,947.0,5,,CC BY-SA 3.0,dc8c2207-9257-4949-a24c-4ea93f23b76a,"I have become aware that a group at a large corporation is developing an econometrics model to forecast sales of their product. They are using this model solely to estimate sales in specified stress test economic scenarios where they are given what the economic environment will be like, including real GDP contraction, rising unemployment rate, etc... out to 2016. Because of the nature of those scenarios, they think the most proper way to construct this model is to focus solely on the 2008-2009 period capturing the main period of the recent financial crisis. They have monthly data, so that gives them 24 monthly data points. Given that GDP's frequency is really quarterly, on this one variable it gives them only 8 true datapoints. But, they extrapolate it into 24 month observations. + +For the record, if they chose to, they have good internal data going back to 2001 and up to the current period. But, as mentioned they decided to focus instead solely on the 2008-2009 period. + +I will also answer this question as I have built many such econometrics models. And, I invite others to debate and rebutt my answer... and to post your own better answer. ",added 14 characters in body,2013-10-05 15:12:25.113 +184283,56684,0.0,36,,,edb576dd-3c9c-4173-b1af-d509360e3ca2,,from http://math.stackexchange.com/questions/512624/significance-test-for-highly-skewed-bernoulli-distribution,2013-10-05 16:31:40.173 +184376,56911,1506.0,1,,CC BY-SA 3.0,57f47e46-f22c-432c-9d09-e3068f7eee00,Detect Pattern in Residual Plot,,2013-10-06 06:53:48.957 +184377,56911,1506.0,2,,CC BY-SA 3.0,57f47e46-f22c-432c-9d09-e3068f7eee00,"I wish to automatically (not by visual inspection) detect where large deviations occur in a residual plot from a regression. For example, suppose I have the residual plot below: + + +![enter image description here][1] + + +I want to automatically detect the observations from about 30:35 deviate from a normal residual pattern. Some clues are that the magnitude is quite large and the residuals do not appear independent in this region. How can I go about this? + + + [1]: https://i.stack.imgur.com/IWgZV.png",,2013-10-06 06:53:48.957 +184378,56911,1506.0,3,,CC BY-SA 3.0,57f47e46-f22c-432c-9d09-e3068f7eee00,,,2013-10-06 06:53:48.957 +184435,56911,,4,user88,CC BY-SA 3.0,89e85066-fb9c-459e-a134-57d51b5f56b8,Detecting patterns in residual plot,edited title,2013-10-06 13:08:49.547 +184444,56928,16046.0,1,,CC BY-SA 3.0,81814c07-2601-49e8-bb84-cfcf1a73f3d4,Reference for Hierarchical Bayesian Modelling,,2013-10-06 15:21:17.703 +184442,56928,16046.0,3,,CC BY-SA 3.0,81814c07-2601-49e8-bb84-cfcf1a73f3d4,,,2013-10-06 15:21:17.703 +184443,56928,16046.0,2,,CC BY-SA 3.0,81814c07-2601-49e8-bb84-cfcf1a73f3d4,"I am currently reading ""Bayesian Data Analysis"" by Gelman et. al. and my main goal was to learn about Hierarchical modeling on chapter 5. I read until chapter 4 and the book is written terribly for a taste of a math student as it is pretty sketchy and engineering oriented. + +I decided to not to continue anymore with this book and I would be very grateful if somebody could introduce a reference with a more rigorous approach to the topic. + +",,2013-10-06 15:21:17.703 +184462,56768,11490.0,33,,,95a9c40e-d2f7-4519-85cc-2621ef94cbbb,,806,2013-10-06 17:15:36.623 +184469,56768,,25,,,cedb0df9-a14d-448f-a400-5fce1cb28076,,http://twitter.com/#!/StackStats/status/386916228692209664,2013-10-06 18:09:52.167 +184474,56928,,4,user88,CC BY-SA 3.0,7be19069-5d02-4fed-ae65-39df9fdd61b5,Reference for hierarchical Bayesian modelling,added 1 characters in body; edited title,2013-10-06 18:22:25.083 +184475,56928,,5,user88,CC BY-SA 3.0,7be19069-5d02-4fed-ae65-39df9fdd61b5,"I am currently reading ""Bayesian Data Analysis"" by Gelman et. al. and my main goal was to learn about Hierarchical modelling on chapter 5. I read until chapter 4 and the book is written terribly for a taste of a math student as it is pretty sketchy and engineering oriented. + +I decided to not to continue anymore with this book and I would be very grateful if somebody could introduce a reference with a more rigorous approach to the topic. + +",added 1 characters in body; edited title,2013-10-06 18:22:25.083 +184516,56928,15827.0,5,,CC BY-SA 3.0,c3f47328-186d-41a2-be2f-63b0e6391849,"I am currently reading ""Bayesian Data Analysis"" by Gelman et al. and my main goal was to learn about Hierarchical modelling on chapter 5. I read until chapter 4 and the book is written terribly for a taste of a math student as it is pretty sketchy and engineering oriented. + +I decided to not to continue anymore with this book and I would be very grateful if somebody could introduce a reference with a more rigorous approach to the topic. + +",deleted 1 characters in body,2013-10-06 22:45:52.387 +184545,56955,22423.0,1,,CC BY-SA 3.0,7db1e6d2-ce4a-4628-a8d2-b6e92221d5da,How to combine data from 5 surveys from the same population spanning 10 years,,2013-10-07 02:44:46.310 +184544,56955,22423.0,3,,CC BY-SA 3.0,7db1e6d2-ce4a-4628-a8d2-b6e92221d5da,,,2013-10-07 02:44:46.310 +184546,56955,22423.0,2,,CC BY-SA 3.0,7db1e6d2-ce4a-4628-a8d2-b6e92221d5da,"I have results from 5 surveys each 2 years apart and let us assume that no subjects are selected in more than one survey. + +The sampling method used in these surveys are biased and I have sampling weights calculated(with respect to the population) for each data point in each study. + +The question is, how would I be able to combine the 5 datasets and have the weights recalculated so as to obtain one giant dataset for analysis on this population? + +Also, what should I do if subjects appear in more than one survey? ",,2013-10-07 02:44:46.310 +185759,57297,18198.0,1,,CC BY-SA 3.0,b510d6c8-caec-4002-917b-283ff85acac8,Testing whether two Eigen decompositions are equal,,2013-10-11 12:13:44.087 +185762,57287,,25,,,bc00f02f-2f08-4677-9bc9-86d3fe74cc67,,http://twitter.com/#!/StackStats/status/388639436827025408,2013-10-11 12:17:17.037 +184573,56955,22423.0,5,,CC BY-SA 3.0,1b526a42-1226-40d3-8fb4-97f89adb779d,"I have results from 5 surveys each 2 years apart and let us assume that no subjects are selected in more than one survey. + +The sampling method used in these surveys are biased and I have sampling weights calculated(with respect to the population) for each data point in each study. + +The question is, how would I be able to combine the 5 datasets and have the weights recalculated so as to obtain one giant dataset for analysis on this population? + +Also, what should I do if subjects appear in more than one survey? + +# Updates/Further Elaboration: # + +thank you @user30523, here are some more infomation that might be useful: + +Suppose I wish to find out the estimated distribution of height across the population using these 5 datasets. + +In some data, younger people are oversampled because of the location where the survey are conducted. Let's assume the weights are calculated with respect to their age. + +Eg. assuming 2% of the population are 15 years old, and the location of the survey is at a mall where 15-year-olds made up 5% of all shoppers, then sampling weight for an subject aged 15 in that survey would be calculated as 0.02 / 0.05 = 0.4. For simplicity, each person in the mall has equal chance of being surveyed and all participants complied when asked. + +Given that 5 surveys are conducted in 5 different malls and each has their set of weights calculated in the same way, how would I then be able to combine all 5 datasets and recalculate the sampling weights? + +P.S: I'm new to the topic on sampling weights so do correct me if I have made errors in the way I have calculated the weights.",added 1103 characters in body,2013-10-07 07:09:40.857 +184600,56970,22372.0,2,,CC BY-SA 3.0,45ba5452-56ce-4511-b70d-da164b64e467,"I'm building a logit model using R and I'm getting a result of 88.9% of accuracy (verified using the ROC [in rattle, evaluation tab] using 30% of my 34k dataset). + +What kind of tests would be interesting to do to certify myself that it's a good model?",,2013-10-07 09:44:35.967 +184602,56970,22372.0,3,,CC BY-SA 3.0,45ba5452-56ce-4511-b70d-da164b64e467,,,2013-10-07 09:44:35.967 +184601,56970,22372.0,1,,CC BY-SA 3.0,45ba5452-56ce-4511-b70d-da164b64e467,test a logit model in R,,2013-10-07 09:44:35.967 +184606,56970,1406.0,4,,CC BY-SA 3.0,af8bf362-0d50-4621-ab43-e226a75a7361,How to test a logit model in R?,edited title,2013-10-07 09:48:27.890 +184773,57012,20773.0,3,,CC BY-SA 3.0,e30f56bd-53f7-490c-817b-5e0a7924a4b7,,,2013-10-07 20:36:21.153 +184771,57012,20773.0,2,,CC BY-SA 3.0,e30f56bd-53f7-490c-817b-5e0a7924a4b7,"I have observations taken with different sensitivity thresholds and minimum detection levels, i.e. Lab A is less sensitive and has a minimum detection level of .2 and Lab B is more sensitive and has a minimum detection level of .02. Each row corresponds to a unique measurement taken by either lab: + + Obs | Lab A | Lab B + --------------------- + 1 | .6 | NA + 2 | 0 | NA + 3 | NA | .53 + 4 | .2 | NA + 5 | NA | .07 + +I think I would like something like: + + Obs | LabA | LabB | NewLab + ---------------------------- + 1 | .6 | NA | .64 + 2 | 0 | NA | .13 + 3 | NA | .53 | .53 + 4 | .2 | NA | .21 + 5 | NA | .07 | .07 + +What techniques are available to standardize the values such that there is not a large loss of information? + + 1. Obviously, I could take the values from Lab B and replace anything less than .2 with 0 and then round them, but I want to avoid throwing away information if possible. + 2. One person suggested to add random noise to the values of Lab A, but I'm not sure of the benefit of this vs. simply imputing the missing values from Lab B.",,2013-10-07 20:36:21.153 +184772,57012,20773.0,1,,CC BY-SA 3.0,e30f56bd-53f7-490c-817b-5e0a7924a4b7,How to compare different sensitivity thresholds and detection limits?,,2013-10-07 20:36:21.153 +184780,57015,22454.0,3,,CC BY-SA 3.0,55b254fa-2a3d-4def-b47b-cedd6e7c6628,,,2013-10-07 20:50:53.017 +184782,57015,22454.0,1,,CC BY-SA 3.0,55b254fa-2a3d-4def-b47b-cedd6e7c6628,Testing statistical significance in two conditions,,2013-10-07 20:50:53.017 +184781,57015,22454.0,2,,CC BY-SA 3.0,55b254fa-2a3d-4def-b47b-cedd6e7c6628,"I am measuring two variables $x$ and $y$ in two different conditions. In the first condition, my hypothesis is that $\bar{x} > \bar{y}$ and in the second condition that $\bar{x} < \bar{y}$. Now that I have $N$ samples from both variables, how can I test whether my hypotheses are true? I am not sure if I can safely assume that $x$ and $y$ are independent from each other. Neither do I know from what kind of distributions they are sampled from. The sample size I have is small. I have read several introductions to statistics for the past few days, but never saw a worked out example for this kind of situations. All help appreciated.",,2013-10-07 20:50:53.017 +184821,57026,22458.0,3,,CC BY-SA 3.0,c3011751-2e16-4c3e-a7a6-c6e4c2944a16,,,2013-10-08 01:16:28.460 +184820,57026,22458.0,1,,CC BY-SA 3.0,c3011751-2e16-4c3e-a7a6-c6e4c2944a16,Relationship between the kernel and the value of C in SVM's,,2013-10-08 01:16:28.460 +184819,57026,22458.0,2,,CC BY-SA 3.0,c3011751-2e16-4c3e-a7a6-c6e4c2944a16,"How exactly does the value of C relate across different kernels that we can use for SVM's? As in, how does it vary when changing the polynomial degree of a kernel or while using a gaussian kernel?",,2013-10-08 01:16:28.460 +184825,57026,5237.0,5,,CC BY-SA 3.0,a89e8068-5550-4d3c-98b6-5d2560f574d5,"How exactly does the value of C relate across different kernels that we can use for SVM's? As in, how does it vary when changing the polynomial degree of a kernel or while using a Gaussian kernel?",light editing,2013-10-08 02:09:52.270 +184832,57026,,25,,,b3edbe93-c571-483f-ae7d-61688b411ec7,,http://twitter.com/#!/StackStats/status/387415183624519680,2013-10-08 03:12:32.320 +184922,57053,22475.0,2,,CC BY-SA 3.0,eae6a2b9-8dfe-4f27-8dd8-d5497599071c,"How did I describe descriptive statistics for dummy variable (gender of worker in a shop)? let say this is the info that I have: + +mean : 0.47 +median : 0 +max : 1 +min : 0 +std. dev : 0.4998 +skewness : 0.101 +kurtosis : 1.01 +jarque bera : 85.67 +probability : 0 + +i know that some of the informations are useless since it's a dummy variable. so how do i interpret it in words?",,2013-10-08 12:31:47.380 +184920,57053,22475.0,3,,CC BY-SA 3.0,eae6a2b9-8dfe-4f27-8dd8-d5497599071c,,,2013-10-08 12:31:47.380 +184921,57053,22475.0,1,,CC BY-SA 3.0,eae6a2b9-8dfe-4f27-8dd8-d5497599071c,Descriptive statistics,,2013-10-08 12:31:47.380 +184929,57055,22.0,2,,CC BY-SA 3.0,9ac3a97b-0747-4081-b1e2-f55528d3bc26,"I think only one descriptive statistic is needed: ""47% are male"" (assuming 0 encodes female and 1 encodes male). No other statistics are really helpful to describe those data. If you thought these were a randomish sample of a larger population, you could compute the confidence interval for that proportion. +",,2013-10-08 13:08:34.837 +184931,57053,5237.0,4,,CC BY-SA 3.0,801ef2d3-22c6-4d65-94b9-f6989f650e9e,Interpretation of descriptive statistics for dummy variable,clarified issue in title; added tags; edited for English; formatted,2013-10-08 13:09:13.483 +185951,57271,20473.0,5,,CC BY-SA 3.0,65103292-f647-47a7-8a42-77fba3010ccc,"Write your system explicitly for time $t$ as (""$L$"" for ""loss"", as a positive quantity, and ""$G$"" for ""gain"") +$$ A_t - A_{t-1} = - L^A_{t} + G_{t}^{B\rightarrow A}+G_{t}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - L^B_{t} + G_{t-1}^{A\rightarrow B}+G_{t}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - L^C_{t} + G_{t}^{A\rightarrow C}+G_{t}^{B\rightarrow C}$$ + +The following three relations hold exactly: +$$ L^A_{t} = G_{t}^{A\rightarrow B} + G_{t}^{A\rightarrow C} $$ +$$ L^B_{t} = G_{t}^{B\rightarrow A} + G_{t}^{B\rightarrow C} $$ +$$ L^C_{t} = G_{t}^{C\rightarrow A} + G_{t}^{C\rightarrow B} $$ + +If you substitute in the first three you obtain + +$$ A_t - A_{t-1} = - G_{t}^{A\rightarrow B} - G_{t}^{A\rightarrow C} + G_{t}^{B\rightarrow A}+G_{t}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - G_{t}^{B\rightarrow A} - G_{t}^{B\rightarrow C} + G_{t}^{A\rightarrow B}+G_{t}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - G_{t}^{C\rightarrow A} - G_{t}^{C\rightarrow B} + G_{t}^{A\rightarrow C}+G_{t}^{B\rightarrow C}$$ + +You have $6$ unknown quantities to estimate _per time period_. There is just not enough information to do that. So you need assumptions that will impose structure (=restrictions) on the situation, and will permit you to estimate _something_. What? Let's say you assume that there is a relatively stable ""churn"" from one company to another, as a linear function of their market share in the previous period. This assumption brings in a set of unknown coefficients to be estimated (which will then give you an estimate of ""hidden transfers of market share""). Write $G_{t}^{A\rightarrow B} = a_bA_{t-1}$ (market share lost from $A$ to $B$ as a linear function of $A$'s market share in period $t-1$). +Your equations will become + +$$ A_t - A_{t-1} = - a_bA_{t-1} - a_cA_{t-1} + b_aB_{t-1}+c_aC_{t-1} $$ + +$$ B_t - B_{t-1} = - b_aB_{t-1} - b_cB_{t-1} + a_bA_{t-1}+c_bC_{t-1}$$ + +$$ C_t - C_{t-1} = - c_aC_{t-1} - c_bC_{t-1} + a_cA_{t-1}+ b_cB_{t-1}$$ + +We have turned a set of mathematical identities into a _model_. It is doubtful that this model will hold exactly for each $t$, so you should add a stochastic error term. Rearranging we obtain a first-order Vector Autoregression (VAR): + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +1-a_b-a_c & b_a & c_a \\ +a_b & 1-b_a-b_c & c_b \\ +a_c & b_c & 1-c_a-c_b \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +or, to homogenize notation, + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +\gamma_{11} & \gamma_{12} & \gamma_{13} \\ +\gamma_{21} & \gamma_{22} & \gamma_{23} \\ +\gamma_{31} & \gamma_{32} & \gamma_{33} \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +subject to the equality restrictions +$$ \begin{matrix} +\gamma_{11} + \gamma_{21} + \gamma_{31} =1 \\ +\gamma_{12} + \gamma_{22} + \gamma_{32} =1 \\ +\gamma_{13} + \gamma_{23} + \gamma_{33} =1 \\ +\end{matrix} $$ + +So you have essentially $6$ unknown coefficients and a sample of $T-1$ observations (for each company). +Note that these restrictions _imply_ the ""add up to unity"" restriction $A_t+B_t+C_t =1$ for each $t$, so this last one does not impose any additional structure on the unknown coefficients -but it does imply a relation between the error terms, namely that $u^A_{t} + u^B_{t} +u^C_{t} =0$. Any additional assumptions on the three error terms should either come from knowledge of the specific real world phenomenon under study, and/or through a statistical specification search. + +The for example an estimation for the hidden transfers of market share will be, for example + +$$\hat G_{t}^{A\rightarrow B} = \hat \gamma_{21}A_{t-1}$$ + +etc. + +Of course you may find that such a model does not fit your data sample well - for example you expect that all estimated coefficients should be positive and smaller than or equal to unity, but the estimation procedure may not give you that. But this is what we do: we come up with specification hypotheses and we test them against the data - ""success"" is never guaranteed. Then you should try to come up with a different model. + +",Cange time-indexing and added an example,2013-10-12 14:00:57.927 +184993,57012,20773.0,5,,CC BY-SA 3.0,e8cab215-3ee1-433f-be26-5f92422567ae,"I have observations taken with different sensitivity thresholds and minimum detection levels, i.e. Lab A is less sensitive and has a minimum detection level of .2 and Lab B is more sensitive and has a minimum detection level of .02. Each row corresponds to a unique measurement taken by either lab: + + Obs | Lab A | Lab B + --------------------- + 1 | .6 | NA + 2 | 0 | NA + 3 | NA | .53 + 4 | .2 | NA + 5 | NA | .07 + +I think I would like something like: + + Obs | LabA | LabB | NewLab + ---------------------------- + 1 | .6 | NA | .64 + 2 | 0 | NA | .13 + 3 | NA | .53 | .53 + 4 | .2 | NA | .21 + 5 | NA | .07 | .07 + +What techniques are available to standardize the values such that there is not a large loss of information? + + 1. Obviously, I could take the values from Lab B and replace anything less than .2 with 0 and then round them, but I want to avoid throwing away information if possible. + 2. One person suggested to add random noise to the values of Lab A, but I'm not sure of the benefit of this vs. simply imputing the missing values from Lab B. + +*Edit 1:* +There are no observations for which both Lab A and Lab B values are present, one will always be missing.",added 119 characters in body,2013-10-08 16:25:34.013 +184996,56372,21108.0,5,,CC BY-SA 3.0,b18542c4-d61f-47e2-b5be-760f17240604,"I just made an implementation of P(A|B)/P(¬A|B) for a ""people who bought this also bought..."" algorithm. + + +I'm doing it by + + P(A|B) = count_users(bought_A_and_B)/count_users(bought_A) + P(¬A|B) = count_users(bought_B_but_not_A)/count_users(did_not_buy_A) + +Then dividing the top one by the bottom one I get a score which makes absolute sense, but what kind of correlation am I calculating? What is this method called? Where can I read more about it? + +[EDIT] When the number of users who bought item B but not item A is zero I just assume the correlation is 0. The same goes on when the number of users who bought A is zero.",edited body,2013-10-08 16:35:30.503 +184997,56372,21108.0,5,,CC BY-SA 3.0,7cc7e87d-a414-49ef-8ecf-f4c24763154c,"I just made an implementation of P(A|B)/P(¬A|B) for a ""people who bought this also bought..."" algorithm. + + +I'm doing it by + + P(A|B) = count_users(bought_A_and_B)/count_users(bought_A) + P(¬A|B) = count_users(bought_B_but_not_A)/count_users(did_not_buy_A) + +Then dividing the top one by the bottom one I get a score which makes absolute sense, but what kind of correlation am I calculating? What is this method called? Where can I read more about it? + +[EDIT] This is not for using in a production environment, it is just some algorithm which appeared out of the blue in an online course I'm taking, I was just wondering where it could come from. Also, when the number of users who bought item B but not item A is zero I just skip the pair until I get more data. The same goes on when the number of users who bought A is zero.",edited body,2013-10-08 16:41:37.913 +185022,57086,22034.0,2,,CC BY-SA 3.0,d1f0ea48-e92f-4b74-b688-7899e3b89c34,"I've read through the following posts that answered the question I was going to ask: + +http://stats.stackexchange.com/questions/59741/use-random-forest-model-to-make-predictions-from-sensor-data + +http://stats.stackexchange.com/questions/64201/decision-tree-for-output-prediction + +Here's what I've done so far: I compared Logistic Regression to Random Forests and RF outperformed Logistic. Now the medical researchers I work with want to turn my RF results into a medical diagnostic tool. For example: + +If you are an Asian Male between 25 and 35, have Vitamin D below xx and Blood Pressure above xx, you have a 76% chance of developing disease xxx. + +However, RF doesn't lend itself to simple mathematical equations (see above links). So here's my question: what ideas do you all have for using RF to develop a diagnostic tool (without having to export hundreds of trees). + +Here's a few of my ideas: + +1. Use RF for variable selection, then use Logistic (using all possible interactions) to make the diagnostic equation. +2. Somehow aggregate the RF forest into one ""mega-tree,"" that somehow averages the node splits across trees. +3. Similar to #2 and #1, use RF to select variables (say m variables total), then build hundreds of classification trees, all of which uses every m variable, then pick the best single tree. + +Any other ideas? Also, doing #1 is easy, but any ideas on how to implement #2 and #3?",,2013-10-08 18:41:35.193 +185024,57086,22034.0,3,,CC BY-SA 3.0,d1f0ea48-e92f-4b74-b688-7899e3b89c34,,,2013-10-08 18:41:35.193 +185023,57086,22034.0,1,,CC BY-SA 3.0,d1f0ea48-e92f-4b74-b688-7899e3b89c34,Ideas for outputting a prediction equation for Random Forests,,2013-10-08 18:41:35.193 +185032,57015,,24,,CC BY-SA 3.0,e5712be2-23eb-4c93-a80d-dc6d4e8e7e04,,Proposed by anonymous approved by 686 edit id of 5573,2013-10-08 20:07:05.960 +185031,57015,0.0,5,,CC BY-SA 3.0,e5712be2-23eb-4c93-a80d-dc6d4e8e7e04,"I am measuring two variables $x$ and $y$ in two different conditions. In the first condition, my hypothesis is that $\bar{x} > \bar{y}$ and in the second condition that $\bar{x} < \bar{y}$. Now that I have $N$ samples from both variables, how can I test whether my hypotheses are true? I am not sure if I can safely assume that $x$ and $y$ are independent from each other. Neither do I know from what kind of distributions they are sampled from. The sample size I have is small. I have read several introductions to statistics for the past few days, but never saw a worked out example for this kind of situations. All help appreciated. + +Edit: Like Michael Mayer wrote, there is a binary grouping variable ""condition"". Sorry for a bit unclear question.",more information to question,2013-10-08 20:07:05.960 +185072,57065,594.0,5,,CC BY-SA 3.0,05dead74-dd75-430d-b197-517eccc833bf,"We've run a split test of a new product feature and want to measure if the uplift on revenue is significant. Our observations are definitely not normally distributed (most of our users don't spend, and within those that do, it is heavily skewed towards lots of small spenders and a few very big spenders). + +We've decided on using bootstrapping to compare the means, to get round the issue of the data not being normally distributed (side-question: is this a legitimate use of bootstrapping?) + +My question is, do I need to trim outliers from the data set (e.g. the few very big spenders) before I run the bootstrapping, or does that not matter? + +",deleted 54 characters in body,2013-10-08 23:07:21.240 +185106,57110,22494.0,1,,CC BY-SA 3.0,91783daf-4317-40a8-b965-aa6c2b05ebf2,how to remove seasonality from daily electricity demand,,2013-10-09 03:35:34.333 +185105,57110,22494.0,3,,CC BY-SA 3.0,91783daf-4317-40a8-b965-aa6c2b05ebf2,,,2013-10-09 03:35:34.333 +185107,57110,22494.0,2,,CC BY-SA 3.0,91783daf-4317-40a8-b965-aa6c2b05ebf2,"how to remove seasonality from daily electricity demand (arima) + +I tried to build a model to forecast daily electricity demand in R, and plot my data as per blow. +![Daily electricity demand][1] + +I tried to remove seasonality with the following, +demand.xts.diff<-diff(demand.xts,lag=1,difference=1) +demand.xts.diff<-diff(demand.xts,lag=7,difference=1) + +I also tried to use lag=365,366 (I am not sure what lag to use, due tot he leap year issue), but none of them successfully removed seasonality, and the ACF and PACF are as per blow + +![ACF][2] +![PACF][3] + +Anybody with experience of modelling daily electricity demand, please give some advice, any advice is appreciated. + + + [1]: https://i.stack.imgur.com/pz5OS.png + [2]: https://i.stack.imgur.com/OV7T4.png + [3]: https://i.stack.imgur.com/HcXKi.png",,2013-10-09 03:35:34.333 +185108,57110,22494.0,5,,CC BY-SA 3.0,dd8cd3d8-2a49-425a-bfcd-c9aef89c5ade,"how to remove seasonality from daily electricity demand (arima) + +I tried to build a model to forecast daily electricity demand in R, and plot my data as per blow. My understanding is there is weekly (high demand on Tue, Wed, and low demand on Sat, Sun) and annul seasonality (high demand on Winter and lower on Summer). +![Daily electricity demand][1] + +I tried to remove seasonality with the following, +demand.xts.diff<-diff(demand.xts,lag=1,difference=1) +demand.xts.diff<-diff(demand.xts,lag=7,difference=1) + +I also tried to use lag=365,366 (I am not sure what lag to use, due tot he leap year issue), but none of them successfully removed seasonality, and the ACF and PACF are as per blow + +![ACF][2] +![PACF][3] + +Anybody with experience of modelling daily electricity demand, please give some advice, any advice is appreciated. + + + [1]: https://i.stack.imgur.com/pz5OS.png + [2]: https://i.stack.imgur.com/OV7T4.png + [3]: https://i.stack.imgur.com/HcXKi.png",added 158 characters in body,2013-10-09 03:51:49.790 +185109,57110,17249.0,5,,CC BY-SA 3.0,4a926a73-1c9f-402e-9939-7c998bc7b336,"I want to remove seasonality from daily electricity demand (a time series). My understanding is there is weekly (high demand on Tue, Wed, and low demand on Sat, Sun) and annual seasonality (high demand on Winter and lower on Summer). I tried to build a model to forecast daily electricity demand in R, and plot my data as shown below: +![Daily electricity demand][1] + +I tried to remove seasonality with the following: + + demand.xts.diff<-diff(demand.xts,lag=1,difference=1) + demand.xts.diff<-diff(demand.xts,lag=7,difference=1) + +I also tried to use `lag=365` and `lag=366` (I am not sure what lag to use, due to the leap year issue), but none of them successfully removed seasonality. The ACF and PACF are shown below: + +![ACF][2] +![PACF][3] + +Any advice is appreciated. + + + [1]: https://i.stack.imgur.com/pz5OS.png + [2]: https://i.stack.imgur.com/OV7T4.png + [3]: https://i.stack.imgur.com/HcXKi.png","adjusted code, improved text",2013-10-09 04:06:14.310 +185110,57110,17249.0,4,,CC BY-SA 3.0,4a926a73-1c9f-402e-9939-7c998bc7b336,How to remove seasonality from daily electricity demand,"adjusted code, improved text",2013-10-09 04:06:14.310 +185111,57110,,24,,CC BY-SA 3.0,4a926a73-1c9f-402e-9939-7c998bc7b336,,"Proposed by 24808 approved by 805, 919 edit id of 5577",2013-10-09 04:06:14.310 +185124,57110,,25,,,10fd3468-9968-462a-8da3-b79c2136f4e3,,http://twitter.com/#!/StackStats/status/387823266498637824,2013-10-09 06:14:06.873 +185156,57126,22503.0,3,,CC BY-SA 3.0,8aff1f06-eb93-4019-bd27-180b130704ae,,,2013-10-09 09:52:16.150 +185157,57126,22503.0,2,,CC BY-SA 3.0,8aff1f06-eb93-4019-bd27-180b130704ae,"Wondering if anyone has an opinion on whether bootstrapping the difference in means is the right method given I have a situation with extreme data points. I've decided to use this as I don't think a t test is appropriate + +I have about 30k observations per group (3 groups) + +my situation is about spend, and I have extreme outliers +the outliers aren't quite like an ""income"" distribution. i.e. Most users (95%+) will spend zero, a subset of users will spend 5 - 10 dollars. some will spend about 20 or 50 dollars and then a select few will spend 500+, with a couple of users spending 5000 or 10000+ + +I am trying to test which group brought in the most revenue per user + + +can anyone offer any advice on which statistical test is best suited?",,2013-10-09 09:52:16.150 +185155,57126,22503.0,1,,CC BY-SA 3.0,8aff1f06-eb93-4019-bd27-180b130704ae,is bootstrapping the right method for extreme distributions,,2013-10-09 09:52:16.150 +185162,57128,22505.0,1,,CC BY-SA 3.0,7a7db1d9-a337-4ae2-be08-a265e834bb00,Poolong regression results in SPSS,,2013-10-09 10:06:51.963 +185163,57128,22505.0,3,,CC BY-SA 3.0,7a7db1d9-a337-4ae2-be08-a265e834bb00,,,2013-10-09 10:06:51.963 +185161,57128,22505.0,2,,CC BY-SA 3.0,7a7db1d9-a337-4ae2-be08-a265e834bb00,"please help me to solve the following issue: + +1. I run my linear regression model many times (let's say 1000 times) with two variables: y - continuous dependent variable, x - continuous independent variable (mean of several consequent measurements). +2. The independent variable in each model was randomly drawn using its mean and standard deviation +3. I have the regression coefficient and standard error for this independent variable in each of the models. + +Somehow I have to combine these results into one regression result. As far as I know the regression coefficients of 1000 models can be just averaged. However, this is not really clear to me how can I estimate the total variance of 1000 models. + +Thank you! + +",,2013-10-09 10:06:51.963 +185167,57128,674.0,5,,CC BY-SA 3.0,304a609f-22cf-4718-9137-9406f5713b96,"I have to solve the following issue: + +1. I run my linear regression model many times (let's say 1000 times) with two variables: y - continuous dependent variable, x - continuous independent variable (mean of several consequent measurements). +2. The independent variable in each model was randomly drawn using its mean and standard deviation +3. I have the regression coefficient and standard error for this independent variable in each of the models. + +Somehow I have to combine these results into one regression result. As far as I know the regression coefficients of 1000 models can be just averaged. However, this is not really clear to me how can I estimate the total variance of 1000 models. + +",deleted 22 characters in body; edited tags; edited title,2013-10-09 10:08:41.917 +185169,57128,674.0,6,,CC BY-SA 3.0,304a609f-22cf-4718-9137-9406f5713b96,,deleted 22 characters in body; edited tags; edited title,2013-10-09 10:08:41.917 +185168,57128,674.0,4,,CC BY-SA 3.0,304a609f-22cf-4718-9137-9406f5713b96,Pooling regression results in SPSS,deleted 22 characters in body; edited tags; edited title,2013-10-09 10:08:41.917 +185193,57137,21896.0,1,,CC BY-SA 3.0,a12ba676-3572-4938-b1a6-f4308be97151,Negative Binomial Regression: is parameter theta (R) the reciprocal of parameter kappa (SAS)?,,2013-10-09 12:00:22.943 +185192,57137,21896.0,3,,CC BY-SA 3.0,a12ba676-3572-4938-b1a6-f4308be97151,,,2013-10-09 12:00:22.943 +185363,57177,594.0,5,,CC BY-SA 3.0,c97a4854-dc00-4e21-8001-c6a5480e273d,"As for qualitative differences, the lognormal and gamma are, as you say, quite similar. + +Indeed, in practice they're often used to model the same phenomena (some people will use a gamma where others use a lognormal). They are both, for example, constant-coefficient-of-variation models (the CV for the lognormal is $\sqrt{e^{\sigma^2} -1}$, for the gamma it's $1/\sqrt \alpha$). + +[How can it be constant if it depends on a parameter, you ask? It applies when you model the scale (location for the log scale); for the lognormal, $\mu$ acts as a scale parameter, while for the gamma, the scale is the parameter that isn't the shape parameter (or its reciprocal if you use the shape-rate parameterization). I'll call the scale parameter for the gamma distribution $\beta$. Gamma GLMs model the mean ($\mu=\alpha\beta$) while holding $\alpha$ constant; in that case $\mu$ is also a scale parameter. A model with varying $\mu$ and constant $\alpha$ or $\sigma$ respectively will have constant CV.] + +You might find it instructive to look at the density of their *logs*, which often shows a very clear difference. + +The log of a lognormal random variable is ... normal. It's symmetric. + +The log of a gamma random variable is left-skew. Depending on the value of the shape parameter, it may be quite skew or nearly symmetric. + +Here's an example, with both lognormal and gamma having mean 1 and variance 1/4. The top plot shows the densities (gamma in green, lognormal in blue), and the lower one shows the densities of the logs: + +![gamma and lognormal, densitiy and density of log][1] + +(Plotting the log of the density of the logs is also useful. That is, taking a log-scale on the y-axis above) + +This difference implies that the gamma has more of a tail on the left, and less of a tail on the right; the far right tail of the lognormal is heavier and its left tail lighter. And indeed, if you look at the skewness, of the lognormal and gamma, for a given coefficient of variation, the lognormal is more right skew ($(e^{\sigma^2}+2) \text{CV}$) than the gamma ($2\text{CV}$). + + + [1]: https://i.stack.imgur.com/I9ARM.png",added 664 characters in body,2013-10-10 00:27:42.183 +185194,57137,21896.0,2,,CC BY-SA 3.0,a12ba676-3572-4938-b1a6-f4308be97151,"after some frantic googling I do believe the answer is yes, but more so I am frustrated that the relation between the two parameter seems to be nowhere described explicitely so I do it here. (I hope this isn't against the rules of stackexchange.) + +[This very nice article](http://digitalcommons.unl.edu/cgi/viewcontent.cgi?article=1141&context=usdeptcommercepub&sei-redir=1&referer=http%3A%2F%2Fscholar.google.nl%2Fscholar_url%3Fhl%3Dnl%26q%3Dhttp%3A%2F%2Fdigitalcommons.unl.edu%2Fcgi%2Fviewcontent.cgi%253Farticle%253D1141%2526context%253Dusdeptcommercepub%26sa%3DX%26scisig%3DAAGBfm3vz9gDbxRveIafikl02v0aeUyu0w%26oi%3Dscholarr%26ei%3DDj9VUqWlL6LG0QXe1oHwAg%26ved%3D0CDAQgAMoADAA#search=%22http%3A%2F%2Fdigitalcommons.unl.edu%2Fcgi%2Fviewcontent.cgi%3Farticle%3D1141%26context%3Dusdeptcommercepub%22) states: we will denote the random variable Y having a negative binomial distribution as Y ~ NB($\mu, \kappa$) with a parameterization such that E(Y) = $\mu$, var(Y) = $\mu + \kappa \mu^2$. + +I take this latter equation as the definition of $\kappa$. + +[Apparently](http://books.google.nl/books?id=Ohks0xwvyT4C&pg=PA196&lpg=PA196&dq=kappa+parameter+negative+binomial+proc+glimmix&source=bl&ots=PYKpaGQ8VN&sig=5sNEB-7H7ZocErTKhi35ORKd2lA&hl=nl&sa=X&ei=lEBVUqCnNcTJ0QXppYGoAg&ved=0CDYQ6AEwAA#v=onepage&q=kappa%20parameter%20negative%20binomial%20proc%20glimmix&f=false) this kappa is implemented in SAS. + +Now turning to R, the function `glm.nb` in the `MASS` package contains a parameter $\mu$ which is obviously the same $\mu$ as above and a parameter $\theta$. The question is how $\theta$ and $\kappa$ are related. The documentation for `glm.nb` only refers to it as an ""additional parameter"". The answer to [this](http://stats.stackexchange.com/questions/10419/what-is-theta-in-a-negative-binomial-regression-fitted-with-r) stackexchange question directly implies that $\theta = 1/\kappa$, but [this](http://stats.stackexchange.com/questions/30360/what-is-the-distribution-of-theta-in-a-negative-binomial-model-glm-nb-with-r?rq=1) question seems to suggest that $\theta = \kappa$. + +The help page for negative binomial in R is nice and introduces a parameter called `size` that equals $1/\kappa$. Fitting `glm.nb` on random data generated by `rnbinom` for various choices of $\mu$ and `size` seems to support the thesis that $\theta = 1/\kappa$ (i.e. that $\theta$ = `size`) but also that for large values of size the estimation is poor. + +Summarizing: I do believe that $\theta = 1/\kappa$ but it would be nice if there were an easily googlable place on the internet stating this explicitly. Maybe one of the answers to this questions can serve as such a place? ",,2013-10-09 12:00:22.943 +185201,57137,21896.0,5,,CC BY-SA 3.0,dcd9c3b8-6f71-448a-90ad-3d615ee3fef3,"After some frantic googling I do believe the answer is yes, but more so I am frustrated that the relation between the two parameter seems to be nowhere described explicitely so I do it here. (I hope this isn't against the rules of stackexchange.) + +[This very nice article](http://digitalcommons.unl.edu/cgi/viewcontent.cgi?article=1141&context=usdeptcommercepub&sei-redir=1&referer=http%3A%2F%2Fscholar.google.nl%2Fscholar_url%3Fhl%3Dnl%26q%3Dhttp%3A%2F%2Fdigitalcommons.unl.edu%2Fcgi%2Fviewcontent.cgi%253Farticle%253D1141%2526context%253Dusdeptcommercepub%26sa%3DX%26scisig%3DAAGBfm3vz9gDbxRveIafikl02v0aeUyu0w%26oi%3Dscholarr%26ei%3DDj9VUqWlL6LG0QXe1oHwAg%26ved%3D0CDAQgAMoADAA#search=%22http%3A%2F%2Fdigitalcommons.unl.edu%2Fcgi%2Fviewcontent.cgi%3Farticle%3D1141%26context%3Dusdeptcommercepub%22) states: we will denote the random variable Y having a negative binomial distribution as Y ~ NB($\mu, \kappa$) with a parameterization such that E(Y) = $\mu$, var(Y) = $\mu + \kappa \mu^2$. + +I take this latter equation as the definition of $\kappa$. + +[Apparently](http://books.google.nl/books?id=Ohks0xwvyT4C&pg=PA196&lpg=PA196&dq=kappa+parameter+negative+binomial+proc+glimmix&source=bl&ots=PYKpaGQ8VN&sig=5sNEB-7H7ZocErTKhi35ORKd2lA&hl=nl&sa=X&ei=lEBVUqCnNcTJ0QXppYGoAg&ved=0CDYQ6AEwAA#v=onepage&q=kappa%20parameter%20negative%20binomial%20proc%20glimmix&f=false) this kappa is implemented in SAS. + +Now turning to R, the function `glm.nb` in the `MASS` package contains a parameter $\mu$ which is obviously the same $\mu$ as above and a parameter $\theta$. The question is how $\theta$ and $\kappa$ are related. The documentation for `glm.nb` only refers to it as an ""additional parameter"". The answers to [this](http://stats.stackexchange.com/questions/10419/what-is-theta-in-a-negative-binomial-regression-fitted-with-r) and [this](http://stats.stackexchange.com/questions/10457/interpreting-negative-binomial-regression-output-in-r?rq=1) stackexchange questions directly imply that $\theta = 1/\kappa$, but [this](http://stats.stackexchange.com/questions/30360/what-is-the-distribution-of-theta-in-a-negative-binomial-model-glm-nb-with-r?rq=1) question seems to suggest that $\theta = \kappa$. + +The help page for negative binomial in R is nice and introduces a parameter called `size` that equals $1/\kappa$. Fitting `glm.nb` on random data generated by `rnbinom` for various choices of $\mu$ and `size` seems to support the thesis that $\theta = 1/\kappa$ (i.e. that $\theta$ = `size`) but also that for large values of size the estimation is poor. + +Summarizing: I do believe that $\theta = 1/\kappa$ but it would be nice if there were an easily googlable place on the internet stating this explicitly. Maybe one of the answers to this questions can serve as such a place? ",added 118 characters in body,2013-10-09 12:06:18.343 +185204,57137,21896.0,5,,CC BY-SA 3.0,155ab217-d115-4841-b575-8c1a25007ac4,"After some frantic googling I do believe the answer is yes, but more so I am frustrated that the relation between the two parameters seems to be nowhere described explicitely so I do it here. (I hope this isn't against the rules of stackexchange.) + +[This very nice article](http://digitalcommons.unl.edu/cgi/viewcontent.cgi?article=1141&context=usdeptcommercepub&sei-redir=1&referer=http%3A%2F%2Fscholar.google.nl%2Fscholar_url%3Fhl%3Dnl%26q%3Dhttp%3A%2F%2Fdigitalcommons.unl.edu%2Fcgi%2Fviewcontent.cgi%253Farticle%253D1141%2526context%253Dusdeptcommercepub%26sa%3DX%26scisig%3DAAGBfm3vz9gDbxRveIafikl02v0aeUyu0w%26oi%3Dscholarr%26ei%3DDj9VUqWlL6LG0QXe1oHwAg%26ved%3D0CDAQgAMoADAA#search=%22http%3A%2F%2Fdigitalcommons.unl.edu%2Fcgi%2Fviewcontent.cgi%3Farticle%3D1141%26context%3Dusdeptcommercepub%22) states: we will denote the random variable Y having a negative binomial distribution as Y ~ NB($\mu, \kappa$) with a parameterization such that E(Y) = $\mu$, var(Y) = $\mu + \kappa \mu^2$. + +I take this latter equation as the definition of $\kappa$. + +[Apparently](http://books.google.nl/books?id=Ohks0xwvyT4C&pg=PA196&lpg=PA196&dq=kappa+parameter+negative+binomial+proc+glimmix&source=bl&ots=PYKpaGQ8VN&sig=5sNEB-7H7ZocErTKhi35ORKd2lA&hl=nl&sa=X&ei=lEBVUqCnNcTJ0QXppYGoAg&ved=0CDYQ6AEwAA#v=onepage&q=kappa%20parameter%20negative%20binomial%20proc%20glimmix&f=false) this kappa is implemented in SAS. + +Now turning to R, the function `glm.nb` in the `MASS` package contains a parameter $\mu$ which is obviously the same $\mu$ as above and a parameter $\theta$. The question is how $\theta$ and $\kappa$ are related. The documentation for `glm.nb` only refers to it as an ""additional parameter"". The answers to [this](http://stats.stackexchange.com/questions/10419/what-is-theta-in-a-negative-binomial-regression-fitted-with-r) and [this](http://stats.stackexchange.com/questions/10457/interpreting-negative-binomial-regression-output-in-r?rq=1) stackexchange questions directly imply that $\theta = 1/\kappa$, but [this](http://stats.stackexchange.com/questions/30360/what-is-the-distribution-of-theta-in-a-negative-binomial-model-glm-nb-with-r?rq=1) question seems to suggest that $\theta = \kappa$. + +The [help page for negative binomial in R](http://stat.ethz.ch/R-manual/R-patched/library/stats/html/NegBinomial.html) is nice and introduces a parameter called `size` that equals $1/\kappa$. Fitting `glm.nb` on random data generated by `rnbinom` for various choices of $\mu$ and `size` seems to support the thesis that $\theta = 1/\kappa$ (i.e. that $\theta$ = `size`) but also that for large values of size the estimation is poor. + +Summarizing: I do believe that $\theta = 1/\kappa$ but it would be nice if there were an easily googlable place on the internet stating this explicitly. Maybe one of the answers to this questions can serve as such a place? ",added 79 characters in body,2013-10-09 12:15:59.433 +185219,57012,20773.0,5,,CC BY-SA 3.0,13522325-3b5c-4627-aaea-0f0edaca604a,"I have observations taken with different sensitivity thresholds and minimum detection levels, i.e. Lab A is less sensitive and has a minimum detection level of .2 and Lab B is more sensitive and has a minimum detection level of .02. + +*Edit 2: I have taken $N$ samples and have had them processed by two different labs (for stupid political reasons). Both labs send me the results and I discover that Lab A has a minimum detection level of .2 and Lab B has a minimum detection level of .02. See example:* + +Each row corresponds to a unique measurement taken by either lab: + + Obs | Lab A | Lab B + --------------------- + 1 | .6 | NA + 2 | 0 | NA + 3 | NA | .53 + 4 | .2 | NA + 5 | NA | .07 + +*Edit 2: I would like to be able to use and combine results from both labs, as if they were on the same scale. The problem is that the labs used to process the samples have very different thresholds for detection and have different sensitivity levels.* + +I think I would like something like: + + Obs | LabA | LabB | NewLab + ---------------------------- + 1 | .6 | NA | .64 + 2 | 0 | NA | .13 + 3 | NA | .53 | .53 + 4 | .2 | NA | .21 + 5 | NA | .07 | .07 + +What techniques are available to standardize the values such that there is not a large loss of information? + + 1. Obviously, I could take the values from Lab B and replace anything less than .2 with 0 and then round them, but I want to avoid throwing away information if possible. + 2. One person suggested to add random noise to the values of Lab A, but I'm not sure of the benefit of this vs. simply imputing the missing values from Lab B. + +*Edit 1:* +There are no observations for which both Lab A and Lab B values are present, one will always be missing. + +*Edit 2:* +What can I do to get results from both labs on a similar scale?",added 548 characters in body,2013-10-09 13:36:19.723 +185273,57156,19822.0,1,,CC BY-SA 3.0,bdeaca58-ef69-4246-95f9-685b96ec5013,How to perform unsupervised Random Forest classification using Breimans code?,,2013-10-09 17:22:59.850 +185272,57156,19822.0,3,,CC BY-SA 3.0,bdeaca58-ef69-4246-95f9-685b96ec5013,,,2013-10-09 17:22:59.850 +185274,57156,19822.0,2,,CC BY-SA 3.0,bdeaca58-ef69-4246-95f9-685b96ec5013,"I am working with the Breimans random Forest code (http://stat-www.berkeley.edu/users/breiman/RandomForests/cc_manual.htm#c2) for classification of satellite data (supervised learning). I am using a training and test dataset having sample size of 2000 and variable size 10. The data is classified in to two classes, A and B. In supervised learning mode, the algorithm is performing well with very low classification error (<2%). Now i want to try the unsupervised classification with no class lables in the test data set and see how the algorithm is able to predict the classes? Is there a way to implement unsupervised classification using the Breimans code? Will the error from this method will be higher than supervised classification? +The data and run parameter setting in the algorithm are given below + +DESCRIBE DATA +1 mdim=10,ntrain=2000,nclass=2,maxcat=1, +1 ntest=2000,labelts=1,labeltr=1, + +SET RUN PARAMETERS +2 mtry0=3,ndsize=1,jbt=500,look=100,lookcls=1, +2 jclasswt=0,mdim2nd=0,mselect=0,",,2013-10-09 17:22:59.850 +185293,57160,15764.0,3,,CC BY-SA 3.0,55f4fbfc-b592-4a25-8c46-3a903de800ac,,,2013-10-09 19:09:36.590 +185381,57187,9792.0,3,,CC BY-SA 3.0,6084d71e-6338-4e2c-96e9-f23881bbbbe2,,,2013-10-10 02:44:04.330 +185380,57187,9792.0,1,,CC BY-SA 3.0,6084d71e-6338-4e2c-96e9-f23881bbbbe2,Centering when using splines in R,,2013-10-10 02:44:04.330 +185446,57211,503.0,2,,CC BY-SA 3.0,ecb27f3c-e9d4-4aed-a527-8017b743b9b6,"Using eigenvalues > 1 is only *one* indication of how many factors to retain. Other reasons include the scree test, getting a reasonable proportion of variance explained and (most importantly) substantive sense. + +That said, the rule came about because the average eigenvalue will be 1, so > 1 is ""higher than average"". + +On your second question: Are you asking how to know how many factors (latent variables) to retain? Or are you asking about which observed variables to retain? + +If the former, see above and see any book on factor analysis. If the latter, each factor is a linear combination of *all* the observed variables (although some contribute very little). ",,2013-10-10 10:52:34.780 +185294,57160,15764.0,2,,CC BY-SA 3.0,55f4fbfc-b592-4a25-8c46-3a903de800ac,"I'd like to have a [*seasonal* ARIMA model](https://www.otexts.org/fpp/8/9) implemented with Statsmodels ARIMA. Specifically, I'd like to log before the weekly seasonality and then be able to make forecasts. + +Perhaps an example with ARIMA's [from_formula](http://statsmodels.sourceforge.net/stable/generated/statsmodels.tsa.arima_model.ARIMA.from_formula.html#statsmodels.tsa.arima_model.ARIMA.from_formula) method could accomplish this. I'd also love to be able to do this with patsy. + +Here's my sample code for logging before the weekly seasonality, and then transforming back to compare to the original time series (I've also skipped checking the validity of the model through testing stationarity and the residuals): + + import pandas as pd + import numpy as np + from statsmodels.tsa.arima_model import ARIMA + + # ts is a time series + logged_ts = np.log(ts) + # Differencing by the week forces us to drop the first 7 values. + diffed_logged_ts = (logged_ts - logged_ts.shift(7))[7:] + + p = 0 + d = 1 + q = 1 + + arima = ARIMA(diffed_logged_ts.values, [p, d, q], exog=None, dates=diffed_logged_ts.index, freq='D', missing='none') + diffed_logged_results = arima.fit(trend='c', disp=False) + predicted_diffed_logged = diffed_logged_results.predict(exog=None, dynamic=False) + predicted_diffed_logged_ts = pd.Series(predicted_diffed_logged, index=diffed_logged_ts.index[d:]) + predicted_diffed_logged_ts = np.exp(logged_ts.shift(7) + diffed_logged_ts.shift(d) + predicted_diffed_logged_ts) + + concatenated = pd.concat([ts, predicted_diffed_logged_ts], axis=1, keys=['original', 'predicted']) + print concatenated[-7:] + +What do you think of this approach? I hope there's a less error-prone way coming in a future version of Statsmodels. Could someone tag this question with ""statsmodels""? Thanks! +",,2013-10-09 19:09:36.590 +185292,57160,15764.0,1,,CC BY-SA 3.0,55f4fbfc-b592-4a25-8c46-3a903de800ac,ARIMA with seasonality in Statsmodels,,2013-10-09 19:09:36.590 +185295,57161,20739.0,1,,CC BY-SA 3.0,484423cf-7729-4409-9591-7f374c2684cd,Performing binary logistic regression with equal number of cases and non-cases,,2013-10-09 19:25:15.053 +185296,57161,20739.0,3,,CC BY-SA 3.0,484423cf-7729-4409-9591-7f374c2684cd,,,2013-10-09 19:25:15.053 +185297,57161,20739.0,2,,CC BY-SA 3.0,484423cf-7729-4409-9591-7f374c2684cd,"The best way to ask my question is to present an example scenario: + +Let's say that the outcome of interest is lung cancer (1 = lung cancer; 0 = no lung cancer) and we have 200k records (where 20k patients have lung cancer (cases) and 180k patients do NOT have lung cancer (non-cases)). Since only 10% of patients (20/200k) in our sample data have lung cancer, we use a random sample of 20k from the patients that do NOT have lung cancer. By doing so, we would have a sample of 20k patients with lung cancer and 20k patients without lung cancer in our sample (the sample is reduced from 200k to 40k records). + +Are there any benefits to performing binary logistic regression with equal number of cases and non-cases when the actual distribution of the outcome is not equal? Or does this bias our model estimates/predictive power? + +Thanks in advance! ",,2013-10-09 19:25:15.053 +185301,57161,20739.0,5,,CC BY-SA 3.0,cc2c17bc-ea01-45cf-8f70-7e5683c45310,"The best way to ask my question is to present an example scenario: + +Let's say that the outcome of interest is lung cancer (1 = lung cancer; 0 = no lung cancer) and a researcher has 200k records (where 20k patients have lung cancer (cases) and 180k patients do NOT have lung cancer (non-cases)). Since only 10% of patients (20/200k) in the sample data have lung cancer, a researcher uses a random sample of 20k from the patients that do NOT have lung cancer. By doing so, the researcher would have a sample of 20k patients with lung cancer and 20k patients without lung cancer in their sample (the sample is reduced from 200k to 40k records). + +Are there any benefits to performing binary logistic regression with equal number of cases and non-cases when the actual distribution of the outcome is not equal? Or does this bias model estimates/predictive power? + +Thanks in advance! ",added 21 characters in body,2013-10-09 19:33:35.397 +185303,57164,19264.0,2,,CC BY-SA 3.0,6c124c80-8c41-40cd-b530-5197075e7dbb,"I have an experimentally observed distribution that looks very similar to a gamma or lognormal distribution. I've read that the [lognormal distribution](http://en.wikipedia.org/wiki/Log-normal_distribution) is the maximum entropy probability distribution for a random variate $X$ for which the mean and variance of $ln(X)$ are fixed. Does the gamma distribution have any similar properties? + +What methods can I use to determine the true PDF of my data?",,2013-10-09 19:51:40.443 +185304,57164,19264.0,1,,CC BY-SA 3.0,6c124c80-8c41-40cd-b530-5197075e7dbb,Gamma vs. lognormal distributions,,2013-10-09 19:51:40.443 +185305,57164,19264.0,3,,CC BY-SA 3.0,6c124c80-8c41-40cd-b530-5197075e7dbb,,,2013-10-09 19:51:40.443 +185310,57164,19264.0,5,,CC BY-SA 3.0,50309f10-b9dd-4cfb-8582-bf7aa5de8bcc,I have an experimentally observed distribution that looks very similar to a gamma or lognormal distribution. I've read that the [lognormal distribution](http://en.wikipedia.org/wiki/Log-normal_distribution) is the maximum entropy probability distribution for a random variate $X$ for which the mean and variance of $ln(X)$ are fixed. Does the gamma distribution have any similar properties?,deleted 64 characters in body,2013-10-09 20:34:07.890 +185312,57167,21952.0,1,,CC BY-SA 3.0,2760c52f-dd31-4e2b-853f-2728f6b5853c,simultaneous equations,,2013-10-09 20:44:49.813 +185313,57167,21952.0,3,,CC BY-SA 3.0,2760c52f-dd31-4e2b-853f-2728f6b5853c,,,2013-10-09 20:44:49.813 +185311,57167,21952.0,2,,CC BY-SA 3.0,2760c52f-dd31-4e2b-853f-2728f6b5853c,"I have the following relationships + +Y ~ X1 + X2 + X3 + X4 + +and + +X1 ~ Z1 + Z2 + Z3 + Z4 + +X2 ~ Z1 + Z2 + Z3 + Z4 + +X3 ~ Z1 + Z2 + Z3 + Z4 + +X4 ~ Z1 + Z2 + Z3 + Z4 + +where Y and Z1, Z2, Z3, Z4 are endogenous (Say while the Z's play a role in determining Y, the values of Z's are fixed depending upont he values of Y - Kind of like advertising expense has an impact on sales revenue but at the same time managers determine the advertisement expense on the expected sales revenue). So all the variable are changing simultaneously. Can anyone help me on how I can estimate this relationship? Thank you",,2013-10-09 20:44:49.813 +185322,57015,,24,,CC BY-SA 3.0,90fcbbe2-f915-43ea-af06-3856c0dec8f4,,"Proposed by anonymous approved by 601, 805 edit id of 5584",2013-10-09 21:19:16.310 +185321,57015,0.0,5,,CC BY-SA 3.0,90fcbbe2-f915-43ea-af06-3856c0dec8f4,"I am measuring two unpaired variables $x$ and $y$ in two different conditions ($x$ and $y$ are magnitudes of some special magnetic signals). In the first condition, my hypothesis is that $\bar{x} > \bar{y}$ and in the second condition that $\bar{x} < \bar{y}$. Now that I have $N$ samples from both variables, how can I test whether my hypotheses are true? I am not sure if I can safely assume that $x$ and $y$ are independent from each other. Neither do I know from what kind of distributions they are sampled from. The sample size I have is small. I have read several introductions to statistics for the past few days, but never saw a worked out example for this kind of situations. All help appreciated. + +Edit: Like Michael Mayer wrote, there is a binary grouping variable ""condition"". Sorry for a bit unclear question.",added more information,2013-10-09 21:19:16.310 +185335,57175,20320.0,2,,CC BY-SA 3.0,8e648ebf-f69f-4779-9b4e-4dd7d569b2c7,"I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply EM method for parameter estimation by minimizing the error given in [paper][1]. The estimation problem is formulated as + +$x_n= s_n(\theta_0) + w_n Eq(1)$ + +where $s_n$ is the signal of interest; $w_n$ is a Gaussian white noise. The parameter estimate is obtained after minimization of $J$ (eq3) + +$$\hat{\theta}= \arg \min_{\theta} J(\theta) Eq(2)$$ and $$J = \sum ||u_{i-1} - u_i||^2$$ Eq(3) + +where $u_n$ is a function, $u_n = x_n - s_n(\theta)$. Minimizing the nonlinear prediction error of $u_n$ will give us the parameter vector $\theta$ since $u_n$ will converge to $\theta_0$. The search is initialized by taking random values of initial guesses of the parameters and doing a random search, stopping when $J$ is minimized for a given number of iterations, the parameters are incremented after each iteration by a very small number. I have read considerably on number of papers on EM but they are more complicated since they use Kalman filters and Kalman smoothers as the estimate. I don't think that I will be needing Kalman filter for estimation since minimization of $J$ serves as an estimator. My objective is to formulate the recursive search for optimal $J$ as EM method. + + 1. Can somebody give ideas if I need to find the derivative of the objective function and how I can plugin this step in EM method. + 2. I really do not understand how to begin the formulation of Eq(3) in ML format. + + + [1]: http://www.eurasip.org/Proceedings/Eusipco/Eusipco2000/SESSIONS/THUAM/OR1/CR1473.PDF",,2013-10-09 22:19:42.207 +185337,57175,20320.0,3,,CC BY-SA 3.0,8e648ebf-f69f-4779-9b4e-4dd7d569b2c7,,,2013-10-09 22:19:42.207 +185336,57175,20320.0,1,,CC BY-SA 3.0,8e648ebf-f69f-4779-9b4e-4dd7d569b2c7,On how to formulate and apply maximum likelihood,,2013-10-09 22:19:42.207 +185340,57175,20320.0,5,,CC BY-SA 3.0,1c87e67b-2a11-4c9e-9ee3-04443f77788e,"I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply EM method for parameter estimation by minimizing the error given in [paper][1]. The estimation problem is formulated as + +$x_n= s_n(\theta_0) + w_n Eq(1)$ + +where $s_n$ is the signal of interest; $w_n$ is a Gaussian white noise. The parameter estimate is obtained after minimization of $J$ (eq3) + +$$\hat{\theta}= \arg \min_{\theta} J(\theta) Eq(2)$$ and $$J = \sum ||u_{i-1} - u_i||^2$$ Eq(3) + +where $u_n$ is a function, $u_n = x_n - s_n(\theta)$. Minimizing the nonlinear prediction error of $u_n$ will give us the parameter vector $\theta$ since $u_n$ will converge to $\theta_0$. The search is initialized by taking random values of initial guesses of the parameters and doing a random search, stopping when $J$ is minimized for a given number of iterations, the parameters are incremented after each iteration by a very small number. I have read considerably on number of papers on EM but they are more complicated since they use Kalman filters and Kalman smoothers as the estimate. I don't think that I will be needing Kalman filter for estimation since minimization of $J$ serves as an estimator. My objective is to formulate the recursive search for optimal $J$ as EM method. The maximum-likelihood solution for the parameters , +under the additive white Gaussian noise assumption, corresponds +to minimization of the norm as follows $\theta_{ML} = arg min \sum ||u_{i-1} - u_i||^2 Eq(4)$ + + 1. Can somebody give ideas of of solving Eq(4) as ML. + 2. I really do not understand how to begin the formulation of Eq(3) in ML format. + + + [1]: http://www.eurasip.org/Proceedings/Eusipco/Eusipco2000/SESSIONS/THUAM/OR1/CR1473.PDF",added 136 characters in body,2013-10-09 22:31:06.690 +185341,57175,20320.0,5,,CC BY-SA 3.0,69c42ac2-3f27-4592-8ed8-47377a960c8a,"I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply EM method for parameter estimation by minimizing the error given in [paper][1]. The estimation problem is formulated as + +$x_n= s_n(\theta_0) + w_n Eq(1)$ + +where $s_n$ is the signal of interest; $w_n$ is a Gaussian white noise. The parameter estimate is obtained after minimization of $J$ (eq3) + +$$\hat{\theta}= \arg \min_{\theta} J(\theta) Eq(2)$$ and $$J = \sum ||u_{i-1} - u_i||^2$$ Eq(3) + +where $u_n$ is a function, $u_n = x_n - s_n(\theta)$. Minimizing the nonlinear prediction error of $u_n$ will give us the parameter vector $\theta$ since $u_n$ will converge to $\theta_0$. The search is initialized by taking random values of initial guesses of the parameters and doing a random search, stopping when $J$ is minimized for a given number of iterations, the parameters are incremented after each iteration by a very small number. I have read considerably on number of papers on EM but they are more complicated since they use Kalman filters and Kalman smoothers as the estimate. I don't think that I will be needing Kalman filter for estimation since minimization of $J$ serves as an estimator. My objective is to formulate the recursive search for optimal $J$ as EM method. The maximum-likelihood solution for the parameters , +under the additive white Gaussian noise assumption, corresponds +to minimization of the norm as follows $\theta_{ML} = arg min \sum ||u_{i-1} - u_i||^2 Eq(4)$ + + 1. Can somebody give ideas of of solving Eq(4) as ML i.e how do I minimize J using ML? + 2. I really do not understand how to begin the formulation of Eq(3) in ML format. + + + [1]: http://www.eurasip.org/Proceedings/Eusipco/Eusipco2000/SESSIONS/THUAM/OR1/CR1473.PDF",added 33 characters in body,2013-10-09 22:41:43.693 +185342,57177,594.0,2,,CC BY-SA 3.0,47be9cb5-bfa4-465c-983b-32385d25aa69,"As for qualitative differences, the lognormal and gamma are, as you say, quite similar. + +Indeed, in practice they're often used to model the same phenomena (some people will use a gamma where others use a lognormal). They are both, for example, constant-coefficient-of-variation models (the CV for the lognormal is $\sqrt{e^{\sigma^2} -1}$, for the gamma it's $1/\sqrt \alpha$). + +You might find it instructive to look at the density of their *logs*, which often shows a very clear difference. + +The log of a lognormal random variable is ... normal. It's symmetric. + +The log of a gamma random variable is left-skew. Depending on the value of the shape parameter, it may be quite skew or nearly symmetric. + +This difference implies that the gamma has more of a tail on the left, and less of a tail on the right; the far right tail of the lognormal is heavier and its left tail lighter. And indeed, if you look at the skewness, of the lognormal and gamma, for a given coefficient of variation, the lognormal is more right skew ($(e^{\sigma^2}+2) \text{CV}$) than the gamma ($2\text{CV}$). +",,2013-10-09 22:43:42.927 +185407,57192,155.0,6,,CC BY-SA 3.0,ce295a26-359f-4f9a-94e1-272c2f31aedc,,added 1 characters in body; edited tags; edited title,2013-10-10 06:05:50.533 +185406,57192,155.0,4,,CC BY-SA 3.0,ce295a26-359f-4f9a-94e1-272c2f31aedc,How to check for normal distribution using Excel for performing a t-test?,added 1 characters in body; edited tags; edited title,2013-10-10 06:05:50.533 +185954,57355,20927.0,3,,CC BY-SA 3.0,65f12dac-758c-453a-a31d-c32136036024,,,2013-10-12 14:02:41.167 +185953,57355,20927.0,1,,CC BY-SA 3.0,65f12dac-758c-453a-a31d-c32136036024,SPSS-independent-sample-t-test,,2013-10-12 14:02:41.167 +185348,57175,20320.0,5,,CC BY-SA 3.0,aef0b883-d326-48bf-a27d-13915f5a8fdc,"I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply EM method for parameter estimation by minimizing the error given in [paper][1]. The estimation problem is formulated as + +$x_n= s_n(\theta_0) + w_n Eq(1)$ + + +where $s_n$ is the signal of interest; $w_n$ is a Gaussian white noise. The parameter estimate is obtained after minimization of $J$ (eq3) + +$$\hat{\theta}= \arg \min_{\theta} J(\theta) Eq(2)$$ and $$J = \sum ||u_{i-1} - u_i||^2$$ Eq(3) + +where $u_n$ is a function, $u_n = x_n - s_n(\theta)$. Minimizing the nonlinear prediction error of $u_n$ will give us the parameter vector $\theta$ since $u_n$ will converge to $\theta_0$. So, $u_n$ generally becomes a data series. + +In the original way, the search is initialized by taking random values of initial guesses of the parameters and doing a random search, stopping when $J$ is minimized for a given number of iterations, the parameters are incremented after each iteration by a very small number. I have read considerably on number of papers on EM but they are more complicated since they use Kalman filters and Kalman smoothers as the estimate. I don't think that I will be needing Kalman filter for estimation since minimization of $J$ serves as an estimator. My objective is to formulate the recursive search for optimal $J$ as EM method. The maximum-likelihood solution for the parameters , +under the additive white Gaussian noise assumption, corresponds +to minimization of the norm as follows $\theta_{ML} = arg min u_n Eq(4)$ + + 1. Can somebody give ideas of of solving Eq(4) as ML i.e how do I minimize J using ML? + 2. I really do not understand how to replace Eq (3) by ML based minimization since I cannot understand the formulation of the likelihood and other technical information. + + + [1]: http://www.eurasip.org/Proceedings/Eusipco/Eusipco2000/SESSIONS/THUAM/OR1/CR1473.PDF",added 137 characters in body,2013-10-09 22:54:40.457 +185350,57156,15827.0,5,,CC BY-SA 3.0,725a81ee-f648-4672-b464-dc6b081442fc,"I am working with Breiman's random forest code (http://stat-www.berkeley.edu/users/breiman/RandomForests/cc_manual.htm#c2) for classification of satellite data (supervised learning). I am using a training and test dataset having sample size of 2000 and variable size 10. The data is classified into two classes, A and B. In supervised learning mode, the algorithm is performing well with very low classification error (<2%). Now I want to try the unsupervised classification with no class labels in the test data set and see how the algorithm is able to predict the classes. Is there a way to implement unsupervised classification using Breiman's code? Will the error from this method will be higher than supervised classification? +The data and run parameter setting in the algorithm are given below + +DESCRIBE DATA +1 mdim=10,ntrain=2000,nclass=2,maxcat=1, +1 ntest=2000,labelts=1,labeltr=1, + +SET RUN PARAMETERS +2 mtry0=3,ndsize=1,jbt=500,look=100,lookcls=1, +2 jclasswt=0,mdim2nd=0,mselect=0,",basic English fixes; Breiman's is possessive,2013-10-09 22:56:14.480 +185349,57156,15827.0,4,,CC BY-SA 3.0,725a81ee-f648-4672-b464-dc6b081442fc,How to perform unsupervised Random Forest classification using Breiman's code?,basic English fixes; Breiman's is possessive,2013-10-09 22:56:14.480 +185355,57177,594.0,5,,CC BY-SA 3.0,a0a601e5-4a67-4a0c-9a80-adad523795f9,"As for qualitative differences, the lognormal and gamma are, as you say, quite similar. + +Indeed, in practice they're often used to model the same phenomena (some people will use a gamma where others use a lognormal). They are both, for example, constant-coefficient-of-variation models (the CV for the lognormal is $\sqrt{e^{\sigma^2} -1}$, for the gamma it's $1/\sqrt \alpha$). + +You might find it instructive to look at the density of their *logs*, which often shows a very clear difference. + +The log of a lognormal random variable is ... normal. It's symmetric. + +The log of a gamma random variable is left-skew. Depending on the value of the shape parameter, it may be quite skew or nearly symmetric. + +Here's an example, with both lognormal and gamma having mean 1 and variance 1/4. The top plot shows the densities (gamma in green, lognormal in blue), and the lower one shows the densities of the logs: + +![gamma and lognormal, densitiy and density of log][1] + +(Plotting the log of the density of the logs is also useful.) + +This difference implies that the gamma has more of a tail on the left, and less of a tail on the right; the far right tail of the lognormal is heavier and its left tail lighter. And indeed, if you look at the skewness, of the lognormal and gamma, for a given coefficient of variation, the lognormal is more right skew ($(e^{\sigma^2}+2) \text{CV}$) than the gamma ($2\text{CV}$). + + + [1]: https://i.stack.imgur.com/I9ARM.png",added 373 characters in body,2013-10-09 23:14:21.953 +185358,57183,22541.0,2,,CC BY-SA 3.0,361515a0-5bc8-4025-a674-dbaa6c10c840,"Students have completed a test containing 20 questions at both T1 and T2, with an intervention in the interval. Scores for each question are either 0 (incorrect) or 1 (correct). I am interested in knowing whether the improvement in students' scores was significantly greater for some questions than for others. I am thinking that this may involve an extension of the McNemar test, but open to all suggestions. Thanks!",,2013-10-09 23:52:28.193 +185359,57183,22541.0,1,,CC BY-SA 3.0,361515a0-5bc8-4025-a674-dbaa6c10c840,Which test should I use to assess for the statistical signficance of changes in multiple binomial dependent variables from T1 to T2?,,2013-10-09 23:52:28.193 +185360,57183,22541.0,3,,CC BY-SA 3.0,361515a0-5bc8-4025-a674-dbaa6c10c840,,,2013-10-09 23:52:28.193 +185361,57184,1741.0,2,,CC BY-SA 3.0,6cfb513d-5907-41f1-bd6d-6e6d62b3f513,"Here there are some thoughts: + + 1. All black-box models might be inspected in some way. You can compute the variable importance for each feature for example or you can also plot the predicted response and the actual one for each feature ([link][1]); + 2. You might think about some pruning of the ensemble. Not all the trees in the forest are necessary and you might use just a few. Paper: [Search for the Smallest Random Forest, Zhang]. Otherwise just Google ""ensemble pruning"", and have a look at ""Ensemble Methods: Foundations and Algorithms +"" [Chapter 6][2]; + 3. You can build a single model by feature selection as you said. Otherwise you can also try to use Domingos' method in [Knowledge acquisition from examples via multiple models] that consists in building a new dataset with black-box predictions and build a decision tree on top of it. + 4. As mentioned in [this][3] Stack Exchange's answer, a tree model might seem interpretable but it is prone to high changes just because of small perturbations of the training data. Thus, it is better to use a black-box model. The final aim of an end user is to understand why a new record is classified as a particular class. You might think about some feature importances just for that particular record. + +I would go for 1. or 2. + + [1]: http://stats.stackexchange.com/questions/21152/obtaining-knowledge-from-a-random-forest/21457#21457 + [2]: http://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/publication.htm + [3]: http://stats.stackexchange.com/questions/32125/how-to-make-random-forests-more-interpretable/32132#32132",,2013-10-10 00:04:43.820 +185405,57192,155.0,5,,CC BY-SA 3.0,ce295a26-359f-4f9a-94e1-272c2f31aedc,"I want to know **how to check a data set for normality in Excel, just to verify that the requirements for using a t-test are being met**. + +For the right tail, is it appropriate to just calculate a mean and standard deviation, add 1, 2 & 3 standard deviations from the mean to create a range then compare that to the normal 68/95/99.7 for the standard normal distribution after using the norm.dist function in excel to test each standard deviation value. + +Or is there a better way to test for normality?",added 1 characters in body; edited tags; edited title,2013-10-10 06:05:50.533 +185408,57196,668.0,2,,CC BY-SA 3.0,f0bb99df-2174-45dc-a31b-421819814e44,"You have the right idea. This can be done systematically, comprehensively, and with relatively simple calculations. A graph of the results is called a *normal probability plot* (or sometimes a Q-Q plot). From it you can see *much* more detail than appears in other graphical representations, especially [histograms](http://stats.stackexchange.com/questions/51718/assessing-approximate-distribution-of-data-based-on-a-histogram/51753#51753), and with a little practice you can even learn to determine ways to re-express your data to make them closer to Normal in situations where that is warranted. + +Here is an example: + +![Spreadsheet with probability plot][1] + +Data are in column `A` (and named `Data`). The rest is all calculation, although you can control the ""hinge rank"" value used to fit a reference line to the plot. + +This plot is a scatterplot comparing the data to values that would be attained by numbers drawn independently from a standard Normal distribution. When the points line up along the diagonal, they are close to Normal; horizontal departures (along the data axis) indicate departures from normality. In this example the points are remarkably close to the reference line; the largest departure occurs at the highest value, which is about $1.5$ units to the left of the line. Thus we see at a glance that these data are very close to Normally distributed but perhaps have a slightly ""light"" right tail. This is perfectly fine for applying a t-test. + +The comparison values on the vertical axis are computed in two steps. First each data value is ranked from $1$ through $n$, the amount of data (shown in the `Count` field in cell `F22`). These are proportionally converted to values in the range $0$ to $1$. A good formula to use is $\left(\text{rank}+1/6\right)/\left(n+1/3\right).$ Then these are converted to standard Normal values via the `NormSInv` function. These values appear in the `Normal score` column. The plot at the right is an XY scatterplot of `Normal Score` against the data. (In some references you will see the transpose of this plot, which perhaps is more natural, but Excel prefers to place the leftmost column on the horizontal axis and the rightmost column on the vertical axis, so I have let it do what it prefers.) + +![Spreadsheet: normal score calculation][2] + +(As you can see, I simulated these data with independent random draws from a Normal distribution with mean $5$ and standard deviation $2$. It is therefore no surprise that the probability plot looks so nice.) There really are only two formulas to type in, which you propagate downward to match the data: they appear in cells `B2:C2` and rely on the `Count` value computed in cell `F2`. That's really all there is to it, apart from the plotting. + +The rest of this sheet is not necessary but it's helpful for judging the plot: it provides a robust estimate of a reference line. This is done by picking two points equally far in from the left and right of the plot and connecting them with a line. In the example these points are the third lowest and third highest, as determined by the $3$ in the `Hinge Rank` cell, `F3`. As a bonus, its slope and intercept are robust estimates of the standard deviation and mean of the data, respectively. + +To plot the reference line, two extreme points are computed and added to the plot: their calculation occurs in columns `I:J`, labeled `X` and `Y`. + +![Spreadsheet: reference line calculation][3] + + + [1]: https://i.stack.imgur.com/giJwL.png + [2]: https://i.stack.imgur.com/kS738.png + [3]: https://i.stack.imgur.com/ZdEYB.png",,2013-10-10 06:11:44.377 +185364,57175,20320.0,5,,CC BY-SA 3.0,0307b09e-8a99-40af-ac04-ebd005fe6055,"I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply EM method for parameter estimation by minimizing the error given in [paper][1]. The estimation problem is formulated as + +$x_n= s_n(\theta_0) + w_n Eq(1)$ + + +where $s_n$ is the signal of interest; $w_n$ is a Gaussian white noise. The parameter estimate is obtained after minimization of $J$ (eq3) + +$$\hat{\theta}= \arg \min_{\theta} J(\theta) Eq(2)$$ and $$J = \sum ||u_{i-1} - u_i||^2$$ Eq(3) + +where $u_n$ is a function, $u_n = x_n - s_n(\theta)$. Minimizing $u_n$ will give us the parameter vector $\theta$ since $u_n$ will converge to $\theta_0$. So, $u_n$ generally becomes a data series. In the paper they have used nearest neighbor search for minimizing J and hence the representation of Eq(3) as a nearest neighbor search. + +In the original way, the search is initialized by taking random values of initial guesses of the parameters and doing a random search, stopping when $J$ is minimized for a given number of iterations, the parameters are incremented after each iteration by a very small number. + +My question is how do I replace EQ(3) and formulate the recursive search for optimal $J$ as EM method. The maximum-likelihood solution for the parameters , +under the additive white Gaussian noise assumption, corresponds +to minimization of the norm as follows $\theta_{ML} = arg min u_n Eq(4)$ + + 1. Can somebody give ideas of of solving Eq(4) as ML i.e how do I minimize J using ML? + 2. I really do not understand how to replace Eq (3) by ML based minimization since I cannot understand the formulation of the likelihood and other technical information. +Thank you. + + [1]: http://www.eurasip.org/Proceedings/Eusipco/Eusipco2000/SESSIONS/THUAM/OR1/CR1473.PDF",deleted 125 characters in body,2013-10-10 00:42:12.537 +185368,57185,,1,user10619,CC BY-SA 3.0,9005c5f8-25d5-4cc3-a5c9-0447eba79eb3,What is the difference between the concept and treatment of measurement error under the psycumetry and under statistical analysis?,,2013-10-10 00:46:42.743 +185367,57185,,2,user10619,CC BY-SA 3.0,9005c5f8-25d5-4cc3-a5c9-0447eba79eb3,There is some confusion with respect to the measurement error. What is the definition in statistics and definition in psychometry? Does bias imply the same thing under two disciplines?,,2013-10-10 00:46:42.743 +185366,57185,,3,user10619,CC BY-SA 3.0,9005c5f8-25d5-4cc3-a5c9-0447eba79eb3,,,2013-10-10 00:46:42.743 +185370,57185,594.0,4,,CC BY-SA 3.0,061f74cd-3dbd-49e8-b1f2-dbd600ca568f,What is the difference between the concept and treatment of measurement error under the psychometry and under statistical analysis?,added 4 characters in body; edited title,2013-10-10 00:48:12.213 +185369,57185,594.0,5,,CC BY-SA 3.0,061f74cd-3dbd-49e8-b1f2-dbd600ca568f,"There is some confusion with respect to the measurement error. What is the definition in statistics and definition in psychometry? + +Does bias imply the same thing under two disciplines?",added 4 characters in body; edited title,2013-10-10 00:48:12.213 +185374,57186,22542.0,3,,CC BY-SA 3.0,7fcd301a-12a1-4a3c-95b5-99dc883358a0,,,2013-10-10 01:51:45.710 +185373,57186,22542.0,1,,CC BY-SA 3.0,7fcd301a-12a1-4a3c-95b5-99dc883358a0,Sample sizes for differences between three groups,,2013-10-10 01:51:45.710 +185372,57186,22542.0,2,,CC BY-SA 3.0,7fcd301a-12a1-4a3c-95b5-99dc883358a0,"Quick question - I am doing a study of two schools, and at each school I am sampling three groups. + +I will be asking each group various questions about how they feel about the school, etc. + +I want to be able to detect: + +a) a difference of 20% in responses between the schools (e.g., proportion of students for whom current school was first school of preference), and + +b) *within* each school, 20% difference in responses between the three different groups (similar questions, but for students who enrolled in different eras). + +In both instances, I would like power of 80% for 0.05 significance. + +So, if I do what I think is the appropriate calculation, I effectively come up with + +n = (0.5(0.84 + 1.96)^2)/0.2^2 += 98 + +Can I just assume then that I need three groups of 99 within each school? I guess I'm confused because nobody ever seems to talk about calculating sample sizes when comparing more than two groups. + +Furthermore, is there anything wrong with sampling the three groups of 98 at each school and assuming that the total sample of 294 at each school will be sufficient to detect the 20% difference between the two schools? + +Thank you for the assistance, and I apologise if my english is not clear :)",,2013-10-10 01:51:45.710 +185376,57186,10060.0,5,,CC BY-SA 3.0,3b9a5776-68dd-46b0-9f21-fca716f0eaf8,"Quick question - I am doing a study of two schools, and at each school I am sampling three groups. + +I will be asking each group various questions about how they feel about the school, etc. + +I want to be able to detect: + +a) a difference of 20% in responses between the schools (e.g., proportion of students for whom current school was first school of preference), and + +b) *within* each school, 20% difference in responses between the three different groups (similar questions, but for students who enrolled in different eras). + +In both instances, I would like power of 80% for 0.05 significance. + +So, if I do what I think is the appropriate calculation, I effectively come up with + +$n = \frac{0.5\times(0.84+1.96)^2}{0.2^2} = 98$ + +Can I just assume then that I need three groups of 99 within each school? I guess I'm confused because nobody ever seems to talk about calculating sample sizes when comparing more than two groups. + +Furthermore, is there anything wrong with sampling the three groups of 98 at each school and assuming that the total sample of 294 at each school will be sufficient to detect the 20% difference between the two schools? + +Thank you for the assistance, and I apologise if my english is not clear :)",Change formula to LaTeX expression,2013-10-10 02:27:45.560 +185377,57186,5237.0,5,,CC BY-SA 3.0,dca2c6aa-f48c-4a37-822b-17bb865cc526,"I am doing a study of two schools, and at each school I am sampling three groups. I will be asking each group various questions about how they feel about the school, etc. I want to be able to detect: + +1. a difference of 20% in responses between the schools (e.g., proportion of students for whom current school was first school of preference), and +2. *within* each school, 20% difference in responses between the three different groups (similar questions, but for students who enrolled in different eras). + +In both instances, I would like power of 80% for 0.05 significance. + +So, if I do what I think is the appropriate calculation, I effectively come up with: +\begin{align} +n &= \frac{0.5(0.84 + 1.96)^2}{0.2^2} \\ + &= 98 +\end{align} +Can I just assume then that I need three groups of 99 within each school? I guess I'm confused because nobody ever seems to talk about calculating sample sizes when comparing more than two groups. + +Furthermore, is there anything wrong with sampling the three groups of 98 at each school and assuming that the total sample of 294 at each school will be sufficient to detect the 20% difference between the two schools? +",changed tag; formatted; removed peripheral comments; light editing,2013-10-10 02:27:55.877 +185378,57186,5237.0,6,,CC BY-SA 3.0,dca2c6aa-f48c-4a37-822b-17bb865cc526,,changed tag; formatted; removed peripheral comments; light editing,2013-10-10 02:27:55.877 +185379,57187,9792.0,2,,CC BY-SA 3.0,6084d71e-6338-4e2c-96e9-f23881bbbbe2,"I am having trouble understanding why centering seems to only work with simple linear models and not with splines for example. I am using centering to report the estimated group differences at different $x$, but also statistical values (ignoring multiple comparisons for the moment). + + set.seed(1) + + # simulate data + N <- 10 + x <- rep(seq(0.2,1,0.2),N) + group <- factor(rep(c('I','II'),each=length(x)/N)) + y <- -x^2 + 2*x*as.numeric(group) + rnorm(length(x),mean=0,sd=0.1) + d <- data.frame(group,x,y) + + # fit a linear model with x-group interaction + l <- lm(y~x*group,data=d) + d$lmfit <- fitted(l) + coef(l)['groupII'] # group difference at x==0 + # groupII + # -0.1097071 + + library(ggplot2) + ggplot(d,aes(x,y,colour=group)) + geom_point() + geom_line(aes(x,lmfit,colour=group)) + +The plot confirms the reported small group difference `groupII` of 0.05 at $x=0$ if we were to extrapolate back to 0. + +Now let us centre the data at $x=1$ and estimate the group difference there. + + # center data at x==1 and refit + l <- lm(y~I(x-1)*group,data=d) + coef(l)['groupII'] # group difference at x==1 + # groupII + # 2.08525 + +In agreement with the plot the difference is about 2. + +Now let us fit a spline model. + + # fit data with splines + library(splines) + l <- lm(y~ns(x,2)*group,data=d) + d$lmsplinefit <- fitted(l) + coef(l)['groupII'] # group difference at x==0.2 + # groupII + # 0.2987893 + # compare to: d$lmsplinefit[6] - d$lmsplinefit[1] + + ggplot(d,aes(x,y,colour=group)) + geom_point() + geom_line(aes(x,lmsplinefit,colour=group)) + + +Interestingly, the spline fit reports the group difference at the first $x$, i.e. $x=0.2$. + +If we try to centre at $x=1$ we get the same result, i.e. the difference at $x=0.2$. + + l <- lm(y~ns(I(x-1),2)*group,data=d) + coef(l)['groupII'] + # same result as un-centered data, i.e. 0.2987893 + +Why is that? And is there a way to show the group difference at a different $x$? Btw, centering $x$ manually before the model fit does not make a difference.",,2013-10-10 02:44:04.330 +185382,57188,594.0,2,,CC BY-SA 3.0,42db66eb-ab96-4de8-b768-18f2560cc2bb," + +Here's the probability function and distribution of the proportion of True (plus the normal approximation at which the chi-square will be exact) in a sample of size 10,000 and a proportion of True of only 1% (right below the low end of your suggested total sample size, and with expected number of True only one fifth of your suggested minimum): + + +![enter image description here][1] + +Don't forget that you'll have about five times the expected successes shown here; your approximation will be much better than this. + +A straight two sample proportions test or a chi-square test should do just fine. Indeed, one tenth of your proportion of True's would be just fine. One *hundredth*, you'd just go to exact methods. + + [1]: https://i.stack.imgur.com/Se0Qx.png",,2013-10-10 02:45:38.470 +185388,57189,22545.0,3,,CC BY-SA 3.0,41d9cae3-1d83-4d0a-be24-7e11018d0589,,,2013-10-10 04:00:27.487 +185387,57189,22545.0,1,,CC BY-SA 3.0,41d9cae3-1d83-4d0a-be24-7e11018d0589,Theoretical expected value and variance,,2013-10-10 04:00:27.487 +185386,57189,22545.0,2,,CC BY-SA 3.0,41d9cae3-1d83-4d0a-be24-7e11018d0589,"Let $X$ be a random variable having expected value $\mu$ and variance $\sigma^2$. Find the Expected Value and Variance of $Y = \frac{X−\mu}{\sigma}$. + +I would like to show some progress I've made so far, but honestly I've been thinking about this problem for the past few days but just have no idea where to start. Any hint or insight on a starting point would be much appreciated. + +Thanks!",,2013-10-10 04:00:27.487 +185389,57190,449.0,2,,CC BY-SA 3.0,6f725d2f-b050-45ee-adc9-e08170e488fd,"You can use multi-level logistic regression. You've only got one dependent variable, correctness. You have multiple independent variables nested within student. In R you can use `lmer` to construct the model. It would look something like. + + m <- lmer( answer ~ treatment * Q + (treatment * Q | student), family = 'binomial', data = mydata) + +That would allow for there to be random effects of question and treatment within student as well as overall correctness variability within student but also be able to assess fixed effects if treatment and question. What seem to really want to know is all of the treatment by question interactions. + +In order to analyze all of the questions with any kind of reliability you really should have a lot of students taking the test (hundreds). The general effect of treatment could be assessed with fewer.",,2013-10-10 04:04:00.123 +185390,57189,594.0,6,,CC BY-SA 3.0,0ac0ceb1-76d7-42d8-a91a-c29fd3e6744a,,edited tags,2013-10-10 04:04:07.093 +185391,57191,594.0,2,,CC BY-SA 3.0,ab2371ee-34c4-4a1a-8472-e7a71cb3eddc,"Have you seen the following basic properties of expectation and variance? + +(I'd be very surprised if some version of these hadn't been discussed) + + +$\text{E}(aX+b) = a\text{E}(X)+b$ + +$\text{Var}(aX+b) = a^2\text{Var}(X)$ + + +http://en.wikipedia.org/wiki/Expected_value#Linearity + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +If you apply these properties, or better, the versions you'll already have been given, the problem is trivial. + +If you still can't see it, try finding $\text{E}(X-\mu)$ first and work from there.",,2013-10-10 04:06:18.590 +185394,57192,22031.0,3,,CC BY-SA 3.0,09a9dc49-6c22-4325-b904-3662a9f0f6b1,,,2013-10-10 04:41:49.360 +185393,57192,22031.0,1,,CC BY-SA 3.0,09a9dc49-6c22-4325-b904-3662a9f0f6b1,Check for Normal Distribution in Excel,,2013-10-10 04:41:49.360 +185392,57192,22031.0,2,,CC BY-SA 3.0,09a9dc49-6c22-4325-b904-3662a9f0f6b1,"I'm looking for a way to check a data set for normality in Excel, just to verify that the requirements for using a t-test are being met. + +For the right tail, is it appropriate to just calculate a mean and standard deviation, add 1, 2 & 3 standard deviations from the mean to create a range then compare that to the normal 68/95/99.7 for the standard normal distribution after using the norm.dist function in excel to test each standard deviation value. + +Or is there a better way to test for normality?",,2013-10-10 04:41:49.360 +185395,57186,22542.0,5,,CC BY-SA 3.0,2f9baea0-e9e5-474c-b160-d0a349e681e4,"I am doing a study of two schools, and at each school I am sampling three groups. I am trying to determine why the person chose to go to that school and not another. I will be asking each group various questions about the school and options they may have had for other schools, etc. I want to be able to detect: + +1. a difference of 20% in responses between the schools (e.g., proportion of students for whom current school was first school of preference, or proportion for whom there was no other choice), and +2. *within* each school, 20% difference in responses between the three different groups (similar questions, but for students who enrolled in different eras). + +In both instances, I would like power of 80% for 0.05 significance. + +So, if I do what I think is the appropriate calculation, I effectively come up with: +\begin{align} +n &= \frac{0.5(0.84 + 1.96)^2}{0.2^2} \\ + &= 98 +\end{align} +Can I just assume then that I need three groups of 99 within each school? I guess I'm confused because nobody ever seems to talk about calculating sample sizes when comparing more than two groups. + +Furthermore, is there anything wrong with sampling the three groups of 98 at each school and assuming that the total sample of 294 at each school will be sufficient to detect the 20% difference between the two schools? +",added 163 characters in body,2013-10-10 05:24:59.993 +185410,57197,22507.0,2,,CC BY-SA 3.0,82ac84ed-08f3-440d-b0ce-ad1fc46d5877,"If you select the equal number of cases and non-cases, it will bias the model. For example, suppose that the features have zero correlation with the outcome, and the dataset is very large. The model will predict the same probability of lung cancer for all patients. If you select equal number of positive and negative example, the predicted probability will be 0.5, while in reality it is 0.1 .",,2013-10-10 06:21:56.940 +186091,57378,668.0,10,,,1feab5a9-64fe-4d66-a743-7a136cd6487b,"{""OriginalQuestionIds"":[31],""Voters"":[{""Id"":919,""DisplayName"":""whuber""}]}",101,2013-10-13 16:31:36.877 +186094,57394,16046.0,1,,CC BY-SA 3.0,e7e70389-f554-4886-b5ae-65780f3c30a0,Causality in Time Series,,2013-10-13 16:53:32.577 +185397,57193,1717.0,2,,CC BY-SA 3.0,4491d577-0526-465d-b41b-cf0996d4bb31,"When you use EM to obtain maximum likelihood estimates, you need a variable that describes your observations $x_{n}$, latent variables $z_{n}$ that are in some way related to your observations (e.g. in coin tossing experiments, $\{H, T\}$ are the latent variables and in gaussian mixtures, the mixing coefficients $\pi_{i}$ take the role of latent variables) and the parameters $\theta$ that you are trying to estimate. + +At the risk of not answering your question at all, I think you want a maximum likelihood estimate of $\theta$ using EM based on known observations $x_{n}$ that are given by the following equation: + +$$x_{t} = s_{t}(\theta_{0}) + n_{t}$$ + +If that is correct, a general idea is the following. Since $n_{t}$ is white noise $N(0, \sigma)$, $x_{t}$ can be described by a Gaussian $p(x_{t}|s_{t},\theta) = N(s_{t}(\theta), \sigma)$. In the EM formulation, $x_{t}$'s are known variables, $s_{t}$'s are latent variables and $\theta$ is the parameter. It is customary to group the variables $x_{n}$ in a variable $X$ and likewise, latent variables $s_{n}$ are grouped in a variable $S$. + +As you should know, the EM algorithm consists of 2 steps: expectation and maximization. In the expectation step, we use an expression $Q$ as a proxy for the likelihood $L(\theta|X) = p(X|\theta)$, that is, the probability of +getting the known data $X$ given a parameter $\theta$. This is the same likelihood used to obtain maximum likelihood estimates. However, in EM we use this $Q$ instead: + +$$Q(\theta|\theta^{\text{old}}) = E_{S|X, \theta^{\text{old}}} \log p(X,S|\theta)$$ + +This odd-looking expression is actually a lower bound of the likelihood $L(\theta|X)$. [Bishop's book][1] contains a good derivation of $Q$. + +In order to start the EM magic, you have to choose a random $\theta^{\text{old}}$ and calculate this expectation. Notice that you need $p(X,S|\theta)$ and $p(S|X,\theta^{\text{old}})$. $p(X,S|\theta)$ is equal to $p(X|S,\theta)p(S|\theta)$ and using Bayes' theorem, $p(S|X,\theta^{\text{old}})$ is +proportional to $p(X|S,\theta^{\text{old}})p(S|\theta^{\text{old}})$. + +At this point, I hope it is clear that $p(X|S,\theta)=\prod_{t} p(x_{t}|s_{t},\theta)$, so that part is not hard to calculate. However, $p(S|\theta)$, that is, $\prod_{t}p(s_{t}|\theta)$ is required. I don't know what distribution could be appropriate since this depends on the specifics of your problem so I will assume you know. + +By now, you can calculate $Q(\theta|\theta^{\text{old}})$. + +The maximization step is simply: + +$$\theta = \text{arg max}_{\theta} Q(\theta|\theta^{\text{old}})$$ + +This is the new $\theta$ to be used in the expectation step again until convergence. + +That is a general idea of how EM could work in this case. However, maybe you don't know a distribution for $s_{t}$ or it is difficult to calculate the expectation or the maximization step. + +For the big picture, take a look at [this][2] nice explanation. + + + [1]: http://www.amazon.com/Pattern-Recognition-Learning-Information-Statistics/dp/0387310738 + [2]: https://docs.google.com/viewer?url=http://www.nature.com/nbt/journal/v26/n8/pdf/nbt1406.pdf",,2013-10-10 05:41:08.343 +185398,57194,155.0,2,,CC BY-SA 3.0,42f13a0c-bc2c-492d-9e93-7f5340bf88bb,"You could [plot a histogram using the data analysis toolpack in Excel](http://office.microsoft.com/en-au/excel-help/create-a-histogram-HP001098364.aspx). Graphical approaches are more likely to communicate the degree of non-normality, which is typically more relevant for assumption testing (see [this discussion of normality](http://stats.stackexchange.com/questions/2492/is-normality-testing-essentially-useless). + +The data analysis toolpack in Excel will also give you skewness and kurtosis if you ask for descriptive statistics and choose the ""summary statistics"" option.",,2013-10-10 05:50:58.150 +185401,57195,22547.0,2,,CC BY-SA 3.0,1ed5a49f-bbf7-4655-8441-eb3e15fdfd14,"I have data for a network of weather stations across the United States. This gives me a data frame that contains date, latitude, longitude, and some measured value. Assume that data are collected once per day and driven by regional-scale weather (no, we are not going to get into that discussion). + +I'd like to show graphically how simultaneously-measured values are correlated across time and space. My goal is to show the regional homogeneity (or lack thereof) of the value that is being investigated. + +To start with, I took a subset of the data for the North East. I tried grouping the data by calendar month and plotting the correlation between all points (below). This shows how the 30+ data points from January, February, etc. are correlated between different stations in this area. + +![correlation between daily data during each calendar month][1] + + [1]: https://i.stack.imgur.com/X4YZI.jpg + +The plots seem to show that the values measured by the stations in the south of this domain have higher correlation than with others, but the relative density of connections in the networks means that any nuance is completely washed out. I also wonder if those southern stations aren't just the last ones to plot... + +Unfortunately, there is simply too much data to make sense of on one plot, and that can't be fixed by reducing the size of the lines. I suspect what I have to do is simplify the graph of connections between the stations (nodes) in my network, probably choosing between `k` nearest neighbours or some sphere of influence approach to chose the stations (I don't know how to do this). I am also not sure what is the most appropriate metric to show correlation, but for the intended (non-technical) audience, the correlation coefficient might just be the simplest to explain. I may need to present some other information like the gradient or standard error as well. + +I'm learning my way into this field and R at the same time, and would appreciate suggestions on: + + 1. What's the correct name for what I'm trying to do? Are there some helpful terms that would let me find more literature? My searches are drawing blanks for what must be a common application. + 2. Are there more appropriate methods to show the correlation between multiple data sets separated in space? + 3. ... in particular, methods that are easy to show results from visually? + 4. Are any of these implemented in R? + 5. Do any of these approaches lend themselves to automation? + +",,2013-10-10 05:52:03.253 +185400,57195,22547.0,1,,CC BY-SA 3.0,1ed5a49f-bbf7-4655-8441-eb3e15fdfd14,Showing spatial and temporal correlation on maps,,2013-10-10 05:52:03.253 +185399,57195,22547.0,3,,CC BY-SA 3.0,1ed5a49f-bbf7-4655-8441-eb3e15fdfd14,,,2013-10-10 05:52:03.253 +185402,57193,1717.0,5,,CC BY-SA 3.0,a6a6fcae-bcfd-4b7d-98fb-c48abf5f5e4b,"When you use EM to obtain maximum likelihood estimates, you need a variable that describes your observations $x_{n}$, latent variables $z_{n}$ that are in some way related to your observations (e.g. in coin tossing experiments, $\{H, T\}$ are the latent variables and in gaussian mixtures, the mixing coefficients $\pi_{i}$ take the role of latent variables) and the parameters $\theta$ that you are trying to estimate. + +At the risk of not answering your question at all, I think you want a maximum likelihood estimate of $\theta$ using EM based on known observations $x_{n}$ that are given by the following equation: + +$$x_{t} = s_{t}(\theta_{0}) + n_{t}$$ + +If that is correct, a general idea is the following. Since $n_{t}$ is white noise $N(0, \sigma)$, $x_{t}$ can be described by a Gaussian $p(x_{t}|s_{t},\theta) = N(s_{t}(\theta), \sigma)$. In the EM formulation, $x_{t}$'s are known variables, $s_{t}$'s are latent variables and $\theta$ is the parameter. It is customary to group the variables $x_{n}$ in a variable $X$ and likewise, latent variables $s_{n}$ are grouped in a variable $S$. + +As you should know, the EM algorithm consists of 2 steps: expectation and maximization. In the expectation step, we use an expression $Q$ as a proxy for the likelihood $L(\theta|X) = p(X|\theta)$, that is, the probability of +getting the known data $X$ given a parameter $\theta$. This is the same likelihood used to obtain maximum likelihood estimates. However, in EM we use this $Q$ instead: + +$$Q(\theta|\theta^{\text{old}}) = E_{S|X, \theta^{\text{old}}} \log p(X,S|\theta)$$ + +This odd-looking expression is actually a lower bound of the likelihood $L(\theta|X)$. [Bishop's book][1] contains a good derivation of $Q$. + +In order to start the EM magic, you have to choose a random $\theta^{\text{old}}$ and calculate this expectation. Notice that you need $p(X,S|\theta)$ and $p(S|X,\theta^{\text{old}})$. $p(X,S|\theta)$ is equal to $p(X|S,\theta)p(S|\theta)$ and using Bayes' theorem, $p(S|X,\theta^{\text{old}})$ is +proportional to $p(X|S,\theta^{\text{old}})p(S|\theta^{\text{old}})$. + +At this point, I hope it is clear that $p(X|S,\theta)=\prod_{t} p(x_{t}|s_{t},\theta)$, so that part is not hard to calculate. However, $p(S|\theta)$, that is, $\prod_{t}p(s_{t}|\theta)$ is required. I don't know what distribution could be appropriate since this depends on the specifics of your problem so I will assume you know. + +By now, you can calculate $Q(\theta|\theta^{\text{old}})$. + +The maximization step is simply: + +$$\theta = \text{arg max}_{\theta} Q(\theta|\theta^{\text{old}})$$ + +This is the new $\theta$ to be used in the expectation step again until convergence. + +That is a general idea of how EM could work in this case. However, maybe you don't know a distribution for $s_{t}$ or it is difficult to calculate the expectation or the maximization step. + +For the big picture, take a look at [this][2] nice explanation. + +**UPDATE** + +I think you changed the question quite a bit. Are you asking how to calculate maximum likelihood estimates? Basically, you apply a derivative to the likelihood on the parameter you want to estimate: + +$$\frac{\partial}{\partial \theta}L(\theta|X) = 0$$ + +solve it and that's pretty much it. See more examples [here][3]. + + + [1]: http://www.amazon.com/Pattern-Recognition-Learning-Information-Statistics/dp/0387310738 + [2]: https://docs.google.com/viewer?url=http://www.nature.com/nbt/journal/v26/n8/pdf/nbt1406.pdf + [3]: http://en.wikipedia.org/wiki/Maximum_likelihood#Continuous_distribution.2C_continuous_parameter_space",added update,2013-10-10 05:57:41.453 +185403,57194,155.0,5,,CC BY-SA 3.0,30640fce-c1c3-4a74-afd6-87aa48cc4d27,"You could [plot a histogram using the data analysis toolpack in Excel](http://office.microsoft.com/en-au/excel-help/create-a-histogram-HP001098364.aspx). Graphical approaches are more likely to communicate the degree of non-normality, which is typically more relevant for assumption testing (see [this discussion of normality](http://stats.stackexchange.com/questions/2492/is-normality-testing-essentially-useless). + +The data analysis toolpack in Excel will also give you [skewness and kurtosis](http://graphpad.com/guides/prism/6/statistics/index.htm?stat_skewness_and_kurtosis.htm) if you ask for descriptive statistics and choose the ""summary statistics"" option. You might for example consider values of skewness above plus or minus one be a form of substantive non-normality. + +That said, the assumption with t-tests is that the residuals are normally distributed and not the variable. Furthermore, they also quite robust such that even with fairly large amounts of non-normality, p-values are still fairly valid.",added 444 characters in body,2013-10-10 05:59:12.390 +185404,57195,22547.0,5,,CC BY-SA 3.0,0179ea7e-9ed5-44c5-89e4-5ecb9259d0a7,"I have data for a network of weather stations across the United States. This gives me a data frame that contains date, latitude, longitude, and some measured value. Assume that data are collected once per day and driven by regional-scale weather (no, we are not going to get into that discussion). + +I'd like to show graphically how simultaneously-measured values are correlated across time and space. My goal is to show the regional homogeneity (or lack thereof) of the value that is being investigated. + +To start with, I took a subset of the data for the North East. I tried grouping the data by calendar month and then calculating the ordinary least squares regression between different pairs of data. I then plot the correlation between all pairs as a line connecting the stations (below). The line color shows the value of R2 from the OLS fit. The figure then shows how the 30+ data points from January, February, etc. are correlated between different stations in the area of interest. + +![correlation between daily data during each calendar month][1] + + [1]: https://i.stack.imgur.com/X4YZI.jpg + +The plots show higher correlation between the stations in the south than between other pairs. Unfortunately, there is simply too much data to make sense of on one plot, and that can't be fixed by reducing the size of the lines. I suspect what I have to do is simplify the graph of connections between the stations (nodes) in my network, probably choosing between `k` nearest neighbours or some sphere of influence approach to chose the stations (I don't know how to do this). I am also not sure what is the most appropriate metric to show correlation, but for the intended (non-technical) audience, the correlation coefficient from OLS might just be the simplest to explain. I may need to present some other information like the gradient or standard error as well. + +I'm learning my way into this field and R at the same time, and would appreciate suggestions on: + + 1. What's the more formal name for what I'm trying to do? Are there some helpful terms that would let me find more literature? My searches are drawing blanks for what must be a common application. + 2. Are there more appropriate methods to show the correlation between multiple data sets separated in space? + 3. ... in particular, methods that are easy to show results from visually? + 4. Are any of these implemented in R? + 5. Do any of these approaches lend themselves to automation? + +",clarified method used and question,2013-10-10 06:01:43.267 +185411,57194,155.0,5,,CC BY-SA 3.0,63d64309-796a-4997-a946-5a05af52165f,"You could [plot a histogram using the data analysis toolpack in Excel](http://office.microsoft.com/en-au/excel-help/create-a-histogram-HP001098364.aspx). Graphical approaches are more likely to communicate the degree of non-normality, which is typically more relevant for assumption testing (see [this discussion of normality](http://stats.stackexchange.com/questions/2492/is-normality-testing-essentially-useless)). + +The data analysis toolpack in Excel will also give you [skewness and kurtosis](http://graphpad.com/guides/prism/6/statistics/index.htm?stat_skewness_and_kurtosis.htm) if you ask for descriptive statistics and choose the ""summary statistics"" option. You might for example consider values of skewness above plus or minus one be a form of substantive non-normality. + +That said, the assumption with t-tests is that the residuals are normally distributed and not the variable. Furthermore, they also quite robust such that even with fairly large amounts of non-normality, p-values are still fairly valid.",added 1 characters in body,2013-10-10 06:24:06.637 +185414,57198,22548.0,3,,CC BY-SA 3.0,84c6c1eb-7d1b-4327-b1b5-79df42387088,,,2013-10-10 06:34:08.830 +185413,57198,22548.0,1,,CC BY-SA 3.0,84c6c1eb-7d1b-4327-b1b5-79df42387088,what is random error in OLS regression? and the differnce with Gaussian noise?,,2013-10-10 06:34:08.830 +185412,57198,22548.0,2,,CC BY-SA 3.0,84c6c1eb-7d1b-4327-b1b5-79df42387088,"I'm new in statistics, and I would be thankful if you help me. + +in OLS regression: Y = β0 + β1 X1 + β2 X2 + β3 X3 + β4 X4 + β5 X5 + β6 X6 + ξ + +what is ξ ? is it Gaussian noise or random error? what is a difference why we add it to multiple regression model? In most of papers authors refer it to random error but without clarification. +I need a simple and good reason why authors add it to their model.. + +",,2013-10-10 06:34:08.830 +185415,57199,22551.0,2,,CC BY-SA 3.0,df40d321-4e91-4174-945d-18d0fe55fbbe,"Poisson/Negative binomial can also be used with a binary outcome with offset equal to one. Of course it necessitates that the data be from a prospective design (cohort, rct, etc). Poisson or NB regression gives the more appropriate effect measure (IRR) versus odds ratio from logistic regression. + +NB regression is ""safer"" to run than Poisson regression because even if the overdispersion parameter (alpha in Stata) is not statistically significant, the results will be exactly the same as its Poisson regression form.",,2013-10-10 07:32:59.240 +185417,57200,15563.0,2,,CC BY-SA 3.0,4d6a5617-4e45-4a9e-a88c-e0fa3cc887c6,"I am using the [Kaggle Scikit][1] data to learn R. + +I am using the R e1071 SVM function to predict classes. + +When I use: + + svm(train, trainLabels, scale = TRUE, type = NULL, kernel = ""polynomial"") + +I obtain this level of accuracy on a sample of the Train data: + + > table(pred, trainLabels) + trainLabels + pred 0 1 + 0 478 8 + 1 12 502 + +which I interpret as being 98% accurate (8+12) / (478+8+12+502). + +Though when I use the same prediction model on the Test data, Kaggle returns a **0.82** score, [based on classification accuracy][2]. + +Can you explain why I can get such a different accuracy level? + + + [1]: https://www.kaggle.com/c/data-science-london-scikit-learn/ + [2]: https://www.kaggle.com/c/data-science-london-scikit-learn/details/evaluation",,2013-10-10 08:09:22.140 +185419,57200,15563.0,3,,CC BY-SA 3.0,4d6a5617-4e45-4a9e-a88c-e0fa3cc887c6,,,2013-10-10 08:09:22.140 +185418,57200,15563.0,1,,CC BY-SA 3.0,4d6a5617-4e45-4a9e-a88c-e0fa3cc887c6,SVM prediction accuracy drops when using Test data,,2013-10-10 08:09:22.140 +185420,57201,21762.0,2,,CC BY-SA 3.0,dc589773-5245-4e18-8535-7d4c012cc359,"A simple approach would be the following: + +1) Take all observations sampled at random under condition A and obtain the *relevant one sided* p-value from Wilcoxon's rank sum test. + +2) Do the same for the observations sampled under condition B. + +3) If the smaller of the two p-values is below the level $\alpha/2$ and the other p-value is below $\alpha$, then your claim holds at the $\alpha$ level. (This would be the Bonferroni-Holm correction for multiple testing.) + +Since the sample sizes are extremely low, you will get a ""significant"" result only if the signal is very strong.",,2013-10-10 08:18:09.707 +185422,57198,1406.0,4,,CC BY-SA 3.0,e38bca1c-fa2e-48e6-ba30-01b811878fd2,What is random error in OLS regression? And how is it related to Gaussian noise?,deleted 2 characters in body; edited title,2013-10-10 08:26:56.503 +185421,57198,1406.0,5,,CC BY-SA 3.0,e38bca1c-fa2e-48e6-ba30-01b811878fd2,"In OLS regression: + +$$Y=\beta_0+\beta_1 X_1+ \beta_2 X_2+\beta_3 X_3 + \beta_4 X_4+\beta_5 X_5+\beta_6 X_6 + \varepsilon,$$ + + +what is $\varepsilon$? Is it Gaussian noise or random error? What is a difference? Why we add it to multiple regression model? In most of papers authors refer it to random error but without clarification. + +I need a simple and good reason why authors add it to their model.. + +",deleted 2 characters in body; edited title,2013-10-10 08:26:56.503 +185423,57202,14799.0,2,,CC BY-SA 3.0,e7fe806b-7c5a-4ea5-8899-9e49d3ea9247,"This may a problem of interpretation, a misunderstanding what a so-called ""direct effect"" coefficient really is. + +In regression models with continuous predictor variables and no interaction terms -- that is, with no terms that are constructed as the product of other terms -- each variable's coefficient is the slope of the regression surface in the direction of that variable. It is constant, regardless of the values of the variables, and is obviously a measure of the effect of that variable. + +In models with interactions -- that is, with terms that are constructed as the products of other terms -- that interpretation can be made without further qualification only for variables that are **not** involved in any interactions. The coefficient of a variable that **is** involved in interactions is the slope of the regression surface in the direction of that variable **when the values of all the variables that interact with the variable in question are zero**, and the significance test of the coefficient refers to the slope of the regression surface **only in that region of the predictor space**. Since there is no requirement that there actually be data in that region of the space, the apparent direct effect coefficient may bear little resemblance to the slope of the regression surface in the region of the predictor space where data were actually observed. There is no true ""direct effect"" in such cases; the best substitute is probably the ""average effect"": the slope of the regression surface in the direction of the variable in question, taken at each data point and averaged over all data points. For more on this, see http://stats.stackexchange.com/questions/65898/answer/65917",,2013-10-10 08:31:00.453 +185433,57205,22558.0,2,,CC BY-SA 3.0,08f26015-a044-4186-b60a-83d32509736b,"I would not touch the data at all. Use this for autocorrelation with NaNs: + +http://www.mathworks.com/matlabcentral/fileexchange/43840-autocorrelation-and-partial-autocorrelation-with-nans/content/nanautocorr.m",,2013-10-10 09:58:27.403 +185575,57248,10278.0,5,,CC BY-SA 3.0,09c07e23-6e2f-4df3-a2be-238b8fa63408,"If you treat the variable as ordinal you are assuming a gene-dosage effect. +This is essentially a two degree of freedom test. You are doing a regression. +If you treat the variable as nominal you are not assuming any gene-dosage effect and instead comparing the mean of the three genotype groups this is a one degree of freedom test. +Hence the gene-dosage model (treating genotypes as ordinal) is more powerful. +",added 30 characters in body,2013-10-10 19:34:30.353 +185425,57177,594.0,5,,CC BY-SA 3.0,b4c4d007-b8e9-455d-ac98-24914b0d487d,"As for qualitative differences, the lognormal and gamma are, as you say, quite similar. + +Indeed, in practice they're often used to model the same phenomena (some people will use a gamma where others use a lognormal). They are both, for example, constant-coefficient-of-variation models (the CV for the lognormal is $\sqrt{e^{\sigma^2} -1}$, for the gamma it's $1/\sqrt \alpha$). + +[How can it be constant if it depends on a parameter, you ask? It applies when you model the scale (location for the log scale); for the lognormal, $\mu$ acts as a scale parameter, while for the gamma, the scale is the parameter that isn't the shape parameter (or its reciprocal if you use the shape-rate parameterization). I'll call the scale parameter for the gamma distribution $\beta$. Gamma GLMs model the mean ($\mu=\alpha\beta$) while holding $\alpha$ constant; in that case $\mu$ is also a scale parameter. A model with varying $\mu$ and constant $\alpha$ or $\sigma$ respectively will have constant CV.] + +You might find it instructive to look at the density of their *logs*, which often shows a very clear difference. + +The log of a lognormal random variable is ... normal. It's symmetric. + +The log of a gamma random variable is left-skew. Depending on the value of the shape parameter, it may be quite skew or nearly symmetric. + +Here's an example, with both lognormal and gamma having mean 1 and variance 1/4. The top plot shows the densities (gamma in green, lognormal in blue), and the lower one shows the densities of the logs: + +![gamma and lognormal, densitiy and density of log][1] + +(Plotting the log of the density of the logs is also useful. That is, taking a log-scale on the y-axis above) + +This difference implies that the gamma has more of a tail on the left, and less of a tail on the right; the far right tail of the lognormal is heavier and its left tail lighter. And indeed, if you look at the skewness, of the lognormal and gamma, for a given coefficient of variation, the lognormal is more right skew ($\text{CV}^3+3\text{CV}$) than the gamma ($2\text{CV}$). + + + [1]: https://i.stack.imgur.com/I9ARM.png",gave a 'simpler' form for lognormal skewness,2013-10-10 08:40:22.340 +185426,57203,633.0,2,,CC BY-SA 3.0,d8b9fa3f-b3f3-4de4-b435-dd770c51c19c,"Yes, the gamma distribution is the maximum entropy distribution for which the mean $E(X)$ and mean-log $E(\log X)$ are fixed. As with all exponential family distributions, it is the unique maximum entropy distribution for a fixed expected sufficient statistic.",,2013-10-10 09:03:03.037 +185427,57202,12683.0,5,,CC BY-SA 3.0,af5bd193-6764-4fd4-a93f-0be534858189,"This may be a problem of interpretation, a misunderstanding of what a so-called ""direct effect"" coefficient really is. + +In regression models with continuous predictor variables and no interaction terms -- that is, with no terms that are constructed as the product of other terms -- each variable's coefficient is the slope of the regression surface in the direction of that variable. It is constant, regardless of the values of the variables, and is obviously a measure of the effect of that variable. + +In models with interactions -- that is, with terms that are constructed as the products of other terms -- that interpretation can be made without further qualification only for variables that are **not** involved in any interactions. The coefficient of a variable that **is** involved in interactions is the slope of the regression surface in the direction of that variable **when the values of all the variables that interact with the variable in question are zero**, and the significance test of the coefficient refers to the slope of the regression surface **only in that region of the predictor space**. Since there is no requirement that there actually be data in that region of the space, the apparent direct effect coefficient may bear little resemblance to the slope of the regression surface in the region of the predictor space where data were actually observed. There is no true ""direct effect"" in such cases; the best substitute is probably the ""average effect"": the slope of the regression surface in the direction of the variable in question, taken at each data point and averaged over all data points. For more on this, see http://stats.stackexchange.com/questions/65898/answer/65917",fixed typos,2013-10-10 09:11:52.483 +185428,57175,12683.0,5,,CC BY-SA 3.0,a1c72a30-45e4-489a-b54d-d8637b93b401,"I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply the EM method for parameter estimation by minimizing the error given in [paper][1]. The estimation problem is formulated as + +$x_n= s_n(\theta_0) + w_n Eq(1)$ + + +where $s_n$ is the signal of interest; $w_n$ is a Gaussian white noise. The parameter estimate is obtained after minimization of $J$ (eq3) + +$$\hat{\theta}= \arg \min_{\theta} J(\theta) Eq(2)$$ and $$J = \sum ||u_{i-1} - u_i||^2$$ Eq(3) + +where $u_n$ is a function, $u_n = x_n - s_n(\theta)$. Minimizing $u_n$ will give us the parameter vector $\theta$ since $u_n$ will converge to $\theta_0$. So, $u_n$ generally becomes a data series. In the paper they have used nearest neighbor search for minimizing J and hence the representation of Eq(3) as a nearest neighbor search. + +In the original way, the search is initialized by taking random values of initial guesses of the parameters and doing a random search, stopping when $J$ is minimized for a given number of iterations, the parameters are incremented after each iteration by a very small number. + +My question is how do I replace EQ(3) and formulate the recursive search for optimal $J$ as EM method. The maximum-likelihood solution for the parameters, +under the additive white Gaussian noise assumption, corresponds +to minimization of the norm as follows $\theta_{ML} = \arg \min u_n Eq(4)$ + + 1. Can somebody give ideas on solving Eq(4) as ML; i.e how do I minimize J using ML? + 2. I really do not understand how to replace Eq (3) by ML-based minimization since I cannot understand the formulation of the likelihood and other technical information. + + [1]: http://www.eurasip.org/Proceedings/Eusipco/Eusipco2000/SESSIONS/THUAM/OR1/CR1473.PDF","fixed typos, improved formatting",2013-10-10 09:14:54.500 +185429,57203,633.0,5,,CC BY-SA 3.0,8c96897a-1d96-42b6-b2f6-1b6b9c6d56ff,"Yes, the gamma distribution is the maximum entropy distribution for which the mean $E(X)$ and mean-log $E(\log X)$ are fixed. As with all exponential family distributions, it is the unique maximum entropy distribution for a fixed expected sufficient statistic. + +To answer your question about physical processes that generate these distributions: The lognormal distribution arises when the logarithm of X is normally distributed, for example, if X is the product of very many small factors. If X is gamma distributed, it is the sum of many exponentially-distributed variates. For example, the waiting time for many events of a Poisson process.",added 387 characters in body,2013-10-10 09:15:41.347 +185430,57195,,25,,,aff4e00a-efc3-4e59-b7ce-cdf642c08abc,,http://twitter.com/#!/StackStats/status/388231387591237632,2013-10-10 09:15:50.517 +185431,57164,12683.0,5,,CC BY-SA 3.0,67c56f53-78be-4009-aaf6-921619b97641,I have an experimentally observed distribution that looks very similar to a gamma or lognormal distribution. I've read that the [lognormal distribution](http://en.wikipedia.org/wiki/Log-normal_distribution) is the maximum entropy probability distribution for a random variate $X$ for which the mean and variance of $\ln(X)$ are fixed. Does the gamma distribution have any similar properties?,improved formatting,2013-10-10 09:20:31.077 +185432,57204,4831.0,2,,CC BY-SA 3.0,c696db83-d611-4f5e-ad15-6fb2e477ab34,"You are correct that there's currently no good way to do seasonal ARIMA in statsmodels. Currently, I only have a half-baked solution for doing non-consecutive lags, but it's not public anywhere. It's a bit heavy, computations-wise. Unfortunately, I doubt I'll be able to work on this anytime soon (unless someone would be willing to fund the enhancement...). Contributions in this area would be very welcome. + +https://github.com/statsmodels/statsmodels/issues/247 +https://github.com/statsmodels/statsmodels/issues/232",,2013-10-10 09:22:00.363 +185576,57223,22564.0,5,,CC BY-SA 3.0,66e09a7f-8995-4d05-9731-49469d5f6c4a,"I have a problem like the following: + +1) There are six measurements for each individual with large within-subject variance + +2) There are two groups (Treatment and Control) + +3) Each group consists of 5 individuals + +4) I want to perform a significance test comparing the two groups to know if the group means are different from one another. + + +The data looks like this: +![http://s10.postimg.org/p9krg6f3t/examp.png][1] + +And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. **This ignores within-subject variability**: + + + n.simulations<-10000 + pvals=matrix(nrow=n.simulations,ncol=1) + for(k in 1:n.simulations){ + subject=NULL + for(i in 1:10){ + subject<-rbind(subject,as.matrix(rep(i,6))) + } + #set.seed(42) + + #Sample Subject Means + subject.means<-rnorm(10,100,2) + + #Sample Individual Measurements + values=NULL + for(sm in subject.means){ + values<-rbind(values,as.matrix(rnorm(6,sm,20))) + } + + out<-cbind(subject,values) + + #Split into GroupA and GroupB + GroupA<-out[1:30,] + GroupB<-out[31:60,] + + #Add effect size to GroupA + GroupA[,2]<-GroupA[,2]+0 + + colnames(GroupA)<-c(""Subject"", ""Value"") + colnames(GroupB)<-c(""Subject"", ""Value"") + + #Calculate Individual Means and SDS + GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2) + for(i in 1:length(unique(GroupA[,1]))){ + GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + } + colnames(GroupA.summary)<-c(""Mean"",""SD"") + + + GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2) + for(i in 1:length(unique(GroupB[,1]))){ + GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + } + colnames(GroupB.summary)<-c(""Mean"",""SD"") + + Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary)) + colnames(Summary)[1]<-""Group"" + + pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value + } + + +And here is code for plots: + + + #Plots + par(mfrow=c(2,2)) + boxplot(GroupA[,2]~GroupA[,1], col=""Red"", main=""Group A"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupA[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupA[,1]))){ + m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(GroupB[,2]~GroupB[,1], col=""Light Blue"", main=""Group B"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupB[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupB[,1]))){ + m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(Summary[,2]~Summary[,1], col=c(""Red"",""Light Blue""), xlab=""Group"", ylab=""Average Value"", + ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])), + main=""Individual Averages"") + stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T) + + points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(.9, + t.test(GroupA.summary[,1])$conf.int[1],.9, + t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + + points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(1.9, + t.test(GroupB.summary[,1])$conf.int[1],1.9, + t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + hist(pvals, breaks=seq(0,1,by=.05), col=""Grey"", + main=c(paste(""# sims="", n.simulations), + paste(""% Sig p-values="",100*length(which(pvals<0.05))/length(pvals))) + ) + +Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data. + +So what is the correct way to analyze this data? + + +**Bonus:** + +The example above is a simplification. For the actual data: + +1) The within-subject variance is positively correlated with the mean. + +2) Values can only be multiples of two. + +3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end. + +4) Number of Subjects in each group are not necessarily equal. + +Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the ""correct"" p-values. + +**EDIT:** + +Ok, here is what *actual* data looks like. There is also three groups rather than two: + +![enter image description here][2] + +dput() of data: + + structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, + 22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, + 6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, + 2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, + 12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, + 10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, + 20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list( + NULL, c(""Group"", ""Subject"", ""Value""))) + + +**EDIT 2:** + +In response to Henrik's answer: +So if I instead perform anova followed by TukeyHSD procedure on the individual averages as shown below, I could interpret this as underestimating my p-value by about 3-4x? + +My goal with this part of the question is to understand how I, as a reader of a journal article, can better interpret previous results given their choice of analysis method. For example they have those ""stars of authority"" showing me 0.01>p>.001. So if i accept 0.05 as a reasonable cutoff I should accept their interpretation? The only additional information is mean and SEM. + + #Get Invidual Means + summary=NULL + for(i in unique(dat[,2])){ + sub<-which(dat[,2]==i) + summary<-rbind(summary,cbind( + dat[sub,1][1], + dat[sub,2][1], + mean(dat[sub,3]), + sd(dat[sub,3]) + ) + ) + } + colnames(summary)<-c(""Group"",""Subject"",""Mean"",""SD"") + + TukeyHSD(aov(summary[,3]~as.factor(summary[,1])+ (1|summary[,2]))) + + # Tukey multiple comparisons of means + # 95% family-wise confidence level + # + # Fit: aov(formula = summary[, 3] ~ as.factor(summary[, 1]) + (1 | summary[, 2])) + # + # $`as.factor(summary[, 1])` + # diff lwr upr p adj + # 2-1 -0.672619 -4.943205 3.597967 0.9124024 + # 3-1 7.507937 1.813822 13.202051 0.0098935 + # 3-2 8.180556 2.594226 13.766885 0.0046312 + + [1]: https://i.stack.imgur.com/55V9J.png + [2]: https://i.stack.imgur.com/k1xWd.png",added 1451 characters in body,2013-10-10 19:45:44.693 +185579,57249,19264.0,3,,CC BY-SA 3.0,c104c76e-1ff7-4661-8be2-af1d7ad779d7,,,2013-10-10 19:49:21.903 +185434,57206,22555.0,2,,CC BY-SA 3.0,5f158851-751e-43d9-8312-58c314a0d914,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. If Beta 1 is 0 and Beta 2 is 3 then an assumption can be made that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. It also has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: 0, 1.8 + Exponential: 4, 9 + Normal: 0, 3 + Students-t: 0, 0 to 3 (a range) + +Gamma can be identified from a line, and Beta, Beta J and Beta U from regions. These are illustrated in Hahn and Shapiro Fig 6-1. + +Consider this a rough screening method, but it could indicate where further assessment is worthwhile. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",,2013-10-10 09:59:06.940 +185435,57206,22555.0,5,,CC BY-SA 3.0,2c87f975-3fc8-4f90-85e4-326f51c97099,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. If Beta 1 is 0 and Beta 2 is 3 then an assumption can be made that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: 0, 1.8 + Exponential: 4, 9 + Normal: 0, 3 + Students-t: 0, 0 to 3 (a range) + +Gamma can be identified from a line, and Beta, Beta J and Beta U from regions. These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",deleted 106 characters in body,2013-10-10 10:09:29.183 +185436,57206,22555.0,5,,CC BY-SA 3.0,d971bd91-a11b-4a1d-8ce1-fdafbc03e237,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. A Beta 1 of 0 and Beta 2 of 3 suggests that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: 0, 1.8 + Exponential: 4, 9 + Normal: 0, 3 + Students-t: 0, 3 to 10 (a range) + Impossible: 0 to 4 and 1 to 5 (a region). + +Gamma can be identified from a line, and Beta, Beta J and Beta U from regions. These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",deleted 106 characters in body,2013-10-10 10:15:59.297 +185437,57207,503.0,2,,CC BY-SA 3.0,461d5e8a-a086-4616-92cd-995d838ccd95,"For measurement error there really isn't a difference in the definitions. Psychometry defines ""true score"" as ""measured score"" + ""error"" and this is the same thing as the statistical definition. The confusion may come from different terminology; that developed because psychometry deals with tests while statistics can deal with almost anything. + +""Bias"" is a bit more complex. @NickCox gave the definition in statistics. In psychometry, it is used (at least some of the time) in a slightly different way, again due to the specialized nature of the subject. A test is biased for/against a group if its predictions work differently in another setting. So, e.g. if we are using SAT scores to predict college GPA, bias would be that one group gets lower/higher GPA with the same SAT score. + +In statistics, a scale could be biased against everyone - e.g. if my scale estimates everyone's weight as 5 pounds less than the actual value, that's bias. In the psychometrics definition, that can't be bias. + +BUT psychometricians often use ""bias"" in the statistical sense as well. +",,2013-10-10 10:19:23.717 +185440,57208,13889.0,3,,CC BY-SA 3.0,4f16d70a-15a3-4557-952c-52db1864aa30,,,2013-10-10 10:34:56.943 +185438,57208,13889.0,2,,CC BY-SA 3.0,4f16d70a-15a3-4557-952c-52db1864aa30,Suppose I have N methods and M benchmarks. I have an AUC statistic (and some other similar statistics) for each combination of method with benchmark. What test should I use to test if one method is better than the rest? I have seen some authors do pairwise comparisons using a one-sided Wilcoxon signed-rank test but I would prefer to test all methods at once. In any case I'm not sure the assumptions for the one-sided Wilcoxon signed-rank test hold. If the average AUC for each benchmark varies widely can you say the samples are from the same population? Also I'm not sure the distribution of the AUCs is symmetric around the median. Any advice would be welcome.,,2013-10-10 10:34:56.943 +185439,57208,13889.0,1,,CC BY-SA 3.0,4f16d70a-15a3-4557-952c-52db1864aa30,Test to rank methods by AUCs on various benchmarks,,2013-10-10 10:34:56.943 +185441,57209,21896.0,2,,CC BY-SA 3.0,6fefc0ac-befd-4a0f-b8a1-835c6c8db58b,"Following the suggestion of @Momo I will answer the question myself. What I had forgotten yesterday when I posted this question, is that I can just see what `glm.nb` does by typing ""glm.nb"" into the console. From the code it returns it can be inferred that indeed the variance equals $\mu + \mu^2/\theta$ so that $\theta = 1/\kappa$. + +Also I'd like to use the opportunity to advertise this [article](http://www.jstatsoft.org/v27/i08/paper) I found since then, also addressing these matters. +",,2013-10-10 10:37:03.140 +185445,57210,22560.0,3,,CC BY-SA 3.0,36622877-0359-46a6-b09f-15f1cc8bbb8d,,,2013-10-10 10:46:06.893 +185444,57210,22560.0,1,,CC BY-SA 3.0,36622877-0359-46a6-b09f-15f1cc8bbb8d,Why eigenvalues is greater than 1 in factor analysis,,2013-10-10 10:46:06.893 +185443,57210,22560.0,2,,CC BY-SA 3.0,36622877-0359-46a6-b09f-15f1cc8bbb8d,Why we take eigenvalue greater than 1 in factor analysis to retain factors ? And how can we decide which variables are to be chosen as factors ,,2013-10-10 10:46:06.893 +185578,57249,19264.0,1,,CC BY-SA 3.0,c104c76e-1ff7-4661-8be2-af1d7ad779d7,General Sum of Gamma Distributions,,2013-10-10 19:49:21.903 +185447,57190,449.0,5,,CC BY-SA 3.0,23e7321b-8294-44d4-a6ce-dda2fe4d33b0,"You can use multi-level logistic regression. You've only got one dependent variable, correctness. You have multiple independent variables nested within student. In R you can use `lmer` to construct the model. It would look something like. + + m <- lmer( answer ~ treatment * Q + (treatment * Q | student), family = 'binomial', data = mydata) + +That would allow for there to be random effects of question and treatment within student as well as overall correctness variability within student but you would also be able to assess fixed effects of treatment and question. What you seem to really want to know is all of the treatment by question interactions and that model provides them. + +In order to analyze all of the questions with any kind of reliability you really should have a lot of students taking the test (hundreds). The general effect of treatment could be assessed with fewer. Also, if you know the categories, the kinds of questions you think differ, then you could replace the individual question variable with that. It would be much more sensible and make this look much less like a fishing expedition.",added 272 characters in body,2013-10-10 10:53:13.073 +185450,57212,22190.0,2,,CC BY-SA 3.0,2b89f1ea-c055-4128-acd8-e7a1d66333eb,"I am new to statistics and asked to develop a statistical model, which I had started, they ask me to carry out concordance and discordance now, however I don't know anything about these terms except except that the concordance is the probability that a pair of individuals will both have a certain characteristic, given that one of the pair has the characteristic and the opposite for discordance. +Still I don't know why I have to find them and what would be the appropriate value of both for a decent model. +Thanks +",,2013-10-10 10:53:56.520 +185449,57212,22190.0,1,,CC BY-SA 3.0,2b89f1ea-c055-4128-acd8-e7a1d66333eb,Concordance and Discordance role in modelling,,2013-10-10 10:53:56.520 +185448,57212,22190.0,3,,CC BY-SA 3.0,2b89f1ea-c055-4128-acd8-e7a1d66333eb,,,2013-10-10 10:53:56.520 +185451,57213,503.0,2,,CC BY-SA 3.0,009467a3-3084-46f7-a392-f277eae83613,"In [this paper][1] I cover concordance and discordance. The paper is about `PROC LOGISTIC` in `SAS` but the section on concordance is more general. Briefly: Look at all possible pairs of observations. A pair is concordant if the observation with the higher observed value also has the higher predicted value. + + + [1]: http://www.nesug.org/proceedings/nesug08/sa/sa09.pdf",,2013-10-10 11:17:56.730 +185452,57206,22555.0,5,,CC BY-SA 3.0,cb7e33a9-60c9-4480-9e61-478f8e010ade,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. A Beta 1 of 0 and Beta 2 of 3 suggests that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: [0,1.8] [point] + Exponential: [4,9] [point] + Normal: [0,3] [point] + Students-t: [0,0] to (3,10] [line] + Lognormal: (0,3.6] to (3,10] [line] + Gamma and Beta: (0,3) to [4,9], [1,0] to [4,5] ex [0,1.8] [area] + Impossible: (0,1),(4,1] to (4,5) [area] + + Values of Beta1, Beta2 where brackets mean: + + [ and ] : includes + ( and ) : approaches but does not include. + ex : excludes + +These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",added 431 characters in body,2013-10-10 11:19:34.070 +185453,57206,22555.0,5,,CC BY-SA 3.0,5fd45d87-4f2a-4858-bd1e-a7a0044d8898,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. A Beta 1 of 0 and Beta 2 of 3 suggests that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: [0,1.8] [point] + Exponential: [4,9] [point] + Normal: [0,3] [point] + Students-t: [0,0] to (3,10] [line] + Lognormal: (0,3.6] to (3,10] [line] + Gamma and Beta: (0,3) to [4,9], [1,0] to [4,5] ex [0,1.8] [area] + Impossible: (0,1) to (4.5], (0,1) to (4,1] [area] + + Values of Beta1, Beta2 where brackets mean: + + [ and ] : includes + ( and ) : approaches but does not include. + ex : excludes + +These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",added 431 characters in body,2013-10-10 11:26:56.413 +185455,57210,,5,user88,CC BY-SA 3.0,1989dd57-e043-40fd-822b-7b63ec202f11,"Why we take eigenvalue greater than 1 in factor analysis to retain factors? + And how can we decide which variables are to be chosen as factors?",added 3 characters in body; edited title,2013-10-10 11:28:00.717 +185454,57210,,4,user88,CC BY-SA 3.0,1989dd57-e043-40fd-822b-7b63ec202f11,Why eigenvalues are greater than 1 in factor analysis?,added 3 characters in body; edited title,2013-10-10 11:28:00.717 +185456,57214,1428.0,2,,CC BY-SA 3.0,a908955b-f311-40da-9f9e-ba2430ab3eb3,"If the M benchmarks are supposed to yield score identically distributed score estimates (e.g. cross-validation folds) then maybe you can estimate confidence intervals for the mean AUC score for each method by bootstrapping on the M benchmarks of that method and then compare methods by considering non-overlapping confidence intervals. As bootstrapped confidence interval is a non-parametric method, you do not make any assumption on the symmetry of AUCs around the median.",,2013-10-10 11:28:17.020 +185457,57212,12683.0,5,,CC BY-SA 3.0,58cf2067-e072-4253-add0-28c3fe86e5e7,"I am new to statistics and asked to develop a statistical model, which I had started, they ask me to carry out concordance and discordance now, however I don't know anything about these terms except except that the concordance is the probability that a pair of individuals will both have a certain characteristic, given that one of the pair has the characteristic and the opposite for discordance. +Still I don't know why I have to find them and what would be the appropriate value of both for a decent model. +",removed thanks,2013-10-10 11:33:34.190 +186095,57394,16046.0,3,,CC BY-SA 3.0,e7e70389-f554-4886-b5ae-65780f3c30a0,,,2013-10-13 16:53:32.577 +186164,57417,22425.0,3,,CC BY-SA 3.0,e6884711-0a34-4459-adbd-ae5d96027db4,,,2013-10-14 03:02:05.353 +185458,57206,22555.0,5,,CC BY-SA 3.0,2c3c267d-7c08-490c-8066-12b5b43dfeaf,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. A Beta 1 of 0 and Beta 2 of 3 suggests that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: [0,1.8] [point] + Exponential: [4,9] [point] + Normal: [0,3] [point] + Students-t: [0,0] to (3,10] [line] + Lognormal: (0,3.6] to (3,10] [line] + Gamma and Beta: (0,3) to (4,9), [1,0] to [4,5] ex [0,1.8] [area] + Impossible: (0,1) to (4.5), (0,1) to (4,1] [area] + + Values of Beta1, Beta2 where brackets mean: + + [ and ] : includes + ( and ) : approaches but does not include. + ex : excludes + +These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",edited body,2013-10-10 11:40:05.413 +185459,57215,449.0,2,,CC BY-SA 3.0,3ab4e791-2952-41cd-805e-6f98e58a0af7,"The wikipedia definition is a fine definition that you can use for your paper if you need one but I think you're missing something. + +The $\epsilon$ is random error, which is synonymous with noise. In practice, the random error can be Gaussian distributed, in which case it is Gaussian noise, but it could take on other distributions. If the distribution of $\epsilon$ happens to be Gaussian then you've met one of the theoretical assumptions of the model and things like interval estimation are better justified. If it's not Gaussian then, like Glen_b said, you still have that it's best linear unbiased. + +Theoretically, the random error (noise) is supposed to be Gaussian distributed but the outcome could be anything. So, in order to answer your question you'd need to state whether you want to know the distribution of your particular noise or what the distribution of the noise should be. For the former you'd need data.",,2013-10-10 11:41:48.027 +185460,57216,22561.0,2,,CC BY-SA 3.0,15f64131-5579-49b9-916c-e25e8d005db8,"The total variance for combined regression results can be estimated using the same approach as in multiple imputations. In the attached file, the formulas for combining the regression results and total variance are presented. + +![Combining (pooloing) results of several regression models into one][1] + + + [1]: https://i.stack.imgur.com/rFT00.png",,2013-10-10 12:00:04.637 +185463,57217,15563.0,3,,CC BY-SA 3.0,2aec6883-726c-4e89-8759-5d33b35fba3b,,,2013-10-10 12:05:58.447 +185461,57217,15563.0,2,,CC BY-SA 3.0,2aec6883-726c-4e89-8759-5d33b35fba3b,"I am using the R e1071 library for the SVM (Support Vector Machine) algorithm. +And I used tune to find out the best Cost and gamma parameters. + +Though the plot doesn't seem to provide the actual best prediction. + +Here is some details: + + gammalist <- c(0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,0.045,0.05) + obj <- tune(svm, Class~., data = trainData, ranges = list(gamma = gammalist, cost = 2^(2:4)), tunecontrol = tune.control(sampling = ""fix"") ) + plot(obj) + + +Here is the plot obtained: +![enter image description here][1] + +The plot leads me to believe 0.02 is roundabout the best gamma. +But I actually tested manually several others and find better results for 0.042. + +On a 200 sample, I get 23 errors with gamma=0.042, and 26 errors with gamma=0.02. + +How do you explain this? + + + [1]: https://i.stack.imgur.com/Al0QG.png",,2013-10-10 12:05:58.447 +185462,57217,15563.0,1,,CC BY-SA 3.0,2aec6883-726c-4e89-8759-5d33b35fba3b,R e1071 tune plot does not give me the best gamma?,,2013-10-10 12:05:58.447 +185468,57185,15827.0,4,,CC BY-SA 3.0,51c21a53-4894-4c20-96e6-16220fc8de24,What is the difference between the concept and treatment of measurement error in psychometry and in statistics?,"shorter title (without distorting intended meaning, I hope)",2013-10-10 12:36:32.503 +185469,57218,21398.0,2,,CC BY-SA 3.0,c3ee4016-6ca0-4ca6-aea6-d299cf321fd4,I have a question. I have a dataset with some missing values that were not MCAR. I imputed them with fully conditional specification method iterations. I then executed my analysis on the basis of the imputed dataset. The results of the original model (with missing values: listwise deletion) did not change much in the eventual pooled model. My idea would be to go back the missing values dataset. What do you think?,,2013-10-10 12:44:27.567 +185471,57218,21398.0,3,,CC BY-SA 3.0,c3ee4016-6ca0-4ca6-aea6-d299cf321fd4,,,2013-10-10 12:44:27.567 +185470,57218,21398.0,1,,CC BY-SA 3.0,c3ee4016-6ca0-4ca6-aea6-d299cf321fd4,Multiple imputation - original model,,2013-10-10 12:44:27.567 +185473,57219,10547.0,2,,CC BY-SA 3.0,42d6c45c-e692-4b95-b8e6-ec3e27228396,"If all the $Zs$ are also determined by the $Y$ the system of equations cannot be identified. What you need to do is to reduce the equations such that the coefficients can be identified. + +I recommend reading: + +> Wooldridge, Introductory Econometrics, 3d ed. Chapter 16: Simultaneous +> equations + +Here your kind of problems gets explained and there are some pretty nice examples.",,2013-10-10 12:59:56.353 +185474,57219,10547.0,5,,CC BY-SA 3.0,493c4cdd-9a31-438b-8502-d852e938d0a6,"If all the $Z$s are also determined by the $Y$ the system of equations, which you have proposed, cannot be identified. What you need to do is to reduce the equations such that the coefficients can be identified. + +I recommend reading: + +> Wooldridge, Introductory Econometrics, 3d ed. Chapter 16: Simultaneous +> equations + +Here, your kind of problems gets explained, and there are some pretty nice examples. + +I also recommend reading: + + Rummery,Vella,Verbeek (1998) - Estimating the Returns to Education for Australian Youth via Rank-Order Instrumental Variables + +and + + Vella,Verbeek (1997) - Using Rank Order As An Instrumental Variable - An Applicaton To The Return To Schooling + +Vella and Verbeek (also Rummery) estimate smth. like: + +$y_i = x_i\beta + z_i\delta + e_i, \ \ \ \ i = 1,...,N$ + +Here $x_i$ is a $K$ vector of exogenous variables whereas $z_i$ is assumed to be endogenous. Hence the reduced form equation of $z_i$ is given by: + +$z_i = x_i\alpha + v_i$ + +The advantage of this approach is, that you dont need any exclusion restrictions for the $x_i$, which are necessary to make 2SLS/3SLS work. + +I've used this approach to solve a three equation system, i.e., i got three equations and in each of them there are two endogenous regressors which are also the dependend variable in some other equation. + +I also applied a plug-in style of approach to deal with potential heteroscedasticity. + +There are some issues which are not presented within this papers but I would be happy to talk to you about that.",added 1145 characters in body,2013-10-10 13:10:22.827 +186163,57417,22425.0,2,,CC BY-SA 3.0,e6884711-0a34-4459-adbd-ae5d96027db4,"Let X1, X2,...,Xn be discrete random variables. I'm looking for a way to prove the random variables are independent but not identically distributed. + +Can anyone suggest some ideas ?",,2013-10-14 03:02:05.353 +185477,57220,22562.0,2,,CC BY-SA 3.0,db0186ad-6dd6-4ac7-96e4-defd1999ad41,"I have the following regression + + children = \beta_0 + \beta_1 (log) earnings + \beta_2 grandparents + \epsilon + +and \beta_1>0 with p=0.01 and \beta_2>0 with p=0.01, and N is large (N>10.000) and grandparents takes values 0,1,2,3,4. + +Then I add the interaction term ((log) earnings*grandparents) to equation 1, such that: + + children = \beta_0 + \beta_1 (log) earnings + \beta_2 grandparents+ \beta_3 ((log) earnings*grandparents) + \epsilon + +and \beta_1>0 with p=0.01, beta_2 is no longer stat.sign and also \beta_3 is no stat. sign. + +I do not understand how to interpret the results and if the interaction term wipes out the direct effect of grandparents since (log)earnings is always different from 0. + +Thanks! + + + +",,2013-10-10 13:11:58.220 +185476,57220,22562.0,3,,CC BY-SA 3.0,db0186ad-6dd6-4ac7-96e4-defd1999ad41,,,2013-10-10 13:11:58.220 +185475,57220,22562.0,1,,CC BY-SA 3.0,db0186ad-6dd6-4ac7-96e4-defd1999ad41,Interaction wipes out my direct effects in regression (non zero variable),,2013-10-10 13:11:58.220 +185478,57220,16474.0,5,,CC BY-SA 3.0,921e1d79-1e39-4114-a8a3-ff07f1af83c3,"I have the following regression + + $children = \beta_0 + \beta_1 \log(earnings) + \beta_2 grandparents + \epsilon$ + +and $\beta_1>0$ with $p$=0.01 and $\beta_2>0$ with $p$=0.01, and N is large (N>10.000) and grandparents takes values 0,1,2,3,4. + +Then I add the interaction term ($\log(earnings)*grandparents$) to equation 1, such that: + + $children = \beta_0 + \beta_1 \log( earnings) + \beta_2 grandparents+ \beta_3 \log( earnings)*grandparents + \epsilon$ + +and $\beta_1>0$ with $p$=0.01, $\beta_2$ is no longer statistically significant and also $\beta_3$ is not statistically significant. + +I do not understand how to interpret the results and if the interaction term wipes out the direct effect of grandparents since $\log(earnings)$ is always different from 0. + + + +",improved formatting,2013-10-10 13:23:13.150 +185481,57221,22563.0,3,,CC BY-SA 3.0,bfb7c505-77a9-4e14-abc9-7a6898af3636,,,2013-10-10 13:28:20.653 +185480,57221,22563.0,1,,CC BY-SA 3.0,bfb7c505-77a9-4e14-abc9-7a6898af3636,What's the approximate distribution? Replace the true mean with sample mean,,2013-10-10 13:28:20.653 +185479,57221,22563.0,2,,CC BY-SA 3.0,bfb7c505-77a9-4e14-abc9-7a6898af3636,"If say for a random variable X, I have observation of x1,x2,x3,....,xn. Let m be the sample mean, and s be the sample deviation. Does the new random variable (x-m)/s follow some distribution? It's not t-distribution I guess, since for it to be t distribution, m needs to be replaced by true mean. + +Can statistics expert shed some light on this?",,2013-10-10 13:28:20.653 +185482,57222,16474.0,2,,CC BY-SA 3.0,a41ca399-164d-4220-a633-503daab41441,"$\beta_2$ in equation 2 is the effect of $grandparents$ when $\log(earnings) = 0$, i.e. $earnings = 1$. This is apperently outside the range of your data, so it is an extrapolation. The easiest way around that is to center $earnings$ before taking the logarithm or creating the interaction term at some meaningfull value withing the range of the data, for example, the median. That way the main effect of $grandparents$ will be the effect of grandparents when one has a median income instead of a fictional income of 1.",,2013-10-10 13:29:02.837 +185484,57223,22564.0,1,,CC BY-SA 3.0,508b3919-0da2-43d5-87c6-349b1a53e63e,How to compare two groups with multiple measurements for each individual with R?,,2013-10-10 13:47:37.470 +185485,57223,22564.0,3,,CC BY-SA 3.0,508b3919-0da2-43d5-87c6-349b1a53e63e,,,2013-10-10 13:47:37.470 +185483,57223,22564.0,2,,CC BY-SA 3.0,508b3919-0da2-43d5-87c6-349b1a53e63e," +Hello Stack exchange, + +I have a problem like the following: + +1) There are six measurements for each individual with large within-subject variance + +2) There are two groups (Treatment and Control) + +3) Each Group consists of 5 individuals + +4) I want to perform a significance test comparing the two groups to know if the group means are different from one another. + + +The data looks like this: +![http://s10.postimg.org/p9krg6f3t/examp.png][1] + +And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. **This ignores within-subject variability**: + + + n.simulations<-10000 + pvals=matrix(nrow=n.simulations,ncol=1) + for(k in 1:n.simulations){ + subject=NULL + for(i in 1:10){ + subject<-rbind(subject,as.matrix(rep(i,6))) + } + #set.seed(42) + + #Sample Subject Means + subject.means<-rnorm(10,100,2) + + #Sample Individual Measurements + values=NULL + for(sm in subject.means){ + values<-rbind(values,as.matrix(rnorm(6,sm,20))) + } + + out<-cbind(subject,values) + + #Split into GroupA and GroupB + GroupA<-out[1:30,] + GroupB<-out[31:60,] + + #Add effect size to GroupA + GroupA[,2]<-GroupA[,2]+0 + + colnames(GroupA)<-c(""Subject"", ""Value"") + colnames(GroupB)<-c(""Subject"", ""Value"") + + #Calculate Individual Means and SDS + GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2) + for(i in 1:length(unique(GroupA[,1]))){ + GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + } + colnames(GroupA.summary)<-c(""Mean"",""SD"") + + + GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2) + for(i in 1:length(unique(GroupB[,1]))){ + GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + } + colnames(GroupB.summary)<-c(""Mean"",""SD"") + + Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary)) + colnames(Summary)[1]<-""Group"" + + pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value + } + + +And here is code for plots: + + + #Plots + par(mfrow=c(2,2)) + boxplot(GroupA[,2]~GroupA[,1], col=""Red"", main=""Group A"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupA[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupA[,1]))){ + m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(GroupB[,2]~GroupB[,1], col=""Light Blue"", main=""Group B"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupB[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupB[,1]))){ + m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(Summary[,2]~Summary[,1], col=c(""Red"",""Light Blue""), xlab=""Group"", ylab=""Average Value"", + ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])), + main=""Individual Averages"") + stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T) + + points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(.9, + t.test(GroupA.summary[,1])$conf.int[1],.9, + t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + + points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(1.9, + t.test(GroupB.summary[,1])$conf.int[1],1.9, + t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + hist(pvals, breaks=seq(0,1,by=.05), col=""Grey"", + main=c(paste(""# sims="", n.simulations), + paste(""% Sig p-values="",100*length(which(pvals<0.05))/length(pvals))) + ) + +Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data. + +So what is the correct way to analyze this data? + + +**Bonus:** + +The example above is a simplification. For the actual data: + +1) The within-subject variance is positively correlated with the mean. + +2) Values can only be multiples of two. + +3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end. + +4) Number of Subjects in each group are not necessarily equal. + +Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the ""correct"" p-values. + + [1]: https://i.stack.imgur.com/55V9J.png",,2013-10-10 13:47:37.470 +185486,57223,15827.0,5,,CC BY-SA 3.0,0b488283-6a15-44b5-b08c-874f71896025,"I have a problem like the following: + +1) There are six measurements for each individual with large within-subject variance + +2) There are two groups (Treatment and Control) + +3) Each group consists of 5 individuals + +4) I want to perform a significance test comparing the two groups to know if the group means are different from one another. + + +The data looks like this: +![http://s10.postimg.org/p9krg6f3t/examp.png][1] + +And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. **This ignores within-subject variability**: + + + n.simulations<-10000 + pvals=matrix(nrow=n.simulations,ncol=1) + for(k in 1:n.simulations){ + subject=NULL + for(i in 1:10){ + subject<-rbind(subject,as.matrix(rep(i,6))) + } + #set.seed(42) + + #Sample Subject Means + subject.means<-rnorm(10,100,2) + + #Sample Individual Measurements + values=NULL + for(sm in subject.means){ + values<-rbind(values,as.matrix(rnorm(6,sm,20))) + } + + out<-cbind(subject,values) + + #Split into GroupA and GroupB + GroupA<-out[1:30,] + GroupB<-out[31:60,] + + #Add effect size to GroupA + GroupA[,2]<-GroupA[,2]+0 + + colnames(GroupA)<-c(""Subject"", ""Value"") + colnames(GroupB)<-c(""Subject"", ""Value"") + + #Calculate Individual Means and SDS + GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2) + for(i in 1:length(unique(GroupA[,1]))){ + GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + } + colnames(GroupA.summary)<-c(""Mean"",""SD"") + + + GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2) + for(i in 1:length(unique(GroupB[,1]))){ + GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + } + colnames(GroupB.summary)<-c(""Mean"",""SD"") + + Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary)) + colnames(Summary)[1]<-""Group"" + + pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value + } + + +And here is code for plots: + + + #Plots + par(mfrow=c(2,2)) + boxplot(GroupA[,2]~GroupA[,1], col=""Red"", main=""Group A"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupA[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupA[,1]))){ + m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(GroupB[,2]~GroupB[,1], col=""Light Blue"", main=""Group B"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupB[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupB[,1]))){ + m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(Summary[,2]~Summary[,1], col=c(""Red"",""Light Blue""), xlab=""Group"", ylab=""Average Value"", + ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])), + main=""Individual Averages"") + stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T) + + points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(.9, + t.test(GroupA.summary[,1])$conf.int[1],.9, + t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + + points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(1.9, + t.test(GroupB.summary[,1])$conf.int[1],1.9, + t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + hist(pvals, breaks=seq(0,1,by=.05), col=""Grey"", + main=c(paste(""# sims="", n.simulations), + paste(""% Sig p-values="",100*length(which(pvals<0.05))/length(pvals))) + ) + +Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data. + +So what is the correct way to analyze this data? + + +**Bonus:** + +The example above is a simplification. For the actual data: + +1) The within-subject variance is positively correlated with the mean. + +2) Values can only be multiples of two. + +3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end. + +4) Number of Subjects in each group are not necessarily equal. + +Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the ""correct"" p-values. + + [1]: https://i.stack.imgur.com/55V9J.png",deleted 30 characters in body,2013-10-10 13:50:36.427 +185488,57224,3731.0,2,,CC BY-SA 3.0,ee2f3f02-fd21-4ba3-9722-8e2ed09b0088,"To close the loop for those who don't want to follow the comment thread above: + +No. People do not normally compile these things because they are specific to both the particular model chosen and the particular data onto which the model is fit. To talk of a ""typical"" number is not well posed. + +If someone would like to post a more comprehensive answer, I will be happy to ""unaccept"" this answer and accept theirs instead. ",,2013-10-10 14:09:23.710 +185489,57225,503.0,2,,CC BY-SA 3.0,ae7e0ec3-f2ed-4d09-93b3-6323f5180150,"I think the choice depends on the audience that will read whatever you write. + +If they are mostly statistically unsophisticated, I'd say you could use the original data set and put a footnote about how multiple imputation did not change things much. If they are more sophisticated, I'd go with the MI analysis. Even if things don't change ""much"" they change *some* and the MI is a better approach. + +Also, be careful that you looked at *all* the output for what changed (or didn't). Not just parameter estimates but their standard errors (or whatever your analysis involves - you didn't say what analysis you did, so it's hard to say what might be affected). ",,2013-10-10 14:16:53.693 +185490,57221,594.0,5,,CC BY-SA 3.0,b1323a1c-406c-49bf-bd9e-b9f7a2ca1249,"If say for a random variable $X$, I have observation of $x_1,x_2,x_3,\ldots,x_n$. Let $m$ be the sample mean, and $s$ be the sample standard deviation. Does the new random variable $(X-m)/s$ follow some distribution? It's not $t$-distribution I guess, since for it to be $t$ distributed, $m$ needs to be replaced by true mean. + +Can statistics expert shed some light on this?","formatting, spelling/grammar",2013-10-10 14:21:48.783 +185494,57226,22566.0,2,,CC BY-SA 3.0,d89df24f-4793-49a7-871c-e5fffca843a2,The above presented formulas are available in the SPSS help: Help > Algorithms > Multiple Imputation: Pooling Algorithms > Rubin's Rules (multiple imputation algorithms) > Combining Results after Multiple Imputation,,2013-10-10 14:44:17.203 +185496,57227,9792.0,2,,CC BY-SA 3.0,de5d42f5-a1e6-4faa-8a6b-0dc0e577f67c,"I think the question why splines cannot be centered arose out of a misunderstanding of how splines function. It seems that splines don't model an intercept and thus centering is impossible. It would, however, be great if someone had another solution to estimating the group differences at different time points when modelling more complex dynamics. +",,2013-10-10 15:02:05.507 +185497,57223,,25,,,f7504841-04ec-4981-80cf-2f089fe58be2,,http://twitter.com/#!/StackStats/status/388321984008159232,2013-10-10 15:15:50.387 +185498,57228,22568.0,2,,CC BY-SA 3.0,c16d3942-e5bb-4e4f-ba17-660ba10e344c,"I am an expert GIS user moving towards R more and more. I have been using R for some basic regressions and such, but I would like to begin to use and manipulate GIS data in R. + +How can I create a basemap graphic similar to the one in this post: +http://stats.stackexchange.com/questions/72421/showing-spatial-and-temporal-correlation-on-maps + +Again, I am a beginner in R and haven't found any other related thread here. + +thanks, +mike",,2013-10-10 15:23:08.293 +185500,57228,22568.0,3,,CC BY-SA 3.0,c16d3942-e5bb-4e4f-ba17-660ba10e344c,,,2013-10-10 15:23:08.293 +185499,57228,22568.0,1,,CC BY-SA 3.0,c16d3942-e5bb-4e4f-ba17-660ba10e344c,How to create a GIS basemap in R?,,2013-10-10 15:23:08.293 +186162,57417,22425.0,1,,CC BY-SA 3.0,e6884711-0a34-4459-adbd-ae5d96027db4,Independent but not identically distributed,,2013-10-14 03:02:05.353 +185503,57229,22567.0,2,,CC BY-SA 3.0,1f5099f9-3c2e-4c5b-bbdf-487714206543,"I have a regression problem where the independent variables are all factors (categorical). I've been looking at the literature on missing data, and so far it all seems concerned with missing training data. I was wondering if there is a standard way of dealing with missing data in the *prediction* set. That is, you have all the data you need to train, but then you need to be able to make a prediction with only partial data. This must have been a studied problem. + +My initial thought is to use an average of the dummy encoded variables, according to how common they are. As a quick example, say we have a three level factor dummy encoded as + +level 1: [1 0] + +level 2: [0 1] + +level 3: [0 0] + +say level i occurs fraction f_i of the time in the training data (so sum(i, f_i)==1) + +say the regression has the two coefficients \beta_1 and \beta_2. + +Then a missing value in this factor might be estimated as: + +\beta_1*f_1 + \beta_2*f_2 + 0*f_3 + +But given that the ""default"" level encoding are shared across factors, I'm not sure I'm handling level 3 correctly in this case. + +Any suggestions? + +Thanks, +Craig + + + +",,2013-10-10 15:23:53.377 +185502,57229,22567.0,1,,CC BY-SA 3.0,1f5099f9-3c2e-4c5b-bbdf-487714206543,Dealing with missing data in the prediction set only,,2013-10-10 15:23:53.377 +185501,57229,22567.0,3,,CC BY-SA 3.0,1f5099f9-3c2e-4c5b-bbdf-487714206543,,,2013-10-10 15:23:53.377 +185505,57230,22569.0,1,,CC BY-SA 3.0,66b38f64-7127-4af7-b557-c287bd9dc419,Represent data across multiple categories and sub categories,,2013-10-10 15:25:25.017 +185506,57230,22569.0,3,,CC BY-SA 3.0,66b38f64-7127-4af7-b557-c287bd9dc419,,,2013-10-10 15:25:25.017 +185504,57230,22569.0,2,,CC BY-SA 3.0,66b38f64-7127-4af7-b557-c287bd9dc419,"The data contains category and sub category distributions. + +Category A - 100 + -- Level One - 40 + -- Level Two - 30 + -- Level Three - 30 + +Category B - 50 + -- Level One - 10 + -- Level Two - 15 + -- Level Three - 25 + +The sub categories are always same (Level one, level two, level three) +The sum of all sub categories is equal to the total of the main category. +Each graph would contain up to 5 main categories. + +What is the best way to represent this data?",,2013-10-10 15:25:25.017 +185508,57230,22569.0,5,,CC BY-SA 3.0,6af05a89-0446-489d-8beb-b2828486ac81,"The data contains category and sub category distributions. +The category are topics in a quiz such as: Music, Sports, Business. + +Each category has three levels to choose from: Basic, Standard and Advanced. + +I want to represent this on a graph: +Ex: A user might take a quiz on Music across different levels. Say the number of questions attempted is 100. The user would have answered them across levels. 40 for basic, 40 for standard and 20 for advanced. + + +Each graph would contain up to 5 main categories. + +What is the best way to represent this data?",added 56 characters in body,2013-10-10 15:39:29.993 +185511,57231,22381.0,2,,CC BY-SA 3.0,1ce69f54-467b-492d-bd90-3dc682223e0e,"I am using squared return as a proxy to calculate volatility, however i'm not sure whether to use absolute return or percentage return. Under absolute return all volatility estimates are below 1, however under percentage return there is a mix of volatility greater than 1 and less than 1. Percentage return below 1 would end up as a volatility figure less than the percentage return itself, on the other hand percentage return above 1 would end up as a volatility figure greater than percentage return. + +My question is: Doesn't this pose a problem when calculating volatility in that there is an over estimation when the return is above 1? + +I am going to use the data to fit an ARMA-GARCH model, would there be any difference if I used percentage or absolute values?",,2013-10-10 15:42:26.310 +185510,57231,22381.0,1,,CC BY-SA 3.0,1ce69f54-467b-492d-bd90-3dc682223e0e,Absolute Return vs Percentage Return to Calculate Volatility,,2013-10-10 15:42:26.310 +185509,57231,22381.0,3,,CC BY-SA 3.0,1ce69f54-467b-492d-bd90-3dc682223e0e,,,2013-10-10 15:42:26.310 +185512,57231,,6,,CC BY-SA 3.0,ffb63273-b3e1-4a21-8ea1-5bb1b80122cd,,Added additional tags,2013-10-10 15:45:28.413 +185513,57231,,24,,CC BY-SA 3.0,ffb63273-b3e1-4a21-8ea1-5bb1b80122cd,,"Proposed by 3826 approved by 805, 7290 edit id of 5587",2013-10-10 15:45:28.413 +185514,57232,2873.0,2,,CC BY-SA 3.0,7c95c624-c1a5-4453-8a80-74872c0d4392,"The `ns` function (and other spline functions) does its own ""centering"" of the data. Consider this example: + + > library(splines) + > + > s1 <- ns( 1:10, 3 ) + > s2 <- ns( (1:10)-5, 3 ) + > + > all.equal(s1,s2) + [1] ""Attributes: < Component 1: Mean relative difference: 0.9090909 >"" + [2] ""Attributes: < Component 7: Mean relative difference: 0.9090909 >"" + > all.equal(as.vector(s1),as.vector(s2)) + [1] TRUE + +So the centering of the data leads to the same splines as the uncentered data (other than the knot information in the attributes). So centering your variable before computing a spline has no effect. If you want to compare the values at a point other than 0 then just use the `predict` function to get the actual predictions at the point of interest and compare (subtract).",,2013-10-10 15:46:38.177 +185515,57230,594.0,5,,CC BY-SA 3.0,fea96626-e807-47d8-96cc-384b557c5618,"The data contains category and sub-category distributions. + +The categories are topics in a quiz such as: Music, Sports, Business. + +Each category has three levels to choose from: Basic, Standard and Advanced. + +I want to represent this on a graph. + +For example: A user might take a quiz on Music across different levels. Say the number of questions attempted is 100. The user would have answered them across levels. 40 for basic, 40 for standard and 20 for advanced. + +Each graph would contain up to 5 main categories. + +What is the best way to represent this data?","spelling, formatting etc",2013-10-10 15:47:46.300 +185517,57229,5237.0,6,,CC BY-SA 3.0,96854c05-d309-4d74-9f19-a12d68610e10,,added tag; formatted; removed signature; light editing,2013-10-10 15:50:05.603 +185516,57229,5237.0,5,,CC BY-SA 3.0,96854c05-d309-4d74-9f19-a12d68610e10,"I have a regression problem where the independent variables are all factors (categorical). I've been looking at the literature on missing data, and so far it all seems concerned with missing training data. I was wondering if there is a standard way of dealing with missing data in the *prediction* set. That is, you have all the data you need to train, but then you need to be able to make a prediction with only partial data. This must have been a studied problem. + +My initial thought is to use an average of the dummy encoded variables, according to how common they are. As a quick example, say we have a three level factor dummy encoded as + + level 1: [1 0] + level 2: [0 1] + level 3: [0 0] + +Say level $i$ occurs fraction $f_i$ of the time in the training data (so $\sum(i, f_i)=1$), and the regression has the two coefficients $\beta_1$ and $\beta_2$. + +Then a missing value in this factor might be estimated as: +$$ +\beta_1*f_1 + \beta_2*f_2 + 0*f_3 +$$ +But given that the ""default"" level encoding are shared across factors, I'm not sure I'm handling level 3 correctly in this case. + +",added tag; formatted; removed signature; light editing,2013-10-10 15:50:05.603 +185519,57231,22381.0,5,,CC BY-SA 3.0,a1b41d34-9ef5-474f-b2ac-7e6355074355,"I am using squared return as a proxy to calculate volatility, however i'm not sure whether to use absolute return or percentage return. Under absolute return all return estimates are below 1, however under percentage return there is a mix of return greater than 1 and less than 1. Percentage return below 1 would end up as a volatility figure less than the percentage return itself, on the other hand percentage return above 1 would end up as a volatility figure greater than percentage return. + +My question is: Doesn't this pose a problem when calculating volatility in that there is an over estimation when the return is above 1? + +I am going to use the data to fit an ARMA-GARCH model, would there be any difference if I used percentage or absolute values?",deleted 8 characters in body,2013-10-10 15:52:02.787 +185521,57233,5237.0,2,,CC BY-SA 3.0,2dbe7e85-e74e-4e8c-a427-b39c876d392c,"*I'll let someone else address the estimation of the missing data. (At first glance, it looks reasonable to me.)* + +You are right that you are not handling level 3 correctly. The coding scheme that you use in your question set up is known as *reference level coding*. To use this approach correctly, you need to have an intercept (i.e., $\beta_0$), which estimates the mean of level 3. I suspect you do have such, even though you didn't list it. In this case, you would just add the intercept to your final equation. That is: +$$ +\beta_0 + \beta_1\!*\!f_1 + \beta_2\!*\!f_2 + 0\!*\!f_3 +$$ +Note that you don't have to do it this way. You can suppress the intercept and use a dummy for each level of your factor. Some people prefer this because they think it is conceptually clearer. However, it is exactly the same as above, just with the labels shifted. ",,2013-10-10 15:56:30.450 +185522,57229,22567.0,5,,CC BY-SA 3.0,e6ce2a49-7902-4cc9-9ab6-f66e260654a6,"I have a regression problem where the independent variables are all factors (categorical). I've been looking at the literature on missing data, and so far it all seems concerned with missing training data. I was wondering if there is a standard way of dealing with missing data in the *prediction* set. That is, you have all the data you need to train, but then you need to be able to make a prediction with only partial data. This must have been a studied problem. + +My initial thought is to use an average of the dummy encoded variables, according to how common they are. As a quick example, say we have a three level factor dummy encoded as + + level 1: [1 0] + level 2: [0 1] + level 3: [0 0] + +Say level $i$ occurs fraction $f_i$ of the time in the training data (so $\sum_i{f_i}=1$), and the regression has the two coefficients $\beta_1$ and $\beta_2$. + +Then a missing value in this factor might be estimated as: +$$ +\beta_1*f_1 + \beta_2*f_2 + 0*f_3 +$$ +But given that the ""default"" level encoding are shared across factors, I'm not sure I'm handling level 3 correctly in this case. + +",fix summation equation,2013-10-10 15:59:24.710 +185523,57234,2873.0,2,,CC BY-SA 3.0,96a3f4ab-c578-4c38-aab4-b646e9b93679,"You should read through the Spatial and possibly SpatialTemporal [Taskviews on CRAN][1]. Those will give you an idea of what packages are available and gives brief descriptions of what they do and how they compare. + + + [1]: http://cran.r-project.org/web/views/",,2013-10-10 16:05:27.410 +185524,57235,10060.0,2,,CC BY-SA 3.0,cf72a2a8-053a-48a4-9151-721277e79553,"R by itself does not handle GIS type of work but with different add-ons it can be a quite potent GIS device. You'd need to understand the idea of ""package"" (user-contributed scripts) and how to use `install.packages(""whateverPackage"")` command to install them. + +I don't use R in GIS enough to show you the whole topography (pun totally intended), but the most commonly used packages I have seen are `map`, `ggmap`, `ggplot2`, `RgoogleMaps`, and `plotGoogleMap`. + +Also, check out some sites and tutorials about this topic: [1](http://www.nyu.edu/projects/politicsdatalab/workshops/GISwR.pdf), [2](http://cran.r-project.org/web/views/Spatial.html), [3](http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf), and [4](http://www.icesi.edu.co/CRAN/web/packages/plotGoogleMaps/vignettes/plotGoogleMaps-intro.pdf). These got me started and within a day I could make some silly maps. + +Also, this [pdf](http://statacumen.com/teach/SC1/SC1_16_Maps.pdf) probably contain some code pertinent to the map you wish to create. Its $\LaTeX$ format is a bit off, but you can still get some general functionality and key commands. + +Good luck!",,2013-10-10 16:08:02.663 +185527,57236,22572.0,3,,CC BY-SA 3.0,2ba32bca-4415-4c06-b270-6b1a63c06d83,,,2013-10-10 16:10:12.377 +185526,57236,22572.0,1,,CC BY-SA 3.0,2ba32bca-4415-4c06-b270-6b1a63c06d83,"Kaplan Meier - Can I use to assess recovery of function, not just loss?",,2013-10-10 16:10:12.377 +185525,57236,22572.0,2,,CC BY-SA 3.0,2ba32bca-4415-4c06-b270-6b1a63c06d83,"In my experience and readings, Kaplan-Meier has always been used to calculate differential survival between a certain number of groups. However, Im looking to assess how time to recovery from a certain event as measured by activity levels. At time zero, everyone is essentially ""dead"" (non-mobile), and with time they regain mobility. +seems like a ""negative"" kaplan-meier, is that possible? Or should I be looking at a different modeling strategy?",,2013-10-10 16:10:12.377 +185528,57228,5237.0,5,,CC BY-SA 3.0,0d624180-8f86-4f18-87d4-e02b3abc0064,"I am an expert GIS user moving towards R more and more. I have been using R for some basic regressions and such, but I would like to begin to use and manipulate GIS data in R. + +How can I create a basemap graphic similar to the one in this post: +http://stats.stackexchange.com/questions/72421/showing-spatial-and-temporal-correlation-on-maps + +Again, I am a beginner in R and haven't found any other related thread here. +",removed signature,2013-10-10 16:10:14.737 +185529,57235,10060.0,5,,CC BY-SA 3.0,b8bc9203-7a71-467b-88d6-4b29cd99cae4,"R by itself does not handle GIS type of work but with different add-ons it can be a quite potent GIS device. You'd need to understand the idea of ""package"" (user-contributed scripts) and how to use `install.packages(""whateverPackage"")` command to install them. + +I don't use R in GIS enough to show you the whole topography (pun totally intended), but the most commonly used packages I have seen are `map`, `ggmap`, `ggplot2`, `RgoogleMaps`, and `plotGoogleMap`. + +Also, check out some sites and tutorials about this topic: [1](http://www.nyu.edu/projects/politicsdatalab/workshops/GISwR.pdf), [2](http://cran.r-project.org/web/views/Spatial.html), [3](http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf), and [4](http://www.icesi.edu.co/CRAN/web/packages/plotGoogleMaps/vignettes/plotGoogleMaps-intro.pdf). These got me started and within a day I could make some silly maps. + +Lastly, this [pdf](http://statacumen.com/teach/SC1/SC1_16_Maps.pdf) probably contains some codes pertinent to the map you wish to create. Its $\LaTeX$ format is a bit off, but you can still get some general functionality and key commands. + +Good luck!",added 2 characters in body,2013-10-10 16:15:27.743 +185530,57236,594.0,5,,CC BY-SA 3.0,b1f8b7b3-792e-4bd2-92c1-93c18b2d8940,"In my experience and readings, Kaplan-Meier has always been used to calculate differential survival between a certain number of groups. However, I'm looking to assess how time to recovery from a certain event as measured by activity levels. At time zero, everyone is essentially ""dead"" (non-mobile), and with time they regain mobility. +seems like a ""negative"" Kaplan-Meier, is that possible? Or should I be looking at a different modeling strategy?",added 1 characters in body,2013-10-10 16:20:04.957 +185531,57233,5237.0,5,,CC BY-SA 3.0,43adffab-4dcd-42fa-aaa0-8210755cbf03,"*(I'll let someone else address the estimation of the missing data. You may want to model the probability that the observation is each level of the unknown factor directly using knowledge of other covariate values, and possibly outside information, e.g., priors etc. However, at first glance your approach looks reasonable to me.)* + +One note is that I can't tell from your description if you are weighting by *raw frequencies*. If so, you want to divide these by $N$ to get the marginal *probabilities* instead. + +You are right that you are not handling level 3 correctly. The coding scheme that you use in your question set up is known as *reference level coding*. To use this approach correctly, you need to have an intercept (i.e., $\beta_0$), which estimates the mean of level 3. I suspect you do have such, even though you didn't list it. In this case, you would just add the intercept to your final equation. That is: +$$ +\beta_0\!*\!f_3 + \beta_1\!*\!f_1 + \beta_2\!*\!f_2 +$$ +Note that you are multiplying the intercept (which encodes the reference level) by the marginal probability that the observation is actually the reference level. ",added 295 characters in body,2013-10-10 16:20:30.040 +185534,57237,22570.0,2,,CC BY-SA 3.0,23b9ecac-78b5-4afa-a218-b6bc9d6a27ec,"I'm trying to generate sets of causally connected random variables and started off doing this with a monte carlo approach. + +The baseline is a 2-dimensional measured histogram from which I draw random values. + +In my concrete examples these variables are acceleration $\bf{a}$ and velocity $\bf{v}$ - so obviously +$v_{i+1} = v_{i} + a_i * dt$ +has to hold. + +My current naive approach is: + +I start with a some $v_0$. +Then I generate a random $a_0$ according to the measured probability of $\bf{a}$ for the value of $v_0$. Using this $a_0$ I can calculate $v_1$ and the whole procedure starts over again. + +So when I check the generated accelerations $\bf{a}$ in bins of $\bf{v}$ everything's fine. +But I obviously this does not at all respect the marginal distribution of $\bf{v}$. + +I'm kind of familiar with basic monte carlo methods, though lacking some theoretical background as you might guess. +I'd be fine if the two variables where *just* connected by some correlation matrix, but the causal connection between the two gives me headaches. + +I didn't manage to find an example for this kind of problem somewhere - I might be googl'ing the wrong terms. +I'd be satisfied if somebody could point me to some literature/example or promising method to get a hold on this. + +(Or tell me that's is not really possible given my inputs - that's what I'm guessing occasionally...) +",,2013-10-10 16:26:50.467 +185533,57237,22570.0,3,,CC BY-SA 3.0,23b9ecac-78b5-4afa-a218-b6bc9d6a27ec,,,2013-10-10 16:26:50.467 +185532,57237,22570.0,1,,CC BY-SA 3.0,23b9ecac-78b5-4afa-a218-b6bc9d6a27ec,Generating causally dependent random variables,,2013-10-10 16:26:50.467 +185535,52871,,5,,CC BY-SA 3.0,26608ba7-0af3-4994-a6e7-c99c2918feeb,"I want to analyze a multilevel multidimensional model in WinBUGS. the model is as below (N=2362 students responding to K=45 items of a test, students are nested within J=116 schools): + + model{ + #responses + for(i in 1:N){ + for(j in 1:K){ + logit(p[i,j])<- a1[j]*t[i,1]+a2[j]*t[i,2]-b[j] + y[i,j]~dbern(p[i,j] ) + } + t[i,1:2]~dmnorm(mu[sc[i],1:2],tau.p[1:2,1:2]) + } + #school level + for(j in 1:J){ + mu[j,1:2]~dmnorm(m[j,1:2],tau.s[1:2,1:2]) + } + + #priors + for(j in 1:J){ + m[j,1:2]~dmnorm(m0[1:2],cov[1:2,1:2]) + } + + tau.p[1:2,1:2]~dwish(cov[1:2,1:2],2) + tau.s[1:2,1:2]~dwish(cov[1:2,1:2],2) + sigma.p[1:2,1:2]<-inverse(tau.p[,]) + sigma.s[1:2,1:2]<-inverse(tau.s[,]) + s2p<-sum(sigma.p[,]) + s2s<-sum(sigma.s[,]) + rho<-(s2s)/(s2s+s2p) + + a1[1]~dlnorm(0,4) + a2[1]<-0 + b[1]~dnorm(0,1) + for(s in 2:K) { + a1[s]~dlnorm(0,4) + a2[s]~dlnorm(0,4) + b[s]~dnorm(0,1) + } + } + +I've set these functions as initial values: + + ini<-function(){ + list(tau.p=matrix(rgamma(4,100,100),2,2), + tau.s=matrix(rgamma(4,100,100),2,2), + t=rmvnorm(N,mean=c(0,0),sigma=diag(2)), + m=rmvnorm(J,mean=c(0,0),sigma=diag(2)), + mu=rmvnorm(J,mean=c(0,0),sigma=diag(2)), + a1=rlnorm(K,0, 0.4), + a2=c(NA,rlnorm(K-1,0, 0.4)), + b=rnorm(45,0,0.5)) + } +I use rube package in R to check and run my analysis and everything looks fine. When I run the model I receive ""Trap 66 (postcondition violated)"" or ""undefined real result"". I think the problem is from the initials but I have no idea how to solve it. + +Any idea? +",tidied up model to make it more legable,2013-10-10 16:37:14.287 +185536,52871,,24,,CC BY-SA 3.0,26608ba7-0af3-4994-a6e7-c99c2918feeb,,"Proposed by 13267 approved by 805, 919 edit id of 5588",2013-10-10 16:37:14.287 +185539,57238,8414.0,3,,CC BY-SA 3.0,ad6f0a81-99c3-4bab-a025-850786c926b7,,,2013-10-10 16:41:50.710 +185538,57238,8414.0,1,,CC BY-SA 3.0,ad6f0a81-99c3-4bab-a025-850786c926b7,Simulating data to fit a mediation model,,2013-10-10 16:41:50.710 +185537,57238,8414.0,2,,CC BY-SA 3.0,ad6f0a81-99c3-4bab-a025-850786c926b7,"I am interested in finding a procedure to simulate data that are consistent with a specified mediation model. According to the general linear structural equation model framework for testing mediation models first outlined by [Barron and Kenny (1986)][1] and described elsewhere such as [Judd, Yzerbyt, & Muller (2013)][2], mediation models for outcome `Y`, mediator `med`, and predictor `X` and are governed by the following three regression equations: + +1. Y = b11 + b12 * X + e1 +2. med = b21 + b22 * X + e2 +3. Y = b31 + b32 * X + b32 * med + e3 + +The indirect effect or mediation effect of `X` on `Y` through `med` can either be defined as b22 * b32 or, equivalently, as b12 - b32. Under the old framework of testing for mediation, mediation was established by testing b12 in equation 1, b22 in equation 2, and b32 in equation 3. + +So far, I have attempted to simulate values of med and Y that are consistent with values of the various regression coefficients using `rnorm` in `R`, such as the code below: + + x <- rep(c(-.5, .5), 50) + med <- 4 + .7 * x + rnorm(100, sd = 1) + + # Check the relationship between x and med + mod <- lm(med ~ x) + summary(mod) + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + + # Check the relationships between x, med, and y + mod <- lm(y ~ x + med) + summary(mod) + + # Check the relationship between x and y -- not present + mod <- lm(y ~ x) + summary(mod) + +However, it seems that sequentially generating `med` and `y` using equations 2 and 3 is not enough, since I am left with no relationship between `x` and `y` using this approach. + +Can anyone help me find a procedure in R to generate variables `x`, `med`, and `y` that satisfy constraints that I set using equations 1, 2, and 3? + +Thanks for your help! + + [1]: https://umdrive.memphis.edu/grelyea/public/PUBH%207152-Stat%20Methods%20II/Chapter%2010/Mediation/Baron_&_Kenny_1986.pdf + [2]: http://www.psor.ucl.ac.be/personal/yzerbyt/Judd%20et%20al.%20HRMSP%202013.pdf",,2013-10-10 16:41:50.710 +185540,56372,21108.0,5,,CC BY-SA 3.0,701d356f-e4a4-437b-9bf6-60ad2bac8639,"I just made an implementation of P(A|B)/P(¬A|B) for a ""people who bought this also bought..."" algorithm. + + +I'm doing it by + + P(A|B) = count_users(bought_A_and_B)/count_users(bought_A) + P(¬A|B) = count_users(bought_B_but_not_A)/count_users(did_not_buy_A) + +Then dividing the top one by the bottom one I get a score which makes absolute sense, but what kind of correlation am I calculating? What is this method called? Where can I read more about it? + +**[EDIT]** This is not for using in a production environment, it is just some algorithm which appeared out of the blue in an online course I'm taking, I was just wondering where it could come from. Also, when the number of users who bought item B but not item A is zero I just skip the pair until I get more data. The same goes on when the number of users who bought A is zero.",added 4 characters in body,2013-10-10 16:58:07.030 +185543,57239,22571.0,1,,CC BY-SA 3.0,1cdf7b79-4d01-4eba-9f5d-f41e717dcc59,Quantifying the relationship between two disparate time series,,2013-10-10 16:58:21.140 +185541,57239,22571.0,2,,CC BY-SA 3.0,1cdf7b79-4d01-4eba-9f5d-f41e717dcc59,"I have two time series that have a roughly similar trend, though both variables are noisy. This graph shows means and standard errors throughout a season of measurements. + +![enter image description here][1] + +I'd like to be able to make a quantitative statement about the relationship between these two data sets. + +While the two data sets were collected from the same experimental plots, the individual samples from which the means and standard errors were calculated are not meaningfully paired with one another, and you can see that the carbohydrate data set was measured more frequently. + +By taking a subset of the carbohydrate measurements that are closest to the microbial biomass measurement dates, I can make a scatterplot showing the means and standard errors that I think gives a fair visual representation of the relationship (TRS.ml is the carbohydrates): + +![enter image description here][2] + + + [1]: https://i.stack.imgur.com/7ThlT.png + [2]: https://i.stack.imgur.com/kLvc0.png + +This is where I am stuck. I'm not sure how to estimate regression coefficients or calculate an r2 value for a regression of this sort where I have estimates of uncertainty for both variables. Here are some approaches I have been considering: + +1. Deming regression. I'm not sure that this would be the right approach. It seems to be more for data sets in which the same technique was used for both variables. If it is, my question is how would I calculate the variance ratio based on the information I have? + +2. Regression of all underlying data points. This doesn't really work because the data are not meaningfully paired, so of the 80 or so microbial biomass measurements that underlie the data shown in the graphs here, I can't directly match them to individual measurements of carbohydrates. Matching them arbitrarily seems bad. + +3. Regression of carbohydrate means by date against microbial biomass means by date. Basically regress the points in my scatterplot above but throw out the information about the uncertainty. This gives a high r2 driven by the coinciding peaks on July 1st, but to me, seems to overestimate the strength of the relationship. + +4. Regression of all microbial biomass values against carbohydrate means by date or vice versa. This allows more of the underlying uncertainty to be incorporated while not forcing the pairing of unrelated data points in an arbitrary way. Again though, it does not incorporate the uncertainty in both variables. + +My question is which of these approaches, or any other unlisted approaches, would you recommend for quantifying the relationship between these two time series?",,2013-10-10 16:58:21.140 +185542,57239,22571.0,3,,CC BY-SA 3.0,1cdf7b79-4d01-4eba-9f5d-f41e717dcc59,,,2013-10-10 16:58:21.140 +185544,57240,22527.0,2,,CC BY-SA 3.0,b5234575-4030-42a7-bde9-cb3ae105b349,"I believe you multiply by 2 because you need to control for your database being twice as large. There are other ways to do this calculation such as + +decoy spectra identified / target spectra identified + +The use of the term FDR for this calculation is totally confusing and is why people have started calling it target/decoy rate in the last year or so. It's also doubly confusing as people often fail to specify if they are using a spectra target/decoy or a peptide target/decoy...and there is no way to consistently calculate a protein target/decoy as different programs will weigh peptide --> protein evidence differently...It's a mess...Having said that I will always make this calculation just to double check I or the software has not done something stupid. For that it is very useful. ",,2013-10-10 16:59:34.360 +185546,57238,17249.0,5,,CC BY-SA 3.0,7c1412a2-e6c8-486c-b9c2-c4e3cbbaea21,"I am interested in finding a procedure to simulate data that are consistent with a specified mediation model. According to the general linear structural equation model framework for testing mediation models first outlined by [Barron and Kenny (1986)][1] and described elsewhere such as [Judd, Yzerbyt, & Muller (2013)][2], mediation models for outcome $Y$, mediator $med$, and predictor $X$ and are governed by the following three regression equations: + +1. $Y = b_{11} + b_{12}*X + e_1$ +2. $med = b_{21} + b_{22}*X + e_2$ +3. $Y = b_{31} + b_{32}*X + b_{32} * med + e_3$ + +The indirect effect or mediation effect of $X$ on $Y$ through $med$ can either be defined as $b_{22}*b_{32}$ or, equivalently, as $b_{12}-b_{32}$. Under the old framework of testing for mediation, mediation was established by testing $b_{12}$ in equation 1, $b_{22}$ in equation 2, and $b_{32}$ in equation 3. + +So far, I have attempted to simulate values of $med$ and $Y$ that are consistent with values of the various regression coefficients using `rnorm` in `R`, such as the code below: + + x <- rep(c(-.5, .5), 50) + med <- 4 + .7 * x + rnorm(100, sd = 1) + + # Check the relationship between x and med + mod <- lm(med ~ x) + summary(mod) + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + + # Check the relationships between x, med, and y + mod <- lm(y ~ x + med) + summary(mod) + + # Check the relationship between x and y -- not present + mod <- lm(y ~ x) + summary(mod) + +However, it seems that sequentially generating $med$ and $Y$ using equations 2 and 3 is not enough, since I am left with no relationship between $X$ and $Y$ using this approach. + +Can anyone help me find a procedure in R to generate variables $X$, $med$, and $Y$ that satisfy constraints that I set using equations 1, 2, and 3? + +Thanks for your help! + + [1]: https://umdrive.memphis.edu/grelyea/public/PUBH%207152-Stat%20Methods%20II/Chapter%2010/Mediation/Baron_&_Kenny_1986.pdf + [2]: http://www.psor.ucl.ac.be/personal/yzerbyt/Judd%20et%20al.%20HRMSP%202013.pdf",Added LaTeX,2013-10-10 17:03:04.440 +185545,57238,,24,,CC BY-SA 3.0,7c1412a2-e6c8-486c-b9c2-c4e3cbbaea21,,Proposed by 24808 approved by 11091 edit id of 5589,2013-10-10 17:03:04.440 +185547,57241,5237.0,2,,CC BY-SA 3.0,82a6ee7f-1d2d-4638-8ead-f5e27515c7f3,"This is quite straightforward. The reason you have no relationship between $x$ and $y$ using your approach is because of the code: + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + +If you want some relationship between $x$ and $y$ even when $med$ is included (that is, you want *partial* mediation), you would simply use a non-zero value for $b_{32}$ instead. For example, you could substitute the following code for the above: + + y <- 2.5 + 3 * x + .4 * med + rnorm(100, sd = 1) + +Thus, $b_{32}$ has been changed from $0$ to $3$. (Of course some other, specific value would probably be more relevant, depending on your situation, I just picked $3$ off the top of my head.) ",,2013-10-10 17:09:45.623 +185549,57238,5237.0,6,,CC BY-SA 3.0,9f3f096d-cb52-4893-947a-9f3279dc943e,,added tags; removed thanks,2013-10-10 17:12:22.710 +185548,57238,5237.0,5,,CC BY-SA 3.0,9f3f096d-cb52-4893-947a-9f3279dc943e,"I am interested in finding a procedure to simulate data that are consistent with a specified mediation model. According to the general linear structural equation model framework for testing mediation models first outlined by [Barron and Kenny (1986)][1] and described elsewhere such as [Judd, Yzerbyt, & Muller (2013)][2], mediation models for outcome $Y$, mediator $med$, and predictor $X$ and are governed by the following three regression equations: + +1. $Y = b_{11} + b_{12}*X + e_1$ +2. $med = b_{21} + b_{22}*X + e_2$ +3. $Y = b_{31} + b_{32}*X + b_{32} * med + e_3$ + +The indirect effect or mediation effect of $X$ on $Y$ through $med$ can either be defined as $b_{22}*b_{32}$ or, equivalently, as $b_{12}-b_{32}$. Under the old framework of testing for mediation, mediation was established by testing $b_{12}$ in equation 1, $b_{22}$ in equation 2, and $b_{32}$ in equation 3. + +So far, I have attempted to simulate values of $med$ and $Y$ that are consistent with values of the various regression coefficients using `rnorm` in `R`, such as the code below: + + x <- rep(c(-.5, .5), 50) + med <- 4 + .7 * x + rnorm(100, sd = 1) + + # Check the relationship between x and med + mod <- lm(med ~ x) + summary(mod) + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + + # Check the relationships between x, med, and y + mod <- lm(y ~ x + med) + summary(mod) + + # Check the relationship between x and y -- not present + mod <- lm(y ~ x) + summary(mod) + +However, it seems that sequentially generating $med$ and $Y$ using equations 2 and 3 is not enough, since I am left with no relationship between $X$ and $Y$ using this approach. + +Can anyone help me find a procedure in R to generate variables $X$, $med$, and $Y$ that satisfy constraints that I set using equations 1, 2, and 3? + + + [1]: https://umdrive.memphis.edu/grelyea/public/PUBH%207152-Stat%20Methods%20II/Chapter%2010/Mediation/Baron_&_Kenny_1986.pdf + [2]: http://www.psor.ucl.ac.be/personal/yzerbyt/Judd%20et%20al.%20HRMSP%202013.pdf",added tags; removed thanks,2013-10-10 17:12:22.710 +185558,57231,22381.0,5,,CC BY-SA 3.0,7b7e38ea-8f9b-46a0-87b0-4ec78100d619,"I am using squared return as a proxy to calculate volatility, however i'm not sure whether to use raw return or percentage return. Under raw return all return estimates are below 1, however under percentage return there is a mix of return greater than 1 and less than 1. Percentage return below 1 would end up as a volatility figure less than the percentage return itself, on the other hand percentage return above 1 would end up as a volatility figure greater than percentage return. + +My question is: Doesn't this pose a problem when calculating volatility in that there is an over estimation when the return is above 1? + +I am going to use the data to fit an ARMA-GARCH model, would there be any difference if I used percentage or absolute values?",deleted 10 characters in body; edited title,2013-10-10 18:20:07.200 +185557,57231,22381.0,4,,CC BY-SA 3.0,7b7e38ea-8f9b-46a0-87b0-4ec78100d619,Raw Return vs Percentage Return to Calculate Volatility,deleted 10 characters in body; edited title,2013-10-10 18:20:07.200 +185550,57233,5237.0,5,,CC BY-SA 3.0,08ae1146-8c96-4524-8bc8-d11ee34eed1c,"*(I'll let someone else address the estimation of the missing data. You may want to directly model the probability that the observation is each level of the unknown factor using knowledge of other covariate values, and possibly outside information, e.g., priors etc. There are strategies such as [propensity scores][1] that you might be able to use for this type of thing. However, at first glance your approach looks reasonable to me.)* + +One note is that I can't tell from your description if you are weighting by *raw frequencies*. If so, you want to divide these by $N$ to get the marginal *probabilities* instead. + +You are right that you are not handling level 3 correctly. The coding scheme that you use in your question set up is known as *reference level coding*. To use this approach correctly, you need to have an intercept (i.e., $\beta_0$), which estimates the mean of level 3. I suspect you do have such, even though you didn't list it. In this case, you would just add the intercept to your final equation. That is: +$$ +\beta_0\!*\!f_3 + \beta_1\!*\!f_1 + \beta_2\!*\!f_2 +$$ +Note that you are multiplying the intercept (which encodes the reference level) by the marginal probability that the observation is actually the reference level. + + + [1]: http://en.wikipedia.org/wiki/Propensity_score_matching",added 174 characters in body,2013-10-10 17:15:34.363 +185551,57238,8414.0,5,,CC BY-SA 3.0,e22a1519-3c6a-4839-b639-37c1dd00c20e,"I am interested in finding a procedure to simulate data that are consistent with a specified mediation model. According to the general linear structural equation model framework for testing mediation models first outlined by [Barron and Kenny (1986)][1] and described elsewhere such as [Judd, Yzerbyt, & Muller (2013)][2], mediation models for outcome $Y$, mediator $med$, and predictor $X$ and are governed by the following three regression equations: + +1. $Y = b_{11} + b_{12}*X + e_1$ +2. $med = b_{21} + b_{22}*X + e_2$ +3. $Y = b_{31} + b_{32}*X + b_{32} * med + e_3$ + +The indirect effect or mediation effect of $X$ on $Y$ through $med$ can either be defined as $b_{22}*b_{32}$ or, equivalently, as $b_{12}-b_{32}$. Under the old framework of testing for mediation, mediation was established by testing $b_{12}$ in equation 1, $b_{22}$ in equation 2, and $b_{32}$ in equation 3. + +So far, I have attempted to simulate values of $med$ and $Y$ that are consistent with values of the various regression coefficients using `rnorm` in `R`, such as the code below: + + x <- rep(c(-.5, .5), 50) + med <- 4 + .7 * x + rnorm(100, sd = 1) + + # Check the relationship between x and med + mod <- lm(med ~ x) + summary(mod) + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + + # Check the relationships between x, med, and y + mod <- lm(y ~ x + med) + summary(mod) + + # Check the relationship between x and y -- not present + mod <- lm(y ~ x) + summary(mod) + +However, it seems that sequentially generating $med$ and $Y$ using equations 2 and 3 is not enough, since I am left with no relationship between $X$ and $Y$ in regression equation 1 (which models a simple bivariate relationship between $X$ and $Y$) using this approach. + +Can anyone help me find a procedure in R to generate variables $X$, $med$, and $Y$ that satisfy constraints that I set using equations 1, 2, and 3? + + + [1]: https://umdrive.memphis.edu/grelyea/public/PUBH%207152-Stat%20Methods%20II/Chapter%2010/Mediation/Baron_&_Kenny_1986.pdf + [2]: http://www.psor.ucl.ac.be/personal/yzerbyt/Judd%20et%20al.%20HRMSP%202013.pdf",Clarified text,2013-10-10 17:19:44.757 +185552,57238,8414.0,5,,CC BY-SA 3.0,dfafbd85-c819-4032-b11e-d2f1bce451f0,"I am interested in finding a procedure to simulate data that are consistent with a specified mediation model. According to the general linear structural equation model framework for testing mediation models first outlined by [Barron and Kenny (1986)][1] and described elsewhere such as [Judd, Yzerbyt, & Muller (2013)][2], mediation models for outcome $Y$, mediator $med$, and predictor $X$ and are governed by the following three regression equations: + +1. $Y = b_{11} + b_{12}*X + e_1$ +2. $med = b_{21} + b_{22}*X + e_2$ +3. $Y = b_{31} + b_{32}*X + b_{32} * med + e_3$ + +The indirect effect or mediation effect of $X$ on $Y$ through $med$ can either be defined as $b_{22}*b_{32}$ or, equivalently, as $b_{12}-b_{32}$. Under the old framework of testing for mediation, mediation was established by testing $b_{12}$ in equation 1, $b_{22}$ in equation 2, and $b_{32}$ in equation 3. + +So far, I have attempted to simulate values of $med$ and $Y$ that are consistent with values of the various regression coefficients using `rnorm` in `R`, such as the code below: + + x <- rep(c(-.5, .5), 50) + med <- 4 + .7 * x + rnorm(100, sd = 1) + + # Check the relationship between x and med + mod <- lm(med ~ x) + summary(mod) + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + + # Check the relationships between x, med, and y + mod <- lm(y ~ x + med) + summary(mod) + + # Check the relationship between x and y -- not present + mod <- lm(y ~ x) + summary(mod) + +However, it seems that sequentially generating $med$ and $Y$ using equations 2 and 3 is not enough, since I am left with no relationship between $X$ and $Y$ in regression equation 1 (which models a simple bivariate relationship between $X$ and $Y$) using this approach. This is important because one definition of the indirect (i.e., mediation) effect is $b_{12}-b_{32}$, as I describe above. + +Can anyone help me find a procedure in R to generate variables $X$, $med$, and $Y$ that satisfy constraints that I set using equations 1, 2, and 3? + + + [1]: https://umdrive.memphis.edu/grelyea/public/PUBH%207152-Stat%20Methods%20II/Chapter%2010/Mediation/Baron_&_Kenny_1986.pdf + [2]: http://www.psor.ucl.ac.be/personal/yzerbyt/Judd%20et%20al.%20HRMSP%202013.pdf",Added more clarifying text,2013-10-10 17:30:50.737 +185553,57241,5237.0,5,,CC BY-SA 3.0,35b3dd40-8f3d-4df0-9d1b-0f1c94c59c32,"This is quite straightforward. The reason you have no relationship between $x$ and $y$ using your approach is because of the code: + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + +If you want some relationship between $x$ and $y$ even when $med$ is included (that is, you want *partial* mediation), you would simply use a non-zero value for $b_{32}$ instead. For example, you could substitute the following code for the above: + + y <- 2.5 + 3 * x + .4 * med + rnorm(100, sd = 1) + +Thus, $b_{32}$ has been changed from $0$ to $3$. (Of course some other, specific value would probably be more relevant, depending on your situation, I just picked $3$ off the top of my head.) + +-------------- +*Edit:* +With respect to the marginal $x\rightarrow y$ relationship being non-significant, that is just a function of [statistical power][1]. Since the causal force of $x$ is passed entirely through $med$ in your original setup, you have lower power than you might otherwise. Nonetheless, the effect is still *real* in some sense. When I ran your original code (after having set the seed using `90` as a value that I again just picked off the top of my head), I did get a significant effect: + + set.seed(90) + x <- rep(c(-.5, .5), 50) + med <- 4 + .7 * x + rnorm(100, sd = 1) + + # Check the relationship between x and med + mod <- lm(med ~ x) + summary(mod) + + y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1) + + # Check the relationships between x, med, and y + mod <- lm(y ~ x + med) + summary(mod) + + # Check the relationship between x and y -- not present + mod <- lm(y ~ x) + summary(mod) + + ... + Coefficients: + Estimate Std. Error t value Pr(>|t|) + (Intercept) 3.8491 0.1151 33.431 <2e-16 *** + x 0.5315 0.2303 2.308 0.0231 * + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + + ... + +To get more power, you can increase the $N$ you are using, or use smaller error values (i.e., use `sd=` values less than the default `1` in the `rnorm()` calls). + + + [1]: http://en.wikipedia.org/wiki/Statistical_power",added 1833 characters in body,2013-10-10 17:30:55.270 +185554,57223,22564.0,5,,CC BY-SA 3.0,6cf4f662-320a-43a5-8faa-fd40d5e17655,"I have a problem like the following: + +1) There are six measurements for each individual with large within-subject variance + +2) There are two groups (Treatment and Control) + +3) Each group consists of 5 individuals + +4) I want to perform a significance test comparing the two groups to know if the group means are different from one another. + + +The data looks like this: +![http://s10.postimg.org/p9krg6f3t/examp.png][1] + +And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. **This ignores within-subject variability**: + + + n.simulations<-10000 + pvals=matrix(nrow=n.simulations,ncol=1) + for(k in 1:n.simulations){ + subject=NULL + for(i in 1:10){ + subject<-rbind(subject,as.matrix(rep(i,6))) + } + #set.seed(42) + + #Sample Subject Means + subject.means<-rnorm(10,100,2) + + #Sample Individual Measurements + values=NULL + for(sm in subject.means){ + values<-rbind(values,as.matrix(rnorm(6,sm,20))) + } + + out<-cbind(subject,values) + + #Split into GroupA and GroupB + GroupA<-out[1:30,] + GroupB<-out[31:60,] + + #Add effect size to GroupA + GroupA[,2]<-GroupA[,2]+0 + + colnames(GroupA)<-c(""Subject"", ""Value"") + colnames(GroupB)<-c(""Subject"", ""Value"") + + #Calculate Individual Means and SDS + GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2) + for(i in 1:length(unique(GroupA[,1]))){ + GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + } + colnames(GroupA.summary)<-c(""Mean"",""SD"") + + + GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2) + for(i in 1:length(unique(GroupB[,1]))){ + GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + } + colnames(GroupB.summary)<-c(""Mean"",""SD"") + + Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary)) + colnames(Summary)[1]<-""Group"" + + pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value + } + + +And here is code for plots: + + + #Plots + par(mfrow=c(2,2)) + boxplot(GroupA[,2]~GroupA[,1], col=""Red"", main=""Group A"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupA[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupA[,1]))){ + m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(GroupB[,2]~GroupB[,1], col=""Light Blue"", main=""Group B"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupB[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupB[,1]))){ + m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(Summary[,2]~Summary[,1], col=c(""Red"",""Light Blue""), xlab=""Group"", ylab=""Average Value"", + ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])), + main=""Individual Averages"") + stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T) + + points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(.9, + t.test(GroupA.summary[,1])$conf.int[1],.9, + t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + + points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(1.9, + t.test(GroupB.summary[,1])$conf.int[1],1.9, + t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + hist(pvals, breaks=seq(0,1,by=.05), col=""Grey"", + main=c(paste(""# sims="", n.simulations), + paste(""% Sig p-values="",100*length(which(pvals<0.05))/length(pvals))) + ) + +Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data. + +So what is the correct way to analyze this data? + + +**Bonus:** + +The example above is a simplification. For the actual data: + +1) The within-subject variance is positively correlated with the mean. + +2) Values can only be multiples of two. + +3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end. + +4) Number of Subjects in each group are not necessarily equal. + +Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the ""correct"" p-values. + +**EDIT:** + +Ok, here is what *actual* data looks like. There is also three groups rather than two: + +![enter image description here][2] + +dput() of data: + + structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, + 22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, + 6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, + 2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, + 12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, + 10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, + 20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list( + NULL, c(""Group"", ""Subject"", ""Value""))) + + [1]: https://i.stack.imgur.com/55V9J.png + [2]: https://i.stack.imgur.com/k1xWd.png",added data example,2013-10-10 17:35:10.020 +185555,57230,668.0,5,,CC BY-SA 3.0,4f244ce1-6536-4446-bdb5-139a8f9978e6,"The data contain category and sub-category distributions. + +The categories are topics in a quiz such as: Music, Sports, Business. + +Each category has three levels to choose from: Basic, Standard and Advanced. + +For example: A user might take a quiz on Music across different levels. Say the number of questions attempted is 100. The user would have answered them across levels. 40 for basic, 40 for standard and 20 for advanced. The data consist of counts of the questions attempted within each category for each user. + +What is the best way to represent these data on a graph? Each graph would contain up to 5 main categories. + +",added 61 characters in body,2013-10-10 17:49:32.410 +185556,57238,,25,,,3cb050b6-fcd0-4170-8049-f286538ff024,,http://twitter.com/#!/StackStats/status/388367281618960384,2013-10-10 18:15:50.177 +185561,57242,22573.0,3,,CC BY-SA 3.0,16b13cf6-06c7-4ea8-9755-e85213458509,,,2013-10-10 18:29:33.973 +185560,57242,22573.0,1,,CC BY-SA 3.0,16b13cf6-06c7-4ea8-9755-e85213458509,estimate multivariate normal distribution by observing variance in different directions,,2013-10-10 18:29:33.973 +186165,57195,5237.0,6,,CC BY-SA 3.0,eb4af092-a8ed-4b6c-8bdf-6f50e3f86525,,added tag,2013-10-14 03:30:03.973 +186492,57513,10594.0,3,,CC BY-SA 3.0,96a35508-4fe5-4957-98d5-5291599c2ef8,,,2013-10-15 09:49:58.953 +185559,57242,22573.0,2,,CC BY-SA 3.0,16b13cf6-06c7-4ea8-9755-e85213458509,"Assume I am looking for a normal distribution $\mathcal{N}(\mu,\Sigma)$. For simplicity let's say we only have 2 random variables $x$ and $y$ and a known $\mu=0$. + +Is it possible to estimate $\Sigma$ by observing the variance along multiple directions? + +For example, I measure the variance $\sigma_1$ along the vector $\mathbb{v}_1 = (x_1,y_1)^T$. In another step I obtain a different variance $\sigma_2$ from a different direction $\mathbb{v}_2 = (x_2,y_2)^T$. Ideally one would continue to observe these single variations in different directions and combine them in one multivariate normal distribution. + +Does this make sense?",,2013-10-10 18:29:33.973 +185562,57243,20473.0,2,,CC BY-SA 3.0,b9af84f5-8211-47e8-99cb-1454cca90874,"It appears you are confusing results that hold for a collection of random variables with the case of _one_ random variable. + +When you have a series of observations, $x_1,...,x_n$, then ***if*** they are _contemporaneous_, ($x_{1t},...,x_{nt}$) they are considered as realizations of _n distinct random variables_ (that may be identically and independently distributed, or not). You _cannot_, in this case, assume that all are realizations of the _same_ random variable, because a random variable is a real-valued function: this means that at a specific point in time, it can have only one realization (take one value), otherwise it wouldn't be a function but a correspondence: this is why when we have a _cross-sectional_ sample of size $n$, we say that ""it is comprised of the realization of $n$ random variables"", and not ""$n$ realizations of the same random variable"". Note carefully that ""same"" does not just mean ""identically distributed"", but ontologically equal. + +Assume now that you have a time-series, and the index $1,...,n$ represents different points in time. Can you say that they are all realizations of the _same_ random variable? Well in principle you can, but here too, we tend to view a time series as a stochastic process of _distinct_ random variables (one for each point in time), that, again, may be identically distributed. + +So in general, when looking at a sample, be it cross-sectional or time series, it is advisable to think of it as a collection of realizations of _many_ random variables. + +Now, when we subtract the mean from a random variable, and divide by the standard deviation, we create the ""standardized"" version of the variable, that has mean zero and variance (and standard deviation) unity. This is irrespective of the distribution that this variable follows, because, by standard universal properties of these distribution moments +$$Z = \frac {X-\mu}{\sigma} \Rightarrow E(Z) = \frac {1}{\sigma}E(X) - \frac {\mu}{\sigma} = 0$$ + +and + +$$ \text {Var}(Z) = \text {Var}\left(\frac {X-\mu}{\sigma}\right) = \frac {1}{\sigma^2}\text {Var}(X) = \frac {\sigma^2}{\sigma^2} =1$$ + +The standardized version $Z$ of _one_ random variable $X$ follows a distribution that belongs to the same family as the distribution of $X$, with different said parameters - the distribution _family_ does not change. So if you don't know the distribution by other means, the distribution of the standardized version will remain unknown. + +Now consider the random variable $S_n = \frac 1n\sum_{i=1}^nX_i$. It is for this and like quantities that the various [Central Limit Theorems][1] talk about and tell that they approach a normal distribution asymptotically. + + + [1]: http://en.wikipedia.org/wiki/Central_limit_theorem",,2013-10-10 18:52:09.813 +185565,57244,20179.0,1,,CC BY-SA 3.0,4f9639fe-3c18-4bec-a679-d6bbc97f3efa,genotype coding in regression,,2013-10-10 18:54:07.593 +185564,57244,20179.0,2,,CC BY-SA 3.0,4f9639fe-3c18-4bec-a679-d6bbc97f3efa,"I would like to conduct some analysis on some biological traits with regression model. +The response variable is continuous. One important independent variable is the SNP informaion(wildtype, heterzygous or homozygous ). There are different ways to code it. +It can be treated as a nominal variable, or a ordinal variable(like 1,2,3). Any one familar with the difference and any classical references about it? +Thank you for any suggestion. + +Best",,2013-10-10 18:54:07.593 +185563,57244,20179.0,3,,CC BY-SA 3.0,4f9639fe-3c18-4bec-a679-d6bbc97f3efa,,,2013-10-10 18:54:07.593 +185566,57245,346.0,2,,CC BY-SA 3.0,1ecb6780-da48-4a85-abd6-837295d7c3c0,"I take the freedom to answer the question in the title, how would I analyze this data. + +Given that we have replicates within the samples, mixed models immediately come to mind, which should estimate the variability within each individual and control for it. + +Hence I fit the model using `lmer` from `lme4`. However, as we are interested in p-values, I use `mixed` from `afex` which obtains those via `pbkrtest` (i.e., Kenward-Rogers approximation for degrees-of-freedom). (afex also already sets the contrast to `contr.sum` which I would use in such a case anyway) + +To control for the zero floor effect (i.e., positive skew), I fit two alternative versions transforming the dependent variable either with `sqrt` for mild skew and `log` for stronger skew. + + require(afex) + + # read the dput() in as dat <- ... + dat <- as.data.frame(dat) + dat$Group <- factor(dat$Group) + dat$Subject <- factor(dat$Subject) + + (model <- mixed(Value ~ Group + (1|Subject), dat)) + ## Effect stat ndf ddf F.scaling p.value + ## 1 (Intercept) 237.730 1 15 1 0.0000 + ## 2 Group 7.749 2 15 1 0.0049 + + (model.s <- mixed(sqrt(Value) ~ Group + (1|Subject), dat)) + ## Effect stat ndf ddf F.scaling p.value + ## 1 (Intercept) 418.293 1 15 1 0.0000 + ## 2 Group 4.121 2 15 1 0.0375 + + (model.l <- mixed(log1p(Value) ~ Group + (1|Subject), dat)) + ## Effect stat ndf ddf F.scaling p.value + ## 1 (Intercept) 458.650 1 15 1 0.0000 + ## 2 Group 2.721 2 15 1 0.0981 + +The effect is significant for the untransformed and `sqrt` dv. But are these model sensible? Let's plot the residuals. + + png(""qq.png"", 800, 300, units = ""px"", pointsize = 12) + par(mfrow = c(1, 3)) + par(cex = 1.1) + par(mar = c(2, 2, 2, 1)+0.1) + qqnorm(resid(model[[2]]), main = ""original"") + qqline(resid(model[[2]])) + qqnorm(resid(model.s[[2]]), main = ""sqrt"") + qqline(resid(model.s[[2]])) + qqnorm(resid(model.l[[2]]), main = ""log"") + qqline(resid(model.l[[2]])) + dev.off() + + +![enter image description here][1] + +It seems that the model with `sqrt` trasnformation provides a reasonable fit (there still seems to be one outlier, but I will ignore it). So, let's further inspect this model using `multcomp` to get the comparisons among groups: + + require(multcomp) + + # using bonferroni-holm correction of multiple comparison + summary(glht(model.s[[2]], linfct = mcp(Group = ""Tukey"")), test = adjusted(""holm"")) + ## Simultaneous Tests for General Linear Hypotheses + ## + ## Multiple Comparisons of Means: Tukey Contrasts + ## + ## + ## Fit: lmer(formula = sqrt(Value) ~ Group + (1 | Subject), data = data) + ## + ## Linear Hypotheses: + ## Estimate Std. Error z value Pr(>|z|) + ## 2 - 1 == 0 -0.0754 0.3314 -0.23 0.820 + ## 3 - 1 == 0 1.1189 0.4419 2.53 0.023 * + ## 3 - 2 == 0 1.1943 0.4335 2.75 0.018 * + ## --- + ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + ## (Adjusted p values reported -- holm method) + + # using default multiple comparison correction (which I don't understand) + summary(glht(model.s[[2]], linfct = mcp(Group = ""Tukey""))) + ## Simultaneous Tests for General Linear Hypotheses + ## + ## Multiple Comparisons of Means: Tukey Contrasts + ## + ## + ## Fit: lmer(formula = sqrt(Value) ~ Group + (1 | Subject), data = data) + ## + ## Linear Hypotheses: + ## Estimate Std. Error z value Pr(>|z|) + ## 2 - 1 == 0 -0.0754 0.3314 -0.23 0.972 + ## 3 - 1 == 0 1.1189 0.4419 2.53 0.030 * + ## 3 - 2 == 0 1.1943 0.4335 2.75 0.016 * + ## --- + ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + ## (Adjusted p values reported -- single-step method) + +Punchline: group 3 differs from the other two groups which do not differ among each other. + + [1]: https://i.stack.imgur.com/WXUEh.png",,2013-10-10 18:54:30.263 +185567,57246,22059.0,2,,CC BY-SA 3.0,c3d7f22c-ff10-4978-954f-139f2a142446,"You could, but it doesn't make to much sense to do it this way. Since there are ${n+1}\choose 2$ independent dimensions in a covariance matrix, you'd need to measure the variance in ${n+1}\choose 2$ different directions, and then do some linear algebra to reconstruct the covariance matrix. It's more straightforward just to measure the covariance matrix all at once. + +For example, in $2$ dimensions, you could measure the variance along $(0,1)$, $(0,1)$ and $(1,1)$, then since $v_{(1,1)}=\Sigma_{11}+\Sigma_{22}+2\Sigma_{12}$, you would end up with +$$\Sigma=\left(\array{v_{(1,0)} & \frac{v_{(1,1)}-v_{(1,0)}-v_{(0,1)}}{2}\\ \frac{v_{(1,1)}-v_{(1,0)}-v_{(0,1)}}{2} & v_{(0,1)}}\right)$$ + +I suppose if somehow you were constrained in such a way that you couldn't sample from the entire distribution, but you were only able to sample from projections onto a single dimension at a time, then this approach could be useful.",,2013-10-10 19:06:15.007 +185568,57247,20286.0,2,,CC BY-SA 3.0,5c2e6502-62c3-4dd8-afa1-6c98060e99bd,"Unless you have reason to believe that LabA and LabB would systematically provide different results if they ever measured the same set of samples, your data from the 2 Labs ""are on the same scale"" as far as you can tell from these data. The problem is that the less-sensitive LabA will be unable to report a value for some samples that would have had values reported if LabB had instead done the analysis. + +Perhaps the best way to proceed would be to define a class of results called ""<0.2"", and include in that class all 0 readings from LabA and all readings <0.2 from LabB. How you proceed from there depends on ""What is the question of interest?"" as @Glen_b put it in a comment. + +All will be much more useful and reliable if it is possible to cross-compare a set of samples analyzed by both Labs, because there may be systematic differences between the 2 Labs' results that you don't suspect. + +",,2013-10-10 19:08:35.603 +185569,57246,22059.0,5,,CC BY-SA 3.0,38d8e909-0327-457e-b285-688639bff90d,"You could, but it doesn't make too much sense to do it this way. Since there are ${n+1}\choose 2$ independent dimensions in a covariance matrix, you'd need to measure the variance in ${n+1}\choose 2$ different directions, and then do some linear algebra to reconstruct the covariance matrix. It's more straightforward just to measure the covariance matrix all at once. + +For example, in $2$ dimensions, you could measure the variance along $(0,1)$, $(0,1)$ and $(1,1)$, then since $v_{(1,1)}=\Sigma_{11}+\Sigma_{22}+2\Sigma_{12}$, you would end up with +$$\Sigma=\left(\array{v_{(1,0)} & \frac{v_{(1,1)}-v_{(1,0)}-v_{(0,1)}}{2}\\ \frac{v_{(1,1)}-v_{(1,0)}-v_{(0,1)}}{2} & v_{(0,1)}}\right)$$ + +I suppose if somehow you were constrained in such a way that you couldn't sample from the entire distribution, but you were only able to sample from projections onto a single dimension at a time, then this approach could be useful.",added 1 characters in body,2013-10-10 19:12:14.623 +185570,57228,503.0,10,,,1b2959fb-abe2-4973-a643-23f97c4f9c6a,"{""Voters"":[{""Id"":7290,""DisplayName"":""gung""},{""Id"":1036,""DisplayName"":""Andy W""},{""Id"":21054,""DisplayName"":""COOLSerdash""},{""Id"":17230,""DisplayName"":""Scortchi""},{""Id"":686,""DisplayName"":""Peter Flom""}]}",102,2013-10-10 19:14:24.500 +185572,57244,674.0,5,,CC BY-SA 3.0,9f62081b-1051-4d2f-b0fc-233fd38ea38b,"I would like to conduct some analysis on some biological traits with regression model. +The response variable is continuous. One important independent variable is the SNP information (wildtype, heterozygous, or homozygous). There are different ways to code it. +It can be treated as a nominal or a ordinal variable (like 1, 2, 3). Any one familar with the difference and any classical references about it? +Thank you for any suggestion. +",deleted 10 characters in body,2013-10-10 19:21:11.417 +185571,57244,674.0,4,,CC BY-SA 3.0,9f62081b-1051-4d2f-b0fc-233fd38ea38b,Genotype coding in regression,deleted 10 characters in body,2013-10-10 19:21:11.417 +186166,57417,594.0,5,,CC BY-SA 3.0,edcc3254-00af-48e6-a623-d55bae71f2dd,"Let $X_1, X_2,\ldots ,X_n$ be discrete random variables. + +I'm looking for a way to prove the random variables are independent but not identically distributed. + +Can anyone suggest some ideas ?",formatting,2013-10-14 03:40:56.357 +185577,57249,19264.0,2,,CC BY-SA 3.0,c104c76e-1ff7-4661-8be2-af1d7ad779d7,"I have [read](http://en.wikipedia.org/wiki/Gamma_distribution#Summation) that the sum of gamma distributions with the same scale parameter is another gamma distribution. I've also seen the paper by [Moschopoulos](http://www.ism.ac.jp/editsec/aism/pdf/037_3_0541.pdf) describing a method for the summation of a general set of gamma distributions. I have tried implementing Moschopoulos's method but have yet to have success. + +What does the summation of a general set of gamma distributions look like? To make this question concrete, what does it look like for: + +$Gamma(3,1) + Gamma(4,2) + Gamma(5,1)$ + +If the parameters above are not particularly revealing, please suggest others.",,2013-10-10 19:49:21.903 +185580,57248,10278.0,5,,CC BY-SA 3.0,f5f0cb95-f9e0-4570-9e2d-5b5a8032d445,"If you treat the variable as ordinal you are assuming a gene-dosage effect. +This is essentially a two degree of freedom test. You are doing a regression. +If you treat the variable as nominal you are not assuming any gene-dosage effect and instead comparing the mean of the three genotype groups this is a one degree of freedom test. You are doing ANOVA with 3 categories. +Hence the gene-dosage model (treating genotypes as ordinal) is more powerful because you are using information about the genotype group (whether you 0, 1 or 2 copies of the wild type allele) whereas in the categorical approach your model knows nothing about the genotype groups (they could just be called A, B and C). Treating the genotype as ordinal is the preferred approach. Also I should mention that if you believe that for example the wild-type allele is be dominant then you can merge the heterozygous individuals into the wild-type homozygous group and treat them as one group.",better explanation of the difference between the models,2013-10-10 19:49:42.177 +185583,57250,22577.0,2,,CC BY-SA 3.0,d83f29b1-d736-4cc9-ab55-597c445edc2b,"The scenario is like this: + +I have a cohort with 2000 people, half of them taking DRUG, the other half not taking it. I wanna check the interactions between DRUG and the other variables in the model: + +Method 1: + +Firstly I got a original model:y1=a1*AGE+b1*BMI+c1*DRUG,[DRUG is binary: yes-1, no-0]; i got a likelihood 1; + +If I want to test the interaction of AGE, BMI and DRUG, I need another model:y2=a2*AGE+b2*BMI+c2*DRUG+d*(DRUG*AGE)+e*(DRUG*BMI); i got a likelihood 2; + +Then I compare the likelihood of these two models using chi-square test (df=2), and see whether the difference (likelihood 2 minus likelihood) is significant. + +Method 2: + +Stratify people into two groups according to DRUG status: + +Group 1: for people taking DRUG (n=1000), model 1: y1=a1*AGE+b1*BMI, i got a likelihood 1 (L1); + +Group 2: for people not taking DRUG (n=1000), model 2: y2=a2*AGE+b2*BMI, likelihood 2 (L2); + +Then I use all the people (n-2000), model 3:y3=a3*AGE+b3*BMI+d*(DRUG*AGE)+e*(DRUG*BMI), likelihood 3 (L3); + +So in order to test the interactions, chi-square=L3/(L1*L2). But the question is: What is the degree of freedom (df)?? + +Can anyone help?? Cannot get the answer... Really thanks so much!!! + +Cheers, +GL + +",,2013-10-10 19:57:14.333 +185582,57250,22577.0,1,,CC BY-SA 3.0,d83f29b1-d736-4cc9-ab55-597c445edc2b,how to determine degree of freedom (for test of interaction),,2013-10-10 19:57:14.333 +185581,57250,22577.0,3,,CC BY-SA 3.0,d83f29b1-d736-4cc9-ab55-597c445edc2b,,,2013-10-10 19:57:14.333 +185586,57251,22578.0,3,,CC BY-SA 3.0,0938e44f-219f-4489-95f9-e7698c48dc26,,,2013-10-10 19:59:40.663 +185585,57251,22578.0,1,,CC BY-SA 3.0,0938e44f-219f-4489-95f9-e7698c48dc26,Convergence theorem for Gibbs sampling,,2013-10-10 19:59:40.663 +185584,57251,22578.0,2,,CC BY-SA 3.0,0938e44f-219f-4489-95f9-e7698c48dc26,"The convergence theorem for Gibbs sampling states: + +Given a random Vektor $X$ with $X_1,X_2,...X_K$ and the knowlegde about the conditional distribution of $X_k$ we can find the actual distribution using Gibbs Sampling infinitly often. + +While doing research on this, for a deeper understanding, I ran across [this][1] answer. Which explains quite well how to pick a single sample using the Method, but I am not able to extend/modify it to fit the convergence theorem, as the result of the given example is one sample (spell) and not a final/actual probability distribution. + +**Therefore, how do I have to modify that example to fit the convergence theorem?** + + [1]: http://stats.stackexchange.com/a/10216/31349",,2013-10-10 19:59:40.663 +185589,57252,22580.0,1,,CC BY-SA 3.0,6d8450a9-3b67-4af7-b8e6-6884726113e7,is good to standardize when you have an interaction?,,2013-10-10 19:59:51.410 +185587,57252,22580.0,2,,CC BY-SA 3.0,6d8450a9-3b67-4af7-b8e6-6884726113e7,"I put this question because while reading the benefits of standardizing explanatory variables or not, I read GOOD BUT CONTRASTING opinions about standardizing when there are interaction in the model. Some talk about how problems of collinearity are removed when standadizing (which is basically the case of my GLMM). However, other claim that standard errors and p-values of interactions of standardized models are not reliable... +sooo, any ideas on what is the right thing to do? thanks",,2013-10-10 19:59:51.410 +185588,57252,22580.0,3,,CC BY-SA 3.0,6d8450a9-3b67-4af7-b8e6-6884726113e7,,,2013-10-10 19:59:51.410 +185591,57250,674.0,5,,CC BY-SA 3.0,3d4cd37d-754d-48f9-b8a0-c1c029361a6a,"The scenario is like this: + +I have a cohort with 2000 people, half of them taking DRUG, the other half not taking it. I would like to check interactions between DRUG and the other variables in the model: + +* **Method 1:** + + Firstly I got a original model: `y1=a1*AGE+b1*BMI+c1*DRUG`,[`DRUG` is binary: yes-1, no-0]; I got a likelihood 1; + + If I want to test the interaction of `AGE`, `BMI` and `DRUG`, I need another model: `y2=a2*AGE+b2*BMI+c2*DRUG+d*(DRUG*AGE)+e*(DRUG*BMI)`; I got a likelihood 2; + + Then I compare the likelihood of these two models using chi-square test (df=2), and see whether the difference (likelihood 2 minus likelihood) is significant. + +* **Method 2:** + + Stratify people into two groups according to DRUG status: + + Group 1: for people taking DRUG (n=1000), model 1: `y1=a1*AGE+b1*BMI`, I got a likelihood 1 (L1); + + Group 2: for people not taking DRUG (n=1000), model 2: `y2=a2*AGE+b2*BMI`, likelihood 2 (L2); + + Then I use all the people (n-2000), model 3:y3=a3*AGE+b3*BMI+d*(DRUG*AGE)+e*(DRUG*BMI), likelihood 3 (L3); + +So in order to test the interactions, chi-square=L3/(L1*L2). But the question is: What is the degree of freedom (df)? + +Can anyone help? I cannot get the answer.",added 2 characters in body,2013-10-10 20:02:02.067 +185590,57250,674.0,4,,CC BY-SA 3.0,3d4cd37d-754d-48f9-b8a0-c1c029361a6a,How to determine degree of freedom for a certain test of interaction?,added 2 characters in body,2013-10-10 20:02:02.067 +185592,57249,674.0,4,,CC BY-SA 3.0,8aef8156-defd-400b-80d5-5c5eea124362,General sum of Gamma distributions,edited title,2013-10-10 20:02:49.223 +185593,57231,674.0,4,,CC BY-SA 3.0,f877ebf7-12bc-42d6-a66a-b7146390e0b0,Raw return vs. percentage return to calculate volatility,edited title,2013-10-10 20:04:38.187 +185595,57252,674.0,4,,CC BY-SA 3.0,ebb28b21-b3ff-4081-ae1f-13f178eba5bd,What are the pros and cons of standardizing variable in presence of an interaction?,added 1 characters in body; edited title,2013-10-10 20:06:49.197 +186212,57396,,25,,,5344b91a-6a6e-46e0-876c-5fd69a97c12a,,http://twitter.com/#!/StackStats/status/389682357537878016,2013-10-14 09:21:28.723 +185594,57252,674.0,5,,CC BY-SA 3.0,ebb28b21-b3ff-4081-ae1f-13f178eba5bd,"I put this question because while reading the benefits of standardizing explanatory variables or not, I read *good but contrasting* opinions about standardizing when there are interaction in the model. + +Some talk about how problems of collinearity are removed when standardizing (which is basically the case of my GLMM). However, others claim that standard errors and p-values of interactions of standardized models are not reliable... + +So, any ideas on what is the right thing to do?",added 1 characters in body; edited title,2013-10-10 20:06:49.197 +185598,57253,22582.0,3,,CC BY-SA 3.0,3467d558-fd84-4f3b-a2b7-3ce515b743cb,,,2013-10-10 20:15:15.787 +185597,57253,22582.0,1,,CC BY-SA 3.0,3467d558-fd84-4f3b-a2b7-3ce515b743cb,Box Cox Transformation with swift,,2013-10-10 20:15:15.787 +185596,57253,22582.0,2,,CC BY-SA 3.0,3467d558-fd84-4f3b-a2b7-3ce515b743cb,"I am trying to do a box-cox transformation with swift. I have a dependent variable, annual foreign sales of companies (in US$ thousands) which contains zeros, for a set of panel data. I have been advised to add a small amount, for example, 0.00001 to the annual foreign sales figures so that I can take the log, but I think box-cox transformation will produce a more appropriate constant than 0.00001. I have done a box-cox transformation on R with the codes below, but it has given me a very large lambda2 of 31162.8. + +library(geoR) +boxcoxfit(bornp$ForeignSales, lambda2 = TRUE) +#R output - Fitted parameters: +# lambda lambda2 beta sigmasq +# -1.023463e+00 3.116280e+04 9.770577e-01 7.140328e-11 + +My hunch is that the above value of lambda2 is very large, so I am not sure if I need to run the boxcoxfit with my independent variables like below: +boxcoxfit(bornp$ForeignSales, bornp$family bornp$roa bornp$solvencyratio, +lambda2=TRUE) + +I am still trying to identify the best set of independent variables, so I am not sure if using the boxcoxfit with independent variables at this stage will work or is best. + +I would be very grateful for any advice on the above. +Thanks, +Jen +",,2013-10-10 20:15:15.787 +185600,57253,668.0,5,,CC BY-SA 3.0,e29c06a1-f65f-4885-8bf9-f909083fc236,"I am trying to do a box-cox transformation with swift. I have a dependent variable, annual foreign sales of companies (in US\$ thousands) which contains zeros, for a set of panel data. I have been advised to add a small amount, for example, 0.00001 to the annual foreign sales figures so that I can take the log, but I think box-cox transformation will produce a more appropriate constant than 0.00001. I have done a box-cox transformation on R with the codes below, but it has given me a very large lambda2 of 31162.8. + + library(geoR) + boxcoxfit(bornp$ForeignSales, lambda2 = TRUE) + #R output - Fitted parameters: + # lambda lambda2 beta sigmasq + # -1.023463e+00 3.116280e+04 9.770577e-01 7.140328e-11 + +My hunch is that the above value of lambda2 is very large, so I am not sure if I need to run the boxcoxfit with my independent variables like below: + + boxcoxfit(bornp$ForeignSales, bornp$family bornp$roa bornp$solvencyratio,lambda2=TRUE) + +I am still trying to identify the best set of independent variables, so I am not sure if using the boxcoxfit with independent variables at this stage will work or is best. + +I would be very grateful for any advice on the above. +",added 25 characters in body; edited tags,2013-10-10 20:17:08.960 +185599,57253,668.0,6,,CC BY-SA 3.0,e29c06a1-f65f-4885-8bf9-f909083fc236,,added 25 characters in body; edited tags,2013-10-10 20:17:08.960 +185601,57206,22555.0,5,,CC BY-SA 3.0,b345b2e3-73e1-4ac8-9399-ec6042bfe5c7,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta 1 and Beta 2. A Beta 1 of 0 and Beta 2 of 3 suggests that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta 1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta 2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying another distribution. For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Uniform: [0,1.8] [point] + Exponential: [4,9] [point] + Normal: [0,3] [point] + Students-t: (0,3) to [0,10] [line] + Lognormal: (0,3) to [3.6,10] [line] + Gamma: (0,3) to (4,9) [line] + Beta: (0,3) to (4,9), (0,1.8) to (4,9) [area] + Beta J: (0,1.8) to (4,9), (0,1.8) to [4,6*] [area] + Beta U: (0,1.8) to (4,6), [0,1] to [4.5) [area] + Impossible: (0,1) to (4.5), (0,1) to (4,1] [area] + Undefined: (0,3) to (3.6,10), (0,10) to (3,6,10) [area] + + Values of Beta1, Beta2 where brackets mean: + + [ ] : includes (closed) + ( ) : approaches but does not include (open) + * : approximate + +These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",updated table of distributions,2013-10-10 20:25:20.590 +185602,57206,22555.0,5,,CC BY-SA 3.0,7f406071-d5f9-43d9-9e1e-e44c06d3b1f5,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. If you can get a copy of **Hahn and Shapiro: Statistical Models in Engineering** you can look up the factors Beta 1 and Beta 2 (pages 42 to 49) and the Chart 6-1 of Page 197. The theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate so called Beta1 and Beta2. A Beta1 = 0 and Beta2 = 3 suggests that these data are approaching normality. Given this is a rough test but with limited data any test could be considered a rough one. + +Beta1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying other distributions (including Pearson Distributions I, I(U), I(J), II, II(U), III, IV, V, VI, VII). For example, Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated: + + Where: 0 <= Beta1 <= 4 + 1 <= Beta2 <= 10 + + Uniform: [0,1.8] [point] + Exponential: [4,9] [point] + Normal: [0,3] [point] + Students-t: (0,3) to [0,10] [line] + Lognormal: (0,3) to [3.6,10] [line] + Gamma: (0,3) to (4,9) [line] + Beta: (0,3) to (4,9), (0,1.8) to (4,9) [area] + Beta J: (0,1.8) to (4,9), (0,1.8) to [4,6*] [area] + Beta U: (0,1.8) to (4,6), [0,1] to [4.5) [area] + Impossible: (0,1) to (4.5), (0,1) to (4,1] [area] + Undefined: (0,3) to (3.6,10), (0,10) to (3.6,10) [area] + + Values of Beta1, Beta2 where brackets mean: + + [ ] : includes (closed) + ( ) : approaches but does not include (open) + * : approximate + +These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",updated table of distributions,2013-10-10 20:32:52.977 +185658,57270,22593.0,2,,CC BY-SA 3.0,1a8e6a02-0e64-4b5f-857f-40ecb23b1df6,"In a paper of Journal of Chemometrics (Naes & Mevik 2001), the authors propose to make simulations by creating two groups which are different with respect to the smallest eigen vector direction. + +> Blockquote +Here the groups are different with respect to the orthogonal complement to the five ‘NIR loadings’. +This is achieved in the following way. The constant 0 ⋅ 18 is multiplied by a sixth loading vector +(orthogonal to the other five) and added to group 2. Both groups had initially the same means as group +1 +> Blockquote + +How can I compute such a simulation in R? The goal is to obtain group differences which are tied to the ""small eigen-vectors"" space. + +Many thanks, + +Julien",,2013-10-11 00:34:33.700 +185603,57206,22555.0,5,,CC BY-SA 3.0,f6065214-789a-4d69-9be4-a700572f6c72,"This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time). + +As an alternative, you can look at kurtosis and skewness coefficients. From **Hahn and Shapiro: Statistical Models in Engineering** some background is provided on the properties Beta1 and Beta2 (pages 42 to 49) and the Fig 6-1 of Page 197. Additional theory behind this can be found on Wikipedia (see Pearson Distribution). + +Basically you need to calculate the so-called properties Beta1 and Beta2. A Beta1 = 0 and Beta2 = 3 suggests that the data set approaches normality. This is a rough test but with limited data it could be argued that any test could be considered a rough one. + +Beta1 is related to the moments 2 and 3, or variance and [skewness](http://en.wikipedia.org/wiki/Skewness), respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is: + + Beta1 = SKEW(...)^2/VAR(...)^3 + +Beta2 is related to the moments 2 and 4, or the variance and [kurtosis](http://en.wikipedia.org/wiki/Kurtosis), respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is: + + Beta2 = KURT(...)/VAR(...)^2 + +Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying other distributions (including Pearson Distributions I, I(U), I(J), II, II(U), III, IV, V, VI, VII). For example, many of the commonly used distributions such as Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated from these properties: + + Where: 0 <= Beta1 <= 4 + 1 <= Beta2 <= 10 + + Uniform: [0,1.8] [point] + Exponential: [4,9] [point] + Normal: [0,3] [point] + Students-t: (0,3) to [0,10] [line] + Lognormal: (0,3) to [3.6,10] [line] + Gamma: (0,3) to (4,9) [line] + Beta: (0,3) to (4,9), (0,1.8) to (4,9) [area] + Beta J: (0,1.8) to (4,9), (0,1.8) to [4,6*] [area] + Beta U: (0,1.8) to (4,6), [0,1] to [4.5) [area] + Impossible: (0,1) to (4.5), (0,1) to (4,1] [area] + Undefined: (0,3) to (3.6,10), (0,10) to (3.6,10) [area] + + Values of Beta1, Beta2 where brackets mean: + + [ ] : includes (closed) + ( ) : approaches but does not include (open) + * : approximate + +These are illustrated in Hahn and Shapiro Fig 6-1. + +Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method. + +There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post. + + + +",added 101 characters in body,2013-10-10 20:47:04.627 +185605,57254,22583.0,1,,CC BY-SA 3.0,d3cc2c9c-ba44-4df4-ae89-9e27509931c0,variance of compound variable?,,2013-10-10 20:48:01.520 +185604,57254,22583.0,2,,CC BY-SA 3.0,d3cc2c9c-ba44-4df4-ae89-9e27509931c0,"here is my situation. I am weighting a packet of material that has 10 individual units in it. In the end of the day I would like to know the average weight and variance of the individual units but the problem is that I cannot weight each unit individually since I would have to destroy the packet to get to the individual units. So in lieu of this, I am trying to make an inference of the individual units from what I know about the packets. I weighed 10 packets (hence I have 100 individual units). I was able to figure out the average weight of the units but am having trouble with the variance. Here is what I have done so far: + +$$ +\begin{split} +\bar{y}&=\frac{1}{10}\sum^{10}_{i=1}y_i\\ + &=\frac{1}{10}\sum^{10}_{i=1} (x_{i,1}+x_{i,2}+...+x_{i,10})~since~y_i=x_{i,1}+x_{i,2}+...+x_{i,10}\\ + &=\frac{1}{10}\sum^{100}_{j=1}x_j\\ + &=\frac{1}{10}(100~\bar{x})=10~\bar{x} +\end{split} +$$ + +thus we have the average of $x$, $\bar{x}=\frac{\bar{y}}{10}.$ But now my challenge is how to do I find variance of $x$ given the variance of $y$? Any suggestions? Thanks!",,2013-10-10 20:48:01.520 +185606,57254,22583.0,3,,CC BY-SA 3.0,d3cc2c9c-ba44-4df4-ae89-9e27509931c0,,,2013-10-10 20:48:01.520 +185607,57255,19545.0,3,,CC BY-SA 3.0,6b75e78c-ade3-479f-8f43-88efc5713fd3,,,2013-10-10 20:53:13.273 +185608,57255,19545.0,2,,CC BY-SA 3.0,6b75e78c-ade3-479f-8f43-88efc5713fd3,"It seems to me that normalized ERR scores (ERR scores of your ranking algorithm divided by ERR score calculated for the ground truth ranking) are more useful than the unscaled ERR scores, but I have not seen normalized scores being reported in the literature. Is there a good reason that the ERR scores are reported in raw rather than normalized format?",,2013-10-10 20:53:13.273 +185609,57255,19545.0,1,,CC BY-SA 3.0,6b75e78c-ade3-479f-8f43-88efc5713fd3,Why are ERR scores not normalized?,,2013-10-10 20:53:13.273 +185611,57254,22583.0,4,,CC BY-SA 3.0,01077b8e-0e3b-4ca0-bb57-bb4cbc6dd67a,variance of summation/compound variable?,edited title,2013-10-10 20:56:04.337 +185612,57256,668.0,2,,CC BY-SA 3.0,96d03b29-904a-447d-8814-545270fc6bdc,"First, **combine any sums having the same scale factor**: a $\Gamma(n, \beta)$ plus a $\Gamma(m,\beta)$ variate form a $\Gamma(n+m,\beta)$ variate. + +Next, observe that the characteristic function (cf) of $\Gamma(n, \beta)$ is $(1-i \beta t)^{-n}$, whence the cf of a sum of these distributions is the product + +$$\prod_{j} \frac{1}{(1-i \beta_j t)^{n_j}}.$$ + +When the $n_j$ are all *integral,* **this product expands as a partial fraction** into a *linear combination* of $(1-i \beta_j t)^{-\nu}$ where the $\nu$ are integers between $1$ and $n_j$. In the example with $\beta_1 = 1, n_1=8$ (from the sum of $\Gamma(3,1)$ and $\Gamma(5,1)$) and $\beta_2 = 2, n_2=4$ we find + +$$\frac{1}{(1-i t)^{8}}\frac{1}{(1- 2i t)^{4}} = \\ +\frac{1}{(x+i)^8}-\frac{8 i}{(x+i)^7}-\frac{40}{(x+i)^6}+\frac{160 i}{(x+i)^5}+\frac{560}{(x+i)^4}-\frac{1792 i}{(x+i)^3}\\-\frac{5376}{(x+i)^2}+\frac{15360 i}{x+i}+\frac{256}{(2 x+i)^4}+\frac{2048 i}{(2 x+i)^3}-\frac{9216}{(2 x+i)^2}-\frac{30720 i}{2 x+i}.$$ + +The inverse of taking the cf is the inverse Fourier Transform, which is *linear*: that means we may apply it term by term. Each term is recognizable as a multiple of the cf of a Gamma distribution and so is readily **inverted to yield the PDF**. In the example we obtain + +$$\frac{e^{-t} t^7}{5040}+\frac{1}{90} e^{-t} t^6+\frac{1}{3} e^{-t} t^5+\frac{20}{3} e^{-t} t^4+\frac{8}{3} e^{-\frac{t}{2}} t^3+\frac{280}{3} e^{-t} t^3\\ +-128 e^{-\frac{t}{2}} t^2+896 e^{-t} t^2+2304 e^{-\frac{t}{2}} t+5376 e^{-t} t-15360 e^{-\frac{t}{2}}+15360 e^{-t}$$ + +for the PDF of the sum. + +--- + +As a test, here is a histogram of $10^4$ results obtained by adding independent draws from the $\Gamma(8,1)$ and $\Gamma(4,2)$ distributions. On it is superimposed the graph of $10^4$ times the preceding function. The fit is very good. + +![Figure][1] + +--- + +Moschopoulos carries this idea one step further by expanding the cf of the sum into an *infinite* series of Gamma characteristic functions whenever one or more of the $n_i$ is non-integral, and then terminates the infinite series at a point where it is reasonably well approximated. + + + [1]: https://i.stack.imgur.com/sOPCo.png",,2013-10-10 20:58:56.470 +185613,57248,10278.0,5,,CC BY-SA 3.0,da12c19c-42b1-4077-abaa-451e9490a4cb,"If you treat the variable as ordinal you are assuming a gene-dosage effect. +This is essentially a one degree of freedom test. You are testing whether the slope of the regression line is significantly different from $0$. If you treat the variable as nominal you are not assuming any gene-dosage effect and instead you are doing a one way ANOVA with 3 groups so that's a two degrees of freedom test. +The gene-dosage model (treating genotypes as ordinal) is more powerful because you are using information about the genotype group (whether you 0, 1 or 2 copies of the wild type allele) whereas in the categorical approach your model knows nothing about the genotype groups (they could just be called A, B and C). Treating the genotype as ordinal is the preferred approach. Also I should mention that if you believe that for example the wild-type allele is dominant then you can merge the heterozygous individuals into the wild-type homozygous group and treat them as one group.",incorporated @andrea's correction about gene-dosage regression test being 1 dof and the 3 group ANOVA being a 2 dof test,2013-10-10 20:59:23.583 +185641,57265,2490.0,2,,CC BY-SA 3.0,bfc8d250-9121-4eba-adef-d8272f13a86e,"I'm designing a pretty simple experiment that goes like this. Participants will be shown a series of stimuli and after viewing each one they will answer a few questions where they will make judgments about the stimulus - all Likert items. There are two kinds of stimuli. Probably obvious, but the hypothesis is that there will be a difference between answers for A vs B stimuli. There will be 30 or so stimuli, with an equal number of A and B stimuli. All participants will see all the stimuli (within-subjects). + +I'm wondering if there would be a benefit to counterbalancing the order in which they receive the items, vs just showing everyone the same randomized sequence of stimuli (which is easier to setup). + +If there's a better method I need to consider, I'd be interested in hearing about it. I also looked into blocking designs, but this is so simple that I don't think those apply here. I'm planning to analyze with t-tests or Mann-Whitney-Wilcoxon.",,2013-10-10 23:12:54.873 +185614,57248,10278.0,5,,CC BY-SA 3.0,eb6fa4ca-f86c-4293-8ad1-ed0bc91c0b69,"If you treat the variable as ordinal you are assuming a gene-dosage effect. +This is essentially a one degree of freedom test since you are testing whether the slope of the regression line is significantly different from $0$. If you treat the variable as nominal you are not assuming any gene-dosage effect and instead you are doing a one way ANOVA with 3 groups so that's a two degrees of freedom test. +The gene-dosage model (treating genotypes as ordinal) is more powerful because you are using information about the genotype groups (whether the group has 0, 1 or 2 copies of the wild type allele) whereas in the categorical approach your model knows nothing about the genotype groups (they could just be called A, B and C). Treating the genotype as ordinal is the preferred approach. Also I should mention that if you believe that for example the wild-type allele is dominant then you can merge the heterozygous individuals into the wild-type homozygous group and treat them as one group.",incorporated @andrea's correction about gene-dosage regression test being 1 dof and the 3 group ANOVA being a 2 dof test,2013-10-10 21:06:45.523 +185615,57256,668.0,5,,CC BY-SA 3.0,5966dd78-4a79-49c7-92d9-3e55b7aed5fe,"First, **combine any sums having the same scale factor**: a $\Gamma(n, \beta)$ plus a $\Gamma(m,\beta)$ variate form a $\Gamma(n+m,\beta)$ variate. + +Next, observe that the characteristic function (cf) of $\Gamma(n, \beta)$ is $(1-i \beta t)^{-n}$, whence the cf of a sum of these distributions is the product + +$$\prod_{j} \frac{1}{(1-i \beta_j t)^{n_j}}.$$ + +When the $n_j$ are all *integral,* **this product expands as a partial fraction** into a *linear combination* of $(1-i \beta_j t)^{-\nu}$ where the $\nu$ are integers between $1$ and $n_j$. In the example with $\beta_1 = 1, n_1=8$ (from the sum of $\Gamma(3,1)$ and $\Gamma(5,1)$) and $\beta_2 = 2, n_2=4$ we find + +$$\frac{1}{(1-i t)^{8}}\frac{1}{(1- 2i t)^{4}} = \\ +\frac{1}{(x+i)^8}-\frac{8 i}{(x+i)^7}-\frac{40}{(x+i)^6}+\frac{160 i}{(x+i)^5}+\frac{560}{(x+i)^4}-\frac{1792 i}{(x+i)^3}\\-\frac{5376}{(x+i)^2}+\frac{15360 i}{x+i}+\frac{256}{(2 x+i)^4}+\frac{2048 i}{(2 x+i)^3}-\frac{9216}{(2 x+i)^2}-\frac{30720 i}{2 x+i}.$$ + +The inverse of taking the cf is the inverse Fourier Transform, which is *linear*: that means we may apply it term by term. Each term is recognizable as a multiple of the cf of a Gamma distribution and so is readily **inverted to yield the PDF**. In the example we obtain + +$$\frac{e^{-t} t^7}{5040}+\frac{1}{90} e^{-t} t^6+\frac{1}{3} e^{-t} t^5+\frac{20}{3} e^{-t} t^4+\frac{8}{3} e^{-\frac{t}{2}} t^3+\frac{280}{3} e^{-t} t^3\\ +-128 e^{-\frac{t}{2}} t^2+896 e^{-t} t^2+2304 e^{-\frac{t}{2}} t+5376 e^{-t} t-15360 e^{-\frac{t}{2}}+15360 e^{-t}$$ + +for the PDF of the sum. + +This is a finite *mixture* of Gamma distributions having scale factors equal to those within the sum and shape factors less than or equal to those within the sum. Except in special cases (where some cancellation might occur), the number of terms is given by the total shape parameter $n_1 + n_2 + \cdots$ (assuming all the $n_j$ are different). + +--- + +As a test, here is a histogram of $10^4$ results obtained by adding independent draws from the $\Gamma(8,1)$ and $\Gamma(4,2)$ distributions. On it is superimposed the graph of $10^4$ times the preceding function. The fit is very good. + +![Figure][1] + +--- + +Moschopoulos carries this idea one step further by expanding the cf of the sum into an *infinite* series of Gamma characteristic functions whenever one or more of the $n_i$ is non-integral, and then terminates the infinite series at a point where it is reasonably well approximated. + + + [1]: https://i.stack.imgur.com/sOPCo.png",added 349 characters in body,2013-10-10 21:07:59.390 +185616,57257,10570.0,2,,CC BY-SA 3.0,b3027061-966f-4f2b-8f35-b3f45dcb01c9,"Your question is not really possible to answer unless you have additional information about the situation you are applying this to. + +###Indistinguishable situations + +For the purposes of this, we'll assume that $X$, $Y$, and $Z$ are 0-mean multivariate normal distributions in $\mathbb{R}^d$, and we're interested in one or more spectrum $\sigma_i$ (a vector of size $d$ with decreasing values, yada yada). I refer to the components of the spectrum as _eigenvalues_, without specifying that they're the eigenvalues of the covariance matrix. + +1. The true distribution is $X$ which has spectrum $\sigma_X$ with all non-zero values. There is no error, and we draw a large number of samples, estimating everything very accurately. Clearly all of the ""small"" eigenvalues still have ""information"" and aren't noise. + +2. The true distribution is $Y$ which has a spectrum $\sigma_Y$ with only 3 non-zero eigenvalues. There's noise, though, so we measure $Y+Z$, where $\sigma_Z$ _does_ have all non-zero eigenvalues. Let's suppose $Y$ and $Z$ are such that $\sigma_{Y+Z} = \sigma_X$. Here, it's obvious that all but the top 3 eigenvalues are ""merely noise"". + +My point is just that which parts of the spectrum can be attributed to ""noise"" is not an property of the sample. + +###External criteria + +There potentially are external criteria that can help you distinguish the above situations, but they're sort of problem specific. For instance, in the [Netflix Challenge](http://www.netflixprize.com/), a very successful technique for predicting movie ratings was based on SVD (which is also the basis of PCA). When using using SVD-based algorithms for a prediction task, one is confronted with the same challenge you have: _""How many non-zero components do I consider? How far do I reduce the dimensionality?""_ The answer is basically [cross validation](http://en.wikipedia.org/wiki/Cross-validation_(statistics)). The more components you consider, the lower your training error is, but the more risk of overfitting. The validation error is a proxy for generalization error. So, you generally get a chart like: + +![Training/Validation Error as a function of Model Capacity][1] + +If you're not doing a predictive problem, I don't really have useful advice, but I do imagine there might be _something_ you want to measure that can help you define what it _means_ for something to be ""signal"" vs ""noise"" in your application. + + + [1]: https://i.stack.imgur.com/XZJfg.png",,2013-10-10 21:11:46.330 +185617,57128,,25,,,7fc03bde-06eb-4e68-90db-591afd33ff07,,http://twitter.com/#!/StackStats/status/388412947003084800,2013-10-10 21:17:17.657 +185618,57258,6162.0,2,,CC BY-SA 3.0,54f33eda-f5fb-4151-a564-602f7465fd7c,"For information, the random-effect model given by @Henrik: + + > f <- function(x) sqrt(x) + > library(lme4) + > ( fit1 <- lmer(f(Value) ~ Group + (1|Subject), data=dat) ) + Linear mixed model fit by REML ['lmerMod'] + Formula: f(Value) ~ Group + (1 | Subject) + Data: dat + REML criterion at convergence: 296.3579 + Random effects: + Groups Name Std.Dev. + Subject (Intercept) 0.5336 + Residual 0.8673 + Number of obs: 108, groups: Subject, 18 + Fixed Effects: + (Intercept) Group2 Group3 + 3.03718 -0.07541 1.11886 + +is equivalent to a generalized least-squares model with an exchangeable correlation structure for subjects: + + > library(nlme) + > fit2 <- gls(f(Value) ~ Group, data=dat, na.action=na.omit, correlation=corCompSymm(form= ~ 1 | Subject)) + +The fitted variance matrix is then: + + > getVarCov(fit2) + Marginal variance covariance matrix + [,1] [,2] [,3] [,4] [,5] [,6] + [1,] 1.03690 0.28471 0.28471 0.28471 0.28471 0.28471 + [2,] 0.28471 1.03690 0.28471 0.28471 0.28471 0.28471 + [3,] 0.28471 0.28471 1.03690 0.28471 0.28471 0.28471 + [4,] 0.28471 0.28471 0.28471 1.03690 0.28471 0.28471 + [5,] 0.28471 0.28471 0.28471 0.28471 1.03690 0.28471 + [6,] 0.28471 0.28471 0.28471 0.28471 0.28471 1.03690 + Standard Deviations: 1.0183 1.0183 1.0183 1.0183 1.0183 1.0183 + +As you can see, the diagonal entry corresponds to the total variance in the first model: + + > VarCorr(fit1) + Groups Name Std.Dev. + Subject (Intercept) 0.53358 + Residual 0.86731 + > 0.53358^2+0.86731^2 + [1] 1.036934 + +and the covariance corresponds to the within-subject variance: + + > 0.53358^2 + [1] 0.2847076 + +Actually the gls model is more general because it allows a negative covariance. The advantage of `nlme` is that you can more generally use other repeated correlation structures and also you can specify different variances per group with the `weights` argument. + +I think that residuals are different because they are constructed with the random-effects in the first model. In order to get multiple comparisons you can use the `lsmeans` and the `multcomp` packages, but the $p$-values of the hypotheses tests are anticonservative with defaults (too high) degrees of freedom. Unfortunately, the `pbkrtest` package does not apply to `gls`/`lme` models.",,2013-10-10 21:31:42.880 +185619,57251,22578.0,5,,CC BY-SA 3.0,78062366-3004-4e84-aa29-8d83d7b4a011,"The convergence theorem for Gibbs sampling states: + +Given a random Vektor $X$ with $X_1,X_2,...X_K$ and the knowlegde about the conditional distribution of $X_k$ we can find the actual distribution using Gibbs Sampling infinitly often. + +The exact theorem as stated by book (Neural Networks and Learning Machines): +> The random variable $X_k(n)$ +> converges in distribution to the true probabiluty distributions of +> $X_k$ for k=1,2,...,K as n approaches infinity +> +> $\lim_{n -> \infty}P(x^{(n)}_k \leq x | x_k(0)) = P_{x_k}(x) $ for $k +> = 1,2,...,K$ +> +> Where $P_{X_k}(x)$ is the marginal cummulative distribution function +> of $X_k$ + +While doing research on this, for a deeper understanding, I ran across [this][1] answer. Which explains quite well how to pick a single sample using the Method, but I am not able to extend/modify it to fit the convergence theorem, as the result of the given example is one sample (spell) and not a final/actual probability distribution. + +**Therefore, how do I have to modify that example to fit the convergence theorem?** + + [1]: http://stats.stackexchange.com/a/10216/31349",added in quotation for convergence theorem,2013-10-10 21:36:18.333 +185661,57270,,24,,CC BY-SA 3.0,2bae73d1-069e-46ee-bad8-4a8a1acc2752,,Proposed by 22468 approved by -1 edit id of 5592,2013-10-11 01:02:37.520 +185620,57259,6162.0,2,,CC BY-SA 3.0,627aad7e-2789-4bbd-b45a-d6076c84bb16,"Now, try to you write down the model: $y_{ijk} = ...$ where $y_{ijk}$ is the $k$-th value for individual $j$ of group $i$. Then look at what happens for the means $\bar y_{ij\bullet}$: you get a classical Gaussian linear model, with variance homogeneity because there are $6$ repeated measures for each subject: + + > xtabs(~Group+Subject, data=dat) + Subject + Group 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 + 1 6 6 6 6 6 6 6 0 0 0 0 0 0 0 0 0 0 0 + 2 0 0 0 0 0 0 0 6 6 6 6 6 6 6 6 0 0 0 + 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 6 6 + +Thus, since you are interested in mean comparisons only, **you don't need to resort to a random-effect or generalised lead-squares model** - just use a classical (fixed effects) model using the means $\bar y_{ij\bullet}$ as the observations: + + tdat <- transform(dat, tvalue=f(Value)) + dd <- aggregate(tvalue~Group+Subject, data=tdat, FUN=mean) + fit3 <- lm(tvalue~Group, data=dd) + +The ANOVA provides the same answer as @Henrik's approach (and that shows that Kenward-Rogers approximation is correct): + + > anova(fit3) + Analysis of Variance Table + + Response: tvalue + Df Sum Sq Mean Sq F value Pr(>F) + Group 2 3.3799 1.68994 4.121 0.03747 * + +Then you can use `TukeyHSD()` or the `lsmeans` package for multiple comparisons: + + > TukeyHSD(aov(fit3), ""Group"") + Tukey multiple comparisons of means + 95% family-wise confidence level + + Fit: aov(formula = fit3) + + $Group + diff lwr upr p adj + 2-1 -0.07541248 -0.93627828 0.7854533 0.9719148 + 3-1 1.11885667 -0.02896441 2.2666777 0.0565628 + 3-2 1.19426915 0.06817536 2.3203629 0.0370434 + + > library(lsmeans) + > lsmeans(fit3, pairwise~Group) + + $`Group pairwise differences` + estimate SE df t.ratio p.value + 1 - 2 0.07541248 0.3314247 15 0.22754 0.97191 + 1 - 3 -1.11885667 0.4418996 15 -2.53193 0.05656 + 2 - 3 -1.19426915 0.4335348 15 -2.75472 0.03704 + p values are adjusted using the tukey method for 3 means ",,2013-10-10 21:54:03.493 +185621,57260,4779.0,2,,CC BY-SA 3.0,2eee46ca-be30-4256-9202-c772be7bca60,"Looking at this as an outlier problem seems wrong to me. If ""< 10% of users spend at all"", you need to model that aspect. Tobit or Heckman regression would be two possibilities.",,2013-10-10 21:55:57.283 +185622,57065,5237.0,6,,CC BY-SA 3.0,e878b58f-8160-457e-af1b-b71dd952b5d6,,added tag,2013-10-10 22:00:36.617 +185623,57261,16703.0,2,,CC BY-SA 3.0,84755109-3175-48b2-b7dd-3b7e0edbe2a0,"I have $N$ pmfs, and for each each $L$ samples. Each sample has a variable amount of $x$ values, but the $x$ values that they have can be matched. So for example $sample_1 \rightarrow\ x_1 = 0, x_2 = 0, x_3 = 0.2, x_4 = 0.4, x_5 = 0.4\\sample_2 \rightarrow\ x_1 = 0.3,x_2=0, x_3 = 0.4, x_4 = 0.3,x_5=0$. + +I'm using a python program (https://pypi.python.org/pypi/dirichlet/0.7) to calculate the mle from the samples which is a port of Thomas P. Minka's Matlab Fastfit code ([Estimating a Dirichlet Distribution][1]). The problem is that, for fitting, it sums over $logp+psi(\sum^k a_k) - logp$. Since some of the $x$ values are 0, some logp values are -inf. Therefore, summing over this makes everything -inf. + +How can I deal with 0 values when calculating the mle for a Dirichlet distribution? + + + [1]: http://research.microsoft.com/en-us/um/people/minka/papers/dirichlet/",,2013-10-10 22:12:17.237 +185625,57261,16703.0,3,,CC BY-SA 3.0,84755109-3175-48b2-b7dd-3b7e0edbe2a0,,,2013-10-10 22:12:17.237 +185624,57261,16703.0,1,,CC BY-SA 3.0,84755109-3175-48b2-b7dd-3b7e0edbe2a0,How to handle different amount of x_k values per sample when calculating mle for Dirichlet distribution?,,2013-10-10 22:12:17.237 +185626,57253,594.0,5,,CC BY-SA 3.0,ddb36712-60e3-4766-8ae6-92492a025859,"I am trying to do a box-cox transformation with swift. I have a dependent variable, annual foreign sales of companies (in US\$ thousands) which contains zeros, for a set of panel data. I have been advised to add a small amount, for example, 0.00001 to the annual foreign sales figures so that I can take the log, but I think box-cox transformation will produce a more appropriate constant than 0.00001. I have done a box-cox transformation on R with the codes below, but it has given me a very large lambda2 of 31162.8. + + library(geoR) + boxcoxfit(bornp$ForeignSales, lambda2 = TRUE) + #R output - Fitted parameters: + # lambda lambda2 beta sigmasq + # -1.023463e+00 3.116280e+04 9.770577e-01 7.140328e-11 + +My hunch is that the above value of lambda2 is very large, so I am not sure if I need to run the boxcoxfit with my independent variables like below: + + boxcoxfit(bornp$ForeignSales, bornp$family bornp$roa bornp$solvencyratio,lambda2=TRUE) + +I am still trying to identify the best set of independent variables, so I am not sure if using the boxcoxfit with independent variables at this stage will work or is best. + +Here's the description of the two lambda parameters from the help: + +`lambda      ` numerical value(s) for the transformation parameter $\lambda$. Used as the initial value +`            ` in the function for parameter estimation. If not provided default values are as- +`            ` sumed. If multiple values are passed the one with highest likelihood is used as +`            ` initial value. +`lambda2     ` logical or numerical value(s) of the additional transformation (see DETAILS +`            ` below). Defaults to `NULL`. If `TRUE` this parameter is also estimated and the initial +`            ` value is set to the absolute value of the minimum data. A numerical value is +`            ` provided it is used as the initial value. Multiple values are allowed as for +`            ` lambda. + +I would be very grateful for any advice on the above.",added 820 characters in body,2013-10-10 22:32:03.283 +185627,57258,6162.0,5,,CC BY-SA 3.0,082e7bf4-b12d-4c93-b47e-2e3483510057,"For information, the random-effect model given by @Henrik: + + > f <- function(x) sqrt(x) + > library(lme4) + > ( fit1 <- lmer(f(Value) ~ Group + (1|Subject), data=dat) ) + Linear mixed model fit by REML ['lmerMod'] + Formula: f(Value) ~ Group + (1 | Subject) + Data: dat + REML criterion at convergence: 296.3579 + Random effects: + Groups Name Std.Dev. + Subject (Intercept) 0.5336 + Residual 0.8673 + Number of obs: 108, groups: Subject, 18 + Fixed Effects: + (Intercept) Group2 Group3 + 3.03718 -0.07541 1.11886 + +is equivalent to a generalized least-squares model with an exchangeable correlation structure for subjects: + + > library(nlme) + > fit2 <- gls(f(Value) ~ Group, data=dat, na.action=na.omit, correlation=corCompSymm(form= ~ 1 | Subject)) + +The fitted variance matrix is then: + + > getVarCov(fit2) + Marginal variance covariance matrix + [,1] [,2] [,3] [,4] [,5] [,6] + [1,] 1.03690 0.28471 0.28471 0.28471 0.28471 0.28471 + [2,] 0.28471 1.03690 0.28471 0.28471 0.28471 0.28471 + [3,] 0.28471 0.28471 1.03690 0.28471 0.28471 0.28471 + [4,] 0.28471 0.28471 0.28471 1.03690 0.28471 0.28471 + [5,] 0.28471 0.28471 0.28471 0.28471 1.03690 0.28471 + [6,] 0.28471 0.28471 0.28471 0.28471 0.28471 1.03690 + Standard Deviations: 1.0183 1.0183 1.0183 1.0183 1.0183 1.0183 + +As you can see, the diagonal entry corresponds to the total variance in the first model: + + > VarCorr(fit1) + Groups Name Std.Dev. + Subject (Intercept) 0.53358 + Residual 0.86731 + > 0.53358^2+0.86731^2 + [1] 1.036934 + +and the covariance corresponds to the between-subject variance: + + > 0.53358^2 + [1] 0.2847076 + +Actually the gls model is more general because it allows a negative covariance. The advantage of `nlme` is that you can more generally use other repeated correlation structures and also you can specify different variances per group with the `weights` argument. + +I think that residuals are different because they are constructed with the random-effects in the first model. In order to get multiple comparisons you can use the `lsmeans` and the `multcomp` packages, but the $p$-values of the hypotheses tests are anticonservative with defaults (too high) degrees of freedom. Unfortunately, the `pbkrtest` package does not apply to `gls`/`lme` models.",added 1 characters in body,2013-10-10 22:40:30.343 +185630,57262,22587.0,3,,CC BY-SA 3.0,ecde6e73-992d-409a-82d3-feae12f3da22,,,2013-10-10 22:54:50.603 +185629,57262,22587.0,1,,CC BY-SA 3.0,ecde6e73-992d-409a-82d3-feae12f3da22,Poisson Distribution vs multiplying probabilities,,2013-10-10 22:54:50.603 +185644,57264,633.0,5,,CC BY-SA 3.0,aaa94539-10a7-4d48-a8d5-2aa318ce24e9,"The Poisson process that you're using assumes that 0.05 is the expected number of computers failing in one day in an unknown number of total computers (your answer also assumes that this rate is fixed after a computer fails, which implies that computers can fail multiple times, or are replaced immediately, or there are so many of them that this is negligible). + +The independent probability that the student is using assumes that there are exactly four computers each of which has a 5% chance of failing. + +The wording makes it sound to me like 5% is the chance of any individual computer failing (so the second interpretation). In that case, we want to know the total number of computers and apply a binomial distribution. Since the question doesn't give the total number of computers, it can't be answered. + +Another possibility is that 5% is the probability that exactly one computer fails, and yet another possibility is that 5% is the probability that at least one computer fails. In either case you might be able to somehow deduce the Poisson process intensity that gives this value. From there you could calculate similarly to how you did.",added 407 characters in body,2013-10-10 23:17:04.727 +185628,57262,22587.0,2,,CC BY-SA 3.0,ecde6e73-992d-409a-82d3-feae12f3da22,"I am a TA for a stats course for engineers, and I had a really good question from a student today, which I don't know the answer to. + +We were going through the following word problem: + +""Some computers run continuously for the Toronto Stock Exchange. The probability of a computer to fail in a day is estimated at 5%. Assuming differing computers fail independently, what is the probability that 4 computers fail in a day?"" + +Since the sampling takes place over an interval, the way I would approach this is using the Poisson distribution, with the average number of computers failing on a day $\equiv\lambda = 0.05$. If four computers fail, then $k = 4$. Thus we can use the poisson distribution: +\begin{align*} + P(k; \lambda) &= \frac{\lambda^{k} e^{-\lambda}}{k!} \\ + P(k=4; \lambda = 0.05) &= \frac{0.05^{4} e^{-0.05}}{4!} \\ + & = 2.477\times 10^{-7} +\end{align*} + +However, a student asked why it would not be appropriate to just multiply the probability of each computer failing. Since the probability of each computer failing each day $\equiv p = 0.05$, and since each computer failure is independent, he argued that, + +\begin{align*} + P(k=4) &= p^4 \\ + &= 0.05^4 = 6.25\times 10^{-6} +\end{align*} + +Which one of these approaches is wrong given the question? And why? What underlying assumption of the wrong approach is violated by the question? + +Thank you for your help.",,2013-10-10 22:54:50.603 +185633,57263,22585.0,3,,CC BY-SA 3.0,798908e6-410c-41a4-b394-e5be05ca8bb8,,,2013-10-10 23:01:51.663 +185632,57263,22585.0,1,,CC BY-SA 3.0,798908e6-410c-41a4-b394-e5be05ca8bb8,How to separate out the regression effect vs treatment effect without a control group?,,2013-10-10 23:01:51.663 +185631,57263,22585.0,2,,CC BY-SA 3.0,798908e6-410c-41a4-b394-e5be05ca8bb8,"I'm looking at a dataset that has pre-post test measurements on users' stress, depression and anxiety levels collected from a website's online health assessment. On average, the healthier participants at baseline got worse over time, and the sicker participants at baseline got much better, and the middle group gets a little better. There's definitely a regression effect going on here, but also a treatment effect too. + +As this data was collected based on website usage, there isn't really a control group (all of the ""post"" measurements come from people that have used the online program). There are probably ways that I could synthesize a control group using the people who I can guess didn't make much use out of the treatment (based on number of logins or length of time between logins), but is there a way to separate out the treatment effect from the regression effect when you can't use difference-in-difference techniques using a control group or anything like that? + +Thanks! +",,2013-10-10 23:01:51.663 +185634,57264,633.0,2,,CC BY-SA 3.0,da5e2452-da9f-4430-a226-af989872e4f7,"The Poisson process that you're using assumes that 0.05 is the expected number of computers failing in one day in an unknown number of total computers (your answer also assumes that computers failing multiple times is allowed or else there is such a large number of computers that this is negligible). + +The independent probability that the student is using assumes that there are exactly four computers each of which has a 5% chance of failing. + +The wording makes it sound to me like 5% is the chance of any individual computer failing (so the second interpretation). In that case, we want to know the total number of computers and apply a binomial distribution. Since the question doesn't give the total number of computers, it can't be answered.",,2013-10-10 23:04:27.343 +185637,57261,,24,,CC BY-SA 3.0,980a7909-9631-4a55-8311-f526126d80e4,,Proposed by 22468 approved by 88 edit id of 5591,2013-10-10 23:06:08.977 +185636,57261,16174.0,4,,CC BY-SA 3.0,980a7909-9631-4a55-8311-f526126d80e4,Dealing with 0 values when calculating the mle for a Dirichlet distribution,"shortened title for readability, embedded link for readability",2013-10-10 23:06:08.977 +185635,57261,16174.0,5,,CC BY-SA 3.0,980a7909-9631-4a55-8311-f526126d80e4,"I have $N$ pmfs, and for each each $L$ samples. Each sample has a variable amount of $x$ values, but the $x$ values that they have can be matched. So for example: + +$$sample_1 \rightarrow\ x_1 = 0, x_2 = 0, x_3 = 0.2, x_4 = 0.4, x_5 = 0.4$$ +$$sample_2 \rightarrow\ x_1 = 0.3,x_2=0, x_3 = 0.4, x_4 = 0.3,x_5=0$$ + +I'm using a [python program][1] to calculate the mle from the samples which is a port of Thomas P. Minka's Matlab Fastfit code ([Estimating a Dirichlet Distribution][2]). + +The problem is that, for fitting, it sums over $logp+psi(\sum^k a_k) - logp$. Since some of the $x$ values are 0, some logp values are -inf. Therefore, summing over this makes everything -inf. + +How can I deal with 0 values when calculating the mle for a Dirichlet distribution? + +[1]:https://pypi.python.org/pypi/dirichlet/0.7 + [2]: http://research.microsoft.com/en-us/um/people/minka/papers/dirichlet/","shortened title for readability, embedded link for readability",2013-10-10 23:06:08.977 +185638,57262,,4,user88,CC BY-SA 3.0,f10f3f1f-23ad-4901-94df-c773358d0284,Poisson distribution vs multiplying probabilities,edited title,2013-10-10 23:06:47.840 +185640,57264,633.0,5,,CC BY-SA 3.0,ce2856b1-fb8d-4834-914d-b1d67253e027,"The Poisson process that you're using assumes that 0.05 is the expected number of computers failing in one day in an unknown number of total computers (your answer also assumes that this rate is fixed after a computer fails, which implies that computers can fail multiple times, or are replaced immediately, or there are so many of them that this is negligible). + +The independent probability that the student is using assumes that there are exactly four computers each of which has a 5% chance of failing. + +The wording makes it sound to me like 5% is the chance of any individual computer failing (so the second interpretation). In that case, we want to know the total number of computers and apply a binomial distribution. Since the question doesn't give the total number of computers, it can't be answered. + +Another possibility is that 5% is the probability that exactly one computer fails, and yet another possibility is that 5% is the probability that at least one computer fails. In either case you could deduce the Poisson process intensity that gives this value (using the inverse cdf). From there you could calculate similarly to how you did.",added 407 characters in body,2013-10-10 23:11:27.527 +185643,57265,2490.0,3,,CC BY-SA 3.0,bfc8d250-9121-4eba-adef-d8272f13a86e,,,2013-10-10 23:12:54.873 +185642,57265,2490.0,1,,CC BY-SA 3.0,bfc8d250-9121-4eba-adef-d8272f13a86e,Simple experimental design - should I counterbalance?,,2013-10-10 23:12:54.873 +185646,57262,22587.0,4,,CC BY-SA 3.0,a4679d86-a32d-4433-900a-2dacc6f73eab,Poisson distribution vs multiplying probabilities,added 819 characters in body; edited title,2013-10-10 23:20:23.123 +185662,57270,5237.0,5,,CC BY-SA 3.0,fa00ad39-4246-43ec-8713-d84a70bdf051,"In a paper of Journal of Chemometrics (Naes & Mevik 2001), the authors propose to make simulations by creating two groups which are different with respect to the smallest eigenvector direction. + +>Here the groups are different with respect to the orthogonal complement to the five ‘NIR loadings’. This is achieved in the following way. The constant 0 ⋅ 18 is multiplied by a sixth loading vector (orthogonal to the other five) and added to group 2. Both groups had initially the same means as group 1 + + +How can I compute such a simulation in R? The goal is to obtain group differences which are tied to the ""small eigenvectors"" space.",removed signature,2013-10-11 01:02:37.520 +185645,57262,22587.0,5,,CC BY-SA 3.0,a4679d86-a32d-4433-900a-2dacc6f73eab,"I am a TA for a stats course for engineers, and I had a really good question from a student today, which I don't know the answer to. + +We were going through the following word problem: + +""4 computers run continuously for the Toronto Stock Exchange. The probability of a computer failure in a day is estimated at 5%. Assuming differing computers fail independently, what is the probability that all 4 computers fail in a day?"" + +Since the sampling takes place over an interval, the way I would approach this is using the Poisson distribution, with the average number of computers failing on a day $\equiv\lambda = 0.05$. If four computers fail, then $k = 4$. Thus, +\begin{align*} + P(k; \lambda) &= \frac{\lambda^{k} e^{-\lambda}}{k!} \\ + P(k=4; \lambda = 0.05) &= \frac{0.05^{4} e^{-0.05}}{4!} \\ + & = 2.477\times 10^{-7} +\end{align*} + +However, a student asked why it would not be appropriate to just multiply the probability of each computer failing. Since the probability of each computer failing each day $\equiv p = 0.05$, and since each computer failure is independent, he argued that, + +\begin{align*} + P(k=4) &= p^4 \\ + &= 0.05^4 = 6.25\times 10^{-6} +\end{align*} + +Which one of these approaches is wrong given the question? And why? What underlying assumption of the wrong approach is violated by the question? + +Thank you for your help. + +UPDATE: I left out some information in the problem the first time this was posted, and I apologize.",added 819 characters in body; edited title,2013-10-10 23:20:23.123 +185647,57266,18040.0,2,,CC BY-SA 3.0,68af776e-8425-4104-843a-215ceac92894,"I've run into this problem before. Often times it's due to a problem with passing negative values to a distribution that doesn't allow them. So perhaps your log normal or wishart distributions are getting negative values somehow. + +Another issue is that you are giving priors on your variance parameters that is quite big. This can cause unexpected values to be passed to your log-normald and wishart variables. I'd start by severely constraining the variance priors and slowly expand the parameter space. In my experience fitting BUGS models is both and art and a science. +",,2013-10-10 23:22:27.973 +185648,57264,633.0,5,,CC BY-SA 3.0,c96a8328-592b-42bd-972a-b90f27b97f5b,"The Poisson process that you're using assumes that 0.05 is the expected number of computers failing in one day in an unknown number of total computers (your answer also assumes that this rate is fixed after a computer fails, which implies that computers can fail multiple times, or are replaced immediately, or there are so many of them that this is negligible). + +The independent probability that the student is using assumes that there are exactly four computers each of which has a 5% chance of failing. + +The wording makes it sound to me like 5% is the chance of any individual computer failing (so the second interpretation). In that case, we want to know the total number of computers and apply a binomial distribution. Since the question doesn't give the total number of computers, it can't be answered. + +Another possibility is that 5% is the probability that exactly one computer fails, and yet another possibility is that 5% is the probability that at least one computer fails. In either case you can deduce the Poisson process intensity that gives this value. For the first of these, I get 4.4997552907483822; for the second, I get an intensity of 0.051293294149203306. From there you could calculate similarly to how you did. + +--- + +Per your update: You can eliminate the Poisson process since you don't have a fixed rate. You still have to decide whether 5% is the probability of a given computer failing, in which case the student is right. If it's the probability of at least one computer failing, or the probability of exactly one computer failing, you'll have to reason back from that number to the probability of any individual computer failing before reasoning forwards.",added 90 characters in body,2013-10-10 23:27:17.640 +185651,57267,22591.0,2,,CC BY-SA 3.0,b64f0cdf-b047-40d6-b92f-a9fa8f89a6d7,"Here is my situation: +- A huge amount of data +- 600 features +- Only one class is provided +Now, my question is how can I reduce the number of features to important ones? In another word, all of these features (with data) are intending to predict only one class. but some of features have large impact on the prediction (means their variation come to higher probability). ",,2013-10-10 23:28:22.503 +185650,57267,22591.0,1,,CC BY-SA 3.0,b64f0cdf-b047-40d6-b92f-a9fa8f89a6d7,Extract important features,,2013-10-10 23:28:22.503 +185649,57267,22591.0,3,,CC BY-SA 3.0,b64f0cdf-b047-40d6-b92f-a9fa8f89a6d7,,,2013-10-10 23:28:22.503 +185654,57268,18040.0,3,,CC BY-SA 3.0,480333e6-1dc5-482e-95c6-4781d1488162,,,2013-10-10 23:42:09.300 +185653,57268,18040.0,1,,CC BY-SA 3.0,480333e6-1dc5-482e-95c6-4781d1488162,Difference between two different mixed effects models,,2013-10-10 23:42:09.300 +185652,57268,18040.0,2,,CC BY-SA 3.0,480333e6-1dc5-482e-95c6-4781d1488162,"I have a question about how to tell two different mixed effects models apart. In the simple case both involve fitting a model with a random group effect and a covariate. I fit the model with `lme4` in `R`. Here is a visualization of the two different scenarios. +![enter image description here][1] + + library(ggplot2) + library(lme4) + gen_dat2 <- function(group.m,group.v,int, sl,n){ + x <- vector() + y <- vector() + g <- vector() + for(i in 1:length(group.m)){ + x.t <- rnorm(n,group.m[i],group.v[i]) + y.t <- rnorm(n,group.m[i],group.v[i])+int + sl*x.t + x <- c(x,x.t) + y <- c(y,y.t) + g <- c(g,rep(i,n)) + } + return(cbind(x,y,g)) + } + + group.m <- runif(5,1,20) + group.v <- runif(5,1,1.5) + + dat2 <- data.frame(gen_dat2(group.m,group.v,1,4,14)) + ggplot(dat2,aes(x=x,y=y,colour=as.factor(g),group=g))+geom_point()+stat_smooth(method=""lm"",se=F) + m2 <- lmer(y~x + (x|g),data=dat2) + + + +Then I can generate and fit the other scenario with similar code: + +![enter image description here][2] + + gen_dat <- function(group.m,group.v,int, sl,n){ + x <- vector() + y <- vector() + g <- vector() + for(i in 1:length(group.m)){ + x.t <- rnorm(n,0,1) + y.t <- rnorm(n,group.m[i],group.v[i])+int + sl*x.t + x <- c(x,x.t) + y <- c(y,y.t) + g <- c(g,rep(i,n)) + } + return(cbind(x,y,g)) + } + + group.m <- runif(5,1,20) + group.v <- runif(5,1,1.5) + + dat1 <- data.frame(gen_dat(group.m,group.v,1,4,14)) + ggplot(dat1,aes(x=x,y=y,colour=as.factor(g),group=g))+geom_point()+stat_smooth(method=""lm"",se=F) + m1 <- lmer(y~x + (x|g),data=dat1) + + +My central question is how do I tell these two models apart? Am I incorrectly fitting the first one, and I need an extra term in there to model the relationships between groups and the x variable as well as y? Both detect substantial between group variation in the intercept and not much in the slope as I would predict. But I need a way to tell these two apart. Any thoughts would be helpful. + + [1]: https://i.stack.imgur.com/GgxYw.png + [2]: https://i.stack.imgur.com/pgGm7.png",,2013-10-10 23:42:09.300 +185655,57268,18040.0,4,,CC BY-SA 3.0,cf3af755-be00-4568-a10d-a1185bd1c214,Difference between two mixed effects models,edited title,2013-10-10 23:47:36.853 +185656,57269,1741.0,2,,CC BY-SA 3.0,09f3a2ce-cf9a-4b24-8296-029e4a18ea93,"Given that your model exhibits good accuracy you can just use it to predict the class labels of records in the unlabeled dataset. However, you cannot evaluate the performances on unlabeled data. + +Be careful that you should assess the quality of your model on the labeled data by cross-validation. It is not enough to check the training error rate. + +If your model is not accurate enough you might think about [semi-supervised learning][1]. The unlabeled data is used in order to improve the quality of your model via inductive learning. The accuracy should always be computed by cross-validation on your labeled data. + +Have a look at [ Crimisini et al. Decision Forests: A Unified Framework +for Classification, Regression, Density Estimation, Manifold Learning and +Semi-Supervised Learning ] Chapter 7 about semi-supervised learning and 7.4 about induction with semi-supervised learning. + + [1]: http://en.wikipedia.org/wiki/Semi-supervised_learning",,2013-10-11 00:17:06.150 +185657,57262,,25,,,5c74984b-cdcf-4a12-9280-19d0b33d5d4a,,http://twitter.com/#!/StackStats/status/388458238112698368,2013-10-11 00:17:15.923 +185663,57270,16174.0,5,,CC BY-SA 3.0,2bae73d1-069e-46ee-bad8-4a8a1acc2752,"In a paper of Journal of Chemometrics (Naes & Mevik 2001), the authors propose to make simulations by creating two groups which are different with respect to the smallest eigen-vector direction. + +>Here the groups are different with respect to the orthogonal complement to the five ‘NIR loadings’. +This is achieved in the following way. The constant 0 ⋅ 18 is multiplied by a sixth loading vector (orthogonal to the other five) and added to group 2. Both groups had initially the same means as group 1 + + +How can I compute such a simulation in R? The goal is to obtain group differences which are tied to the ""small eigen-vectors"" space.",removed signature,2013-10-11 01:02:37.520 +185664,57271,20473.0,2,,CC BY-SA 3.0,63f55c7e-7004-4c5d-a5e1-c1d55a6d016e,"Write your system explicitly for time $t$ as (""$L$"" for ""loss"", as a positive quantity, and ""$G$"" for ""gain"") +$$ A_t - A_{t-1} = - L^A_{t-1} + G_{t-1}^{B\rightarrow A}+G_{t-1}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - L^B_{t-1} + G_{t-1}^{A\rightarrow B}+G_{t-1}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - L^C_{t-1} + G_{t-1}^{A\rightarrow C}+G_{t-1}^{B\rightarrow C}$$ + +The following three relations hold exactly: +$$ L^A_{t-1} = G_{t-1}^{A\rightarrow B} + G_{t-1}^{A\rightarrow C} $$ +$$ L^B_{t-1} = G_{t-1}^{B\rightarrow A} + G_{t-1}^{B\rightarrow C} $$ +$$ L^C_{t-1} = G_{t-1}^{C\rightarrow A} + G_{t-1}^{C\rightarrow B} $$ + +If you substitute in the first three you obtain + +$$ A_t - A_{t-1} = - G_{t-1}^{A\rightarrow B} - G_{t-1}^{A\rightarrow C} + G_{t-1}^{B\rightarrow A}+G_{t-1}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - G_{t-1}^{B\rightarrow A} - G_{t-1}^{B\rightarrow C} + G_{t-1}^{A\rightarrow B}+G_{t-1}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - G_{t-1}^{C\rightarrow A} - G_{t-1}^{C\rightarrow B} + G_{t-1}^{A\rightarrow C}+G_{t-1}^{B\rightarrow C}$$ + +You have $6$ unknown quantities to estimate _per time period_. There is just not enough information to do that. So you need assumptions that will impose structure (=restrictions) on the situation, and will permit you to estimate _something_. What? Let's say you assume that there is a relatively stable ""churn"" from one company to another, as a linear function of their market share in the previous period. This assumption brings in a set of unknown coefficients to be estimated (which will then give you an estimate of ""hidden transfers of market share""). Write $G_{t-1}^{A\rightarrow B} = a_bA_{t-1}$ (market share lost from $A$ to $B$ as a linear function of $A$'s market share in period $t-1$). +Your equations will become + +$$ A_t - A_{t-1} = - a_bA_{t-1} - a_cA_{t-1} + b_aB_{t-1}+c_aC_{t-1} $$ + +$$ B_t - B_{t-1} = - b_aB_{t-1} - b_cB_{t-1} + a_bA_{t-1}+c_bC_{t-1}$$ + +$$ C_t - C_{t-1} = - c_aC_{t-1} - c_bC_{t-1} + a_cA_{t-1}+ b_cB_{t-1}$$ + +We have turn a set of mathematical identities into a _model_. It is doubtful that this model will hold exactly for each $t$, so you should add a stochastic error term. Rearranging we obtain a first-order Vector Autoregression (VAR): + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +1-a_b-a_c & b_a & c_a \\ +a_b & 1-b_a-b_c & c_b \\ +a_c & b_c & 1-c_a-c_b \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +or, to homogenize notation, + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +\gamma_{11} & \gamma_{12} & \gamma_{13} \\ +\gamma_{21} & \gamma_{22} & \gamma_{23} \\ +\gamma_{31} & \gamma_{32} & \gamma_{33} \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +subject to the equality restrictions +$$ \begin{matrix} +\gamma_{11} + \gamma_{21} + \gamma_{31} =1 \\ +\gamma_{12} + \gamma_{22} + \gamma_{32} =1 \\ +\gamma_{13} + \gamma_{23} + \gamma_{33} =1 \\ +\end{matrix} $$ + +So you have essentially $6$ unknown coefficients and a sample of $T-1$ observations (for each company). Now, you must also assume what, if any, is the stochastic relation between the three stochastic error terms (are they correlated?) but such additional assumption should either come from knowledge of the specific real world phenomenon under study, or through a statistical specification search. + +Of course you may find that such a model does not fit your data sample well - for example you expect that all estimated coefficients should be positive and smaller than or equal to unity, but the estimation procedure may not give you that. But this is what we do: we come up with specification hypotheses and we test them against the data - ""success"" is never guaranteed. Then you should try to come up with a different model. + +",,2013-10-11 01:09:15.643 +185665,57270,22593.0,5,,CC BY-SA 3.0,9cfc802b-031f-41cb-9b7f-05c7798ccdbd,"In a paper of Journal of Chemometrics (Naes & Mevik 2001 : Understanding the collinearity problem in regression and discriminant analysis), the authors propose to make simulations by creating two groups which are different with respect to the smallest eigenvector direction. + +>Here the groups are different with respect to the orthogonal complement to the five ‘NIR loadings’. This is achieved in the following way. The constant 0 ⋅ 18 is multiplied by a sixth loading vector (orthogonal to the other five) and added to group 2. Both groups had initially the same means as group 1 + + +How can I compute such a simulation in R? The goal is to obtain group differences which are tied to the ""small eigenvectors"" space.",adding title of the reference,2013-10-11 01:20:54.640 +185667,57272,22595.0,16,,,92829f23-e685-4c79-98ae-0b5e5578e95b,,,2013-10-11 01:37:35.967 +185666,57272,22595.0,2,,CC BY-SA 3.0,ce3b02d5-a6dc-457e-8862-4052938f945c,"The following are text books I used for my MSEE coursework and research and I found them to be pretty good. + +1. Probability, Statistics and Random Processes for Engineers by Henry Stark and John W. Woods +(Detailed explanation of concepts, good for Communications and Signal Processing people) +2. Probability, Random Variables and Random Processes by Hwei Hsu +(Concise explanation of concepts, has a good amount of solved examples)",,2013-10-11 01:37:35.967 +185668,57272,1895.0,5,,CC BY-SA 3.0,922aea72-5169-4bec-b4e0-89cfa0886ed6,"The following are text books I used for my MSEE coursework and research and I found them to be pretty good. + +1. [Probability, Statistics and Random Processes for Engineers](http://www.amazon.com/Probability-Statistics-Processes-Engineers-Edition/dp/0132311232) by Henry Stark and John W. Woods +(Detailed explanation of concepts, good for Communications and Signal Processing people). +2. [Schaum's Outline of Probability, Random Variables and Random Processes](http://www.amazon.com/Schaums-Outline-Probability-Variables-Processes/dp/0071632891) by Hwei Hsu +(Concise explanation of concepts, has a good amount of solved examples).",added 199 characters in body,2013-10-11 01:47:30.337 +185671,57273,22596.0,3,,CC BY-SA 3.0,e57f3873-cbcb-4953-bcae-b341f15c33d9,,,2013-10-11 02:05:57.810 +185670,57273,22596.0,1,,CC BY-SA 3.0,e57f3873-cbcb-4953-bcae-b341f15c33d9,Difficulty with MCMC implementation,,2013-10-11 02:05:57.810 +185669,57273,22596.0,2,,CC BY-SA 3.0,e57f3873-cbcb-4953-bcae-b341f15c33d9,"I could really use some guided help! I'm having difficulty understanding an MCMC implementation in terms of modeling a data set. I'm working on generating parameters from stellar light curves, and was asked to look into implementing an MCMC algorithm. A large chuck on the code is written in Python, so I've been trying to use [emcee hammer](http://dan.iel.fm/emcee/) to generate parameter fits. But going through the code, it's just ""clicking"" how the method works. + +I have a set of data (time vs flux) of two stars orbiting each other such that from our point of view, they eclipse. There are dips in the light curve to signify this. All I'm attempting to do is get the parameters of the system dependent on the characteristics of these dips. + +In the emcee implementation, there are a few functions that I understand: the posterior function which, I believe, simply generates a data set given the set of parameters. Then there's a prior function which, I assume, is the function given a previous set of parameters. Somehow the algorithm chooses whether or not the jump to the posterior parameter set is to be done? I'm guessing that's what the use of the likelihood function is? To describe whether or not to take the jump? + +I apologize, I'm quite confused on how this is to be implemented in terms a defined set of data.",,2013-10-11 02:05:57.810 +185674,57274,22594.0,2,,CC BY-SA 3.0,8ba01954-a44f-45ab-aa74-443f0bb2d962,"I'm fairly new to statistics and I'm still trying to figure out the best way to analyse the data I have. The experiment has 2 groups of participants who perform 2 repetitions of a task that consists of 5 stages. All participants completed both repetitions for all stages, but one group had 8 participants while the other group only had 6. I have a about 100 dependent variables that I wish to examine, so my data looks a bit like this: + + ID Group Repetition Stage DV1 DV2 ... + 1 A 1 1 212.9 179.9 ... + 1 A 2 1 144.8 134.7 ... + 2 B 1 1 146.3 156.8 ... + 2 B 2 1 128.6 178.2 ... + +Group is a between-subjects factor while Repetition and Stage are within-subjects factors. I would like to determine whether Group and Repetition have a significant effect on each dependent variable within each stage (I am not interested in the effect of stage itself). I'm doing the analysis in R so I have the following code: + + options(contrasts=c(""contr.sum"",""contr.poly"")) + mydata=read.csv(""data.csv"",header=TRUE) + mydata$Group = factor(mydata$Group) + mydata$Repetition = factor(mydata$Repetition) + mydata$Stage = factor(mydata$Stage) + # for each stage + mydata = mydata[mydata$Stage==1,] + for (i in 5:(ncol(mydata))) + { + fit=aov(formula=as.formula(paste(names(mydata)[i],""~ Group * Repetition + Error(ID/Repetition)"")), data=mydata) + } + +My questions are: + + 1. Is mixed measures ANOVA a valid test for this data? What's the correct way to test whether my data fits the assumptions of ANOVA in R? If this is not a reliable test, what's a possible alternative? + 2. Have I defined the mixed measures ANOVA in R correctly? The various tutorials I've read define it in different ways so I'm a bit confused. + +Thanks in advance for any help. + + ",,2013-10-11 02:25:06.173 +185673,57274,22594.0,3,,CC BY-SA 3.0,8ba01954-a44f-45ab-aa74-443f0bb2d962,,,2013-10-11 02:25:06.173 +185672,57274,22594.0,1,,CC BY-SA 3.0,8ba01954-a44f-45ab-aa74-443f0bb2d962,Is mixed measures ANOVA the correct test for my data?,,2013-10-11 02:25:06.173 +185677,57275,22598.0,3,,CC BY-SA 3.0,7721b4b4-3ec4-4951-b637-8fc93b4d2158,,,2013-10-11 02:49:49.953 +185676,57275,22598.0,1,,CC BY-SA 3.0,7721b4b4-3ec4-4951-b637-8fc93b4d2158,Training One Class SVM using LibSVM,,2013-10-11 02:49:49.953 +185675,57275,22598.0,2,,CC BY-SA 3.0,7721b4b4-3ec4-4951-b637-8fc93b4d2158,"I hope to use one-class SVM of LIBSVM to train a training samples so as to get a model. Then, I use the model to predict the new test data and the training data is same type or not. In the training process, I have some questions as follows: + + - The training samples is all positive examples or not? + - Which kernel function can get better result,linear kernel or RBF kernel? + - What is the effect of nu's values to the model? +",,2013-10-11 02:49:49.953 +185680,57276,22310.0,3,,CC BY-SA 3.0,4341a7fb-14e9-46f8-a348-0e2ae8e6bbb7,,,2013-10-11 02:51:27.387 +185679,57276,22310.0,1,,CC BY-SA 3.0,4341a7fb-14e9-46f8-a348-0e2ae8e6bbb7,"Path analysis, sample sizes, and alternative analysis",,2013-10-11 02:51:27.387 +185692,56784,,25,,,71a76e5b-abb4-47d5-9999-0e692934a860,,http://twitter.com/#!/StackStats/status/388548838333878272,2013-10-11 06:17:16.663 +185678,57276,22310.0,2,,CC BY-SA 3.0,4341a7fb-14e9-46f8-a348-0e2ae8e6bbb7,"I am examining how English ivy affects the occurrence of a salamander species under cover objects (e.g., logs). Soil moisture is assumed to be the major factor that affect their occurrence. + +My hypothesized pathway: The presence/absence of salamanders under cover objects is either a direct consequence of changes in ivy-induced abioitc environment (i.e., drier soil) or an indirect result of changes in prey community that resulted from altered abiotic factors. But,k there are multiple factors, other than English ivy, that affect soil moisture. + +![enter image description here][1] + +My questions are: + +1) I think that a path analysis is most suitable for testing my causal mechanisms. But, given a small sample size (n = 71), is a path analysis appropriate? + +2) Another potential problem for a path analysis is that the effects of English ivy on soil moisture seem to depend on the other factors (e.g., the number of overstory trees), as shown below. Are there any way to account for such patterns in a path analysis? + +![The relationship between soil moisture and English ivy cover on cover objects (""the number of overstory trees"" for the left graph) for different levels of the surrounding overstory trees (""English ivy cover on cover objects"" for the left graph][2] + +3) Are there any other analyses suitable for testing my hypothesized relationships? I am considering multiple (linear and logistic) regressions, but again my sample size is small **AND** regressions do not reflect my hypothesized causal relationships accurately. + +I am using R, so if any recommended codes would be greatly helpful (I am a relatively new R user). + +Thank you in advance for reading and considering my questions. + +Sincerely, + +Kiyoshi + + [1]: https://i.stack.imgur.com/k65Ag.jpg + [2]: https://i.stack.imgur.com/ArgZm.jpg",,2013-10-11 02:51:27.387 +185682,57249,,25,,,938b1411-cb83-45a2-917f-6ac7b8cd81d4,,http://twitter.com/#!/StackStats/status/388503539091517440,2013-10-11 03:17:16.493 +185683,57276,5237.0,5,,CC BY-SA 3.0,87bc8adb-41a9-416f-82d5-d5f4971a32a7,"I am examining how English ivy affects the occurrence of a salamander species under cover objects (e.g., logs). Soil moisture is assumed to be the major factor that affect their occurrence. + +My hypothesized pathway: The presence/absence of salamanders under cover objects is either a direct consequence of changes in ivy-induced abioitc environment (i.e., drier soil) or an indirect result of changes in prey community that resulted from altered abiotic factors. But,k there are multiple factors, other than English ivy, that affect soil moisture. + +![enter image description here][1] + +My questions are: + +1. I think that a path analysis is most suitable for testing my causal mechanisms. But, given a small sample size (n = 71), is a path analysis appropriate? + +2. Another potential problem for a path analysis is that the effects of English ivy on soil moisture seem to depend on the other factors (e.g., the number of overstory trees), as shown below. Are there any way to account for such patterns in a path analysis? + + ![The relationship between soil moisture and English ivy cover on cover objects (""the number of overstory trees"" for the left graph) for different levels of the surrounding overstory trees (""English ivy cover on cover objects"" for the left graph][2] + +3. Are there any other analyses suitable for testing my hypothesized relationships? I am considering multiple (linear and logistic) regressions, but again my sample size is small **AND** regressions do not reflect my hypothesized causal relationships accurately. + +I am using R, so any recommended code would be greatly helpful (I am a relatively new R user, though). + + + [1]: https://i.stack.imgur.com/k65Ag.jpg + [2]: https://i.stack.imgur.com/ArgZm.jpg","added tags; formatted; removed thanks, signature",2013-10-11 03:19:10.097 +185684,57276,5237.0,6,,CC BY-SA 3.0,87bc8adb-41a9-416f-82d5-d5f4971a32a7,,"added tags; formatted; removed thanks, signature",2013-10-11 03:19:10.097 +185685,57274,5237.0,5,,CC BY-SA 3.0,805fe935-d4bd-4413-a39d-100d1ff331c3,"I'm fairly new to statistics and I'm still trying to figure out the best way to analyse the data I have. The experiment has 2 groups of participants who perform 2 repetitions of a task that consists of 5 stages. All participants completed both repetitions for all stages, but one group had 8 participants while the other group only had 6. I have a about 100 dependent variables that I wish to examine, so my data looks a bit like this: + + ID Group Repetition Stage DV1 DV2 ... + 1 A 1 1 212.9 179.9 ... + 1 A 2 1 144.8 134.7 ... + 2 B 1 1 146.3 156.8 ... + 2 B 2 1 128.6 178.2 ... + +Group is a between-subjects factor while Repetition and Stage are within-subjects factors. I would like to determine whether Group and Repetition have a significant effect on each dependent variable within each stage (I am not interested in the effect of stage itself). I'm doing the analysis in R so I have the following code: + + options(contrasts=c(""contr.sum"",""contr.poly"")) + mydata = read.csv(""data.csv"",header=TRUE) + mydata$Group = factor(mydata$Group) + mydata$Repetition = factor(mydata$Repetition) + mydata$Stage = factor(mydata$Stage) + # for each stage + mydata = mydata[mydata$Stage==1,] + for (i in 5:(ncol(mydata))) + { + fit = aov(formula=as.formula(paste(names(mydata)[i], + ""~ Group * Repetition + Error(ID/Repetition)"")), + data=mydata) + } + +My questions are: + + 1. Is mixed measures ANOVA a valid test for this data? What's the correct way to test whether my data fits the assumptions of ANOVA in R? If this is not a reliable test, what's a possible alternative? + 2. Have I defined the mixed measures ANOVA in R correctly? The various tutorials I've read define it in different ways so I'm a bit confused. +",tweaked code for readability; removed thanks,2013-10-11 03:22:11.990 +185688,57275,5237.0,4,,CC BY-SA 3.0,6cc8d218-d0ec-49dc-b25e-39b5a9f11723,Training one class SVM using LibSVM,added tag; light editing & formatting,2013-10-11 03:27:59.190 +185687,57275,5237.0,5,,CC BY-SA 3.0,6cc8d218-d0ec-49dc-b25e-39b5a9f11723,"I hope to use one-class SVM of LIBSVM to train a training samples so as to get a model. Then, I will use the model to predict whether the new test data and the training data is same type or not. In the training process, I have some questions as follows: + + - Should the training samples all be positive examples or not? + - Which kernel function can get better result, **linear** kernel or **RBF** kernel? + - What is the effect of nu's values to the model? +",added tag; light editing & formatting,2013-10-11 03:27:59.190 +185686,57275,5237.0,6,,CC BY-SA 3.0,6cc8d218-d0ec-49dc-b25e-39b5a9f11723,,added tag; light editing & formatting,2013-10-11 03:27:59.190 +185689,56784,594.0,33,,,12f6317e-ae42-4285-8674-fed749d49e70,,817,2013-10-11 03:40:07.910 +185690,57277,20603.0,2,,CC BY-SA 3.0,67019973-7c52-4e39-b61d-50a3236f440e,">Should the training samples all be positive examples or not? + +Yes, in one class SVM (and any other outlier detection algorithm) you need just **one** class. If it is **positive** or **negative** depends on your naming convention, but it it more probable, that you will seek for **positive** examples which are underrepresented. + +>Which kernel function can get better result, linear kernel or RBF kernel? + +""There is no free lunch"". There is no general answer, the reason behind having many kernels (not just linear and rbf) is that they work well in different applications. It is **data dependant** decision, so you will have to test at least those two. + +>What is the effect of nu's values to the model? + +It corresponds to the bounds on fraction of points becoming support vectors, so it limits the model's complexity (smaller the number of SVs, simplier the model and less prone to overfitting, yet prone to underfitting). As in the http://www.cms.livjm.ac.uk/library/archive/Grid%20Computing/NoveltyDetection/sch00support.pdf paper, it directly corresponds to: + +* ""an upper bound on the fraction of outliers"" +* ""a lower bound on the fraction of SVs"".",,2013-10-11 05:28:45.147 +185695,57278,20144.0,2,,CC BY-SA 3.0,5d3d56c9-9330-4d2b-88f3-518b100b01d3,"first of all, I want to express my apologies if the question is too broad or wrong, but I am in need of references and I have no idea of whom can I ask. + +If you are interested, the question comes from a model I built, you can see some details [here](http://physics.stackexchange.com/questions/78524/boltzmann-distribution-with-interaction-between-particles) and [here](http://physics.stackexchange.com/questions/80019/grand-canonical-ensemble-with-interaction-simulation-doubts). In this model I have: +$$f(\mathbb{x}|T,\mu)=\frac{h(\mathbb{x})e^{-\frac{E(\mathbb{x})}{kT}+\mu N(x)}}{\mathcal{Z}(T,\mu)}$$ + +There, my parameters are $\mu$ and $T$, and $\mathbb{x}=(x_1,\dots,x_M)$ where $x_i\in\{0,1\}$ and I have the restriction $\forall i\in\{1,\dots,M-D+1\}$ +$$\sum_{j=0}^{D-1} x_{i+j} \leq 1$$ +This is,$h(\mathbb{x})=0$ if that condition is not held. + + +I have the ""small"" inconvenient of not knowing $\mathcal{Z}(T,\mu)$, so I used a MCMC (Metropolis-Hastings) method to approximate this function. However I face two problems. The first of them is regarding the simulation and the model, and I am on solving it (it depends too much on the initial condition). The second is that these parameters are not fully known and I have no idea of how can I estimate them. I have been reading about bayesian inference and I know a bit of estimator theory but I am no expert (furthermore I don't know if not knowing the partition function can affect). If any of you were able to give me some clue in the form of a book that I can read, I would be eternally grateful. + +Thank you very much for your help.",,2013-10-11 06:43:41.090 +185694,57278,20144.0,3,,CC BY-SA 3.0,5d3d56c9-9330-4d2b-88f3-518b100b01d3,,,2013-10-11 06:43:41.090 +185698,57279,22262.0,2,,CC BY-SA 3.0,1540356b-83ae-43ec-86de-50fd9b8a052c,"What variable selection approach should I consider if I have thousands of predictors with clusters that are extremely correlated? + +For example I might have a predictor set $X:= \{A_1,A_2,A_3,A_4,...,A_{39},B_1,B_2,...,B_{44},C_1,C_2,...\}$ with cardinality $|X| > 2000$. Consider the case where all $\rho(A_i,A_j)$ are very high, and similarly for $B$, $C$, .... + +Correlated predictors aren't correlated ""naturally""; it's a result of the feature engineering process. This is because all $A_i$ are hand engineered from the same underlying data with small variations in hand-engineering methodology, e.g. I use a thinner pass band on $A_2$ than I did for $A_1$ in my denoising approach but everything else is the same. + +My goal is to improve out of sample accuracy in my classification model. + +One approach would just be to try everything: non-negative garotte, ridge, lasso, elastic nets, random subspace learning, PCA/manifold learning, least angle regression and pick the one that's best in my out of sample dataset. But specific methods that are good at dealing with the above would be appreciated.",,2013-10-11 06:57:56.613 +185699,57279,22262.0,1,,CC BY-SA 3.0,1540356b-83ae-43ec-86de-50fd9b8a052c,Variable selection with groups of predictors that are highly correlated,,2013-10-11 06:57:56.613 +185697,57279,22262.0,3,,CC BY-SA 3.0,1540356b-83ae-43ec-86de-50fd9b8a052c,,,2013-10-11 06:57:56.613 +185702,57280,22600.0,3,,CC BY-SA 3.0,8c5e8fb0-5841-4ae4-8f67-b4f56b8d8e31,,,2013-10-11 07:24:30.003 +185701,57280,22600.0,1,,CC BY-SA 3.0,8c5e8fb0-5841-4ae4-8f67-b4f56b8d8e31,Binomial Conditional Probability of a an event,,2013-10-11 07:24:30.003 +185700,57280,22600.0,2,,CC BY-SA 3.0,8c5e8fb0-5841-4ae4-8f67-b4f56b8d8e31,"## Determining Binomial Condition Probability of a Random Sample ## + +I have a question about binomial probability involving a conditional event. This problem keeps tripping me up, because while I know how to calculate the binomial probability that a random variable is a failure, i don't know how to calculate the conditional probability of that variable. + + +---------- + + +My question is as follows: + +**70%** of the total shipments come from **factory A**, of which **10% are defective**. + +**30%** of the total shipments come from **factory B**, of which **5% are defective**. + +A random shipment comes in, and a sample of 20 pints is taken, and 1 of the pints is defective. + +*What is the probability that this shipment came from the Factory A?* + +",,2013-10-11 07:24:30.003 +185703,57281,9074.0,2,,CC BY-SA 3.0,f7d71e61-191d-4712-b579-76c0e92f1c19,"This requires a straight-forward application of Bayes' rule: $P(A|Defective)=P(Defective|A)*P(A)/P(Defective) \leftrightarrow \\ +P(A|Defective)=0.1*0.7/0.085= 0.07/0.085 \approx 0.824$",,2013-10-11 07:39:11.260 +185704,57281,9074.0,5,,CC BY-SA 3.0,43545709-e3ef-462f-8fe6-478d150dbfec,**Edited**: didn't pay attention to the question. Will edit answer later today. ,Didn't read the question thoroughly. ,2013-10-11 08:03:27.483 +185706,57281,9074.0,5,,CC BY-SA 3.0,8c153f28-999a-42c0-aa97-f50965737ff3,"**Edit 1**: didn't pay attention to the question. Will edit answer later today. + +**Edit 2**: I've attempted to provide an answer below, however I might be mistaken. Feel free to correct me if I am in error. + +$P(1 defective|A) \approx 0.270 \wedge P(1 defective|B) \approx 0.377 \\ +P(A) = 0.7 \wedge P(B) = 0.3 \\ +P(1D) = 0.7*0.270+0.3*0.377 = 0.189+0.113 = 0.302 \\ +P(A|1D) = P(1D|A)*P(A)/P(1D)=0.270*0.7/0.302 \approx 0.626$",added 354 characters in body,2013-10-11 08:22:50.210 +185709,57282,22601.0,2,,CC BY-SA 3.0,51317f1a-2981-446a-8f73-a8521e7811fa,"are there any generic tests to validate if a given sample follows a unimodal distribution, like a gaussian, cauchy's, Student's t or chi^2? + +Thanks, + +Christian",,2013-10-11 08:38:55.517 +185708,57282,22601.0,3,,CC BY-SA 3.0,51317f1a-2981-446a-8f73-a8521e7811fa,,,2013-10-11 08:38:55.517 +185707,57282,22601.0,1,,CC BY-SA 3.0,51317f1a-2981-446a-8f73-a8521e7811fa,Generic test for unimodality given sample,,2013-10-11 08:38:55.517 +185710,57283,9074.0,2,,CC BY-SA 3.0,382e20a9-2035-43d6-ab1b-46b9b4c6a4d8,"You're asking two questions: + +1) Is there a generic test for unimodality? +2) Is there tests to test whether a sample is derived from a given distribution, say, a normal distribution? + +Ad 1): Yes, the Hartigan-Hartigan dip test, [Ann. Statist. 13(1):70-84][1]. + +Ad 2): There exists a number of special tests, but the [Kolmogorov-Smirnov][2] test is a general-purpose nonparametric test, although with low statistical power. + +Best, + + [1]: http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1176346577 + [2]: http://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test",,2013-10-11 08:43:06.977 +185711,57271,20473.0,5,,CC BY-SA 3.0,9d6fe1be-d19d-49e1-bf4f-d4569cdd9b74,"Write your system explicitly for time $t$ as (""$L$"" for ""loss"", as a positive quantity, and ""$G$"" for ""gain"") +$$ A_t - A_{t-1} = - L^A_{t-1} + G_{t-1}^{B\rightarrow A}+G_{t-1}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - L^B_{t-1} + G_{t-1}^{A\rightarrow B}+G_{t-1}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - L^C_{t-1} + G_{t-1}^{A\rightarrow C}+G_{t-1}^{B\rightarrow C}$$ + +The following three relations hold exactly: +$$ L^A_{t-1} = G_{t-1}^{A\rightarrow B} + G_{t-1}^{A\rightarrow C} $$ +$$ L^B_{t-1} = G_{t-1}^{B\rightarrow A} + G_{t-1}^{B\rightarrow C} $$ +$$ L^C_{t-1} = G_{t-1}^{C\rightarrow A} + G_{t-1}^{C\rightarrow B} $$ + +If you substitute in the first three you obtain + +$$ A_t - A_{t-1} = - G_{t-1}^{A\rightarrow B} - G_{t-1}^{A\rightarrow C} + G_{t-1}^{B\rightarrow A}+G_{t-1}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - G_{t-1}^{B\rightarrow A} - G_{t-1}^{B\rightarrow C} + G_{t-1}^{A\rightarrow B}+G_{t-1}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - G_{t-1}^{C\rightarrow A} - G_{t-1}^{C\rightarrow B} + G_{t-1}^{A\rightarrow C}+G_{t-1}^{B\rightarrow C}$$ + +You have $6$ unknown quantities to estimate _per time period_. There is just not enough information to do that. So you need assumptions that will impose structure (=restrictions) on the situation, and will permit you to estimate _something_. What? Let's say you assume that there is a relatively stable ""churn"" from one company to another, as a linear function of their market share in the previous period. This assumption brings in a set of unknown coefficients to be estimated (which will then give you an estimate of ""hidden transfers of market share""). Write $G_{t-1}^{A\rightarrow B} = a_bA_{t-1}$ (market share lost from $A$ to $B$ as a linear function of $A$'s market share in period $t-1$). +Your equations will become + +$$ A_t - A_{t-1} = - a_bA_{t-1} - a_cA_{t-1} + b_aB_{t-1}+c_aC_{t-1} $$ + +$$ B_t - B_{t-1} = - b_aB_{t-1} - b_cB_{t-1} + a_bA_{t-1}+c_bC_{t-1}$$ + +$$ C_t - C_{t-1} = - c_aC_{t-1} - c_bC_{t-1} + a_cA_{t-1}+ b_cB_{t-1}$$ + +We have turned a set of mathematical identities into a _model_. It is doubtful that this model will hold exactly for each $t$, so you should add a stochastic error term. Rearranging we obtain a first-order Vector Autoregression (VAR): + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +1-a_b-a_c & b_a & c_a \\ +a_b & 1-b_a-b_c & c_b \\ +a_c & b_c & 1-c_a-c_b \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +or, to homogenize notation, + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +\gamma_{11} & \gamma_{12} & \gamma_{13} \\ +\gamma_{21} & \gamma_{22} & \gamma_{23} \\ +\gamma_{31} & \gamma_{32} & \gamma_{33} \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +subject to the equality restrictions +$$ \begin{matrix} +\gamma_{11} + \gamma_{21} + \gamma_{31} =1 \\ +\gamma_{12} + \gamma_{22} + \gamma_{32} =1 \\ +\gamma_{13} + \gamma_{23} + \gamma_{33} =1 \\ +\end{matrix} $$ + +So you have essentially $6$ unknown coefficients and a sample of $T-1$ observations (for each company). Now, you must also assume what, if any, is the stochastic relation between the three stochastic error terms (are they correlated?) but such additional assumption should either come from knowledge of the specific real world phenomenon under study, or through a statistical specification search. + +Of course you may find that such a model does not fit your data sample well - for example you expect that all estimated coefficients should be positive and smaller than or equal to unity, but the estimation procedure may not give you that. But this is what we do: we come up with specification hypotheses and we test them against the data - ""success"" is never guaranteed. Then you should try to come up with a different model. + +",added 2 characters in body,2013-10-11 08:50:02.827 +185713,57283,9074.0,5,,CC BY-SA 3.0,5387d364-ebbf-4a6d-9c8f-26702892d8b5,"You're asking two questions: + + 1. Is there a generic test for unimodality? + 2. Is there tests to test whether a sample is derived from a given distribution, say, a normal distribution? + +Ad 1): Yes, the Hartigan-Hartigan dip test, [Ann. Statist. 13(1):70-84][1]. + +Ad 2): There exists a number of special tests, but the [Kolmogorov-Smirnov][2] test is a general-purpose nonparametric test, although with low statistical power. + +Best, + + [1]: http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1176346577 + [2]: http://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test",added 1 characters in body,2013-10-11 08:56:00.610 +185715,57284,,2,user14650,CC BY-SA 3.0,d0e821d4-743c-40ef-83e9-66dad2846efa,"In R, the function `wilcox.test` takes the argument `conf.level = 0.095` (for example). Giving the same argument to the function `wilcoxsign_test` from the *coin* package returns a warning: + + additional arguments conf.level will be ignored + +**What default confidence level does `wilcoxsign_test` use, and how can I change it?** + +*Or :* Why do I not need a confidence level for this function?",,2013-10-11 09:06:48.623 +185718,56955,,25,,,79a5b88c-fdd1-44d9-8b7c-9d2191b483ba,,http://twitter.com/#!/StackStats/status/388594137152643073,2013-10-11 09:17:16.740 +185719,57285,221.0,2,,CC BY-SA 3.0,6008c348-1a85-4007-9c6f-6ea070411500,"The topic is called [Association Rule Learning][1], which is one of the most basic (and rather old-fashioned) ways to build a recommender system. The most widely known algorithms are called A Priori and FP Growth. Every good book about Data Mining should contain a chapter about it. + +However, the formula seems to be wrong. + +$P(A|B)$ means Probability of A given B, so + + P(A|B)=count_users(bought(A,B)) / count_users(bought_B) + +is correct. + +Furthermore, the mentioned algorithms do not take into account something like $P(\neg A|B)$, because the fact that a user has not bought A could have multiple meanings + + - user does not like A + - user does not know that A exists or is sold here + - user does not bought A although he likes it for one of the thousand seemingly arbitrary motivatiors of human behavior. + +because not buying something is an implicit preference. If the user would have stated explicitly that he does not like A (may be in a survey), it is called an explicit preference. In case of implicit negative preferences, the negative preferences are often excluded from the model. + +If explicit preferences are given, the overall formula $\frac{P(A|B)}{P(\neg A|B)}$ would make sense and represent the [Odds Ratio][2]. + + + [1]: http://en.wikipedia.org/wiki/Association_rule_learning + [2]: http://en.wikipedia.org/wiki/Odds_ratio + [3]: http://en.wikipedia.org/wiki/Statistical_classification",,2013-10-11 09:40:49.687 +185721,57282,,24,,CC BY-SA 3.0,203372b3-cb62-42e9-8c9f-7547cf8ed197,,Proposed by 27581 approved by 930 edit id of 5593,2013-10-11 09:45:32.230 +185720,57282,,5,,CC BY-SA 3.0,203372b3-cb62-42e9-8c9f-7547cf8ed197,"Are there any generic tests to validate if a given sample follows a unimodal distribution, like a Gaussian, Cauchy, Student's t or a chi-square?",Slight editing,2013-10-11 09:45:32.230 +185722,57220,,24,,CC BY-SA 3.0,18183114-e8f1-4ee4-acd9-42723719772c,,Proposed by 31381 approved by 686 edit id of 5594,2013-10-11 09:47:17.813 +185723,57220,22603.0,5,,CC BY-SA 3.0,18183114-e8f1-4ee4-acd9-42723719772c,"I have the following regression + + $children = \beta_0 + \beta_1 \log(earnings) + \beta_2 grandparents + \epsilon$ + +and $\beta_1>0$ with $p$=0.01 and $\beta_2>0$ with $p$=0.01, and N is large (N>10.000) and grandparents takes values 0,1,2,3,4. + +Then I add the interaction term ($\log(earnings)*grandparents$) to equation 1, such that: + + $children = \beta_0 + \beta_1 \log( earnings) + \beta_2 grandparents+ \beta_3 \log( earnings)*grandparents + \epsilon$ + +and $\beta_1>0$ with $p$=0.01, $\beta_2$ is no longer statistically significant and also $\beta_3$ is not statistically significant. + +I do not understand how to interpret the results and if the interaction term wipes out the direct effect of grandparents since $\log(earnings)$ is always different from 0. + + +>>>>There is a way to test the stat. sign. of the effect of Grandparents in the interacted model? (Thanks Maarten for your previous answer) +",improve formatting,2013-10-11 09:47:17.813 +185724,57286,503.0,2,,CC BY-SA 3.0,d7bb3ae9-1c10-4442-a7f3-15c63d956bd2,"You don't need (and in fact can't) set a confidence limit in `wilcoxsign_test` because the about of the function includes a p value. e.g.the first example in the help file for the function: + + + RoundingTimes <- data.frame( + times = c(5.40, 5.50, 5.55, + 5.85, 5.70, 5.75, + 5.20, 5.60, 5.50, + 5.55, 5.50, 5.40, + 5.90, 5.85, 5.70, + 5.45, 5.55, 5.60, + 5.40, 5.40, 5.35, + 5.45, 5.50, 5.35, + 5.25, 5.15, 5.00, + 5.85, 5.80, 5.70, + 5.25, 5.20, 5.10, + 5.65, 5.55, 5.45, + 5.60, 5.35, 5.45, + 5.05, 5.00, 4.95, + 5.50, 5.50, 5.40, + 5.45, 5.55, 5.50, + 5.55, 5.55, 5.35, + 5.45, 5.50, 5.55, + 5.50, 5.45, 5.25, + 5.65, 5.60, 5.40, + 5.70, 5.65, 5.55, + 6.30, 6.30, 6.25), + methods = factor(rep(c(""Round Out"", ""Narrow Angle"", ""Wide Angle""), 22)), + block = factor(rep(1:22, rep(3, 22)))) + + ### classical global test + friedman_test(times ~ methods | block, data = RoundingTimes) + +gives as output + + Asymptotic Friedman Test + + data: times by + methods (Narrow Angle, Round Out, Wide Angle) + stratified by block + chi-squared = 11.1429, df = 2, p-value = 0.003805 + +so, since p = 0.0038, you know it is significant at p = 0.05 (and, indeed, much below that). ",,2013-10-11 10:26:57.610 +185726,57252,,24,,CC BY-SA 3.0,7a276888-4bca-491e-b87f-e6d872670a74,,Proposed by anonymous approved by 686 edit id of 5595,2013-10-11 10:28:02.303 +185725,57252,0.0,5,,CC BY-SA 3.0,7a276888-4bca-491e-b87f-e6d872670a74,"I put this question because while reading the benefits of standardizing explanatory variables or not, I read *good but contrasting* opinions about standardizing when there are interaction in the model. + +Some talk about how problems of collinearity are removed when standardizing (e.g. http://stats.stackexchange.com/questions/60476/collinearity-diagnostics-problematic-only-when-the-interaction-term-is-included#61022), which is basically the case of my GLMM. However, others claim that standard errors and p-values of interactions of standardized models are not reliable... (e.g.http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is or http://quantpsy.org/interact/interactions.htm) + +So, any ideas on what is the right thing to do?",I edited a bit to include the links of those contrasting opinions,2013-10-11 10:28:02.303 +185729,57287,21624.0,3,,CC BY-SA 3.0,34043a0d-98d4-42c3-8bec-690b0e8b61c3,,,2013-10-11 10:37:53.160 +185728,57287,21624.0,1,,CC BY-SA 3.0,34043a0d-98d4-42c3-8bec-690b0e8b61c3,How to decide how many times of bootstrap to run regards the sample size?,,2013-10-11 10:37:53.160 +185727,57287,21624.0,2,,CC BY-SA 3.0,34043a0d-98d4-42c3-8bec-690b0e8b61c3,"I am using bootstrap for my simulation. + +The number of the population is flexible for each case, while the sample size is decide by a certain percentage. For example, I have a 10,000 population, and I decide to use 10% for each iteration of bootstrap, so the sample size is 1,000. + +In practice, I found it is hard to decide how many time to run the bootstrap is enough. With less simulation, the results appears insuffice, while with a large number of simulation is simply waste of time. + +May I know if there is a method can help me to decide the number of iterations to run?",,2013-10-11 10:37:53.160 +186215,57428,20470.0,4,,CC BY-SA 3.0,db7a9836-5766-421a-9970-ebc244a2e876,How to discretise continuous attributes while implementing ID3 Algorithm,grammar / rephrasal,2013-10-14 09:23:33.433 +185730,57287,21624.0,5,,CC BY-SA 3.0,57a71b9b-0da4-4f98-87cf-b54f9c19c1bc,"I am using bootstrap for my simulation. + +The number of the population is flexible for each case, and the sample size is decided by a certain percentage. For example, I have a 10,000 population, and I decide to use 10% for each iteration of bootstrap, so the sample size is 1,000. + +In practice, I found it is hard to decide how many times to run the bootstrap is enough. With less simulation, the results appears insuffice, while with a large number of simulation is redundant. + +May I know if there is a method can help me to decide the number of iterations to run?",deleted 11 characters in body,2013-10-11 10:55:07.973 +185733,57288,19436.0,1,,CC BY-SA 3.0,164a16f0-fe53-4555-bdff-11aba7d1604b,Reversing Chebyshev inequality argument,,2013-10-11 10:55:26.637 +185732,57288,19436.0,2,,CC BY-SA 3.0,164a16f0-fe53-4555-bdff-11aba7d1604b,"One way one could state Chebyshev's inequality is + +> The probability that a realization deviates from the mean more +> than $k$ standard deviations is at most $\frac{1}{k^2}$. + +My question is: Can one rigorously reverse this logic and make a statement about the probability that the actual mean is close to the observation. One immediate technical problem is that one needs to define a probability space on possible probability distributions/means. + +I'm asking because I think this type of argument (although slightly more convoluted) underlies Vapnik-Chervonenkis theory. In their textbooks this issue is not discussed at all. They prove a large deviation principle and then simply invert all their inequalities. How does this work? Does it?",,2013-10-11 10:55:26.637 +185731,57288,19436.0,3,,CC BY-SA 3.0,164a16f0-fe53-4555-bdff-11aba7d1604b,,,2013-10-11 10:55:26.637 +185734,57283,9074.0,5,,CC BY-SA 3.0,4c8518f1-f825-4496-b879-0f0e300c91b2,"You're asking two questions: + + 1. Is there a generic test for unimodality? + 2. Are there tests to test whether a sample is derived from a given distribution, say, a normal distribution? + +Ad 1): Yes, the Hartigan-Hartigan dip test, [Ann. Statist. 13(1):70-84][1]. + +Ad 2): There exists a number of special tests, but the [Kolmogorov-Smirnov][2] test is a general-purpose nonparametric test, although with low statistical power. + +Best, + + [1]: http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1176346577 + [2]: http://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test",Congruence error.,2013-10-11 10:57:12.170 +185735,57287,674.0,4,,CC BY-SA 3.0,23c27ad9-b6d8-4426-b8ce-a0a1038deee9,How to decide bootstrap sample size and number of runs?,added 14 characters in body; edited title,2013-10-11 11:01:45.260 +185736,57287,674.0,5,,CC BY-SA 3.0,23c27ad9-b6d8-4426-b8ce-a0a1038deee9,"I am using bootstrap for my simulation. + +The number of the population is flexible for each case, and the sample size is decided by a certain percentage. For example, I have a 10,000 population, and I decide to use 10% for each iteration of bootstrap, so the sample size is 1,000. + +In practice, I found it is hard to decide how many times to run the bootstrap is enough. With less simulation, the results appears insufficiant, while with a large number of simulation they are redundant. + +May I know if there is a method that can help me to decide the number of iterations to run?",added 14 characters in body; edited title,2013-10-11 11:01:45.260 +185738,57287,21624.0,5,,CC BY-SA 3.0,5a8edc37-ec17-4b17-845e-b2a3b2193e95,"I am using bootstrap for my simulation. + +The number of the population is flexible for each case, and the sample size is decided by a certain percentage. For example, I have a 10,000 population, and I decide to use 10% for each iteration of bootstrap, so the sample size is 1,000. + +In practice, I found it is hard to decide how many times to run the bootstrap is enough. With less simulation, the results appear insufficiant, while with a large number of simulation they are redundant. + +May I know if there is a method that can help me to decide the number of iterations to run?",edited title,2013-10-11 11:07:21.687 +185737,57287,21624.0,4,,CC BY-SA 3.0,5a8edc37-ec17-4b17-845e-b2a3b2193e95,How to decide bootstrap number of runs?,edited title,2013-10-11 11:07:21.687 +185740,57290,0.0,2,,CC BY-SA 3.0,eaaa1685-5eb8-4f57-900a-a02bc70941bb,,,2013-10-11 11:26:16.130 +185739,57289,0.0,2,,CC BY-SA 3.0,1db4b3b2-ab49-42c6-a169-74624870c957,,,2013-10-11 11:26:16.130 +185743,57291,22262.0,3,,CC BY-SA 3.0,eb1818b2-f2e4-4aa4-acfb-8ecbee48234f,,,2013-10-11 11:39:19.043 +185742,57291,22262.0,1,,CC BY-SA 3.0,eb1818b2-f2e4-4aa4-acfb-8ecbee48234f,Function to find the quantile in a vector corresponding to constant $x$,,2013-10-11 11:39:19.043 +185741,57291,22262.0,2,,CC BY-SA 3.0,eb1818b2-f2e4-4aa4-acfb-8ecbee48234f,"Suppose I have constant `x=0.1` in the language `R` and I have a vector `vec = rnorm(200)`. Is there a pre-packaged function to find the quantile of `vec` that corresponds the closest to `x`? + +A solution is as follows: + + x = 0.1 + vec = rnorm(100) + percentiles = quantile(vec,seq(0,1,by=0.01)) + which(abs(x-percentiles)==min(abs(x-percentiles))) + #returns closest match +... but I would like to know if there's a pre-packaged function.",,2013-10-11 11:39:19.043 +185744,57292,19125.0,2,,CC BY-SA 3.0,77844d56-f139-4737-badd-5f0492e6b69b,"Your supervisor may very well be right and the small sample size is the problem. You might want to do a bit of reading on [*Power Analysis*](http://psych.wisc.edu/henriques/power.html). An introductory paper is that by Cohen (1992). + +In short, there is a relation between sample size, effect size and power (which is the probability that the test detects a significant effect assuming that there is one). For example, if you have an estimate of the effect size you're looking for (in your example the difference between the means of the two groups) and you want to obtain a statistically significant result regardings this effect with a certain error probability (the $\alpha$-Level), then you can compute the size of the sample that is neccessary. Generally, when you have two of the numbers, you can compute the third one. + +The difficult part is probably to get an idea of the effect size before doing the analysis. After all, ususally that is what one wants to find out about. An interesting discussion on this can be found on the [Cognitive Sciences SE site](http://cogsci.stackexchange.com/questions/3384/how-to-estimate-an-expected-effect-size-for-a-planned-psychology-study?lq=1). + +One piece of free software to do power analysis is [G Power](http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register). There is also the `pwr`-package for R. + +References: + + +Cohen, J. (1992). [A power primer](http://classes.deonandan.com/hss4303/2010/cohen%201992%20sample%20size.pdf). *Psychological Bulletin*, 112(1), 155. + ",,2013-10-11 11:48:49.933 +185857,57325,22622.0,2,,CC BY-SA 3.0,9e44d147-9a69-4e49-baaf-af8f690a9818,"I am running cross-sectional regressions of the type: +Y_c = alpha + beta X_1 + gamma X_2 + delta_1 X_3 + delta_2 X_1 X_3 + delta_3 X_2 X_3 + e_c. +My theoretical model implies that delta_2 should be negative, delta_3 should be positive and the marginal effect of X_3 should be negative. My estimates imply that delta_2 is negative and significant, delta_3 is positive and insignificant, beta is significant, and gamma is insignificant. Building on this evidence, can I calculate the marginal effect of X_3 as delta_1 + delta_2 E(X_1) where E(X_1) is the muean of X_1 justifiying this procedure with the fact that all the terms incorporating X_2 are insignificant? + +Thanks for this.",,2013-10-11 20:30:29.607 +185746,57293,18198.0,2,,CC BY-SA 3.0,58b3f483-5bb1-4e7b-b60f-63dd24dabf9e,"I am testing various techniques for dealing with strong Multi-collinearity in a regression problem. + +There have been various comparison papers written between competing techniques such as Ridge Regression (RR) and Principal Components Regression (PCR). There seems to be no clear winner though with the best technique seemingly problem specific. However one thing that bothers me about the PCR approach is the somewhat arbitrary way in which one simply excludes the smallest eigenvectors as has been proven in Hadi and Ling even the smallest eigenvector may have strong predictive power while the largest eigenvectors may have none. + +""Some Cautionary notes on the use of Principal Components Regression"" by Hadi and Ling. + +They also show that the the SSE can be vastly improved by adding seemingly insignificant eigenvectors. + +In their discussion they highlght two papers the try to address this 2nd deficiency Lott(1973) and Gunst and Mason(1973) but it has been shown that the Lott technique fails to pick the ""correct"" eigenvectors in the presence of strong MC, and my problem has strong MS. Do you know of a paper that can select the optimum set of eigenvalues even in the presence of strong MC? + +Or more recent papers that compare PCR and RR?",,2013-10-11 11:56:09.380 +185745,57293,18198.0,3,,CC BY-SA 3.0,58b3f483-5bb1-4e7b-b60f-63dd24dabf9e,,,2013-10-11 11:56:09.380 +185747,57293,18198.0,1,,CC BY-SA 3.0,58b3f483-5bb1-4e7b-b60f-63dd24dabf9e,Selecting Optimal set of Eigenvectors for Principal Components Regression,,2013-10-11 11:56:09.380 +185749,57290,2081.0,5,,CC BY-SA 3.0,5c30db97-1fba-48f8-9aa7-156c098ed4bf,"Canonical correlation is between sets of variables. It is the maximized correlation between a linear combination of one set and a linear combination of the other set (the two combinations are called canonical variates). Canonical correlation analysis deals with that latent ""inside"" of a multivariate regression/anova.",added 318 characters in body,2013-10-11 12:01:56.470 +185748,57290,,24,,CC BY-SA 3.0,5c30db97-1fba-48f8-9aa7-156c098ed4bf,,"Proposed by 3277 approved by 2116, 930 edit id of 5596",2013-10-11 12:01:56.470 +185750,57294,11489.0,2,,CC BY-SA 3.0,d75c4039-8dfe-4e4c-8794-4603664608f3,"Yep, if had you bothered to read the manual of `quantile`, you would have found the function `ecdf` in the ""See Also"" section. + + x <- 0.1 + vec <- rnorm( 100 ) + ecdf( vec )( x ) + # or + my.ecdf <- ecdf( vec ) + my.ecdf( x ) + +`ecdf` is a function returning another function -- that in turn is the experimental distribution function of your distribution.",,2013-10-11 12:03:59.843 +185751,57295,21762.0,2,,CC BY-SA 3.0,37a8cc39-d0ec-49fe-9026-bb9b8266e077,"A data driven (and thus probably not so very good) approach + +Calculate four correlation matrices: One for each layer and one for the pooled data (three lines per sample). If they all look quite similar, run a PCA based on the correlation matrix of the pooled sample and go on with the first few PCs. + +Instead of comparing the four correlation matrices, you could also consider the four loading matrices of the corresponding PCAs and compare the loadings of the first few PCs. This is much easier if you have lots of variables.",,2013-10-11 12:05:40.297 +185752,57294,11489.0,12,,,d0f471eb-e1c7-4bb3-8dce-cb8191f65a24,"{""Voters"":[{""Id"":14803,""DisplayName"":""January""}]}",via Vote,2013-10-11 12:06:11.553 +185753,57293,674.0,5,,CC BY-SA 3.0,cf62b295-c42e-4407-9578-17de1111bb42,"I am testing various techniques for dealing with strong multi-collinearity in a regression problem. + +There have been various comparison papers written between competing techniques such as Ridge Regression (RR) and Principal Components Regression (PCR). There seems to be no clear winner though with the best technique seemingly problem specific. However one thing that bothers me about the PCR approach is the somewhat arbitrary way in which one simply excludes the smallest eigenvectors as has been proven in Hadi and Ling even the smallest eigenvector may have strong predictive power while the largest eigenvectors may have none. + +> ""[Some Cautionary notes on the use of Principal Components +> Regression][1]"" by Hadi and Ling. ([PDF][2]) + +They also show that the the SSE can be vastly improved by adding seemingly insignificant eigenvectors. + +In their discussion they highlght two papers that try to address this 2nd deficiency--Lott(1973) and Gunst and Mason(1973--but it has been shown that the Lott technique fails to pick the ""correct"" eigenvectors in the presence of strong MC, and my problem has strong MS. + +Do you know of a paper that can select the optimum set of eigenvalues even in the presence of strong MC? +Or more recent papers that compare PCR and RR? + + + [1]: http://www.jstor.org/discover/10.2307/2685559?uid=3738016&uid=2&uid=4&sid=21102750270477 + [2]: http://www.uvm.edu/~rsingle/stat380/F04/possible/Hadi+Ling-AmStat-1998_PCRegression.pdf",added 223 characters in body,2013-10-11 12:06:53.477 +185754,57293,674.0,4,,CC BY-SA 3.0,cf62b295-c42e-4407-9578-17de1111bb42,Selecting optimal set of eigenvectors for Principal Components Regression,added 223 characters in body,2013-10-11 12:06:53.477 +185755,57294,11489.0,13,,,e8860c4f-f2d9-42a6-9679-7b59ef267e93,"{""Voters"":[{""Id"":14803,""DisplayName"":""January""}]}",,2013-10-11 12:07:34.730 +185757,57296,21398.0,1,,CC BY-SA 3.0,fe559804-4816-4ff1-86a6-e6814817feac,Selection probability weight,,2013-10-11 12:11:24.820 +185758,57296,21398.0,3,,CC BY-SA 3.0,fe559804-4816-4ff1-86a6-e6814817feac,,,2013-10-11 12:11:24.820 +185756,57296,21398.0,2,,CC BY-SA 3.0,fe559804-4816-4ff1-86a6-e6814817feac,"I have a question on my selection probability weight. Is it a correct weight? + +***The research design:*** research areas were divided into strata according to size. Interviews were taken: 50 batches of 10 interviews in each area according to the relative size of strata. +Clusters were made for each stratum. In each cluster: batches of 10 interviews were sampled in fixed intervals. A random walk selected households and within these households, respondents were randomly chosen. + +***The selection probability weight:*** I had no population data on number of households. A selection probability weight was calculated for the within-household selection for each stratum. In each stratum, a weight was calculated and normalized so that the sum of the weights is 500 for each research area. The size of the eventual stratum was divided by the number of people in the stratum eligible for the survey. The result of this calculation was then multiplied by the number of eligible respondents in the household. +",,2013-10-11 12:11:24.820 +185761,57297,18198.0,2,,CC BY-SA 3.0,b510d6c8-caec-4002-917b-283ff85acac8,"I have an eigen decomposition of a 30 variable covariance matrix calculated using 5y of daily data and would like to compare it to a different 5y period to see if the eigenvalues are the same. Obviously they will not be exactly the same due to noise in the signal but can I test statistically that they are the the same? + +""An asymptotic chi-square test for the equality of two correlation matrices"" by R. Jennrich + +The closest match I have found is a paper to test the equivalence on two correlation matrices, but as I am working in the Eigenvector space I would prefer a test that is performed on the eigenvectors (plus the paper is quite old). + +Also on a similar topic what is the minimum length of time I can run a PCA analysis over for 30 variables on daily data. Clearly if i can generate more eigenvector decompositions to compare I can be more confident in my results.",,2013-10-11 12:13:44.087 +185760,57297,18198.0,3,,CC BY-SA 3.0,b510d6c8-caec-4002-917b-283ff85acac8,,,2013-10-11 12:13:44.087 +185763,57278,20144.0,5,,CC BY-SA 3.0,5ad84eb6-e4d5-4847-8dfb-13c04bbc8eba,"first of all, I want to express my apologies if the question is too broad or wrong, but I am in need of references and I have no idea of whom can I ask. + +If you are interested, the question comes from a model I built, you can see some details [here](http://physics.stackexchange.com/questions/78524/boltzmann-distribution-with-interaction-between-particles) and [here](http://physics.stackexchange.com/questions/80019/grand-canonical-ensemble-with-interaction-simulation-doubts). In this model I have: +$$f(\mathbb{x}|T,\mu)=\frac{h(\mathbb{x})e^{-\frac{E(\mathbb{x})}{kT}+\mu N(x)}}{\mathcal{Z}(T,\mu)}$$ + +There, my parameters are $\mu$ and $T$, and $\mathbb{x}=(x_1,\dots,x_M)$ where $x_i\in\{0,1\}$ and I have the restriction $\forall i\in\{1,\dots,M-D+1\}$ +$$\sum_{j=0}^{D-1} x_{i+j} \leq 1$$ +This is,$h(\mathbb{x})=0$ if that condition is not held. + + +I have the ""small"" inconvenient of not knowing $\mathcal{Z}(T,\mu)$, so I used a MCMC (Metropolis-Hastings) method to approximate this function. However I face two problems. The first of them is regarding the simulation and the model, and I am on solving it (it depends too much on the initial condition). The second is that these parameters are not fully known and I have no idea of how can I estimate them. I have been reading about bayesian inference and I know a bit of estimator theory but I am no expert (furthermore I don't know if not knowing the partition function can affect). If any of you were able to give me some clue in the form of a book that I can read, I would be eternally grateful. + +Thank you very much for your help. + +Thanks to cardinal's comment, I have realized that I didn't explain one thing. It probably makes all more complex but there it goes: +The idea is that $E$ is known in each experiment, actually $E(\mathbf{x}) = \mathbf{E}\cdot\mathbf{x}$. However, $\mathbf{E}$ is not always the same, it represents an external potential for some particles. The ""good"" thing is that $T$, which accounts for the temperature, never changes whatever $\mathbf{E}$ is, so I thought that I could find a way of estimating it, given the fact that I have an empirical distribution of $x_{i}$ (so, a probability that a particle is in the position $i$) given a certain $\mathbf{E}$. So, in a way, what I have is +$$f(\mathbf{x}|T,\mu , \mathbf{E})$$, but I always know $\mathbf{E}$ and I know (can I say this?) that $T,\mu$ are independent of $\mathbf{E}$. I am sorry for not being clear enough before. I am starting to think that nothing of this makes sense...",Added information from the comments,2013-10-11 12:26:27.490 +185766,57298,,3,user30602,CC BY-SA 3.0,09f4c732-254b-4e4b-805b-8eb9318eb76a,,,2013-10-11 12:36:44.077 +185764,57298,,2,user30602,CC BY-SA 3.0,09f4c732-254b-4e4b-805b-8eb9318eb76a,"I have a function $f(x)=2ae^{-ax}(1-e^{-ax})$, for $x>0, a>0$. This is a pdf. I need to find $P(X>1)$. I have done all my work in such a way that I should get the same answer whether I use the pdf or the cdf to find this probability. However, I'm getting different answers. Can someone please help me? + +**My attempt:** + +(using pdf) $P(X>1)=\int_0^{\infty}2ae^{-ax}(1-e^{-ax})dx = 2e^{-a}-e^{-2a}$ +(using cdf) $P(X>1)= 1-P(X\leq 1) = 1 - (F_X(1)) = 1-(e^{-ax}(e^{-ax}-2))|_{x=1}=1-2e^{-a}-e^{-2a}$ + +Why are my answers different? Thanks!",,2013-10-11 12:36:44.077 +185765,57298,,1,user30602,CC BY-SA 3.0,09f4c732-254b-4e4b-805b-8eb9318eb76a,Different answers for probability density function and cumulative density function,,2013-10-11 12:36:44.077 +185767,57242,22573.0,5,,CC BY-SA 3.0,69e34ff8-77f1-4cf2-b4b3-fe462d860172,"Assume I am looking for a normal distribution $\mathcal{N}(\mu,\Sigma)$. For simplicity let's say we only have 2 random variables $x$ and $y$ and a known $\mu=0$. + +Is it possible to estimate $\Sigma$ by observing the variance along multiple directions? + +For example, I measure the variance $\sigma_1$ along the vector $\mathbb{v}_1 = (x_1,y_1)^T$. In another step I obtain a different variance $\sigma_2$ from a different direction $\mathbb{v}_2 = (x_2,y_2)^T$. Ideally one would continue to observe these single variations in different directions and combine them in one multivariate normal distribution. + +Does this make sense? + +EDIT: +Some additional background information might be useful: I have a sensor device with known position and orientation in 2D space (in a future step both may have an uncertainty). The sensor is able to measure only the distance of a point along its orientation. I'm also given the sensor model. So for each distance measure $d_i$, I obtain the standard error $\sigma(d_i)$ which depends on the distance. + +Since I'm not able to manipulate the sensor position to my advantage or perform a large number of measurements, I'd like to combine these variances into one covariance matrix in order to make a more reliable prediction of the position of the measured point. + +This is just a thought that is still under development with no guaranty to work out correctly. Hence my question of ""making sense""...",added background information to clarify problem,2013-10-11 12:39:25.957 +185768,57298,,5,user30602,CC BY-SA 3.0,dcd7f79b-65c2-4adc-90c4-bd6e5aebdee0,"I have a function $f(x)=2ae^{-ax}(1-e^{-ax})$, for $x>0, a>0$. This is a pdf. I need to find $P(X>1)$. I have done all my work in such a way that I should get the same answer whether I use the pdf or the cdf to find this probability. However, I'm getting different answers. Can someone please help me? + +**My attempt:** + +(using pdf) $P(X>1)=\int_1^{\infty}2ae^{-ax}(1-e^{-ax})dx = 2e^{-a}-e^{-2a}$ +(using cdf) $P(X>1)= 1-P(X\leq 1) = 1 - (F_X(1)) = 1-(e^{-ax}(e^{-ax}-2))|_{x=1}=1-2e^{-a}-e^{-2a}$ + +Why are my answers different? Thanks!",fixed grammar,2013-10-11 12:44:22.840 +185771,57299,19395.0,1,,CC BY-SA 3.0,7d28127f-c600-43ac-b3de-3beb623d882e,Before and after data: Which test for average comparison of Likert scale data?,,2013-10-11 13:10:18.780 +185770,57299,19395.0,3,,CC BY-SA 3.0,7d28127f-c600-43ac-b3de-3beb623d882e,,,2013-10-11 13:10:18.780 +185769,57299,19395.0,2,,CC BY-SA 3.0,7d28127f-c600-43ac-b3de-3beb623d882e,"I have one group of respondents which answer on a scale of 1-5 once before and once after an experiment. I want to see if the experiment made a difference to their responses. + +I was told not to use a t-test because of the Likert scale (ordinal data does not seem to fit a t-test) and because my data are not nearly normally distributed (answers to the questions lean heavily to the 1 of the scale (which is not a mistake in the design)). + +I am not sure if the Wilcoxon signed-rank test works, because it seems to be designed for differences in groups (as in ""Do men respond differently from women?""). + +Any suggestions on what could actually be used here? + +(The answer [here](http://stats.stackexchange.com/questions/20245/using-t-test-for-comparing-likert-responses-before-and-after-intervention) refers to a ""special paired t-test"", but does not explain which one)",,2013-10-11 13:10:18.780 +185772,57300,21762.0,2,,CC BY-SA 3.0,a57aeeb8-f38f-4bec-8bfe-ccab83f5821e,"Wilcoxon's signed-rank test is usually a good choice in such a situation. It is the paired version of Wilcoxon's rank-sum test (aka Mann-Whitney-U-test). I think you are mixing these two procedures. + +Make sure you use an exact/pseudo-exact implementation of the test to account for the highly discrete distribution.",,2013-10-11 13:17:25.363 +185773,57301,22059.0,2,,CC BY-SA 3.0,9d9b1065-f9a4-4cea-adee-61e7a059d8ad,"It looks like a simple calculation error: + +\begin{align} +F_X(y)&=\int_{x=0}^y 2ae^{-ax}(1-e^{-ax})\\ +&=e^{-ax}(e^{-ax}-2)|_{x=0}^y\\ +&=e^{-2ay}-2e^{-ay}+1. +\end{align} +",,2013-10-11 13:22:52.120 +185774,57298,,5,user30602,CC BY-SA 3.0,dda38a81-a611-4837-acdd-20c4c7bc8f75,"I have a function $f(x)=2ae^{-ax}(1-e^{-ax})$, for $x>0, a>0$. This is a pdf. I need to find $P(X>1)$. I have done all my work in such a way that I should get the same answer whether I use the pdf or the cdf to find this probability. However, I'm getting different answers. Can someone please help me? + +**My attempt:** + +(using pdf) $P(X>1)=\int_1^{\infty}2ae^{-ax}(1-e^{-ax})dx = 2e^{-a}-e^{-2a}$ + +(using cdf) $P(X>1)= 1-P(X\leq 1) = 1 - (F_X(1)) = 1-(e^{-ax}(e^{-ax}-2))|_{x=1}=1-2e^{-a}-e^{-2a}$ + +Why are my answers different? Thanks!",improved formatting,2013-10-11 13:24:58.080 +185855,57325,22622.0,3,,CC BY-SA 3.0,9e44d147-9a69-4e49-baaf-af8f690a9818,,,2013-10-11 20:30:29.607 +185775,57302,750.0,2,,CC BY-SA 3.0,8c193f5c-879b-41d0-9f07-3a3f63ebf992,"You don't need extra terms in the models, less actually. It is plain to see by your plots, but if you look at `summary(m2)` you will see that the variance for random effect for `x` is really small, and the variance for the intercept is quite small as well. + +Similarly for the `m1` model, you can see from the plot that the slopes are all the same, but the intercept varies. You can use an F-test to check the model with *only* random intercepts versus the model with random slopes and intercepts you specified. + + m1 <- lmer(y~x + (x|g),data=dat1) + m1RInt <- lmer(y~x + (1|g),data=dat1) + anova(m1,m1RInt) + +Also just looking at the variance estimates of the random intercepts and effects for `summary(m1)` you would have come to the same conclusion that using random intercepts adds nothing to the model.",,2013-10-11 13:41:25.093 +185778,57303,11506.0,3,,CC BY-SA 3.0,1d68e327-b90c-4b86-a7b1-4427a21c3086,,,2013-10-11 13:50:46.157 +185777,57303,11506.0,1,,CC BY-SA 3.0,1d68e327-b90c-4b86-a7b1-4427a21c3086,Quantitative results of clustering analysis,,2013-10-11 13:50:46.157 +185776,57303,11506.0,2,,CC BY-SA 3.0,1d68e327-b90c-4b86-a7b1-4427a21c3086,"Currently, I am doing a clustering analysis for two sets of data. One smaller dataset (about 100 data) got ground truth labels, and one larger dataset (about 2000 data) has no ground truth labels. + +For the smaller dataset, obviously, I can obtain quantitative results like accuracy, sensitivity and specificity. + +However, for the larger dataset, I have no ground truth and couldn't get any useful quantitative results. + +1. The only thing I found useful is the 'mean silhouette value', which can measure the cluster performance. However, it based on some distance measure that can only tell people how separate are the clusters. I am wondering if there are other 'better' or 'more appropriate' quantitative analysis for data without labels. + +2. Because the data are without labels, I am also wondering if we can somehow have a 'uncertainty' measure about the clustering results like how confident about the cluster results? + +3. For the smaller dataset with labels, except accuracy, sensitivity and specificity, any other quantitative results I can get? For the classification algorithm, we can do a cross-validation, is there any method we can use to do such a cross-validation for clustering? Also, can we get ROC analysis for clustering task? + +Thanks! +A. + ",,2013-10-11 13:50:46.157 +185779,57300,21762.0,5,,CC BY-SA 3.0,a58a1e78-8a2c-4633-a9c1-7aff4fe52730,"Wilcoxon's signed-rank test is usually a good choice in such a situation. It is the paired version of Wilcoxon's rank-sum test (aka Mann-Whitney-U-test). I think you are mixing these two procedures. + +Make sure you use an exact/pseudo-exact implementation of the test to account for the highly discrete distribution. + +EDIT: How you do it in R for x (pre) and y (post) + + library(coin) + set.seed(2) + x <- sample(1:2, 20, T) + y <- sample(2:3, 20, T) + + #Basic R gives p value of 0.0007167 + wilcox.test(x-y) + + #Coin gives p value of 0.0001221 + wilcoxsign_test(x~y, distribution = exact()) + ",How you do it in R,2013-10-11 13:51:35.963 +185782,57304,9522.0,3,,CC BY-SA 3.0,6b560f2a-0d80-4352-8587-49f671a62762,,,2013-10-11 14:05:31.013 +185781,57304,9522.0,1,,CC BY-SA 3.0,6b560f2a-0d80-4352-8587-49f671a62762,Which statistic test to used?,,2013-10-11 14:05:31.013 +185780,57304,9522.0,2,,CC BY-SA 3.0,6b560f2a-0d80-4352-8587-49f671a62762,"I have performed an experiment to test the cellular sensitivity to a certain DNA damage agent. We have found 270 genes that were specifically sensitive to the drug and the total number of genes analyzed was 3668. 38 out of the 270 sensitive genes are classified as ""DNA repair genes"". If the number of ""DNA repair genes"" containing in the genome is 112 and the total number of genes in the genome in 3668, are the sensitive genes enrichment in DNA repair genes? +Which statistic may I have to used? I would appreciate if you could also tell me some tool to calculate the pvalue online because I do not much about biostatistic. +Thanks in advanced",,2013-10-11 14:05:31.013 +185783,57291,668.0,10,,,dc9f49a6-f361-4a4b-8014-0b0ac3fa8ff6,"{""Voters"":[{""Id"":8413,""DisplayName"":""Momo""},{""Id"":17230,""DisplayName"":""Scortchi""},{""Id"":1036,""DisplayName"":""Andy W""},{""Id"":7290,""DisplayName"":""gung""},{""Id"":919,""DisplayName"":""whuber""}]}",102,2013-10-11 14:07:02.583 +185786,57305,22607.0,3,,CC BY-SA 3.0,223ad603-0971-4f74-a672-2c494e949f90,,,2013-10-11 14:20:05.910 +185785,57305,22607.0,1,,CC BY-SA 3.0,223ad603-0971-4f74-a672-2c494e949f90,Interpretation of a PDF squared,,2013-10-11 14:20:05.910 +185784,57305,22607.0,2,,CC BY-SA 3.0,223ad603-0971-4f74-a672-2c494e949f90,"I have a problem where the crucial variable is the integral of the squared PDF of a random variable, i.e. + + $\int f(x)^2dx$ + +How should I interpret this property of a distribution? If $f(x)$ is gaussian, then this is inversely proportional to the variance, $\sigma^2$, but I don't think this is generally true. + +(Note that this is also equal to $\int F(x)f'(x)dx$ ). + +",,2013-10-11 14:20:05.910 +185787,57293,668.0,5,,CC BY-SA 3.0,0b28f8f5-b5f1-4632-8f81-b0c90d3ae611,"I am testing various techniques for dealing with strong multi-collinearity (MC) in a regression problem. + +There have been various comparison papers written between competing techniques such as Ridge Regression (RR) and Principal Components Regression (PCR). There seems to be no clear winner though with the best technique seemingly problem specific. However one thing that bothers me about the PCR approach is the somewhat arbitrary way in which one simply excludes the smallest eigenvectors as has been proven in Hadi and Ling even the smallest eigenvector may have strong predictive power while the largest eigenvectors may have none. + +> ""[Some Cautionary notes on the use of Principal Components +> Regression][1]"" by Hadi and Ling. ([PDF][2]) + +They also show that the the SSE can be vastly improved by adding seemingly insignificant eigenvectors. + +In their discussion they highlght two papers that try to address this 2nd deficiency--Lott(1973) and Gunst and Mason(1973)--but it has been shown that the Lott technique fails to pick the ""correct"" eigenvectors in the presence of strong MC, and my problem has strong MC. + +Do you know of a paper that can select the optimum set of eigenvalues even in the presence of strong MC? Or more recent papers that compare PCR and RR? + + + [1]: http://www.jstor.org/discover/10.2307/2685559?uid=3738016&uid=2&uid=4&sid=21102750270477 + [2]: http://www.uvm.edu/~rsingle/stat380/F04/possible/Hadi+Ling-AmStat-1998_PCRegression.pdf",added 5 characters in body,2013-10-11 14:20:18.940 +185856,57325,22622.0,1,,CC BY-SA 3.0,9e44d147-9a69-4e49-baaf-af8f690a9818,Marginal effect in model with interactions,,2013-10-11 20:30:29.607 +185890,57338,21746.0,2,,CC BY-SA 3.0,883f30d6-db01-40ea-b3bb-d00768bd54f3,"In Neural Nets for the regression problem, we rescale the continuous labels consistently with the output activation function, i.e. normalize them if the logistic sigmoid is used, or adjusted normalize them if tanh is used. At the end we can restore original range but renormalizing the output neurons back. + +Should we also normalize input features? And how? For example, if hidden activation differs from the output activation? E.g. if hidden activation is TANH and output activation is LOGISTIC, should the input features be normalized to lie in [0,1] or [-1,1] interval?",,2013-10-11 23:29:41.150 +185788,57306,21638.0,2,,CC BY-SA 3.0,60230186-7e87-47df-a2d8-f8290f1245fa,"Standard practice to test for enrichment of gene lists is to do a hypergeometric test or, equivalently, a one-sided [Fisher's exact test][1]. You have the following $2\times2$ contingency table: + +$$ +\array{& \text{DNA Repair} & \text{Other} \\\text{Sensitive} & 38 & 232 & 270\\\text{Not Sensitive} & 74 & 3324 & 3398 \\ & 112 & 3556} +$$ + +You can carry out the test in `R` as follows: + + fisher.test(matrix(c(38,74,232,3324),nrow=2,ncol=2),alternative=""greater"") + +Which gives a highly significant result: + + Fisher's Exact Test for Count Data + + data: matrix(c(38, 74, 232, 3324), nrow = 2, ncol = 2) + p-value < 2.2e-16 + alternative hypothesis: true odds ratio is greater than 1 + 95 percent confidence interval: + 5.062107 Inf + sample estimates: + odds ratio + 7.34918 + +Note that as we are testing for over-representation (rather than under-representation) the `alternative` parameter is set to `""greater""`. + + [1]: http://en.wikipedia.org/wiki/Fisher%27s_exact_test",,2013-10-11 14:48:42.243 +185790,57279,22262.0,5,,CC BY-SA 3.0,cdf00a11-c554-4f7f-918f-7810ee18440c,"What variable selection approach should I consider if I have thousands of predictors with clusters that are extremely correlated? + +For example I might have a predictor set $X:= \{A_1,A_2,A_3,A_4,...,A_{39},B_1,B_2,...,B_{44},C_1,C_2,...\}$ with cardinality $|X| > 2000$. Consider the case where all $\rho(A_i,A_j)$ are very high, and similarly for $B$, $C$, .... + +Correlated predictors aren't correlated ""naturally""; it's a result of the feature engineering process. This is because all $A_i$ are hand engineered from the same underlying data with small variations in hand-engineering methodology, e.g. I use a thinner pass band on $A_2$ than I did for $A_1$ in my denoising approach but everything else is the same. + +My goal is to improve out of sample accuracy in my classification model. + +One approach would just be to try everything: non-negative garotte, ridge, lasso, elastic nets, random subspace learning, PCA/manifold learning, least angle regression and pick the one that's best in my out of sample dataset. But specific methods that are good at dealing with the above would be appreciated. + +Note that my out of sample data is extensive in terms of sample size. ",added 74 characters in body,2013-10-11 15:05:21.000 +185795,57307,9522.0,3,,CC BY-SA 3.0,2d649f53-aed2-41ae-a13a-2007bc0a93ca,,,2013-10-11 15:15:59.620 +185794,57307,9522.0,1,,CC BY-SA 3.0,2d649f53-aed2-41ae-a13a-2007bc0a93ca,Any online software to calculate pvalue of Fisher exact test?,,2013-10-11 15:15:59.620 +185793,57307,9522.0,2,,CC BY-SA 3.0,2d649f53-aed2-41ae-a13a-2007bc0a93ca,"I would like to do a one-sided Fisher´s exact test for an analysis. I have not any statistic software to obtain the pvalues (no SAS, no SPSS). The 2x2 tables are of this type: + +Do you know any online statistical software to calculate the pvalues? I have tried with some of them but the results indicate pvalue<0.0001 but I need to know the exact number. +The 2x2 tables are of this type: + + + + ![enter image description here][1] + + + [1]: https://i.stack.imgur.com/BUTLq.png + +Thanks a lot in advanced!",,2013-10-11 15:15:59.620 +185796,57305,,25,,,b968dc23-f4f0-4d36-8421-9cfaa1f87e3a,,http://twitter.com/#!/StackStats/status/388684734056128512,2013-10-11 15:17:16.730 +185797,57304,15827.0,4,,CC BY-SA 3.0,9e8aa6be-64d2-4ab7-bdbc-408ede818fa6,Which statistical test should be used to test for enrichment of gene lists?,more informative title; fixes to English,2013-10-11 15:20:35.360 +185798,57304,15827.0,5,,CC BY-SA 3.0,9e8aa6be-64d2-4ab7-bdbc-408ede818fa6,"I have performed an experiment to test the cellular sensitivity to a certain DNA damage agent. We have found 270 genes that were specifically sensitive to the drug and the total number of genes analyzed was 3668. 38 out of the 270 sensitive genes are classified as ""DNA repair genes"". If the number of ""DNA repair genes"" contained in the genome is 112 and the total number of genes in the genome is 3668, are the sensitive genes enrichment in DNA repair genes? +Which statistical test should be used? I would appreciate if you could also tell me some tool to calculate the p-value online. ",more informative title; fixes to English,2013-10-11 15:20:35.360 +185799,57308,22507.0,2,,CC BY-SA 3.0,d9982cf4-dc90-4cd4-94a8-c5e5c66d0fe7,"I would do the forward stepwise selection, adding predictors as long as the correlation with residuals is significant, and then do some regularization (ridge, lasso, elastic nets). There are 2-3 metaparameters: forward stepwise termination constraint, and 1 or 2 regularization parameters. These metaparameters are determined via cross-validation. + +If you want to take into account non-linearity, you could try random forest, which produces good results when there are many predictors. But it is slow.",,2013-10-11 15:33:10.407 +185800,57223,22564.0,5,,CC BY-SA 3.0,7487704b-5dae-4252-afaa-917449ff1839,"I have a problem like the following: + +1) There are six measurements for each individual with large within-subject variance + +2) There are two groups (Treatment and Control) + +3) Each group consists of 5 individuals + +4) I want to perform a significance test comparing the two groups to know if the group means are different from one another. + + +The data looks like this: +![http://s10.postimg.org/p9krg6f3t/examp.png][1] + +And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. **This ignores within-subject variability**: + + + n.simulations<-10000 + pvals=matrix(nrow=n.simulations,ncol=1) + for(k in 1:n.simulations){ + subject=NULL + for(i in 1:10){ + subject<-rbind(subject,as.matrix(rep(i,6))) + } + #set.seed(42) + + #Sample Subject Means + subject.means<-rnorm(10,100,2) + + #Sample Individual Measurements + values=NULL + for(sm in subject.means){ + values<-rbind(values,as.matrix(rnorm(6,sm,20))) + } + + out<-cbind(subject,values) + + #Split into GroupA and GroupB + GroupA<-out[1:30,] + GroupB<-out[31:60,] + + #Add effect size to GroupA + GroupA[,2]<-GroupA[,2]+0 + + colnames(GroupA)<-c(""Subject"", ""Value"") + colnames(GroupB)<-c(""Subject"", ""Value"") + + #Calculate Individual Means and SDS + GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2) + for(i in 1:length(unique(GroupA[,1]))){ + GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + } + colnames(GroupA.summary)<-c(""Mean"",""SD"") + + + GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2) + for(i in 1:length(unique(GroupB[,1]))){ + GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + } + colnames(GroupB.summary)<-c(""Mean"",""SD"") + + Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary)) + colnames(Summary)[1]<-""Group"" + + pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value + } + + +And here is code for plots: + + + #Plots + par(mfrow=c(2,2)) + boxplot(GroupA[,2]~GroupA[,1], col=""Red"", main=""Group A"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupA[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupA[,1]))){ + m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2]) + ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(GroupB[,2]~GroupB[,1], col=""Light Blue"", main=""Group B"", + ylim=c(.9*min(out[,2]),1.1*max(out[,2])), + xlab=""Subject"", ylab=""Value"") + stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T) + #abline(h=mean(GroupB[,2]), lty=2, lwd=3) + + for(i in 1:length(unique(GroupB[,1]))){ + m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2]) + ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2] + + points(i-.2,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.2, + ci[1],i-.2, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + boxplot(Summary[,2]~Summary[,1], col=c(""Red"",""Light Blue""), xlab=""Group"", ylab=""Average Value"", + ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])), + main=""Individual Averages"") + stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T) + + points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(.9, + t.test(GroupA.summary[,1])$conf.int[1],.9, + t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + + points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col=""Grey"") + segments(1.9, + t.test(GroupB.summary[,1])$conf.int[1],1.9, + t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col=""Grey"" + ) + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + hist(pvals, breaks=seq(0,1,by=.05), col=""Grey"", + main=c(paste(""# sims="", n.simulations), + paste(""% Sig p-values="",100*length(which(pvals<0.05))/length(pvals))) + ) + +Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data. + +So what is the correct way to analyze this data? + + +**Bonus:** + +The example above is a simplification. For the actual data: + +1) The within-subject variance is positively correlated with the mean. + +2) Values can only be multiples of two. + +3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end. + +4) Number of Subjects in each group are not necessarily equal. + +Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the ""correct"" p-values. + +**EDIT:** + +Ok, here is what *actual* data looks like. There is also three groups rather than two: + +![enter image description here][2] + +dput() of data: + + structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, + 22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, + 6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, + 2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, + 12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, + 10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, + 20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list( + NULL, c(""Group"", ""Subject"", ""Value""))) + + +**EDIT 2:** + +In response to Henrik's answer: +So if I instead perform anova followed by TukeyHSD procedure on the individual averages as shown below, I could interpret this as underestimating my p-value by about 3-4x? + +My goal with this part of the question is to understand how I, as a reader of a journal article, can better interpret previous results given their choice of analysis method. For example they have those ""stars of authority"" showing me 0.01>p>.001. So if i accept 0.05 as a reasonable cutoff I should accept their interpretation? The only additional information is mean and SEM. + + #Get Invidual Means + summary=NULL + for(i in unique(dat[,2])){ + sub<-which(dat[,2]==i) + summary<-rbind(summary,cbind( + dat[sub,1][3], + dat[sub,2][4], + mean(dat[sub,3]), + sd(dat[sub,3]) + ) + ) + } + colnames(summary)<-c(""Group"",""Subject"",""Mean"",""SD"") + + TukeyHSD(aov(summary[,3]~as.factor(summary[,1])+ (1|summary[,2]))) + + # Tukey multiple comparisons of means + # 95% family-wise confidence level + # + # Fit: aov(formula = summary[, 3] ~ as.factor(summary[, 1]) + (1 | summary[, 2])) + # + # $`as.factor(summary[, 1])` + # diff lwr upr p adj + # 2-1 -0.672619 -4.943205 3.597967 0.9124024 + # 3-1 7.507937 1.813822 13.202051 0.0098935 + # 3-2 8.180556 2.594226 13.766885 0.0046312 + + + + +**EDIT 3:** +I think we are getting close to my understanding. Here is the simulation described in the comments to @Stephane: + + + #Get Subject Means + means<-aggregate(Value~Group+Subject, data=dat, FUN=mean) + + #Initialize ""dat2"" dataframe + dat2<-dat + + #Initialize within-Subject sd + s<-.001 + pvals=matrix(nrow=10000,ncol=2) + + for(j in 1:10000){ + #Sample individual measurements for each subject + temp=NULL + for(i in 1:nrow(means)){ + temp<-c(temp,rnorm(6,means[i,3], s)) + } + + #Set new values + dat2[,3]<-temp + + #Take means of sampled values and fit to model + dd2 <- aggregate(Value~Group+Subject, data=dat2, FUN=mean) + fit2 <- lm(Value~Group, data=dd2) + + #Save sd and pvalue + pvals[j,]<-cbind(s,anova(fit2)[[5]][5]) + + #Update sd + s<-s+.001 + } + + plot(pvals[,1],pvals[,2], xlab=""Within-Subject SD"", ylab=""P-value"") + + +![enter image description here][6] + + + [1]: https://i.stack.imgur.com/55V9J.png + [2]: https://i.stack.imgur.com/k1xWd.png + [3]: https://i.stack.imgur.com/55V9J.png + [4]: https://i.stack.imgur.com/55V9J.png + [5]: https://i.stack.imgur.com/55V9J.png + [6]: https://i.stack.imgur.com/gMMDY.png",added simulation code and results,2013-10-11 15:48:11.567 +185803,57309,22611.0,3,,CC BY-SA 3.0,044a8796-48fe-4b04-a9b8-a58d5a39ea09,,,2013-10-11 16:00:23.980 +185802,57309,22611.0,1,,CC BY-SA 3.0,044a8796-48fe-4b04-a9b8-a58d5a39ea09,Covary two dummy variables in SEM?,,2013-10-11 16:00:23.980 +185801,57309,22611.0,2,,CC BY-SA 3.0,044a8796-48fe-4b04-a9b8-a58d5a39ea09,"I am running a structural equation model (SEM) in Amos 18, and I want to test the impact of marital status on several latent variables. Marital status is nominal, so I created three dummy variables: + +1. Mar_Single: 1 = yes, 0 = no +2. Mar_Married: 1 = yes, 0 = no +3. Mar_Other: 1 = yes, 0 = no + +I included Mar_Single and Mar_Married in the SEM, so their coefficients will be interpreted against the omitted (reference) group, Mar_Other. The modification indices suggested fit could be improved significantly if I covary Mar_Single and Mar_Married. Should I do this? In a way, this makes sense because they are perfectly correlated: If Mar_Single = 1 then Mar_Married will always = 0. + +Larry",,2013-10-11 16:00:23.980 +185804,57310,20972.0,2,,CC BY-SA 3.0,d52a1b12-de82-4883-8197-7151c98c073f,"Microsoft research have an online tool [here](http://research.microsoft.com/en-us/um/redmond/projects/mscompbio/fisherexacttest). You can also download an Excel add-in from [here](http://www.real-statistics.com/free-download/). + +Your result according to the Microsoft tool is 6.511E-017.",,2013-10-11 16:12:37.340 +185839,57320,19331.0,2,,CC BY-SA 3.0,2b808cf5-0887-42c0-b297-4d7dc397064d,"The spatial power covariance structure is a generalization of the first-order autoregressive covariance structure. Where the first-order autoregressive structure assumes the time points are equally spaced, the spatial power structure can account for a continuous time point. In reality, we could just forget the first-order autoregressive structure entirely, because if we fit the spatial power structure when the data are equally spaced we'll get the same answer as when using the first-order autoregressive structure. + +All that aside, the correlation function you're looking for is `corCAR1()`, which is the continuous first-order autoregressive structure. If you're looking to duplicate what you fit in SAS, then the code you're looking for is: + + gls(CD4t~T, data=df, na.action = (na.omit), method = ""REML"", + corr=corCAR1(form=~T|NUM_PAT)) + +Of course, you don't need to specify `method = ""REML""`, since, as in SAS, the default method in `gls()` is already restricted maximum likelihood.",,2013-10-11 19:13:11.093 +185805,57254,22583.0,5,,CC BY-SA 3.0,fc5a157b-f9d6-46f5-ae4e-68e95f3552d2,"here is my situation. I am weighting a packet of material that has 10 individual units in it. In the end of the day I would like to know the average weight and variance of the individual units but the problem is that I cannot weight each unit individually since I would have to destroy the packet to get to the individual units. So in lieu of this, I am trying to make an inference of the individual units from what I know about the packets. I weighed 10 packets (hence I have 100 individual units). I was able to figure out the average weight of the units but am having trouble with the variance. Here is what I have done so far: + +$$ +\begin{split} +\bar{y}&=\frac{1}{10}\sum^{10}_{i=1}y_i\\ + &=\frac{1}{10}\sum^{10}_{i=1} (x_{i,1}+x_{i,2}+...+x_{i,10})~~[since~y_i=x_{i,1}+x_{i,2}+...+x_{i,10}]\\ + &=\frac{1}{10}\sum^{100}_{j=1}x_j\\ + &=\frac{1}{10}(100~\bar{x})=10~\bar{x} +\end{split} +$$ + +thus we have the average of $x$, $\bar{x}=\frac{\bar{y}}{10}.$ But now my challenge is how to do I find variance of $x$ given the variance of $y$? Any suggestions? Thanks! + +::::UPDATE:::: + +After some thought I came up with this reasoning: +$$ +\begin{split} +\frac{1}{10}var(y)&=var(\bar{y})\\ + &=var(10~\bar{x})\\ + &=100~var(\bar{x})\\ + &=100~\frac{1}{100}var(x)~~[assuming~that~all~x~are~i.i.d.]\\ + &=var(x) +\end{split} +$$ + +thus we have $var(x)=\frac{1}{10}var(y).$ I am correct in that if we assume that all the individual units share the same common variance and are independent of each other, this result holds? +",added 486 characters in body,2013-10-11 16:15:38.403 +185806,52871,18447.0,5,,CC BY-SA 3.0,8ea92f64-b133-4850-96aa-5e9ffd28d1b2,"I want to analyze a multilevel multidimensional model in WinBUGS. the model is as below (N=2362 students responding to K=45 items of a test, students are nested within J=116 schools): + + model{ + #responses + for(i in 1:N){ + for(j in 1:K){ + logit(p[i,j])<- a1[j]*th[i,1]+a2[j]*th[i,2]-b[j] + y[i,j]~dbern(p[i,j] ) + } + th[i,1:2]~dmnorm(mu[sc[i],1:2],tau.p[1:2,1:2]) + } + #school level + for(j in 1:J){ + mu[j,1:2]~dmnorm(m[j,1:2],tau.s[1:2,1:2]) + } + + #priors + for(j in 1:J){ + m[j,1:2]~dmnorm(m0[1:2],cov[1:2,1:2]) + } + + tau.p[1:2,1:2]~dwish(cov[1:2,1:2],2) + tau.s[1:2,1:2]~dwish(cov[1:2,1:2],2) + sigma.p[1:2,1:2]<-inverse(tau.p[,]) + sigma.s[1:2,1:2]<-inverse(tau.s[,]) + s2p<-sum(sigma.p[,]) + s2s<-sum(sigma.s[,]) + rho<-(s2s)/(s2s+s2p) + + a1[1]~dlnorm(0,4) + a2[1]<-0 + b[1]~dnorm(0,1) + for(s in 2:K) { + a1[s]~dlnorm(0,4) + a2[s]~dlnorm(0,4) + b[s]~dnorm(0,1) + } + } + +I've set these functions as initial values: + + ini<-function(){ + list(tau.p=matrix(rgamma(4,100,100),2,2), + tau.s=matrix(rgamma(4,100,100),2,2), + th=rmvnorm(N,mean=c(0,0),sigma=diag(2)), + m=rmvnorm(J,mean=c(0,0),sigma=diag(2)), + mu=rmvnorm(J,mean=c(0,0),sigma=diag(2)), + a1=rlnorm(K,0, 0.4), + a2=c(NA,rlnorm(K-1,0, 0.4)), + b=rnorm(45,0,0.5)) + } +I use rube package in R to check and run my analysis and everything looks fine. When I run the model I receive ""Trap 66 (postcondition violated)"" or ""undefined real result"". I think the problem is from the initials but I have no idea how to solve it. + +Any idea? +",added 4 characters in body,2013-10-11 16:26:08.007 +185807,57276,22310.0,5,,CC BY-SA 3.0,4a14ee46-e043-46c6-9976-2f6d4388ebe0,"I am examining how English ivy affects the occurrence of a salamander species under cover objects (e.g., logs). Soil moisture is assumed to be the major factor that affect their occurrence. + +My hypothesized pathway: The presence/absence of salamanders under cover objects is either a direct consequence of changes in ivy-induced abioitc environment (i.e., drier soil) or an indirect result of changes in prey community that resulted from altered abiotic factors. But, there are multiple factors, other than English ivy, that affect soil moisture. + +![enter image description here][1] + +My questions are: + +1. I think that a path analysis is most suitable for testing my causal mechanisms. But, given a small sample size (n = 71), is a path analysis appropriate? + +2. Another potential problem for a path analysis is that the effects of English ivy on soil moisture seem to depend on the other factors (e.g., the number of overstory trees), as shown below. Are there any way to account for such patterns in a path analysis? + + ![The relationship between soil moisture and English ivy cover on cover objects (""the number of overstory trees"" for the left graph) for different levels of the surrounding overstory trees (""English ivy cover on cover objects"" for the left graph][2] + +3. Are there any other analyses suitable for testing my hypothesized relationships? I am considering multiple (linear and logistic) regressions, but again my sample size is small **AND** regressions do not reflect my hypothesized causal relationships accurately. + +I am using R, so any recommended code would be greatly helpful (I am a relatively new R user, though). + + + [1]: https://i.stack.imgur.com/k65Ag.jpg + [2]: https://i.stack.imgur.com/ArgZm.jpg",deleted 1 characters in body,2013-10-11 16:42:08.147 +185810,57311,22612.0,2,,CC BY-SA 3.0,bf3d2c94-b4b2-499c-9d02-838657b178d4,"dependent variables (ordinal): credit rating 1970 (cr70) and credit rating 1980 (cr80). + +Here is what i want to do: + +regress cr80-cr70 = independent vars. + +how could this be done and how could you interpret it!? +if the dependent variable is continuous it would be simple. but can you make a new var from the difference of two ordinal vars, and have that be the dependent var?",,2013-10-11 16:46:12.820 +185809,57311,22612.0,1,,CC BY-SA 3.0,bf3d2c94-b4b2-499c-9d02-838657b178d4,change in category variable,,2013-10-11 16:46:12.820 +185808,57311,22612.0,3,,CC BY-SA 3.0,bf3d2c94-b4b2-499c-9d02-838657b178d4,,,2013-10-11 16:46:12.820 +185811,57268,18040.0,5,,CC BY-SA 3.0,fca076d2-c4ab-4b54-968a-f4993e8bda16,"I have a question about how to tell two different mixed effects models apart. In the simple case both involve fitting a model with a random group effect and a covariate. I fit the model with `lme4` in `R`. Here is a visualization of the two different scenarios. +![enter image description here][1] + + library(ggplot2) + library(lme4) + gen_dat2 <- function(group.m,group.v,int, sl,n){ + x <- vector() + y <- vector() + g <- vector() + for(i in 1:length(group.m)){ + x.t <- rnorm(n,group.m[i],group.v[i]) + y.t <- rnorm(n,group.m[i],group.v[i])+int + sl*x.t + x <- c(x,x.t) + y <- c(y,y.t) + g <- c(g,rep(i,n)) + } + return(cbind(x,y,g)) + } + + group.m <- runif(5,1,20) + group.v <- runif(5,1,1.5) + + dat2 <- data.frame(gen_dat2(group.m,group.v,1,4,14)) + ggplot(dat2,aes(x=x,y=y,colour=as.factor(g),group=g))+geom_point()+stat_smooth(method=""lm"",se=F) + m2 <- lmer(y~x + (x|g),data=dat2) + + + +Then I can generate and fit the other scenario with similar code: + +![enter image description here][2] + + gen_dat <- function(group.m,group.v,int, sl,n){ + x <- vector() + y <- vector() + g <- vector() + for(i in 1:length(group.m)){ + x.t <- rnorm(n,0,1) + y.t <- rnorm(n,group.m[i],group.v[i])+int + sl*x.t + x <- c(x,x.t) + y <- c(y,y.t) + g <- c(g,rep(i,n)) + } + return(cbind(x,y,g)) + } + + group.m <- runif(5,1,20) + group.v <- runif(5,1,1.5) + + dat1 <- data.frame(gen_dat(group.m,group.v,1,4,14)) + ggplot(dat1,aes(x=x,y=y,colour=as.factor(g),group=g))+geom_point()+stat_smooth(method=""lm"",se=F) + m1 <- lmer(y~x + (x|g),data=dat1) + + +My central question is how do I tell these two models apart? Am I incorrectly fitting the first one, and I need an extra term in there to model the relationships between groups and the x variable as well as y? Both detect substantial between group variation in the intercept and not much in the slope as I would predict. But I need a way to tell these two apart. Any thoughts would be helpful. + + +---- +Edits: + +This has been helpful in me restating the question. So I want to re-ask the question with an example which I hope will make it clear why I want to be able to tell these two models apart. Let's imagine that Y is the average student test score at a school, and X is spending per student in that school. Our grouping variables are 5 different school districts. + +Data in the top figure shows that an increase in spending within a district means that test scores increase. It also shows that between districts there are differences is scores, but that's clearly because some districts spend more student than others. + +Data in the second figure show similarly that within a district student scores increase as spending increases. It also shows that between districts there are differences in test scores. However we don't know what is driving those differences, unlike in the first set of data. This is a pretty common situation I've encountered in building models. The former is not. + +So what I'm asking is what is the appropriate model that captures the following features from the first dataset: + +1. Test scores increase as spending per student does +2. There is also variance between districts in student test scores +3. Part of that difference between districts is because of the underlying relationship between spending and test scores, which also varies with district. + +More generally stated, how do you handle a scenario where you're building a hierarchical model where the grouping variable is correlated with one of your continuous independent variables (e.g. the first scenario). I feel like the model I've presented get's at points 1. and 2., but not point 3. So I'm really seeking a way to tease these two scenarios apart. + +Normally I might add an extra level of hierarchy if there was another group level explanatory variable. Continuing our example, maybe in the 2nd dataset there are differences between district because in some districts parents have more time to spend on homework with students. So we would add that as a group level predictor in a hierarchical model. But that wouldn't work in the first scenario. + + + [1]: https://i.stack.imgur.com/GgxYw.png + [2]: https://i.stack.imgur.com/pgGm7.png",added 2179 characters in body,2013-10-11 16:57:17.710 +185814,57312,18198.0,3,,CC BY-SA 3.0,ebfdaff6-5f8b-41f5-8072-dd73d86ea0be,,,2013-10-11 16:59:04.847 +185813,57312,18198.0,1,,CC BY-SA 3.0,ebfdaff6-5f8b-41f5-8072-dd73d86ea0be,Degrees of Freedom for Ridge regression without knowing the Ridge Parameter?,,2013-10-11 16:59:04.847 +185812,57312,18198.0,2,,CC BY-SA 3.0,ebfdaff6-5f8b-41f5-8072-dd73d86ea0be,"There is a very nice post here that gives a neat solution to the problem of finding the ridge parameter when the degrees of freedom are known: + +http://stats.stackexchange.com/questions/8309/how-to-calculate-regularization-parameter-in-ridge-regression-given-degrees-of-f + +My question is how can you know what the degrees of freedom are before knowing the ridge parameter value is? ( I have added a comment to the other thread but its quite old so thought it best to create a new topic). + + + +",,2013-10-11 16:59:04.847 +185886,57336,22624.0,1,,CC BY-SA 3.0,3e33c97c-5173-4d5e-8712-f1829175ea73,How to test if a result is statistically significant?,,2013-10-11 22:55:22.160 +185885,57336,22624.0,2,,CC BY-SA 3.0,3e33c97c-5173-4d5e-8712-f1829175ea73,"I am trying to determine if a certain conversion on my site is statistically significant. I remembered doing this type of stuff in school but I can't seem to remember how to do it now. + +For 1st set: n = 7297 and conversion was 2.618% +For 2nd set: n = 6107 and conversion was 2.669% + +Any tips on how to do this?",,2013-10-11 22:55:22.160 +185893,57305,1895.0,10,,,74c08ada-b6b3-4956-b263-e32229699128,"{""OriginalQuestionIds"":[9926],""Voters"":[{""Id"":2970,""DisplayName"":""cardinal""}]}",101,2013-10-11 23:34:46.400 +185815,57259,6162.0,5,,CC BY-SA 3.0,67c1a788-1182-41f1-9cd5-9ee5c43295bd,"Now, try to you write down the model: $y_{ijk} = ...$ where $y_{ijk}$ is the $k$-th value for individual $j$ of group $i$. Then look at what happens for the means $\bar y_{ij\bullet}$: you get a classical Gaussian linear model, with variance homogeneity because there are $6$ repeated measures for each subject: + + > xtabs(~Group+Subject, data=dat) + Subject + Group 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 + 1 6 6 6 6 6 6 6 0 0 0 0 0 0 0 0 0 0 0 + 2 0 0 0 0 0 0 0 6 6 6 6 6 6 6 6 0 0 0 + 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 6 6 + +Thus, since you are interested in mean comparisons only, **you don't need to resort to a random-effect or generalised least-squares model** - just use a classical (fixed effects) model using the means $\bar y_{ij\bullet}$ as the observations: + + tdat <- transform(dat, tvalue=f(Value)) + dd <- aggregate(tvalue~Group+Subject, data=tdat, FUN=mean) + fit3 <- lm(tvalue~Group, data=dd) + +The ANOVA provides the same answer as @Henrik's approach (and that shows that Kenward-Rogers approximation is correct): + + > anova(fit3) + Analysis of Variance Table + + Response: tvalue + Df Sum Sq Mean Sq F value Pr(>F) + Group 2 3.3799 1.68994 4.121 0.03747 * + +Then you can use `TukeyHSD()` or the `lsmeans` package for multiple comparisons: + + > TukeyHSD(aov(fit3), ""Group"") + Tukey multiple comparisons of means + 95% family-wise confidence level + + Fit: aov(formula = fit3) + + $Group + diff lwr upr p adj + 2-1 -0.07541248 -0.93627828 0.7854533 0.9719148 + 3-1 1.11885667 -0.02896441 2.2666777 0.0565628 + 3-2 1.19426915 0.06817536 2.3203629 0.0370434 + + > library(lsmeans) + > lsmeans(fit3, pairwise~Group) + + $`Group pairwise differences` + estimate SE df t.ratio p.value + 1 - 2 0.07541248 0.3314247 15 0.22754 0.97191 + 1 - 3 -1.11885667 0.4418996 15 -2.53193 0.05656 + 2 - 3 -1.19426915 0.4335348 15 -2.75472 0.03704 + p values are adjusted using the tukey method for 3 means ",added 1 characters in body,2013-10-11 17:18:12.757 +185816,57302,750.0,5,,CC BY-SA 3.0,0646fd40-6972-41a9-9736-37c0cc24afa2,"You don't need extra terms in the models, less actually. It is plain to see by your plots, but if you look at `summary(m2)` you will see that the variance for random effect for `x` is really small, and the variance for the intercept is quite small as well. + +Similarly for the `m1` model, you can see from the plot that the slopes are all the same, but the intercept varies. You can use an F-test to check the model with *only* random intercepts versus the model with random slopes and intercepts you specified. + + m1 <- lmer(y~x + (x|g),data=dat1) + m1RInt <- lmer(y~x + (1|g),data=dat1) + anova(m1,m1RInt) + +Also just looking at the variance estimates of the random intercepts and effects for `summary(m1)` you would have come to the same conclusion that using random slopes adds nothing to the model.",deleted 4 characters in body,2013-10-11 17:19:50.067 +185817,57313,12544.0,2,,CC BY-SA 3.0,46fc523c-4c82-4644-830b-63aff0ce567e,"Yes, you should. You should always correlate exogenous variables - their correlations are not part of the model, and if you don't, you're either worsening fit, or getting degrees of freedom (which appear to improve fit) when you shouldn't. + +When you do this in regression analysis, they are correlated (and there's no way of stopping them). + +Also, you say ""If Mar_Single = 1 then Mar_Married will always = 0."" Yes, but that doesn't make them perfectly correlated - because if mar_single = 0 mar_married might be 0 or might be 1. + +If you use Mplus, it will correlate x-variables by default and will not include these correlations in the null model which it uses to calculate CFI and NFI. ",,2013-10-11 17:21:17.923 +185818,57278,,5,,CC BY-SA 3.0,54ab29a9-ae69-49f4-ba63-6c7f6ec7ec4e,"First of all, I want to express my apologies if the question is too broad or wrong, but I am in need of references and I have no idea whom I can ask. + +If you are interested, the question comes from a model I built, you can see some details [here](http://physics.stackexchange.com/questions/78524/boltzmann-distribution-with-interaction-between-particles) and [here](http://physics.stackexchange.com/questions/80019/grand-canonical-ensemble-with-interaction-simulation-doubts). In this model I have: +$$f(\mathbb{x}|T,\mu)=\frac{h(\mathbb{x})e^{-\frac{E(\mathbb{x})}{kT}+\mu N(x)}}{\mathcal{Z}(T,\mu)}$$ + +There, my parameters are $\mu$ and $T$, and $\mathbb{x}=(x_1,\dots,x_M)$ where $x_i\in\{0,1\}$ and I have the restriction $\forall i\in\{1,\dots,M-D+1\}$ +$$\sum_{j=0}^{D-1} x_{i+j} \leq 1$$ +This is, $h(\mathbb{x})=0$ if that condition is not held. + + +I have the ""small"" inconvenience of not knowing $\mathcal{Z}(T,\mu)$, so I used a MCMC (Metropolis-Hastings) method to approximate this function. However I face two problems. + + - The first of them regards the simulation and the model and I am on solving it (it depends too much on the initial condition). + + - The second is that these parameters are not fully known and I have no idea how can I estimate them. I have been reading about Bayesian inference and I know a bit of estimation theory but I am no expert (furthermore I don't know if not knowing the partition function can affect the result). If any of you were able to give me some clue in the form of a book that I can read, I would be eternally grateful. + +Thank you very much for your help. + +Thanks to cardinal's comment, I have realized that I didn't explain one thing. It probably makes all more complex but there it goes: +The idea is that $E$ is known in each experiment, actually $E(\mathbf{x}) = \mathbf{E}\cdot\mathbf{x}$. However, $\mathbf{E}$ is not always the same, it represents an external potential for some particles. The ""good"" thing is that $T$, which accounts for the temperature, never changes whatever $\mathbf{E}$ is, so I thought that I could find a way of estimating it, given the fact that I have an empirical distribution of $x_{i}$ (so, a probability that a particle is in the position $i$) given a certain $\mathbf{E}$. So, in a way, what I have is +$$f(\mathbf{x}|T,\mu , \mathbf{E})$$, but I always know $\mathbf{E}$ and I know (can I say this?) that $T,\mu$ are independent of $\mathbf{E}$. I am sorry for not being clear enough before. I am starting to think that nothing of this makes sense...",orthography and type setting,2013-10-11 17:28:42.097 +185820,57314,3446.0,1,,CC BY-SA 3.0,62261775-d0aa-4875-bdb1-c54d43828957,Coverage rates of confidence intervals in reality,,2013-10-11 17:31:46.623 +185821,57314,3446.0,2,,CC BY-SA 3.0,62261775-d0aa-4875-bdb1-c54d43828957,"One proves mathematically that if assumptions of a model are satisfied, then the coverage rate of a $100p\%$ confidence interval is $100p\%$. But then statistics gets applied to the world, where model assumptions may not be satisfied. Are there any studies comparing the coverage rates of confidence intervals applied to the real world with theoretical coverage rates?",,2013-10-11 17:31:46.623 +185819,57314,3446.0,3,,CC BY-SA 3.0,62261775-d0aa-4875-bdb1-c54d43828957,,,2013-10-11 17:31:46.623 +185824,57315,5984.0,3,,CC BY-SA 3.0,87accd3a-01a1-47cf-974d-9c515e15e2fe,,,2013-10-11 17:38:34.780 +185823,57315,5984.0,1,,CC BY-SA 3.0,87accd3a-01a1-47cf-974d-9c515e15e2fe,What does it mean that random effects are highly correlated?,,2013-10-11 17:38:34.780 +185840,57321,6162.0,2,,CC BY-SA 3.0,e112a1e8-2c33-4a31-bd80-ecbceb6f65f6,"> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. + +Let me develop this idea here. The model for the indivual observations is +$$y_{ijk}= \mu_i + \alpha_{ij} + \epsilon_{ijk}$$, where : + + - $y_{ijk}$ is the $k$-th measurement of individual $j$ of group $i$ + + - $\alpha_{ij} \sim_{\text{iid}} {\cal N}(0, \sigma^2_b)$ is the random effect for individual $j$ of group $i$ + + - $\epsilon_{ijk} \sim_{\text{iid}} {\cal N}(0, \sigma^2_w)$ is the within-error + +In [my answer to your first question][1], I have suggested you to note that one obtains a classical (fixed effects) Gaussian linear model for the group means $\bar y_{ij\bullet}$. Indeed you can easily check that $\bar y_{ij\bullet} = \mu_i + \delta_{ij}$ with $$\delta_{ij} = \alpha_{ij} + \frac{1}{K}\sum_k \epsilon_{ijk} +\sim_{\text{iid}} {\cal N}(0, \sigma^2) \quad \text{with } \quad \boxed{\sigma^2=\sigma^2_b+\frac{\sigma^2_w}{K}},$$ +assuming $K$ repeated measurements for each individual. This is nothing but the one-way ANOVA model with a fixed factor. + +And then I claimed that in order to draw inference about the $\mu_i$ you can simply consider the simple classical linear model whose observations are the group means $\bar y_{ij\bullet}$. I think I spoke too quickly, and **I'd like to know the advice of an expert about this point**. I know it works here, but is it due to the fact that the observed group means $\bar y_{ij\bullet}$ are sufficient statistics for the $\mu_i$ ? (I do not remember the theory of sufficient statistics). + +> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. I would like to use a method where +> the larger the within subject variance the less sure I am about the +> group means or understand why it does not make sense to desire that. + +As you see from the boxed formula, the within-variance $\sigma^2_w$ plays a role in the model for the observed group means. + + [1]: http://stats.stackexchange.com/a/72490/8402",,2013-10-11 19:18:23.050 +185894,57333,1895.0,33,,,81a5156c-119c-4bea-84e5-bbd86b21169a,,818,2013-10-11 23:40:09.413 +185822,57315,5984.0,2,,CC BY-SA 3.0,87accd3a-01a1-47cf-974d-9c515e15e2fe,"What does it mean when two random effects are highly or perfectly correlated? +That is, in R when you call summary on a mixed model object, under ""Random effects"" ""corr"" is 1 or -1. + + + summary(model.lmer) + Random effects: + Groups Name Variance Std.Dev. Corr + popu (Intercept) 2.5714e-01 0.5070912 + amdclipped 4.2505e-04 0.0206167 1.000 + nutrientHigh 7.5078e-02 0.2740042 1.000 1.000 + amdclipped:nutrientHigh 6.5322e-06 0.0025558 -1.000 -1.000 -1.000 + +I know this is bad and indicates that the random effects part of the model is too complex, but I'm trying to understand + + - 1)what is doing on statistically + - 2)what is going on practically with + the structure of the response variables. + + + + + +**Example** + +Here is an example based on ""[GLMMs in action: gene-by-environment interaction in total fruit production of wild populations of Arabidopsis thaliana][1]"" +by Bolker et al + +Download data + + download.file(url = ""http://glmm.wdfiles.com/local--files/trondheim/Banta_TotalFruits.csv"", destfile = ""Banta_TotalFruits.csv"") + dat.tf <- read.csv(""Banta_TotalFruits.csv"", header = TRUE) + +Set up factors + + dat.tf <- transform(dat.tf,X=factor(X),gen=factor(gen),rack=factor(rack),amd=factor(amd,levels=c(""unclipped"",""clipped"")),nutrient=factor(nutrient,label=c(""Low"",""High""))) + +Modeling log(total.fruits+1) with ""population"" (popu) as random effect + + model.lmer <- lmer(log(total.fruits+1) ~ nutrient*amd + (amd*nutrient|popu), data= dat.tf) + +Accessing the Correlation matrix of the random effects show that everything is perfectly correlated + + attr(VarCorr(model.lmer)$popu,""correlation"") + + (Intercept) amdclipped nutrientHigh amdclipped:nutrientHigh + (Intercept) 1 1 1 -1 + amdclipped 1 1 1 -1 + nutrientHigh 1 1 1 -1 + amdclipped:nutrientHigh -1 -1 -1 1 + + +I understand that these are the correlation coefficients of two vectors of random effects coefficients, such as + + cor(ranef(model.lmer)$popu$amdclipped, ranef(model.lmer)$popu$nutrientHigh) + +Does a high correlation mean that the two random effects contain redundant information? Is this analogous to multicollinearity in multiple regression when a model with highly correlated predictors should be simplified? + + + + + + + [1]: http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=3&cad=rja&ved=0CDYQFjAC&url=http://glmm.wdfiles.com/local--files/examples/Banta_ex.pdf&ei=hTNYUpuzBu7J4APN5YHYBg&usg=AFQjCNG65VjvqOLeYLFxJZnzmlMevgEbuA&bvm=bv.53899372,d.dmg +",,2013-10-11 17:38:34.780 +185825,57273,22596.0,5,,CC BY-SA 3.0,383215c7-62af-4ddf-a393-d3323ce80b0f,"I could really use some guided help! I'm having difficulty understanding an MCMC implementation in terms of modeling a data set. I'm working on generating parameters from stellar light curves, and was asked to look into implementing an MCMC algorithm. A large chuck on the code is written in Python, so I've been trying to use [emcee hammer](http://dan.iel.fm/emcee/) to generate parameter fits. But going through the code, it's just not ""clicking"" how the method works. + +I have a set of data (time vs flux) of two stars orbiting each other such that from our point of view, they eclipse. There are dips in the light curve to signify this. All I'm attempting to do is get the parameters of the system dependent on the characteristics of these dips. + +In the emcee implementation, there are a few functions that I understand: the posterior function which, I believe, simply generates a data set given the set of parameters. Then there's a prior function which, I assume, is the function given a previous set of parameters. Somehow the algorithm chooses whether or not the jump to the posterior parameter set is to be done? I'm guessing that's what the use of the likelihood function is? To describe whether or not to take the jump? + +I apologize, I'm quite confused on how this is to be implemented in terms a defined set of data.",added 4 characters in body,2013-10-11 17:58:05.320 +185828,57316,22615.0,3,,CC BY-SA 3.0,79f8a73c-1353-4fca-b24f-cf8465773f56,,,2013-10-11 18:00:38.057 +185826,57316,22615.0,2,,CC BY-SA 3.0,79f8a73c-1353-4fca-b24f-cf8465773f56,"I have two samples s1 and s2 of count data. The sample size is > 1000 each. The distributions look similar to a Poisson distribution but the variance is much larger than the mean. + +How do I test whether the mean of s1 is larger than the mean of s2?",,2013-10-11 18:00:38.057 +185827,57316,22615.0,1,,CC BY-SA 3.0,79f8a73c-1353-4fca-b24f-cf8465773f56,mean difference for count data,,2013-10-11 18:00:38.057 +185830,57317,22564.0,1,,CC BY-SA 3.0,e5bf0aba-3241-413e-9b9d-408ecc22b571,"When making inferences about group means, are credible Intervals sensitive to within-subject variance while confidence intervals are not?",,2013-10-11 18:05:20.930 +185831,57317,22564.0,2,,CC BY-SA 3.0,e5bf0aba-3241-413e-9b9d-408ecc22b571,"This is a spin off of this question: +http://stats.stackexchange.com/questions/72453/how-to-compare-two-groups-with-multiple-measurements-for-each-individual-with-r + +In the answers there (if I understood correctly) I learned that within-subject variance does not effect inferences made about group means and it is ok to simply take the averages of averages to calculate group mean, then calculate within-group variance and use that to perform significance tests. I would like to use a method where the larger the within subject variance the less sure I am about the group means or understand why it does not make sense to desire that. + +Here is a plot of the original data along with some simulated data that used the same subject means, but sampled the individual measurements for each subject from a normal distribution using those means and a small within-subject variance (sd=.1). As can be seen the group level confidence intervals (bottom row) are unaffected by this (at least the way I calculated them). + +![enter image description here][1] + + +I also used rjags to estimate the group means in three ways. +1) Use the raw original data +2) Use only the Subject means +3) Use the simulated data with small within-subject sd + +The results are below. Using this method we see that the 95% credible intervals are narrower in cases #2 and #3. This meets my intuition of what I would like to occur when making inferences about group means, but I am not sure if this is just some artifact of my model or a property of credible intervals. + +Note. To use rjags you need to first install JAGS from here: +http://sourceforge.net/projects/mcmc-jags/files/ + +![enter image description here][2] + +The various code is below. + +The original data: + + structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, + 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, + 22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, + 6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, + 2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, + 12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, + 10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, + 20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list( + NULL, c(""Group"", ""Subject"", ""Value""))) + + + +Get subject Means and simulate the data with small within-subject variance: + + #Get Subject Means + means<-aggregate(Value~Group+Subject, data=dat, FUN=mean) + + #Initialize ""dat2"" dataframe + dat2<-dat + + #Sample individual measurements for each subject + temp=NULL + for(i in 1:nrow(means)){ + temp<-c(temp,rnorm(6,means[i,3], .1)) + } + + #Set Simulated values + dat2[,3]<-temp + + +The function to fit the JAGS model: + + require(rjags) + + #Jags fit function + jags.fit<-function(dat2){ + + #Create JAGS model + modelstring = "" + + model{ + for(n in 1:Ndata){ + y[n]~dnorm(mu[subj[n]],tau[subj[n]]) T(0, ) + } + + for(s in 1:Nsubj){ + mu[s]~dnorm(muG,tauG) T(0, ) + tau[s] ~ dgamma(5,5) + } + + + muG~dnorm(10,.01) T(0, ) + tauG~dgamma(1,1) + + } + "" + writeLines(modelstring,con=""model.txt"") + + ############# + + #Format Data + Ndata = nrow(dat2) + subj = as.integer( factor( dat2$Subject , + levels=unique(dat2$Subject ) ) ) + Nsubj = length(unique(subj)) + y = as.numeric(dat2$Value) + + dataList = list( + Ndata = Ndata , + Nsubj = Nsubj , + subj = subj , + y = y + ) + + #Nodes to monitor + parameters=c(""muG"",""tauG"",""mu"",""tau"") + + + #MCMC Settings + adaptSteps = 1000 + burnInSteps = 1000 + nChains = 1 + numSavedSteps= nChains*10000 + thinSteps=20 + nPerChain = ceiling( ( numSavedSteps * thinSteps ) / nChains ) + + + #Create Model + jagsModel = jags.model( ""model.txt"" , data=dataList, + n.chains=nChains , n.adapt=adaptSteps , quiet=FALSE ) + # Burn-in: + cat( ""Burning in the MCMC chain...\n"" ) + update( jagsModel , n.iter=burnInSteps ) + + # Getting DIC data: + load.module(""dic"") + + + # The saved MCMC chain: + cat( ""Sampling final MCMC chain...\n"" ) + codaSamples = coda.samples( jagsModel , variable.names=parameters , + n.iter=nPerChain , thin=thinSteps ) + + mcmcChain = as.matrix( codaSamples ) + + result = list(codaSamples=codaSamples, mcmcChain=mcmcChain) + + } + + +Fit the model to each group of each dataset: + + #Fit to raw data + groupA<-jags.fit(dat[which(dat[,1]==1),]) + groupB<-jags.fit(dat[which(dat[,1]==2),]) + groupC<-jags.fit(dat[which(dat[,1]==3),]) + + #Fit to subject mean data + groupA2<-jags.fit(means[which(means[,1]==1),]) + groupB2<-jags.fit(means[which(means[,1]==2),]) + groupC2<-jags.fit(means[which(means[,1]==3),]) + + #Fit to simulated raw data (within-subject sd=.1) + groupA3<-jags.fit(dat2[which(dat2[,1]==1),]) + groupB3<-jags.fit(dat2[which(dat2[,1]==2),]) + groupC3<-jags.fit(dat2[which(dat2[,1]==3),]) + + + +Credible interval/highest density interval function: + + #HDI Function + get.HDI<-function(sampleVec,credMass){ + sortedPts = sort( sampleVec ) + ciIdxInc = floor( credMass * length( sortedPts ) ) + nCIs = length( sortedPts ) - ciIdxInc + ciWidth = rep( 0 , nCIs ) + for ( i in 1:nCIs ) { + ciWidth[ i ] = sortedPts[ i + ciIdxInc ] - sortedPts[ i ] + } + HDImin = sortedPts[ which.min( ciWidth ) ] + HDImax = sortedPts[ which.min( ciWidth ) + ciIdxInc ] + HDIlim = c( HDImin , HDImax, credMass ) + return( HDIlim ) + } + + + +First Plot: + + layout(matrix(c(1,1,2,2,3,4),nrow=3,ncol=2, byrow=T)) + + boxplot(dat[,3]~dat[,2], + xlab=""Subject"", ylab=""Value"", ylim=c(0, 1.2*max(dat[,3])), + col=c(rep(""Red"",length(which(dat[,1]==unique(dat[,1])[1]))/6), + rep(""Green"",length(which(dat[,1]==unique(dat[,1])[2]))/6), + rep(""Blue"",length(which(dat[,1]==unique(dat[,1])[3]))/6) + ), + main=""Original Data"" + ) + stripchart(dat[,3]~dat[,2], vert=T, add=T, pch=16) + legend(""topleft"", legend=c(""Group A"", ""Group B"", ""Group C"", ""Individual Means +/- 95% CI""), + col=c(""Red"",""Green"",""Blue"", ""Grey""), lwd=3, bty=""n"", pch=c(15), + pt.cex=c(rep(0.1,3),1), + ncol=3) + + for(i in 1:length(unique(dat[,2]))){ + m<-mean(examp[which(dat[,2]==unique(dat[,2])[i]),3]) + ci<-t.test(dat[which(dat[,2]==unique(dat[,2])[i]),3])$conf.int[1:2] + + points(i-.3,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.3, + ci[1],i-.3, + ci[2], lwd=4, col=""Grey"" + ) + } + + + + boxplot(dat2[,3]~dat2[,2], + xlab=""Subject"", ylab=""Value"", ylim=c(0, 1.2*max(dat2[,3])), + col=c(rep(""Red"",length(which(dat2[,1]==unique(dat2[,1])[1]))/6), + rep(""Green"",length(which(dat2[,1]==unique(dat2[,1])[2]))/6), + rep(""Blue"",length(which(dat2[,1]==unique(dat2[,1])[3]))/6) + ), + main=c(""Simulated Data"", ""Same Subject Means but Within-Subject SD=.1"") + ) + stripchart(dat2[,3]~dat2[,2], vert=T, add=T, pch=16) + legend(""topleft"", legend=c(""Group A"", ""Group B"", ""Group C"", ""Individual Means +/- 95% CI""), + col=c(""Red"",""Green"",""Blue"", ""Grey""), lwd=3, bty=""n"", pch=c(15), + pt.cex=c(rep(0.1,3),1), + ncol=3) + + for(i in 1:length(unique(dat2[,2]))){ + m<-mean(examp[which(dat2[,2]==unique(dat2[,2])[i]),3]) + ci<-t.test(dat2[which(dat2[,2]==unique(dat2[,2])[i]),3])$conf.int[1:2] + + points(i-.3,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.3, + ci[1],i-.3, + ci[2], lwd=4, col=""Grey"" + ) + } + + + means<-aggregate(Value~Group+Subject, data=dat, FUN=mean) + + boxplot(means[,3]~means[,1], col=c(""Red"",""Green"",""Blue""), + ylim=c(0,1.2*max(means[,3])), ylab=""Value"", xlab=""Group"", + main=""Original Data"" + ) + stripchart(means[,3]~means[,1], pch=16, vert=T, add=T) + + for(i in 1:length(unique(means[,1]))){ + m<-mean(means[which(means[,1]==unique(means[,1])[i]),3]) + ci<-t.test(means[which(means[,1]==unique(means[,1])[i]),3])$conf.int[1:2] + + points(i-.3,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.3, + ci[1],i-.3, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + + means2<-aggregate(Value~Group+Subject, data=dat2, FUN=mean) + + boxplot(means2[,3]~means2[,1], col=c(""Red"",""Green"",""Blue""), + ylim=c(0,1.2*max(means2[,3])), ylab=""Value"", xlab=""Group"", + main=""Simulated Data Group Averages"" + ) + stripchart(means2[,3]~means2[,1], pch=16, vert=T, add=T) + + for(i in 1:length(unique(means2[,1]))){ + m<-mean(means[which(means2[,1]==unique(means2[,1])[i]),3]) + ci<-t.test(means[which(means2[,1]==unique(means2[,1])[i]),3])$conf.int[1:2] + + points(i-.3,m, pch=15,cex=1.5, col=""Grey"") + segments(i-.3, + ci[1],i-.3, + ci[2], lwd=4, col=""Grey"" + ) + } + legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"") + + +Second Plot: + + + layout(matrix(c(1,2,3,4,4,4,5,5,5,6,6,6),nrow=4,ncol=3, byrow=T)) + + #Plot priors + plot(seq(0,10,by=.01),dgamma(seq(0,10,by=.01),5,5), type=""l"", lwd=4, + xlab=""Value"", ylab=""Density"", + main=""Prior on Within-Subject Precision"" + ) + plot(seq(0,10,by=.01),dgamma(seq(0,10,by=.01),1,1), type=""l"", lwd=4, + xlab=""Value"", ylab=""Density"", + main=""Prior on Within-Group Precision"" + ) + plot(seq(0,300,by=.01),dnorm(seq(0,300,by=.01),10,100), type=""l"", lwd=4, + xlab=""Value"", ylab=""Density"", + main=""Prior on Group Means"" + ) + + + #Set overall xmax value + x.max<-1.1*max(groupA$mcmcChain[,""muG""],groupB$mcmcChain[,""muG""],groupC$mcmcChain[,""muG""], + groupA2$mcmcChain[,""muG""],groupB2$mcmcChain[,""muG""],groupC2$mcmcChain[,""muG""], + groupA3$mcmcChain[,""muG""],groupB3$mcmcChain[,""muG""],groupC3$mcmcChain[,""muG""] + ) + + + #Plot result for raw data + #Set ymax + y.max<-1.1*max(density(groupA$mcmcChain[,""muG""])$y,density(groupB$mcmcChain[,""muG""])$y,density(groupC$mcmcChain[,""muG""])$y) + + plot(density(groupA$mcmcChain[,""muG""]),xlim=c(0,x.max), + ylim=c(-.1*y.max,y.max), lwd=3, col=""Red"", + main=""Group Mean Estimates: Fit to Raw Data"", xlab=""Value"" + ) + lines(density(groupB$mcmcChain[,""muG""]), lwd=3, col=""Green"") + lines(density(groupC$mcmcChain[,""muG""]), lwd=3, col=""Blue"") + + hdi<-get.HDI(groupA$mcmcChain[,""muG""], .95) + segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col=""Red"") + + hdi<-get.HDI(groupB$mcmcChain[,""muG""], .95) + segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col=""Green"") + + hdi<-get.HDI(groupC$mcmcChain[,""muG""], .95) + segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col=""Blue"") + + #### + + #Plot result for mean data + + #x.max<-1.1*max(groupA2$mcmcChain[,""muG""],groupB2$mcmcChain[,""muG""],groupC2$mcmcChain[,""muG""]) + y.max<-1.1*max(density(groupA2$mcmcChain[,""muG""])$y,density(groupB2$mcmcChain[,""muG""])$y,density(groupC2$mcmcChain[,""muG""])$y) + + plot(density(groupA2$mcmcChain[,""muG""]),xlim=c(0,x.max), + ylim=c(-.1*y.max,y.max), lwd=3, col=""Red"", + main=""Group Mean Estimates: Fit to Subject Means"", xlab=""Value"" + ) + lines(density(groupB2$mcmcChain[,""muG""]), lwd=3, col=""Green"") + lines(density(groupC2$mcmcChain[,""muG""]), lwd=3, col=""Blue"") + + hdi<-get.HDI(groupA2$mcmcChain[,""muG""], .95) + segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col=""Red"") + + hdi<-get.HDI(groupB2$mcmcChain[,""muG""], .95) + segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col=""Green"") + + hdi<-get.HDI(groupC2$mcmcChain[,""muG""], .95) + segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col=""Blue"") + + + + + #### + #Plot result for simulated data + #Set ymax + #x.max<-1.1*max(groupA3$mcmcChain[,""muG""],groupB3$mcmcChain[,""muG""],groupC3$mcmcChain[,""muG""]) + y.max<-1.1*max(density(groupA3$mcmcChain[,""muG""])$y,density(groupB3$mcmcChain[,""muG""])$y,density(groupC3$mcmcChain[,""muG""])$y) + + plot(density(groupA3$mcmcChain[,""muG""]),xlim=c(0,x.max), + ylim=c(-.1*y.max,y.max), lwd=3, col=""Red"", + main=c(""Group Mean Estimates: Fit to Simulated data"", ""(Within-Subject SD=0.1)""), xlab=""Value"" + ) + lines(density(groupB3$mcmcChain[,""muG""]), lwd=3, col=""Green"") + lines(density(groupC3$mcmcChain[,""muG""]), lwd=3, col=""Blue"") + + hdi<-get.HDI(groupA3$mcmcChain[,""muG""], .95) + segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col=""Red"") + + hdi<-get.HDI(groupB3$mcmcChain[,""muG""], .95) + segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col=""Green"") + + hdi<-get.HDI(groupC3$mcmcChain[,""muG""], .95) + segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col=""Blue"") + + + + [1]: https://i.stack.imgur.com/eiRJ9.png + [2]: https://i.stack.imgur.com/H2rnX.png + + + + + + + + + + + +",,2013-10-11 18:05:20.930 +185829,57317,22564.0,3,,CC BY-SA 3.0,e5bf0aba-3241-413e-9b9d-408ecc22b571,,,2013-10-11 18:05:20.930 +185834,57318,18198.0,3,,CC BY-SA 3.0,ba61d32b-7b3d-4449-9ad0-6f0ab1204e89,,,2013-10-11 18:05:44.790 +185833,57318,18198.0,1,,CC BY-SA 3.0,ba61d32b-7b3d-4449-9ad0-6f0ab1204e89,Iterative method to find Ridge Regression Parameter,,2013-10-11 18:05:44.790 +185832,57318,18198.0,2,,CC BY-SA 3.0,ba61d32b-7b3d-4449-9ad0-6f0ab1204e89,"I have seen a method whereby instead of trying to estimate the ridge parameter (k) directly from the data (using one of the many many ridge parameter estimators in the literature) you solve for it iteratively. + +The method is simple enough: You simply increase k (in suitably small steps) until the condition number is reduced blow 10. + +At first blush this seems like quite a nice solution to me but I've never seen a Ridge Regression paper/book that uses it. + +Is this method theoretically sound though? Even if (as I suspect) it isn't does it really matter for the average practitioner who just want to produce more stable estimates of their Beta's (the weights in the regression) rather than having them ""blow up"" to grossly unrealistic values when they experience severe MC? + +Truly I would like to find a better method than this ideally with a solid theoretical underpinning but its hard to see from a practical view point it can be improved upon? +",,2013-10-11 18:05:44.790 +185835,57314,,25,,,b2d52d57-2e32-409a-b05f-0c290b856fab,,http://twitter.com/#!/StackStats/status/388730032749047808,2013-10-11 18:17:16.803 +185838,57319,1693.0,3,,CC BY-SA 3.0,e3cb4aac-6b0a-4fca-8b20-1c89c118df41,,,2013-10-11 18:20:07.563 +185837,57319,1693.0,1,,CC BY-SA 3.0,e3cb4aac-6b0a-4fca-8b20-1c89c118df41,How high must RSQ be for a suppressor effect to show up?,,2013-10-11 18:20:07.563 +185836,57319,1693.0,2,,CC BY-SA 3.0,e3cb4aac-6b0a-4fca-8b20-1c89c118df41,"I am modeling an outcome for hospital patients, 'RA' (whether readmitted). My predictor of interest is 'HHS' (whether referred to Home Health Services such as from a visiting nurse). Those referred readmit at a 15.2% rate; others, 9.2%, but the former are needier, sicker patients. Conventional thinking is that if we controlled for severity of illness this difference would not only be washed out but would reverse itself. In other words, holding constant the severity of illness, having HHS should mean a lower RA rate. + +With HHS as the sole predictor, B in a logistic regression = 0.6 (N ~ 25k). B is reduced to 0.2 with a group of covariates controlled, each accounting for some aspect of severity of illness, but B doesn't fall below zero. + +HHS alone explains only about 1% of the variance in RA; with the other predictors, this becomes 4%.* Perhaps this is the problem--that the model is not explaining enough variance for the covariates to ""succeed"" in reversing the sign of the coefficient of interest. If this is true, is there a way to estimate how high explained variance needs to be for such a theorized suppressor effect to show up? + + +*Using either of 2 pseudo-RSQ formulas; Cox & Snell's or Menard's [-2LL0 - (-2LL1)] / [-2LL0.]",,2013-10-11 18:20:07.563 +186491,57513,10594.0,1,,CC BY-SA 3.0,96a35508-4fe5-4957-98d5-5291599c2ef8,"A categorical variable in glm shows signifcance from analysis of deviance, but each level is not significant in z-test",,2013-10-15 09:49:58.953 +185841,57321,6162.0,5,,CC BY-SA 3.0,0ef0f224-ad4f-4234-958b-5846b1dceac2,"> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. + +Let me develop this idea here. The model for the individual observations is +$$y_{ijk}= \mu_i + \alpha_{ij} + \epsilon_{ijk}$$, where : + + - $y_{ijk}$ is the $k$-th measurement of individual $j$ of group $i$ + + - $\alpha_{ij} \sim_{\text{iid}} {\cal N}(0, \sigma^2_b)$ is the random effect for individual $j$ of group $i$ + + - $\epsilon_{ijk} \sim_{\text{iid}} {\cal N}(0, \sigma^2_w)$ is the within-error + +In [my answer to your first question][1], I have suggested you to note that one obtains a classical (fixed effects) Gaussian linear model for the group means $\bar y_{ij\bullet}$. Indeed you can easily check that $$\bar y_{ij\bullet} = \mu_i + \delta_{ij}$$ with $$\delta_{ij} = \alpha_{ij} + \frac{1}{K}\sum_k \epsilon_{ijk} +\sim_{\text{iid}} {\cal N}(0, \sigma^2) \quad \text{where } \quad \boxed{\sigma^2=\sigma^2_b+\frac{\sigma^2_w}{K}},$$ +assuming $K$ repeated measurements for each individual. This is nothing but the one-way ANOVA model with a fixed factor. + +And then I claimed that in order to draw inference about the $\mu_i$ you can simply consider the simple classical linear model whose observations are the group means $\bar y_{ij\bullet}$. I think I spoke too quickly, and **I'd like to know the advice of an expert about this point**. I know it works here, but is it due to the fact that the observed group means $\bar y_{ij\bullet}$ are sufficient statistics for the $\mu_i$ ? (I do not remember the theory of sufficient statistics). + +> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. I would like to use a method where +> the larger the within subject variance the less sure I am about the +> group means or understand why it does not make sense to desire that. + +As you see from the boxed formula, the within-variance $\sigma^2_w$ plays a role in the model for the observed group means. + + [1]: http://stats.stackexchange.com/a/72490/8402",added 5 characters in body,2013-10-11 19:26:55.980 +185842,57321,6162.0,5,,CC BY-SA 3.0,6eff36e6-c83c-485e-9ec8-19ca81963fbd,"> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. + +Let me develop this idea here. The model for the individual observations is +$$y_{ijk}= \mu_i + \alpha_{ij} + \epsilon_{ijk}$$, where : + + - $y_{ijk}$ is the $k$-th measurement of individual $j$ of group $i$ + + - $\alpha_{ij} \sim_{\text{iid}} {\cal N}(0, \sigma^2_b)$ is the random effect for individual $j$ of group $i$ + + - $\epsilon_{ijk} \sim_{\text{iid}} {\cal N}(0, \sigma^2_w)$ is the within-error + +In [my answer to your first question][1], I have suggested you to note that one obtains a classical (fixed effects) Gaussian linear model for the subjects means $\bar y_{ij\bullet}$. Indeed you can easily check that $$\bar y_{ij\bullet} = \mu_i + \delta_{ij}$$ with $$\delta_{ij} = \alpha_{ij} + \frac{1}{K}\sum_k \epsilon_{ijk} +\sim_{\text{iid}} {\cal N}(0, \sigma^2) \quad \text{where } \quad \boxed{\sigma^2=\sigma^2_b+\frac{\sigma^2_w}{K}},$$ +assuming $K$ repeated measurements for each individual. This is nothing but the one-way ANOVA model with a fixed factor. + +And then I claimed that in order to draw inference about the $\mu_i$ you can simply consider the simple classical linear model whose observations are the subjects means $\bar y_{ij\bullet}$. I think I spoke too quickly, and **I'd like to know the advice of an expert about this point**. I know it works here, but is it due to the fact that the observed subjects means $\bar y_{ij\bullet}$ are sufficient statistics for the $\mu_i$ ? (I do not remember the theory of sufficient statistics). + +> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. I would like to use a method where +> the larger the within subject variance the less sure I am about the +> group means or understand why it does not make sense to desire that. + +As you see from the boxed formula, the within-variance $\sigma^2_w$ plays a role in the model for the observed group means. + + [1]: http://stats.stackexchange.com/a/72490/8402",added 5 characters in body,2013-10-11 19:32:39.250 +185843,57317,6162.0,6,,CC BY-SA 3.0,d3f09106-aa38-4a26-a36c-58951f89d14c,,tag: mixed model,2013-10-11 19:33:19.393 +185844,57322,10964.0,1,,CC BY-SA 3.0,a069fd69-1082-45df-8e41-9fcab9caf0ed,Explainging p-value to a sophisticated layman,,2013-10-11 19:42:47.813 +185846,57322,10964.0,3,,CC BY-SA 3.0,a069fd69-1082-45df-8e41-9fcab9caf0ed,,,2013-10-11 19:42:47.813 +185845,57322,10964.0,2,,CC BY-SA 3.0,a069fd69-1082-45df-8e41-9fcab9caf0ed,"I Think I understand the concept of p-value but unfortunately I still have to exert a lot of brain cycles to get my arms around it. + +I would like to get an explanation of the p-value that is rigorous enough for a sophisticated layman - something that would be intuitive.",,2013-10-11 19:42:47.813 +185849,57323,22618.0,3,,CC BY-SA 3.0,06c7f167-fdfe-4f10-87ac-77e31dcad6c8,,,2013-10-11 19:57:04.153 +185848,57323,22618.0,1,,CC BY-SA 3.0,06c7f167-fdfe-4f10-87ac-77e31dcad6c8,Simple recommender system - where to start?,,2013-10-11 19:57:04.153 +185847,57323,22618.0,2,,CC BY-SA 3.0,06c7f167-fdfe-4f10-87ac-77e31dcad6c8,"Without going into specifics, I'm currently working on a system that involves 20-25 questions being answered as either Green, Yellow, Orange or Red. After completing a subset of these questions (many questions can be left as defaulting to Green), the system allows our users to choose one outcome out of four, roughly corresponding to the answers they entered (OutcomeGreen, OutcomeYellow, OutcomeOrange or OutcomeRed). The answer that was selected most tends to be a good indicator as to what outcome they will select, but that's not always the case. + +After having this system in place for the last 2 years, now I've received a request to have the system itself make a recommendation as to which outcome the user should select. Using data already accumulated over this period, I'd like to get some insight as to which questions/answers tend to be most influential for specific outcomes, and possibly give them more weight when determining what to recommend. + +My main dilemma is that my last class on statistics was more than 20 years ago, and just looking through the tags here made me feel that I'm out of my depth. With the description I've provided, and the vast knowledge contained within this SE: + + - Is there anything I should be looking into (tools, subset of + CrossValidated tags) that would help gain better insight, and where I + should look for more information? + - Is there a quick way to get up-to-speed on what I'm missing? + +**Background:** I'm a developer in many programming languages, and an amateur mathematician (mostly playing around in number theory and linear programming). I'm also a quick learner; I've been learning how to use R in my spare time. I just need some indication as to where I would find info quickly that would help me move forward with this.",,2013-10-11 19:57:04.153 +185850,57324,5045.0,2,,CC BY-SA 3.0,8056a22e-d2ba-441c-b727-be0df2d7e7c2,"Take a look at the tooth brushing example at the very start of Chapter 14 of Andrew Vicker's book [What is a p-value anyway? 34 Stories to Help You Actually Understand Statistics][1]. It's starts on page 57 or you can use the table of contents button in the bottom right corner to find it. + + + [1]: http://www.pearsonhighered.com/vickers/",,2013-10-11 20:00:54.043 +185851,57322,668.0,10,,,23363b32-864a-4213-beb0-a55014c31fcc,"{""OriginalQuestionIds"":[31],""Voters"":[{""Id"":919,""DisplayName"":""whuber""}]}",101,2013-10-11 20:05:32.083 +185852,57322,5045.0,6,,CC BY-SA 3.0,04d4c461-4b0d-462c-b669-08585fd4a99b,,edited tags,2013-10-11 20:10:25.547 +185854,57322,1895.0,4,,CC BY-SA 3.0,a6f63b15-15e7-4985-9d1b-083036000919,Explaining p-value to a sophisticated layman,edited body; edited title,2013-10-11 20:28:53.333 +185853,57322,1895.0,5,,CC BY-SA 3.0,a6f63b15-15e7-4985-9d1b-083036000919,"I think I understand the concept of p-value but unfortunately I still have to exert a lot of brain cycles to get my arms around it. + +I would like to get an explanation of the p-value that is rigorous enough for a sophisticated layman - something that would be intuitive.",edited body; edited title,2013-10-11 20:28:53.333 +185888,57324,15827.0,5,,CC BY-SA 3.0,6fe1dd84-7aa2-4398-b335-4e134c923c23,"Take a look at the tooth brushing example at the very start of Chapter 14 of Andrew Vickers' book [What is a p-value anyway? 34 Stories to Help You Actually Understand Statistics][1]. It starts on page 57 or you can use the table of contents button in the bottom right corner to find it. + + + [1]: http://www.pearsonhighered.com/vickers/",small fixes to spelling and punctuation,2013-10-11 23:19:18.117 +185858,57325,668.0,5,,CC BY-SA 3.0,e7229950-388a-4f5d-bfef-b0f528a007e7,"I am running cross-sectional regressions of the type + +$$Y_c = \alpha + \beta X_1 + \gamma X_2 + \delta_1 X_3 + \delta_2 X_1 X_3 + \delta_3 X_2 X_3 + e_c.$$ + +**My theoretical model** implies that + + - $\delta_2$ should be negative, + - $\delta_3$ should be positive, and + - the marginal effect of $X_3$ should be negative. + +**My estimates** imply that + + - $\widehat\delta_2$ is negative and significant, + - $\widehat\delta_3$ is positive and insignificant, + - $\widehat\beta$ is significant, and + - $\widehat\gamma$ is insignificant. + +Building on this evidence, can I calculate the marginal effect of $X_3$ as $\delta_1 + \delta_2 E(X_1)$ where $E(X_1)$ is the mean of $X_1$, justifying this procedure with the fact that all the terms incorporating $X_2$ are insignificant? +",Formatting and spelling typos.,2013-10-11 20:35:21.577 +185861,57326,20742.0,3,,CC BY-SA 3.0,44ad9cfc-3126-44ef-ad5b-d1ebf910bc9c,,,2013-10-11 20:36:25.720 +185860,57326,20742.0,1,,CC BY-SA 3.0,44ad9cfc-3126-44ef-ad5b-d1ebf910bc9c,Sweeping across multiple classifiers and choosing the best?,,2013-10-11 20:36:25.720 +185859,57326,20742.0,2,,CC BY-SA 3.0,44ad9cfc-3126-44ef-ad5b-d1ebf910bc9c,"I'm using Weka to perform classification, clustering, and some regression on a few large data sets. I'm currently trying out all the classifiers (decision tree, SVM, naive bayes, etc.). + +Is there an automated way (in Weka or other machine learning toolkit) to sweep through all the available classifier algorithms to find the one that produces the best cross-validated accuracy or other metric? I'm not talking about boosting; rather, I'm looking to just choose the best classifier using a given data set. + +I'd like to find the best clustering algorithm, too, for my other clustering problem; perhaps finding the lowest sum-of-squared-error?",,2013-10-11 20:36:25.720 +185863,57327,22623.0,1,,CC BY-SA 3.0,ee41bc56-0b27-4b12-be8b-5d1ee7e5f1f6,Binary features for prediction,,2013-10-11 20:40:31.053 +185862,57327,22623.0,2,,CC BY-SA 3.0,ee41bc56-0b27-4b12-be8b-5d1ee7e5f1f6,"I have a set of relatively long (~1000) binary features with scalar values [0-10] attached to them. My aim is to write a predictor that learns to map the features to the [0-10] interval to predict new features from given a new binary vector. I used SVM and Lasso with leave-one-out performance analysis, but both always end up predicting the mean value of the distribution (correlates to the histogram of all the feature - scalar distribution). The histograms are also rather norm / rayleigh distributions. Suggestions for algorithms / feature space mapping? My main problem is that I am dealing with binary features for the first time. + +Thanks, EL",,2013-10-11 20:40:31.053 +185864,57327,22623.0,3,,CC BY-SA 3.0,ee41bc56-0b27-4b12-be8b-5d1ee7e5f1f6,,,2013-10-11 20:40:31.053 +185867,57328,13396.0,2,,CC BY-SA 3.0,c3398ae7-d890-41fd-bd59-f74c5eb2c099,"Let's say I want to generate $M$ sequences $p_j$, where $j = 1,\ldots,N$. I want $\mathbb{E}[ p_j ] \to 0$ and $\mathbb{E}[p_j \, p_k] \to \delta_{j, k}$ as $M \to +\infty$, where the expectation is taken across the $M$ samples. + +In practice, I can generate an $M \times N$ matrix of i.i.d. unit normals. For example, in MATLAB, `Z = randn(M, N)`. Then I get $p_j$ from the $j$-th column of $Z$. + +For a finite value of $M$, the sample mean $\mathbb{E}[ p_j ] \neq 0$, but I can ""fix"" the problem if I remove the sample mean by working with $q_j = p_j - \mathbb{E}[ p_j ]$. + +My question is -- how do I continue to improve my sequences, so that I get the 2nd-order moments I want, i.e., $\text{corr}(q_j, q_k) = \delta_{j,k}$ even when $M$ is finite?",,2013-10-11 20:40:44.083 +185866,57328,13396.0,1,,CC BY-SA 3.0,c3398ae7-d890-41fd-bd59-f74c5eb2c099,Improve the quality of psuedo-randomly generated uncorrelated unit normals,,2013-10-11 20:40:44.083 +185865,57328,13396.0,3,,CC BY-SA 3.0,c3398ae7-d890-41fd-bd59-f74c5eb2c099,,,2013-10-11 20:40:44.083 +185868,57328,1895.0,4,,CC BY-SA 3.0,1592c43f-94a6-4169-a889-3d588dfcf211,Improving the quality of pseudo-randomly generated uncorrelated unit normals,edited title,2013-10-11 20:43:25.953 +185871,57329,10135.0,3,,CC BY-SA 3.0,94b8d6e5-7f30-4380-8932-1296c5fa7d62,,,2013-10-11 20:54:18.747 +185869,57329,10135.0,2,,CC BY-SA 3.0,94b8d6e5-7f30-4380-8932-1296c5fa7d62,"I have two statistical models. Model 1 uses a [GLM][1] approach while model 2 uses a time series approach for fitting. I want to compare these two models. + +Model 1 (i.e. GLM) has a better out of sample performance. Model 2 has a better [BIC][2] criteria. So based on out of sample performance, I should pick up model 1 and based on BIC I should pick up model 2 as the preferred model. + +I should add that in this context and for the question I am trying to answer, Both the BIC and out of sample performance are important. The question is how to choose the best model in this case? Should I consider other criteria? Please let me know if you know any good reference with similar cases. + + + [1]: http://en.wikipedia.org/wiki/Generalized_linear_model + [2]: http://en.wikipedia.org/wiki/Bayesian_information_criterion",,2013-10-11 20:54:18.747 +185870,57329,10135.0,1,,CC BY-SA 3.0,94b8d6e5-7f30-4380-8932-1296c5fa7d62,BIC vs. Out of sample performance,,2013-10-11 20:54:18.747 +185872,57328,13396.0,5,,CC BY-SA 3.0,38868be5-f7b3-453a-81d6-4fbb0d884748,"Let's say I want to generate $N$ sequences $p_j$, where $j = 1,\ldots,N$. Each sequence has a length of $M$. I want $\mathbb{E}[ p_j ] \to 0$ and $\text{corr}(p_j, p_k) \to \delta_{j, k}$ as $M \to +\infty$. + +In practice, I can generate an $M \times N$ matrix of i.i.d. unit normals. For example, in MATLAB, `Z = randn(M, N)`. Then I get $p_j$ from the $j$-th column of $Z$. + +For a finite value of $M$, the sample mean $\mathbb{E}[ p_j ] \neq 0$, but I can ""fix"" the problem if I remove the sample mean by working with $q_j = p_j - \mathbb{E}[ p_j ]$. + +My question is -- how do I continue to improve my sequences, so that I get the 2nd-order moments I want, i.e., $\text{corr}(q_j, q_k) = \delta_{j,k}$ even when $M$ is finite?",added 36 characters in body,2013-10-11 20:58:04.033 +185889,57337,2069.0,2,,CC BY-SA 3.0,41016531-0cc7-4adc-adad-25a187f7d96b,"Percents are a proportion. The traditional way to test differences between proportions is the chi-square test. Based on the information you have given me (7106 and 191 [2.62%] in one half and 5944 and 163 [2.67%] in the other), the chi-square test results in a non-significant value of .88 (p value). Your proportions are 2.67 and 2.62, so it is no surprise that these are not statistically significant, despite your large sample. +",,2013-10-11 23:20:23.397 +185892,57338,21746.0,3,,CC BY-SA 3.0,883f30d6-db01-40ea-b3bb-d00768bd54f3,,,2013-10-11 23:29:41.150 +185891,57338,21746.0,1,,CC BY-SA 3.0,883f30d6-db01-40ea-b3bb-d00768bd54f3,Rescaling input features for Neural Networks (Regression),,2013-10-11 23:29:41.150 +185895,57324,5045.0,5,,CC BY-SA 3.0,c4072c16-9f0d-41fd-b33d-38b3081b57d6,"Take a look at the tooth brushing example at the very start of Chapter 14 of Andrew Vickers' book [What is a p-value anyway? 34 Stories to Help You Actually Understand Statistics][1]. It starts on page 57 or you can use the table of contents button in the bottom *left* corner to find it. + + + [1]: http://www.pearsonhighered.com/vickers/",added 1 characters in body,2013-10-11 23:55:03.343 +185873,57330,22359.0,2,,CC BY-SA 3.0,43c6ab3b-ef95-402d-96ba-6c4fb3280ba0,"How are the data sets related? IF both data sets are drawn from the same distribution (they describe the same problem) than you can use the labeled set as a ""test set"" for the clustering. Basically you treat the clustering algorithm as a classifier. The only problem is that you must find a match between the output of the clustering algorithm and the actual labels. + +You might use some simple matching (ex: instances labeled GREEN are more often clustered in cluster 2 and BLUE in cluster 1 so cluster 1== BLUE and cluster 2 == GREEN). + +More elegantly you can compute the [Mutual Information][1] between the clustering output and actual labels. Mutual Information has a nice property, that one doesn't need to know the exact matching. MI will give high scores if most of the matching are consistent. Think of it as a correlation coefficient between (cluster <-> actual label) relation. + +Also check http://en.wikipedia.org/wiki/Cluster_analysis for some measures. The key phrase there is: + +> [...] clustering results are evaluated based on data that was not used for clustering, such as known class labels and external benchmarks. Such benchmarks consist of a set of pre-classified items, and these sets are often created by human (experts). Thus, the benchmark sets can be thought of as a gold standard for evaluation. + +For ROC usually one needs some ""*a posteriori*"" probability, outputted by the classifier, but in your case, the distance between the instance and the cluster center will work. Keep in mind that ROC is computed for a specific label at a time (i.e. one vs all). So for 5 labels you will get 4 independent AUROC values. + +IMHO I strongly advise yo to do the CV for clustering if you have labeled data! Iterate it several times and use the mean of your measure as the performance. + +I would also try this: Use some percent (66% usually) of unlabeled data to perform clustering, measure performance using labeled data, repeat the experiment with different randomization (usually 5-10 times) and report mean performance. Unfortunately I don't know if this method will give a good estimate of your real performance. Is it possible that will overfit the labeled data set. This is not a textbook approach, so, use it with caution. + + + [1]: http://en.wikipedia.org/wiki/Mutual_information ""Mutual Information""",,2013-10-11 21:08:48.940 +185876,57331,22763.0,2,,CC BY-SA 3.0,a0d39d0a-882d-427b-8329-31bcb82ba913,"Say I have some normally distributed data. I have an application where I compute the percentile (or cumulative frequency less than sample) for a particular sample using a CDF function along with the mean $\mu$ and standard deviation $\sigma$ of the samples. + +so $$F_X(x) = \frac 12\left[1 + \text{erf} \left (\frac {x - \mu}{\sqrt{2 \sigma^2}}\right)\right]$$ + +Now I find myself in a situation where I want to determine the cumulative frequency of multiple samples across multiple data sets (finding something akin to an overall percentile of, say, three samples). Now assuming the variables are independent, I can sum the normals using + +($\mu$sum, $\sigma$sum) = ($\mu$x + $\mu$y + $\mu$z), ($\sigma$x + $\sigma$y + $\sigma$z). + +Can I then sum the individual samples I care about and compare them to the new summed normal to compute a percentile of the three samples compared to the sum of the normals? Something tells me this doesn't work but I'd like to be sure. So I'm thinking something like computing the CDF using the sum of the samples I'm interested in: x = (samplex + sampley + samplez) and using the $\mu$sum and $\sigma$sum in the CDF function above.",,2013-10-11 21:12:06.117 +185874,57331,22763.0,1,,CC BY-SA 3.0,a0d39d0a-882d-427b-8329-31bcb82ba913,CDF (cumulative frequency) of multiple samples in summed normals?,,2013-10-11 21:12:06.117 +185875,57331,22763.0,3,,CC BY-SA 3.0,a0d39d0a-882d-427b-8329-31bcb82ba913,,,2013-10-11 21:12:06.117 +185877,57288,,25,,,3638e3e8-e5bb-45e6-be39-ee6f62f84044,,http://twitter.com/#!/StackStats/status/388775673198030848,2013-10-11 21:18:38.317 +185879,57255,19545.0,4,,CC BY-SA 3.0,75e1a727-7a4e-4178-ad90-78067cc7c088,Why are ERR (Expected Reciprocal Ranking) scores not normalized?,added 30 characters in body; edited title,2013-10-11 21:22:00.880 +185878,57255,19545.0,5,,CC BY-SA 3.0,75e1a727-7a4e-4178-ad90-78067cc7c088,"It seems to me that normalized ERR (Expected Reciprocal Ranking) scores (ERR scores of your ranking algorithm divided by ERR score calculated for the ground truth ranking) are more useful than the unscaled ERR scores, but I have not seen normalized scores being reported in the literature. Is there a good reason that the ERR scores are reported in raw rather than normalized format?",added 30 characters in body; edited title,2013-10-11 21:22:00.880 +185880,57332,22359.0,2,,CC BY-SA 3.0,48f22cb6-72e4-479c-ac05-17efcfb2afa4,"Features extracted from image/signal processing tend to get correlated a lot! This is not a very bad thing if you have enough samples. + +From my experience, a classifier with small variance tend to work well (ex. logistic regression). They have less chances of overfitting the train data. + +Another idea that I employed is the Additive logistic regression [here][1] and [here][2] some references. They are already implemented in Weka. They are slower than the logistic models. In the same time they have the great advantage that they perform a feature selection while learning. Moreover, the model is human friendly so you can see what features are more relevant. + +Hope it helps + + [1]: http://people.csail.mit.edu/torralba/courses/6.869/lectures/lecture6/boosting.pdf + [2]: http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1016218223",,2013-10-11 21:23:38.743 +185881,57255,668.0,5,,CC BY-SA 3.0,47aae167-23d7-46cf-b996-abab0dfb7ee2,"It seems to me that normalized ERR ([Expected Reciprocal Ranking](http://don-metzler.net/papers/metzler-cikm09.pdf)) scores (ERR scores of your ranking algorithm divided by ERR score calculated for the ground truth ranking) are more useful than the unscaled ERR scores, but I have not seen normalized scores being reported in the literature. Is there a good reason that the ERR scores are reported in raw rather than normalized format?",added 52 characters in body,2013-10-11 21:28:11.360 +185882,57333,22359.0,2,,CC BY-SA 3.0,6f2dce8a-a958-4911-9239-18a2e9ae1a08,"Try [this][1] paper. Your answer might be at chapter 3.2, figures 2 and 3. + +Long story short: The same performance can be obtained for different pairs of C and kernel parameters. You shouldn't try to manually tune a SVM. + + [1]: http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf",,2013-10-11 21:33:36.120 +185883,57334,594.0,2,,CC BY-SA 3.0,33626cfa-a0a9-442f-b7f3-5d1e3175bd61,"There are two issues here: + +1) If you're doing a formal hypothesis test (and if you're going as far as quoting a p-value in my book you already are), **what is the formal rejection rule?** + +When comparing test statistics to critical values, the critical value is *in the rejection region*. While this formality doesn't matter much when everything is continuous, it does matter when the distribution of the test statistic is discrete. + +When comparing p-values and significance levels, the rule is: + +>           Reject if $p\leq\alpha$ + +Please note that, even if you rounded your p-value to 0.05, indeed even if the $p$ value was exactly 0.05, formally, *you should still reject*. + +2) In terms of 'what is our p-value telling us', then assuming you can even interpret a p-value as 'evidence against the null' (let's say that opinion on that is somewhat divided), 0.0499 and 0.0501 are not really saying different things. + + +My suggestion would be to (1) formally reject the null, and perhaps point out that even if it were exactly 0.05 it should still be rejected; (2) note that there's nothing particularly *special* about $\alpha = 0.05$ and it's very close to that borderline -- even a slightly smaller significance level would not lead to rejection. +",,2013-10-11 21:33:38.517 +185884,57335,13396.0,2,,CC BY-SA 3.0,fedf1925-c675-4c62-b09a-2ef882ddcd69,"I think I got it. If $Z \sim \mathcal{N}(0, 1)$ but we want to generate $X$ such that its mean is $\mu$ and covariance matrix is $C$, we decompose $C = L L^T$, and let $X = L Z + \mu$. + +Now we just need to carry out the reverse operation.",,2013-10-11 21:39:45.703 +185887,57336,22624.0,3,,CC BY-SA 3.0,3e33c97c-5173-4d5e-8712-f1829175ea73,,,2013-10-11 22:55:22.160 +185896,57339,10987.0,2,,CC BY-SA 3.0,da4c4f67-0ec3-462b-8aba-6a8209d99ea3,"Give your large sample sizes, you could probably use a t-test on the means. If your sample sizes are equal, you are in pretty good shape whether you want to use a pooled estimate of the variance or unpooled (Welch's test). Do a one sided test, if you are sure that the population of s1 has a mean at least as large as the mean of the population of s2. + +Note: If the variances are much larger than the means, your counts are not Poisson. But what matters here is the distribution of the sample averages, and that should be nearly normal, unless the data are super-skewed. In that case, you could do a non-parametric test like the Kruskal-Wallis.",,2013-10-11 23:59:32.067 +185897,57340,10987.0,2,,CC BY-SA 3.0,9e263e0a-bd67-49dc-85c5-a0b5002b2325,"You could try CART (tree) classification regression. That would select a decision tree algorithm for the outcomes based on the answers to the questions. As a bi-product, it would indicate which questions are most important in predicting outcome. ",,2013-10-12 00:09:09.067 +185898,57293,,25,,,33bebd4a-5653-4f9d-b893-f4dcb92ae76f,,http://twitter.com/#!/StackStats/status/388820966710980608,2013-10-12 00:18:37.153 +185899,57335,13396.0,5,,CC BY-SA 3.0,11949f92-962b-4509-8d28-8b703f780f10,"I think I got it. If $Z \sim \mathcal{N}(0, 1)$ but we want to generate $X$ such that its mean is $\mu$ and covariance matrix is $C$, we decompose $C = L L^T$, and let $X = L Z + \mu$. + +Now we just need to carry out the reverse operations.",added 1 characters in body,2013-10-12 01:03:46.547 +185902,57341,13549.0,3,,CC BY-SA 3.0,170852f7-f141-4b3b-8fe1-685eea592d7f,,,2013-10-12 02:33:38.043 +185900,57341,13549.0,2,,CC BY-SA 3.0,170852f7-f141-4b3b-8fe1-685eea592d7f,"I have a data set where samples are collected once per year for 15 years at a number of sites. I am worried that these data are temporally autocorrelated and was trying to figure out if I need to address that. However, the only time I will be using degrees of freedom with these data is in a perMANOVA. This test calculates a pseudo F-statistic by permuting the rows. I can't figure out if the exchangebility assumption means that I don't need to worry about autocorrelation at all (i.e., permuting rows will simply destroy the temporal structure, which I am not interested in anyway) or if it means that I can't use a perMANOVA even if I accounted for autocorrelation? + + +Since I had a hard time getting a question about MANCOVA answered, I hope explaining how the test works will help someone help me! Here's the paper describing perMANOVA, just in case (it is open access): + +http://www.entsoc.org/PDF/MUVE/6_NewMethod_MANOVA1_2.pdf",,2013-10-12 02:33:38.043 +185901,57341,13549.0,1,,CC BY-SA 3.0,170852f7-f141-4b3b-8fe1-685eea592d7f,Temporal autocorrelation in perMANOVA?,,2013-10-12 02:33:38.043 +185903,57342,19681.0,2,,CC BY-SA 3.0,4a5e614c-d137-437c-86bf-f7154259e521,"You seem to be aware that the marginal effect of $X_3$ is $\delta_1 + \delta_2 X_1 + \delta_3 X_2$, which is just the derivative of the response with respect to $X_3$. + +Replacing $X_1$ with $E(X_1)$ is a reasonable way to summarize the marginal effect. + +However, discarding the final term due to statistical insignificance is nonsense. There are at least two relatively sensible alternatives: + +1. If your $n$ is so big that you believe the statistical result that $\delta_3$ is insignificant more than you believe your prior belief that $\delta_3$ should be positive, than you could get rid of the $\delta_3 X_1 X_2$ term in your model and refit the coefficients BEFORE using $\delta_1 + \delta_2 X_1$ as your marginal effect. + +2. If you believe that the terms involving $X_2$ need to be in the model, regardless of statistical significance, than you need to keep the $X_2$ term in your marginal effect as well.",,2013-10-12 03:15:41.107 +185904,57336,,25,,,4b58278f-58c5-4b2a-a187-2d8ea4454936,,http://twitter.com/#!/StackStats/status/388866266150490112,2013-10-12 03:18:37.353 +185905,57249,594.0,5,,CC BY-SA 3.0,3adc2d0c-9aa9-4f70-9a8c-24b69c69d5c3,"I have [read](http://en.wikipedia.org/wiki/Gamma_distribution#Summation) that the sum of gamma distributions with the same scale parameter is another gamma distribution. I've also seen the paper by [Moschopoulos](http://www.ism.ac.jp/editsec/aism/pdf/037_3_0541.pdf) describing a method for the summation of a general set of gamma distributions. I have tried implementing Moschopoulos's method but have yet to have success. + +What does the summation of a general set of gamma distributions look like? To make this question concrete, what does it look like for: + +$\text{Gamma}(3,1) + \text{Gamma}(4,2) + \text{Gamma}(5,1)$ + +If the parameters above are not particularly revealing, please suggest others.",added 21 characters in body,2013-10-12 05:29:36.093 +185906,57343,22629.0,3,,CC BY-SA 3.0,9d3d183e-fd24-4ff2-88b5-aba553850ba6,,,2013-10-12 05:40:58.913 +185908,57343,22629.0,1,,CC BY-SA 3.0,9d3d183e-fd24-4ff2-88b5-aba553850ba6,K-Means clustering after first iteration,,2013-10-12 05:40:58.913 +185907,57343,22629.0,2,,CC BY-SA 3.0,9d3d183e-fd24-4ff2-88b5-aba553850ba6,In the k-means clustering we initially pick k random centroids and assign the given data to one of these k-centroids(which ever is nearest). After this we create new centroids by taking the mean of the assigned points . However there might be case that the initially selected random centroids may not be nearest to any point in the dataset and hence no points would be assigned to these centroids. So in such case what should be done in the step of creating new centroids ?,,2013-10-12 05:40:58.913 +185910,57344,21243.0,2,,CC BY-SA 3.0,38d7a905-fbfc-4754-8a22-3e597e425023,"Actually, this isn't by most definitions a recommender system, and anything you read in the literature about recommender systems might be geared toward solving a similar but slightly different problem (namely, where the input and output space are the same set). + +This is, by most conventional definitions, a classification problem, so looking for tags related to Classification might help. + +In terms of actual approaches: as Placidia mentioned, CART and random forest methods are quite popular right now. Additionally, a classic method is [Logistic Regression][1], which might be worth checking at (I'm not an R user, but I believe R has an implementation of it provided). + +More than that is tough to provide without a knowledge of what your system is doing and what approaches might be valid. I'd recommend refreshing your basic probability and statistics and thinking about the way that your variables might be related, then taking a look at the classification methods included with R (and their respective Wikipedia pages). + + [1]: http://en.wikipedia.org/wiki/Logistic_regression",,2013-10-12 06:34:05.823 +185923,57348,6136.0,3,,CC BY-SA 3.0,a87dd8a9-d788-473f-a691-552bf51253cc,,,2013-10-12 10:23:30.670 +185924,57348,6136.0,1,,CC BY-SA 3.0,a87dd8a9-d788-473f-a691-552bf51253cc,KL-divergence between two categorical/multinomial distributions gives negative values?,,2013-10-12 10:23:30.670 +186348,57467,594.0,5,,CC BY-SA 3.0,3b0c3bf3-0cb2-4da4-b05e-994ad5e43f47,"I understand that in a linear regression model like: + + +$y_i = b_0 + b_1 x_i + \epsilon_i$ + +I can have a null and an alternative hypothesis: + +$H_0: b_1 = 0$ and $H_1: b_1 \neq 0$. + +And then I can reject $H_0$ or fail to reject $H_0$. But what if I want to accept that $b_1 = 0$?",removed convolution operator,2013-10-14 22:06:52.670 +185911,57345,436.0,2,,CC BY-SA 3.0,d3c75148-75b5-4786-b081-636589fee8d5,"I am not sure if there is a ""standard"" thing to do in the case one of the initial centroids is completely off. + +You can easily test this by specifying the initial centroids and see how things evolve! + +For instance, R will just give you an error. + +Say you do: + + # Set the RNG seed to ensure reproducibility + set.seed(12345) + + # Let's create 3 visually distinct clusters + n <- c(1000, 500, 850) + classifier.1 <- c(rnorm(n[1], 10, 0.9), + rnorm(n[2], 25, 2), + rnorm(n[3], 35, 2)) + classifier.2 <- c(rnorm(n[1], 5, 1), + rnorm(n[2], 10, 0.4), + rnorm(n[3], 2, .9)) + + col = c(""blue"", ""darkgreen"", ""darkred"") + # Run k-means with 3 clusters and random initial centroids + # to check the clusters are correctly recognized + km <- kmeans(cbind(classifier.1, classifier.2), 3) + # Plot the data, colored by cluster + plot(classifier.1, classifier.2, pch=20, col=col[km$cluster]) + + # Mark the final centroids + points(km$centers, pch=20, cex=2, col=""orange"") + + # Now impose some obviously ""wrong"" starting centroids + start.x <- c(10, 25, 3000) + start.y <- c(10, 10, -10000) + km.2 <- kmeans(cbind(classifier.1, classifier.2), + centers=cbind(start.x, start.y)) + +Now, R has obviously no issue in discriminating the 3 clusters when you let it choose the initial centroids, but when you run it the second time it will just say: + + Error: empty cluster: try a better set of initial centers + +I guess that if you are implementing your own algorithm you may choose to use this behaviour or rather give the user a warning and let the algorithm choose the centroids by itself. + +Obviously, as others pointed out, there are algorithms such as [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) that help in choosing a good set of starting centroids. + +Also, in R you can use the `nstart` parameter of the kmeans function to run several iterations with different centroids: this will improve clustering in certain situations. + +**EDIT**: also, note from the R `kmeans` help page + +> The algorithm of Hartigan and Wong (1979) is used by default. Note +> that some authors use k-means to refer to a specific algorithm rather +> than the general method: most commonly the algorithm given by MacQueen +> (1967) but sometimes that given by Lloyd (1957) and Forgy (1965). The +> Hartigan–Wong algorithm generally does a better job than either of +> those, but trying several random starts (nstart> 1) is often +> recommended. For ease of programmatic exploration, k=1 is allowed, +> notably returning the center and withinss. +> +> Except for the Lloyd–Forgy method, k clusters will always be returned +> if a number is specified. If an initial matrix of centres is supplied, +> it is possible that no point will be closest to one or more centres, +> which is currently an error for the Hartigan–Wong method.",,2013-10-12 07:23:28.157 +185912,57343,594.0,5,,CC BY-SA 3.0,f25bfe3e-6838-45a2-afa2-0fbcf5492c66,"In *k-means clustering* we initially pick $k$ random centroids and assign the given data to one of these $k$ centroids (which ever is nearest). After this we create new centroids by taking the mean of the assigned points. + +However there might be case that the initially selected random centroids may not be nearest to any point in the dataset and hence no points would be assigned to these centroids. So in such case what should be done in the step of creating new centroids? +",formatting etc,2013-10-12 07:33:13.560 +185917,57346,21586.0,2,,CC BY-SA 3.0,5ea62487-0fec-449b-999f-b74d65cbd750,"**It lies in the eye of the beholder.** + +Formally, if there is a strict decision rule for your problem, follow it. This means $\alpha$ is given. However, I am not aware of any problem where this is the case (though setting $\alpha=0.05$ is what many practitioners do after Statistics101). + +**So it really boils down to what AlefSin commented before. There cannot be a ""correct answer"" to your question. Report what you got, rounded or not.** + +There is a huge literature on the ""significance of significance""; see for example the recent paper of one of the leading German statisticians Walter Krämer on ""The cult of statistical significance - What economists should and should not do to make their data talk"", *Schmollers Jahrbuch* **131**, 455-468, 2011.",,2013-10-12 07:43:00.677 +185918,57126,,25,,,899d1c94-2ab6-460b-9c2c-13bc1d6a1694,,http://twitter.com/#!/StackStats/status/388956862500323328,2013-10-12 09:18:37.210 +185919,57271,20473.0,5,,CC BY-SA 3.0,41e4efec-e97f-47f8-8b3b-4d15bc82507e,"Write your system explicitly for time $t$ as (""$L$"" for ""loss"", as a positive quantity, and ""$G$"" for ""gain"") +$$ A_t - A_{t-1} = - L^A_{t-1} + G_{t-1}^{B\rightarrow A}+G_{t-1}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - L^B_{t-1} + G_{t-1}^{A\rightarrow B}+G_{t-1}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - L^C_{t-1} + G_{t-1}^{A\rightarrow C}+G_{t-1}^{B\rightarrow C}$$ + +The following three relations hold exactly: +$$ L^A_{t-1} = G_{t-1}^{A\rightarrow B} + G_{t-1}^{A\rightarrow C} $$ +$$ L^B_{t-1} = G_{t-1}^{B\rightarrow A} + G_{t-1}^{B\rightarrow C} $$ +$$ L^C_{t-1} = G_{t-1}^{C\rightarrow A} + G_{t-1}^{C\rightarrow B} $$ + +If you substitute in the first three you obtain + +$$ A_t - A_{t-1} = - G_{t-1}^{A\rightarrow B} - G_{t-1}^{A\rightarrow C} + G_{t-1}^{B\rightarrow A}+G_{t-1}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - G_{t-1}^{B\rightarrow A} - G_{t-1}^{B\rightarrow C} + G_{t-1}^{A\rightarrow B}+G_{t-1}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - G_{t-1}^{C\rightarrow A} - G_{t-1}^{C\rightarrow B} + G_{t-1}^{A\rightarrow C}+G_{t-1}^{B\rightarrow C}$$ + +You have $6$ unknown quantities to estimate _per time period_. There is just not enough information to do that. So you need assumptions that will impose structure (=restrictions) on the situation, and will permit you to estimate _something_. What? Let's say you assume that there is a relatively stable ""churn"" from one company to another, as a linear function of their market share in the previous period. This assumption brings in a set of unknown coefficients to be estimated (which will then give you an estimate of ""hidden transfers of market share""). Write $G_{t-1}^{A\rightarrow B} = a_bA_{t-1}$ (market share lost from $A$ to $B$ as a linear function of $A$'s market share in period $t-1$). +Your equations will become + +$$ A_t - A_{t-1} = - a_bA_{t-1} - a_cA_{t-1} + b_aB_{t-1}+c_aC_{t-1} $$ + +$$ B_t - B_{t-1} = - b_aB_{t-1} - b_cB_{t-1} + a_bA_{t-1}+c_bC_{t-1}$$ + +$$ C_t - C_{t-1} = - c_aC_{t-1} - c_bC_{t-1} + a_cA_{t-1}+ b_cB_{t-1}$$ + +We have turned a set of mathematical identities into a _model_. It is doubtful that this model will hold exactly for each $t$, so you should add a stochastic error term. Rearranging we obtain a first-order Vector Autoregression (VAR): + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +1-a_b-a_c & b_a & c_a \\ +a_b & 1-b_a-b_c & c_b \\ +a_c & b_c & 1-c_a-c_b \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +or, to homogenize notation, + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +\gamma_{11} & \gamma_{12} & \gamma_{13} \\ +\gamma_{21} & \gamma_{22} & \gamma_{23} \\ +\gamma_{31} & \gamma_{32} & \gamma_{33} \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +subject to the equality restrictions +$$ \begin{matrix} +\gamma_{11} + \gamma_{21} + \gamma_{31} =1 \\ +\gamma_{12} + \gamma_{22} + \gamma_{32} =1 \\ +\gamma_{13} + \gamma_{23} + \gamma_{33} =1 \\ +\end{matrix} $$ + +So you have essentially $6$ unknown coefficients and a sample of $T-1$ observations (for each company). +Note that these restrictions _imply_ the ""add up to unity"" restriction $A_t+B_t+C_t =1$ for each $t$, so this last one does not impose any additional structure on the unknown coefficients -but it does imply a relation between the error terms, namely that $u^A_{t} + u^B_{t} +u^C_{t} =0$. Any additional assumptions on the three error terms should either come from knowledge of the specific real world phenomenon under study, and/or through a statistical specification search. + +Of course you may find that such a model does not fit your data sample well - for example you expect that all estimated coefficients should be positive and smaller than or equal to unity, but the estimation procedure may not give you that. But this is what we do: we come up with specification hypotheses and we test them against the data - ""success"" is never guaranteed. Then you should try to come up with a different model. + +",Elaboration on the relation between the error terms,2013-10-12 09:38:09.133 +185920,57347,6162.0,2,,CC BY-SA 3.0,84b56408-cbde-49f3-a93a-947fd590b617,"There is a natural exact confidence interval for the grandmean in the balanced random one-way ANOVA model $$(y_{ij} \mid \mu_i) \sim_{\text{iid}} {\cal N}(\mu_i, \sigma^2_w), \quad j=1,\ldots,J, +\qquad +\mu_i \sim_{\text{iid}} {\cal N}(\mu, \sigma^2_b), \quad i=1,\ldots,I.$$ +Indeed, it is easy to check that the distribution of the observed means $\bar{y}_{i\bullet}$ is $\bar{y}_{i\bullet} \sim_{\text{iid}} {\cal N}(\mu, \tau^2)$ with $\tau^2=\sigma^2_b+\frac{\sigma^2_w}{J}$, +and it is well known that the between sum of squares $SS_b$ has distribution $$SS_b \sim J\tau^2\chi^2_{I-1}$$ and is independent of the overall observed mean $$\bar y_{\bullet\bullet} \sim {\cal N}(\mu, \frac{\tau^2}{I})$$. +Thus $$\frac{\bar y_{\bullet\bullet} - \mu}{\frac{1}{\sqrt{I}}\sqrt{\frac{SS_b}{J(I-1)}}}$$ has a Student $t$ distribution with $I-1$ degrees of freedom, wherefrom it is easy to get an exact confidence interval about $\mu$. + +Note that all of this is simplify equivalent to the classical interval for a Gaussian mean by considering only the group means $\bar{y}_{i\bullet}$ as the observations. +Thus the simple approach you mention: + +> The simple approach is to first compute the mean of each experiment: +> 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the +> grand mean is 39.7 with the 95% confidence interval ranging from 17.4 +> to 61.9. + +is right. And your intuition about the ignored variation: + +> The problem with that approach is that it totally ignores the +> variation among triplicates. I wonder if there isn't a good way to +> account for that variation. + +is wrong. I also mention the correctness of such a simplification in [http://stats.stackexchange.com/a/72578/8402][1] + + + [1]: http://stats.stackexchange.com/a/72578/8402",,2013-10-12 10:02:03.747 +185922,57347,6162.0,5,,CC BY-SA 3.0,19fbcc15-2c0f-4fb2-858a-c5f2c9aed0c6,"There is a natural exact confidence interval for the grandmean in the balanced random one-way ANOVA model $$(y_{ij} \mid \mu_i) \sim_{\text{iid}} {\cal N}(\mu_i, \sigma^2_w), \quad j=1,\ldots,J, +\qquad +\mu_i \sim_{\text{iid}} {\cal N}(\mu, \sigma^2_b), \quad i=1,\ldots,I.$$ +Indeed, it is easy to check that the distribution of the observed means $\bar{y}_{i\bullet}$ is $\bar{y}_{i\bullet} \sim_{\text{iid}} {\cal N}(\mu, \tau^2)$ with $\tau^2=\sigma^2_b+\frac{\sigma^2_w}{J}$, +and it is well known that the between sum of squares $SS_b$ has distribution $$SS_b \sim J\tau^2\chi^2_{I-1}$$ and is independent of the overall observed mean $$\bar y_{\bullet\bullet} \sim {\cal N}(\mu, \frac{\tau^2}{I})$$. +Thus $$\frac{\bar y_{\bullet\bullet} - \mu}{\frac{1}{\sqrt{I}}\sqrt{\frac{SS_b}{J(I-1)}}}$$ has a Student $t$ distribution with $I-1$ degrees of freedom, wherefrom it is easy to get an exact confidence interval about $\mu$. + +**Note that all of this is simplify equivalent to the classical interval for a Gaussian mean by considering only the group means $\bar{y}_{i\bullet}$ as the observations**. +Thus the simple approach you mention: + +> The simple approach is to first compute the mean of each experiment: +> 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the +> grand mean is 39.7 with the 95% confidence interval ranging from 17.4 +> to 61.9. + +is right. And your intuition about the ignored variation: + +> The problem with that approach is that it totally ignores the +> variation among triplicates. I wonder if there isn't a good way to +> account for that variation. + +is wrong. I also mention the correctness of such a simplification in [http://stats.stackexchange.com/a/72578/8402][1] + + + [1]: http://stats.stackexchange.com/a/72578/8402",added 4 characters in body,2013-10-12 10:21:06.487 +185925,57348,6136.0,2,,CC BY-SA 3.0,a87dd8a9-d788-473f-a691-552bf51253cc,"If + + P = [0,0.9,0,0.1] + +Q = [0,1,0,0] + +Then KL(P||Q) = 0 + ln(0.9/1)*0.9 + 0 + 0 = -0.094 + +This shouldn't be possible from the Gibbs inequality. What am I misunderstanding? ",,2013-10-12 10:23:30.670 +185926,57318,18198.0,5,,CC BY-SA 3.0,21b48cf1-99df-4f20-844c-a1a813d0ac2d,"I have seen a method whereby instead of trying to estimate the ridge parameter (k) directly from the data (using one of the many many ridge parameter estimators in the literature) you solve for it iteratively. + +The method is simple enough: You simply increase k (in suitably small steps) until the condition number is reduced blow 10. + +At first blush this seems like quite a nice solution to me but I've never seen a Ridge Regression paper/book that uses it. + +Update OK this is basically the method suggested by Marquardt ""Generalized inverses, Ridge Regression, Biased Linear Estimation and Non-linear Estimation"" the only difference being he used VIF's to measure the MC while this method uses the condition number. McDonald and Galrneau ""A Monte-Carlo Evaluation of some Ridge-Type Estimators"" note that this method is may not be appropriate for all data sets as it does not include the y values (observations). I still have not found a paper where the Marquardt method is tested against other estimators for the ridge parameter does anybody know of such a paper? + +Is this method theoretically sound though? Even if (as I suspect) it isn't does it really matter for the average practitioner who just want to produce more stable estimates of their Beta's (the weights in the regression) rather than having them ""blow up"" to grossly unrealistic values when they experience severe MC? + +Truly I would like to find a better method than this ideally with a solid theoretical underpinning but its hard to see from a practical view point it can be improved upon? +",added 612 characters in body,2013-10-12 10:35:46.497 +185929,57349,22630.0,3,,CC BY-SA 3.0,936245dc-cf30-48ef-bfc7-f8716be94863,,,2013-10-12 10:43:14.610 +185928,57349,22630.0,1,,CC BY-SA 3.0,936245dc-cf30-48ef-bfc7-f8716be94863,"What does saying ""Mean value of each pixel over all images""?",,2013-10-12 10:43:14.610 +185927,57349,22630.0,2,,CC BY-SA 3.0,936245dc-cf30-48ef-bfc7-f8716be94863,"I was reading a paper related to Auto encoders for my project work. It is required to input images as vectors to the neural network. I couldn't understand a certain sentence due to lack of knowledge of statistics (I guess). I googled, but the problem is I don't know what it is exactly and searching the same phrase returns the same kind of documents but not their explanation. + + + +> We train on 1.6 million 32*32 color images that have been preprocessed +> by subtracting from each pixel its mean value over all images and then +> dividing by the standard deviation of all pixels over all images. + + +What does it mean by ""subtracting from each pixel its mean value over all images and then +dividing by the standard deviation of all pixels over all images"". + +I would request you to explain in laymen terms since I'm new to all this.",,2013-10-12 10:43:14.610 +185930,57321,6162.0,5,,CC BY-SA 3.0,e0dc1c31-7271-42ba-bb17-ee2016f27340,"> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. + +Let me develop this idea here. The model for the individual observations is +$$y_{ijk}= \mu_i + \alpha_{ij} + \epsilon_{ijk}$$, where : + + - $y_{ijk}$ is the $k$-th measurement of individual $j$ of group $i$ + + - $\alpha_{ij} \sim_{\text{iid}} {\cal N}(0, \sigma^2_b)$ is the random effect for individual $j$ of group $i$ + + - $\epsilon_{ijk} \sim_{\text{iid}} {\cal N}(0, \sigma^2_w)$ is the within-error + +In [my answer to your first question][1], I have suggested you to note that one obtains a classical (fixed effects) Gaussian linear model for the subjects means $\bar y_{ij\bullet}$. Indeed you can easily check that $$\bar y_{ij\bullet} = \mu_i + \delta_{ij}$$ with $$\delta_{ij} = \alpha_{ij} + \frac{1}{K}\sum_k \epsilon_{ijk} +\sim_{\text{iid}} {\cal N}(0, \sigma^2) \quad \text{where } \quad \boxed{\sigma^2=\sigma^2_b+\frac{\sigma^2_w}{K}},$$ +assuming $K$ repeated measurements for each individual. This is nothing but the one-way ANOVA model with a fixed factor. + +And then I claimed that in order to draw inference about the $\mu_i$ you can simply consider the simple classical linear model whose observations are the subjects means $\bar y_{ij\bullet}$. I think I spoke too quickly, and **I'd like to know the advice of an expert about this point**. I know it works here, but is it due to the fact that the observed subjects means $\bar y_{ij\bullet}$ are sufficient statistics for the $\mu_i$ ? (I do not remember the theory of sufficient statistics). + +> In the answers there (if I understood correctly) I learned that +> within-subject variance does not effect inferences made about group +> means and it is ok to simply take the averages of averages to +> calculate group mean, then calculate within-group variance and use +> that to perform significance tests. I would like to use a method where +> the larger the within subject variance the less sure I am about the +> group means or understand why it does not make sense to desire that. + +As you see from the boxed formula, the within-variance $\sigma^2_w$ plays a role in the model for the observed group means. + +## Update + +About my request in **bold**, I have now thought about it and there's no need of sufficient statistics or something like that. The principle is very general. Assume a model for a sample $y=(y_i)$ and consider the ""submodel"" for the sample $(f_i(y))$ for given functions $f_i$. If $\mu$ is a parameter appearing in both models, then a confidence interval about $\mu$ with respect to the second (sub)model obviously is a confidence interval about $\mu$ with respect to the first model. + + [1]: http://stats.stackexchange.com/a/72490/8402",added 498 characters in body,2013-10-12 10:45:10.757 +185933,57350,11772.0,1,,CC BY-SA 3.0,5f21cc50-a414-48a6-98ea-c446e2215d99,Subscript notation in expectations,,2013-10-12 11:04:38.997 +185932,57350,11772.0,3,,CC BY-SA 3.0,5f21cc50-a414-48a6-98ea-c446e2215d99,,,2013-10-12 11:04:38.997 +185931,57350,11772.0,2,,CC BY-SA 3.0,5f21cc50-a414-48a6-98ea-c446e2215d99,"What is the exact meaning of the subscript notation $\mathbb{E}_X[f(X)]$ in conditional expectations in the framework of measure theory ? These subscripts do not appear in the definition of conditional expectation, but we may see for example in [this page of wikipedia][1]. (Note that it wasn't always the case, [the same page][2] few months ago). + +What should be for example the meaning of $\mathbb{E}_X[X+Y]$ with $Y=X+1$ ? + + [1]: http://en.wikipedia.org/wiki/Law_of_total_expectation + [2]: http://en.wikipedia.org/w/index.php?title=Law_of_total_expectation&oldid=548089336",,2013-10-12 11:04:38.997 +185934,57349,22630.0,5,,CC BY-SA 3.0,08cb3963-9706-4bc2-b2c8-b0a9b6cffc02,"I was reading a paper related to Auto encoders for my project work. It is required to input images as vectors to the neural network. I couldn't understand a certain sentence due to lack of knowledge of statistics (I guess). I googled, but the problem is I don't know what it is exactly and searching the same phrase returns the same kind of documents but not their explanation. + + + +> We train on 1.6 million 32*32 color images that have been preprocessed +> by subtracting from each pixel its mean value over all images and then +> dividing by the standard deviation of all pixels over all images. + + +What does it mean by ""subtracting from each pixel its mean value over all images and then +dividing by the standard deviation of all pixels over all images"". + +I would request you to explain in laymen terms since I'm new to all this.",edited title,2013-10-12 11:17:00.883 +185935,57349,22630.0,4,,CC BY-SA 3.0,08cb3963-9706-4bc2-b2c8-b0a9b6cffc02,"What does saying ""Standard deviation of all pixels over all images""?",edited title,2013-10-12 11:17:00.883 +185937,57349,15827.0,4,,CC BY-SA 3.0,a83b77be-fa60-47a6-a773-b180c33c4f95,"What does ""Standard deviation of all pixels over all images"" mean?",small fixes,2013-10-12 11:21:03.220 +185936,57349,15827.0,5,,CC BY-SA 3.0,a83b77be-fa60-47a6-a773-b180c33c4f95,"I was reading a paper related to Auto encoders for my project work. It is required to input images as vectors to the neural network. I couldn't understand a certain sentence due to lack of knowledge of statistics (I guess). I Googled, but the problem is I don't know what it is exactly and searching the same phrase returns the same kind of documents but not their explanation. + + + +> We train on 1.6 million 32*32 color images that have been preprocessed +> by subtracting from each pixel its mean value over all images and then +> dividing by the standard deviation of all pixels over all images. + + +What does it mean by ""subtracting from each pixel its mean value over all images and then +dividing by the standard deviation of all pixels over all images"". + +I would request you to explain in lay terms since I'm new to all this.",small fixes,2013-10-12 11:21:03.220 +185939,57349,22630.0,5,,CC BY-SA 3.0,72b6caf7-36bf-4b95-acb6-e41fc8c61312,"I was reading a paper related to Auto encoders for my project work. It is required to input images as vectors to the neural network. I couldn't understand a certain sentence due to lack of knowledge of statistics (I guess). I Googled, but the problem is I don't know what it is exactly and searching the same phrase returns the same kind of documents but not their explanation. + + Source: http://www.cs.toronto.edu/~hinton/absps/esann-deep-final.pdf + +> We train on 1.6 million 32*32 color images that have been preprocessed +> by subtracting from each pixel its mean value over all images and then +> dividing by the standard deviation of all pixels over all images. + + +What does it mean by ""subtracting from each pixel its mean value over all images and then +dividing by the standard deviation of all pixels over all images"". + +My interpretation is: ""Subtracting from each pixel its mean value over all images"" + It means, for a pixel position in an image, subtract the average of values of that pixel position over all images and subtract from the current pixel value. + + +Am I correct? + +It is somewhat ambiguous to me. + +Please explain in some math terms. +",Added more details,2013-10-12 11:51:10.477 +185941,57351,20473.0,2,,CC BY-SA 3.0,67c03e84-b311-41e8-8a42-e33bca29b457,"In an expression where more than one random variables are involved, the symbol $E$ alone does not clarify _with respect to which random variable_ is the expected value ""taken"". For example + +$$E[h(X,Y)] =? \int_{-\infty}^{\infty} h(X,Y) f_X(x)dx$$ +or +$$E[h(X,Y)] =? \int_{-\infty}^{\infty} h(X,Y) f_Y(y)dx$$ + +*Neither*. When many random variables are involved, and there is no subscript in the $E$ symbol, the expected value is taken with respect to their joint distribution: + +$$E[h(X,Y)] = \int_{-\infty}^{\infty}\int_{-\infty}^{\infty} h(x,y) f_{XY}(x,y)dxdy$$ + + +When a subscript is present... in some cases it tells us *on which variable we should condition*. So + +$$E_X[(h(X,Y)] = E(h(X,Y)\mid X) = \int_{-\infty}^{\infty} h(x,y) f_{h(X,Y)|X}(h(x,y)\mid x)dh $$ + +...But in other cases, it tells us which density to use for the ""averaging"" + +$$E_X[(h(X,Y)] = \int_{-\infty}^{\infty} h(x,y) f_{X}(x)dx $$ + + + +Rather confusing I would say, but who said that scientific terminology is totally free of ambiguity? You should look how each author defines the use of such symbols.",,2013-10-12 11:56:27.427 +185942,57352,15827.0,2,,CC BY-SA 3.0,9e7812a9-b0c2-418b-863c-7e7c24333d0d,"Each image is composed of 32 $\times$ 32 pixels, so for a given pixel (say row 13, column 31) something measured is averaged over all the images, and the standard deviation (SD for short) for the same something is also calculated. + +(value − mean) / SD is often called a z-score and is a way of standardizing values to take account of mean and SD. Presumably that's done for every pixel, meaning every pixel position. + +It is spelled out that they are ""dividing by the standard deviation of _all_ pixels over _all_ images"" [my emphasis] and that SD would usually be calculated with reference to the corresponding overall mean. However, division by that SD would be dividing by a constant, so it won't have any effect on the images beyond a question of units.",,2013-10-12 12:16:10.913 +185943,57350,,25,,,579e4a45-e902-4c7f-a4e5-6a325bfec545,,http://twitter.com/#!/StackStats/status/389002160866091008,2013-10-12 12:18:37.187 +185944,57353,5875.0,2,,CC BY-SA 3.0,95872aca-e303-4cfe-8014-23d44d9fb68f,"Let’s remove the two categories with probability $0$ in both distributions. Your example is $P = (0.9, 0.1)$ and $Q = (1,0)$. + +The KL divergence is $KL(P||Q) = \sum_i p_i \log\left( {p_i \over q_i }\right)$. It is not +$$ 0.9 \times \log\, 0.9 + 0 $$ +but +$$ 0.9 \times \log\, 0.9 + 0.1 \times ( +\infty ) = + \infty.$$ +",,2013-10-12 12:19:43.223 +185945,57319,1693.0,5,,CC BY-SA 3.0,f0f5db8d-5108-4371-9776-b2c303417941,"I am modeling an outcome for hospital patients, 'RA' (whether readmitted). My predictor of interest is 'HHS' (whether referred to Home Health Services such as from a visiting nurse). Those referred readmit at a 15.2% rate; others, 9.2%, but the former are needier, sicker patients. Conventional thinking is that if we controlled for severity of illness this difference would not only be washed out but would reverse itself. In other words, holding constant the severity of illness, having HHS should mean a lower RA rate. + +With HHS as the sole predictor, B in a logistic regression = 0.6 (N ~ 25k). B is reduced to 0.2 with a group of covariates controlled, each accounting for some aspect of severity of illness, but B doesn't fall below zero. + +HHS alone explains only about 1% of the variance in RA; with the other predictors, this becomes 4%.* Perhaps this is the problem--that the model is not explaining enough variance for the covariates to ""succeed"" in reversing the sign of the coefficient of interest. If this is true, is there a way to estimate how high explained variance needs to be for such a reversal to show up? + + +---------- + +*Using either of 2 pseudo-RSQ formulas; Cox & Snell's or Menard's [-2LL0 - (-2LL1)] / [-2LL0.]",responding to ttnphns's comment,2013-10-12 12:45:05.293 +185946,57319,1693.0,4,,CC BY-SA 3.0,f0f5db8d-5108-4371-9776-b2c303417941,How high must RSQ be for a suppressor/reversal effect to show up?,responding to ttnphns's comment,2013-10-12 12:45:05.293 +185947,57351,20473.0,5,,CC BY-SA 3.0,0dbe36e0-de54-4dfa-909e-0a7c738f0f6b,"In an expression where more than one random variables are involved, the symbol $E$ alone does not clarify _with respect to which random variable_ is the expected value ""taken"". For example + +$$E[h(X,Y)] =? \int_{-\infty}^{\infty} h(x,y) f_X(x)dx$$ +or +$$E[h(X,Y)] =? \int_{-\infty}^{\infty} h(x,y) f_Y(y)dx$$ + +*Neither*. When many random variables are involved, and there is no subscript in the $E$ symbol, the expected value is taken with respect to their joint distribution: + +$$E[h(X,Y)] = \int_{-\infty}^{\infty}\int_{-\infty}^{\infty} h(x,y) f_{XY}(x,y)dxdy$$ + + +When a subscript is present... in some cases it tells us *on which variable we should condition*. So + +$$E_X[(h(X,Y)] = E(h(X,Y)\mid X) = \int_{-\infty}^{\infty} h(x,y) f_{h(X,Y)|X}(h(x,y)\mid x)dh $$ + +...But in other cases, it tells us which density to use for the ""averaging"" + +$$E_X[(h(X,Y)] = \int_{-\infty}^{\infty} h(x,y) f_{X}(x)dx $$ + + + +Rather confusing I would say, but who said that scientific terminology is totally free of ambiguity? You should look how each author defines the use of such symbols.",edited body,2013-10-12 13:52:57.420 +185950,57354,8671.0,3,,CC BY-SA 3.0,f9231b6f-f3f0-4ad8-a8d9-dbbfedb24550,,,2013-10-12 13:58:25.187 +185948,57354,8671.0,2,,CC BY-SA 3.0,f9231b6f-f3f0-4ad8-a8d9-dbbfedb24550,"I have a problem when computing the mutual information between two variables. +Lets consider the following table: + +.................k=1 .... k=2.....k=3 + +c = 1.........10.......20.........5 + +c = 2..........5 ........3.........20 + +I want to calculate the mutual information between the class (c) and the cluster (k). + +for e.g. for MI(c=1, k=1), I will calculate the P(c,k) as follows: + +..............k=1 ...... k~=1 + +c=1 ........10/63 ........ 25/63 + +c~=1 ...... 5/63 ....... 23/63 + +and I can calculate the MI based on the following: +$MI(c,k) = \sum_{c\in\{0,1\}} \sum_{k\in\{0,1\}} p(c,k)log_2 \frac{p(c,k)}{p(c)p(k)}$ + +In this case: I am getting the same MI for (c=1, k=1) and (c=2, k=1). what is the reason for it? How can I calculate the MI for each (c,k) based on the first table?",,2013-10-12 13:58:25.187 +185949,57354,8671.0,1,,CC BY-SA 3.0,f9231b6f-f3f0-4ad8-a8d9-dbbfedb24550,mutual information calculations example,,2013-10-12 13:58:25.187 +185952,57355,20927.0,2,,CC BY-SA 3.0,65f12dac-758c-453a-a31d-c32136036024,"I want to test mean differences between 4(0,1,2,3) groups for 6 different cell-types. In other words, I would have 6 comparisons to do for each cell-type (0-1,0-2,0-3,1-2,1-3,2-3). I have run a repeated-measures ANOVA before but it only gives me the main effects and interactions. To correct my t-tests for multiple comparisons, I would use Bonferroni, does SPSS let me do that, I just cannot seem to find it except as part of the means-testing using ANOVA? Or do I have to do it manually and how would i do that? alpha-level here is 0.05.Thanks",,2013-10-12 14:02:41.167 +185955,57356,6630.0,2,,CC BY-SA 3.0,8dcb9dec-0a31-4b8a-9d40-4ef8a4020ad5,"MI is between two random variables. In this case, $c$ and $k$ are random variables. + +If you take $c=1$, it isn't really random anymore, and MI should be zero. Likewise for $k=1$. + +(Also, one more thing to keep in mind is that your 'plug-in' estimation of MI is biased.)",,2013-10-12 14:05:53.523 +185956,57271,20473.0,5,,CC BY-SA 3.0,8a36d51e-7879-49c2-8e4c-69e65e20c633,"Write your system explicitly for time $t$ as (""$L$"" for ""loss"", as a positive quantity, and ""$G$"" for ""gain"") +$$ A_t - A_{t-1} = - L^A_{t} + G_{t}^{B\rightarrow A}+G_{t}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - L^B_{t} + G_{t-1}^{A\rightarrow B}+G_{t}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - L^C_{t} + G_{t}^{A\rightarrow C}+G_{t}^{B\rightarrow C}$$ + +The following three relations hold exactly: +$$ L^A_{t} = G_{t}^{A\rightarrow B} + G_{t}^{A\rightarrow C} $$ +$$ L^B_{t} = G_{t}^{B\rightarrow A} + G_{t}^{B\rightarrow C} $$ +$$ L^C_{t} = G_{t}^{C\rightarrow A} + G_{t}^{C\rightarrow B} $$ + +If you substitute in the first three you obtain + +$$ A_t - A_{t-1} = - G_{t}^{A\rightarrow B} - G_{t}^{A\rightarrow C} + G_{t}^{B\rightarrow A}+G_{t}^{C\rightarrow A}$$ + +$$ B_t - B_{t-1} = - G_{t}^{B\rightarrow A} - G_{t}^{B\rightarrow C} + G_{t}^{A\rightarrow B}+G_{t}^{C\rightarrow B}$$ + +$$ C_t - C_{t-1} = - G_{t}^{C\rightarrow A} - G_{t}^{C\rightarrow B} + G_{t}^{A\rightarrow C}+G_{t}^{B\rightarrow C}$$ + +You have $6$ unknown quantities to estimate _per time period_. There is just not enough information to do that. So you need assumptions that will impose structure (=restrictions) on the situation, and will permit you to estimate _something_. What? Let's say you assume that there is a relatively stable ""churn"" from one company to another, as a linear function of their market share in the previous period. This assumption brings in a set of unknown coefficients to be estimated (which will then give you an estimate of ""hidden transfers of market share""). Write $G_{t}^{A\rightarrow B} = a_bA_{t-1}$ (market share lost from $A$ to $B$ as a linear function of $A$'s market share in period $t-1$). +Your equations will become + +$$ A_t - A_{t-1} = - a_bA_{t-1} - a_cA_{t-1} + b_aB_{t-1}+c_aC_{t-1} $$ + +$$ B_t - B_{t-1} = - b_aB_{t-1} - b_cB_{t-1} + a_bA_{t-1}+c_bC_{t-1}$$ + +$$ C_t - C_{t-1} = - c_aC_{t-1} - c_bC_{t-1} + a_cA_{t-1}+ b_cB_{t-1}$$ + +We have turned a set of mathematical identities into a _model_. It is doubtful that this model will hold exactly for each $t$, so you should add a stochastic error term. Rearranging we obtain a first-order Vector Autoregression (VAR): + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +1-a_b-a_c & b_a & c_a \\ +a_b & 1-b_a-b_c & c_b \\ +a_c & b_c & 1-c_a-c_b \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +or, to homogenize notation, + +$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +\gamma_{11} & \gamma_{12} & \gamma_{13} \\ +\gamma_{21} & \gamma_{22} & \gamma_{23} \\ +\gamma_{31} & \gamma_{32} & \gamma_{33} \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$ + +subject to the equality restrictions +$$ \begin{matrix} +\gamma_{11} + \gamma_{21} + \gamma_{31} =1 \\ +\gamma_{12} + \gamma_{22} + \gamma_{32} =1 \\ +\gamma_{13} + \gamma_{23} + \gamma_{33} =1 \\ +\end{matrix} $$ + +So you have essentially $6$ unknown coefficients and a sample of $T-1$ observations (for each company). +Note that these restrictions _imply_ the ""add up to unity"" restriction $A_t+B_t+C_t =1$ for each $t$, so this last one does not impose any additional structure on the unknown coefficients -but it does imply a relation between the error terms, namely that $u^A_{t} + u^B_{t} +u^C_{t} =0$. Any additional assumptions on the three error terms should either come from knowledge of the specific real world phenomenon under study, and/or through a statistical specification search. + +Then, an estimation for a hidden transfer of market share will be, for example + +$$\hat G_{t}^{A\rightarrow B} = \hat \gamma_{21}A_{t-1}$$ + +etc. + +Of course you may find that such a model does not fit your data sample well - for example you expect that all estimated coefficients should be positive and smaller than or equal to unity, but the estimation procedure may not give you that. But this is what we do: we come up with specification hypotheses and we test them against the data - ""success"" is never guaranteed. Then you should try to come up with a different model. + +",deleted 13 characters in body,2013-10-12 14:29:18.637 +185958,57355,503.0,10,,,2abda8a7-a69a-4ebe-8616-f147c08d4c4e,"{""OriginalQuestionIds"":[60383],""Voters"":[{""Id"":7290,""DisplayName"":""gung""},{""Id"":686,""DisplayName"":""Peter Flom""}]}",101,2013-10-12 15:21:34.280 +185959,57357,14850.0,2,,CC BY-SA 3.0,48e3494a-ab05-4a35-9002-cd279513dc71,"A dependent mixture model (hidden markov model) may be of use, depending on the type of deviations expected. + +Assume that your observations come from two distributions (or states), both of which are normally distributed, but have different mean and variance. + +A number of parameters can be estimated: The initial state probabilities (2 parameters), the state transition probabilities between neighbouring data points (4 parameters) and finally the mean and variance of the two distributions (4 parameters). + +In R, this model can be estimated using the depmixS4 package: + + library(depmixS4) + + set.seed(3) + y = rnorm(100) + y[30:35] <- rnorm(6,mean=4,sd=2) + plot(1:100,y,""l"") + + m <- depmix(y~1,nstates=2,ntimes=100) + fm <- fit(m) + + means <- getpars(fm)[c(7,9)] + lines(1:100,means[fm@posterior$state],lwd=2,col=2) + +![enter image description here][1] + + +See http://cran.r-project.org/web/packages/depmixS4/vignettes/depmixS4.pdf for references + + + [1]: https://i.stack.imgur.com/HTvOv.png",,2013-10-12 15:57:43.367 +185960,57354,8671.0,6,,CC BY-SA 3.0,69e6a847-3f03-4d88-bbae-399f78e71fcc,,edited tags,2013-10-12 16:19:50.917 +185961,57357,15827.0,5,,CC BY-SA 3.0,31b24a99-9d3e-4030-97b8-d97c41108eb4,"A dependent mixture model (hidden Markov model) may be of use, depending on the type of deviations expected. + +Assume that your observations come from two distributions (or states), both of which are normally distributed, but have different mean and variance. + +A number of parameters can be estimated: The initial state probabilities (2 parameters), the state transition probabilities between neighbouring data points (4 parameters) and finally the mean and variance of the two distributions (4 parameters). + +In R, this model can be estimated using the depmixS4 package: + + library(depmixS4) + + set.seed(3) + y = rnorm(100) + y[30:35] <- rnorm(6,mean=4,sd=2) + plot(1:100,y,""l"") + + m <- depmix(y~1,nstates=2,ntimes=100) + fm <- fit(m) + + means <- getpars(fm)[c(7,9)] + lines(1:100,means[fm@posterior$state],lwd=2,col=2) + +![enter image description here][1] + + +See http://cran.r-project.org/web/packages/depmixS4/vignettes/depmixS4.pdf for references + + + [1]: https://i.stack.imgur.com/HTvOv.png",edited body,2013-10-12 16:32:41.913 +185962,57356,6630.0,5,,CC BY-SA 3.0,7a7c24a9-15cc-4bd5-9fa3-e9dcee737fd4,"MI is between two random variables. In this case, $c$ and $k$ are random variables. + +If you take $c=1$, it isn't really random anymore, and MI should be zero. Likewise for $k=1$. + +(Also, one more thing to keep in mind is that your 'plug-in' estimation of MI is biased.) + +EDIT: MI between $I(c=1)$ vs $I(k=1)$ where $I$ is the indicator function, is a different matter. Your double usage of the same variable confused me. + +Now your $c$ only take 2 values, therefore, $I(c=1) = 1 - I(c=2)$. MI is invariant invertible transformation of variables, that's why $MI(I(c=1),I(k=1)) = MI(I(c=2),I(k=1))$.",added 331 characters in body,2013-10-12 16:35:12.603 +185965,57358,22637.0,3,,CC BY-SA 3.0,87dc4342-01ae-4a49-82d3-e6c76f3b2d67,,,2013-10-12 16:48:18.213 +185964,57358,22637.0,1,,CC BY-SA 3.0,87dc4342-01ae-4a49-82d3-e6c76f3b2d67,Transformations(CDF technique),,2013-10-12 16:48:18.213 +185963,57358,22637.0,2,,CC BY-SA 3.0,87dc4342-01ae-4a49-82d3-e6c76f3b2d67,"Consider the following short example of transformations. + +Let the joint density of X and Y be given by the unit square, i.e. + +f_{X,Y}(x,y) = \begin{cases} 1\ 0,,2013-10-12 17:07:43.177 +185967,57359,346.0,1,,CC BY-SA 3.0,d12c5a00-1e64-4a4a-950b-9ce915331de5,How to include a linear and quadratic term when also including interaction with those variables?,,2013-10-12 17:07:43.177 +185966,57359,346.0,2,,CC BY-SA 3.0,d12c5a00-1e64-4a4a-950b-9ce915331de5,"When adding a numeric predictor with categorical predictors and their interactions, it is usually considered necessary to center the variables at 0 beforehand. The reasoning is that the main effects are otherwise hard to interpret as they are evaluated with the numeric predictor at 0. + +My question now is how to center if one not only includes the original numeric variable (as a linear term) but also the quadratic term of this variable? Here, two different approaches are necessary: + + 1. **Centering both variables at their individual mean.** This has the unfortunate downside that the 0 now is at a different position for both variables considering the original variable. + 2. **Centering both variables at the mean of the original variable** (i.e., subtracting the mean from the original variable for the linear term and subtracting the square of the mean of the original variable from the quadratic term). With this approach the 0 would represent the same value of the original variable, but the quadratic variable would not be centered at 0 (i.e., the mean of the variable wouldn't be 0). + +I think that approach 2 seems reasonable given the reason for centering after all. However, I cannot find anything about it (also not in the related questions: [a][1] and [b][2]). + +Or is it generally a bad idea to include linear and quadratic terms and their interactions with other variables in a model? + + + [1]: http://stats.stackexchange.com/q/67512/442 + [2]: http://stats.stackexchange.com/q/47178/442",,2013-10-12 17:07:43.177 +185971,57360,22381.0,1,,CC BY-SA 3.0,29909650-ddbb-40ba-8c5f-b88f39c774a0,Does applying ARMA-GARCH require stationarity?,,2013-10-12 17:14:19.543 +186442,57496,12314.0,3,,CC BY-SA 3.0,02a56597-a917-40a1-bfe8-276fce6f92ec,,,2013-10-15 06:05:55.207 +185970,57360,22381.0,2,,CC BY-SA 3.0,29909650-ddbb-40ba-8c5f-b88f39c774a0,"I am going to use the ARMA-GARCH model for financial time series and was wondering whether the series should be stationary before applying the said model. +I know to apply ARMA model the series should be stationary, however i'm not sure for ARMA-GARCH since i'm including GARCH errors which imply volatility clustering and non-constant variance and hence non-stationary series no matter what transformation I do. + +Are financial time series usually stationary or non-stationary? +I tried applying ADF test to a few volatile series and got p-value<0.01 which seems to indicate stationarity but the principle of volatile series itself tells us that the series isn't stationary. + +Can somebody clear that up for me?I'm getting really confused",,2013-10-12 17:14:19.543 +185969,57360,22381.0,3,,CC BY-SA 3.0,29909650-ddbb-40ba-8c5f-b88f39c774a0,,,2013-10-12 17:14:19.543 +185973,57360,1406.0,6,,CC BY-SA 3.0,14d9566d-828d-43fe-8d5d-11ea1380f5dd,,edited body; edited tags,2013-10-12 17:42:16.337 +185972,57360,1406.0,5,,CC BY-SA 3.0,14d9566d-828d-43fe-8d5d-11ea1380f5dd,"I am going to use the ARMA-GARCH model for financial time series and was wondering whether the series should be stationary before applying the said model. +I know to apply ARMA model the series should be stationary, however I'm not sure for ARMA-GARCH since I'm including GARCH errors which imply volatility clustering and non-constant variance and hence non-stationary series no matter what transformation I do. + +Are financial time series usually stationary or non-stationary? +I tried applying ADF test to a few volatile series and got p-value<0.01 which seems to indicate stationarity but the principle of volatile series itself tells us that the series isn't stationary. + +Can somebody clear that up for me?I'm getting really confused",edited body; edited tags,2013-10-12 17:42:16.337 +185974,57361,20473.0,2,,CC BY-SA 3.0,369a04f4-82cc-49fa-82bb-8b0c0150d175,"Copying from the abstract of [Engle's original paper][1]: +""These are mean zero, serially uncorrelated processes with nonconstant variances conditional on the past, but constant unconditional variances. For such processes, the recent past gives information about the one-period forecast variance"". + + + [1]: http://www.jstor.org/stable/1912773",,2013-10-12 17:42:59.303 +185975,57362,1406.0,2,,CC BY-SA 3.0,28da99a9-2af5-4033-ac26-15815377fb86,"Yes the the series should be stationary. GARCH models are actually white noise processes with not trivial dependence structure. Classical GARCH(1,1) model is defined as + +$$r_t=\sigma_t\varepsilon_t,$$ + +with + +$$\sigma_t^2=\alpha_0+\alpha_1\varepsilon_{t-1}^2+\beta_1\sigma_{t-1}^2,$$ + +where $\varepsilon_t$ are independent standard normal variables with unit variance. + +Then + +$$Er_t=EE(r_t|\varepsilon_{t-1},\varepsilon_{t-2},...)=E\sigma_tE(\varepsilon_t|\varepsilon_{t-1},\varepsilon_{t-2},...)=0$$ + +and + +$$Er_tr_{t-h}=EE(r_tr_{t-h}|\varepsilon_{t-1},\varepsilon_{t-2},...)=Er_{t-h}\sigma_{t}E(\varepsilon_t|\varepsilon_{t-1},\varepsilon_{t-2},...)=0$$ + +for $h>0$. Hence $r_t$ is a white noise process. However it is possible to show that $r_t^2$ is actually a $ARMA(1,1)$ process. So GARCH(1,1) is stationary process, yet has non-constant conditional variance. +",,2013-10-12 17:56:44.900 +185976,57361,20473.0,5,,CC BY-SA 3.0,697b5fff-ed99-4197-a405-82017ef701c2,"Copying from the abstract of [Engle's original paper][1]: +""These are mean zero, serially uncorrelated processes with nonconstant variances conditional on the past, but constant unconditional variances. For such processes, the recent past gives information about the one-period forecast variance"". + +Continuing with the references, as the author who introduced GARCH shows (Bollerslev, Tim (1986). ""[Generalized Autoregressive Conditional Heteroskedasticity][2]"", Journal of Econometrics, 31:307-327) +for the GARCH(1,1) process, it suffices that $\alpha_1 + \beta_1 <1$ for 2nd-order stationarity. + + + [1]: http://www.jstor.org/stable/1912773 + [2]: http://www.sciencedirect.com/science/article/pii/0304407686900631#",added 377 characters in body,2013-10-12 18:01:48.457 +185979,57363,22639.0,1,,CC BY-SA 3.0,d9ae5e17-ced4-42c2-8ce2-70ffd3d9fde7,Machine learning algorithms/approaches for class recommendations?,,2013-10-12 18:03:28.603 +185978,57363,22639.0,2,,CC BY-SA 3.0,d9ae5e17-ced4-42c2-8ce2-70ffd3d9fde7,"I am asking a theoretical question about machine learning in terms of clustering. Is it possible, given a set of data of classes that students have taken in a semester to recommend additional classes that students should take if they selected some classes? + +I am thinking along the line of forming clusters of classes and figuring out if a particular set of picked classes match with a pre-existing set of classes. Then, recommend the class that are in the set. But I am new to machine learning, and so welcome any other suggestions of algorithms. + +In addition, this is not particularly theoretical, so feel free to ignore: but does anyone know any particular software that can accomplish this? I know LensKit is a software to handle recommendations but it seems to need ratings (which I do not have). + +I welcome any mathematical manipulations that can turn clusters into ""ratings."" Thanks.",,2013-10-12 18:03:28.603 +185977,57363,22639.0,3,,CC BY-SA 3.0,d9ae5e17-ced4-42c2-8ce2-70ffd3d9fde7,,,2013-10-12 18:03:28.603 +185980,57351,20473.0,5,,CC BY-SA 3.0,2d3aca9e-4928-4815-8891-cf8b3e4662eb,"In an expression where more than one random variables are involved, the symbol $E$ alone does not clarify _with respect to which random variable_ is the expected value ""taken"". For example + +$$E[h(X,Y)] =? \int_{-\infty}^{\infty} h(x,y) f_X(x)dx$$ +or +$$E[h(X,Y)] =? \int_{-\infty}^{\infty} h(x,y) f_Y(y)dx$$ + +*Neither*. When many random variables are involved, and there is no subscript in the $E$ symbol, the expected value is taken with respect to their joint distribution: + +$$E[h(X,Y)] = \int_{-\infty}^{\infty}\int_{-\infty}^{\infty} h(x,y) f_{XY}(x,y)dxdy$$ + + +When a subscript is present... in some cases it tells us *on which variable we should condition*. So + +$$E_X[(h(X,Y)] = E(h(X,Y)\mid X) = \int_{-\infty}^{\infty} h(x,y) f_{h(X,Y)|X}(h(x,y)\mid x)dh $$ + +...But in other cases, it tells us which density to use for the ""averaging"" + +$$E_X[(h(X,Y)] = \int_{-\infty}^{\infty} h(x,y) f_{X}(x)dx $$ + + + +Rather confusing I would say, but who said that scientific notation is totally free of ambiguity or multiple use? You should look how each author defines the use of such symbols.",added 13 characters in body,2013-10-12 18:03:55.497 +185981,57361,20473.0,5,,CC BY-SA 3.0,438c108c-b984-4ffb-affb-0d2971fb4f06,"Copying from the abstract of [Engle's original paper][1]: +""These are mean zero, serially uncorrelated processes with nonconstant variances conditional on the past, but constant unconditional variances. For such processes, the recent past gives information about the one-period forecast variance"". + +Continuing with the references, as the author who introduced GARCH shows (Bollerslev, Tim (1986). ""[Generalized Autoregressive Conditional Heteroskedasticity][2]"", Journal of Econometrics, 31:307-327) +for the GARCH(1,1) process, it suffices that $\alpha_1 + \beta_1 <1$ for 2nd-order stationarity. + +Stationarity (the one needed for estimation procedures), is defined relative to the _unconditional_ distribution and moments. + + + [1]: http://www.jstor.org/stable/1912773 + [2]: http://www.sciencedirect.com/science/article/pii/0304407686900631#",added 377 characters in body,2013-10-12 18:07:20.230 +185982,57364,10135.0,2,,CC BY-SA 3.0,ca91867f-5b21-43b4-b6f2-bf2a8d26c460,"OK, check out the following plot. ![enter image description here][1] + +You need to find the area of shaded region. So you need to take double integration over the shaded region. First you fix your $X$ and take your integration with respect to $Y$. Look at the double bar in the middle of that triangle (upper right corner). Its lower part goes from $Y=-X+z$ to its upper part $Y=1$. These are the bounds for the first integration. Now you need to move that little bar in the middle to left and right to cover all the shaded region. In other words, it means that this time, you need to take your integration with respect to $X$. So as you can see, the line $Y=-X+z$ for $1\leq z<2$ intercepts the line $Y=1$ at $X=z-1$. This is the left boundary limit for your integration. Now move that little bar to the right, it should go up to $X=1$, that gives you the upper bound. Hope that helps. + + [1]: https://i.stack.imgur.com/OfF8D.jpg",,2013-10-12 18:10:09.217 +185983,57212,,25,,,ba7eeb65-fdcc-4303-8892-f1d10b5eb194,,http://twitter.com/#!/StackStats/status/389092757832491008,2013-10-12 18:18:37.167 +185984,57365,22640.0,2,,CC BY-SA 3.0,449888d5-8c6a-4a70-8453-f89009bb79bb,"We have a response variable $Y$ and predictor $X$, and we draw $n$ samples $(Y_1,X_1), \ldots, (Y_n, X_n)$ from the population of interest to do a regression analysis. Under the assumptions of a simple linear regression model, my question is a conceptual one: how do we really think about the response on the $i$th unit, $Y_i$? Do we say it's drawn from the level or subpopulation of individuals with $ X = x_i $, or from the aggregate population over all the values of $X$? Moreover, while we assume that the response $Y$ in every subpopulation defined by $X$ is normal with equal variances, how do we think about the aggregate population from which $Y_i$ is drawn from? ",,2013-10-12 18:36:45.027 +185986,57365,22640.0,3,,CC BY-SA 3.0,449888d5-8c6a-4a70-8453-f89009bb79bb,,,2013-10-12 18:36:45.027 +185985,57365,22640.0,1,,CC BY-SA 3.0,449888d5-8c6a-4a70-8453-f89009bb79bb,"Drawing data from ""population"" for regression analysis",,2013-10-12 18:36:45.027 +185988,57354,674.0,5,,CC BY-SA 3.0,73870fc2-e681-446d-8328-f794a4d67f1c,"I have a problem when computing the mutual information between two variables. +Let's consider the following table: + + .................k=1 .... k=2.....k=3 + c = 1.........10.......20.........5 + c = 2..........5 ........3.........20 + +I want to calculate the mutual information between the class (c) and the cluster (k). + +E.g., for MI(c=1, k=1), I will calculate the P(c,k) as follows: + + ..............k=1 ...... k~=1 + c=1 ........10/63 ........ 25/63 + c~=1 ...... 5/63 ....... 23/63 + +and I can calculate the MI based on the following: +$MI(c,k) = \sum_{c\in\{0,1\}} \sum_{k\in\{0,1\}} p(c,k)log_2 \frac{p(c,k)}{p(c)p(k)}$ + +In this case: I am getting the same MI for (c=1, k=1) and (c=2, k=1). What is the reason for this? How can I calculate the MI for each (c,k) based on the first table? + +",added 36 characters in body; edited title,2013-10-12 18:37:57.323 +185987,57354,674.0,4,,CC BY-SA 3.0,73870fc2-e681-446d-8328-f794a4d67f1c,Computing mutual information,added 36 characters in body; edited title,2013-10-12 18:37:57.323 +185989,57366,306.0,2,,CC BY-SA 3.0,65a4ab91-1449-4132-8426-01b75e01e43f,"Stationarity is a theoretical concept which is then modified to other forms like Weak Sense Stationarity which can be tested easily. Most of the tests like adf test as you have mentioned test for linear conditions only. the ARCH effects are made for series which do not have autocorrelation in the first order but there is dependence in the squared series. + +The ARMA-GARCH process you talk about, here the second order dependence is removed using the GARCH part and then any dependence in the linear terms is captured by the ARMA process. + +The way to go about is to check for the autocorrelation of the squared series, if there is dependence, then apply the GARCH models and check the residuals for any linear time series properties which can then be modelled using ARMA processes.",,2013-10-12 18:41:16.300 +185991,57367,22641.0,3,,CC BY-SA 3.0,1d66db1f-3979-4a43-8aa0-311c2700bc62,,,2013-10-12 18:47:43.087 +185992,57367,22641.0,1,,CC BY-SA 3.0,1d66db1f-3979-4a43-8aa0-311c2700bc62,"Test for differences between (among) related, but not matched, samples",,2013-10-12 18:47:43.087 +185990,57367,22641.0,2,,CC BY-SA 3.0,1d66db1f-3979-4a43-8aa0-311c2700bc62,"When two samples are related, or dependent, but the observations are not matched, are there any tests that will determine if the samples (means or otherwise) are different? I've searched extensively and have only found tests for matched samples, which is not what I need.",,2013-10-12 18:47:43.087 +185995,57368,21840.0,1,,CC BY-SA 3.0,fef043d1-3990-4e4f-a790-427b00612d5c,Compute probability,,2013-10-12 19:10:33.277 +185994,57368,21840.0,3,,CC BY-SA 3.0,fef043d1-3990-4e4f-a790-427b00612d5c,,,2013-10-12 19:10:33.277 +185993,57368,21840.0,2,,CC BY-SA 3.0,fef043d1-3990-4e4f-a790-427b00612d5c,"Suppose that X, Y and Z are i.i.d.∼ Uniform(0,1). Let $t > 0$ be a fixed constant. +(i) Compute $P(XY \leq t)$ +(ii) Compute $ P(XY \leq t)$ +(iii) Compute $ P(XY/Z \leq t)$ + +I found the solution for (i) part undermining the different values of $t$. +$ P(XY \leq t) = \int_0^1\int_0^{yt}dx dy = t/2 $ ; when $t\leq 1$ + +I am finding hard time to lake limits for different values of $t$ i.e when $t\leq 1 $ and $t>1$ + Thanks!",,2013-10-12 19:10:33.277 +185996,57368,21840.0,5,,CC BY-SA 3.0,bc8a8687-8449-42de-9506-53d4b1b9b3b2,"Suppose that X, Y and Z are i.i.d.∼ Uniform(0,1). Let $t > 0$ be a fixed constant. +(i) Compute $P(X/Y \leq t)$ +(ii) Compute $ P(XY \leq t)$ +(iii) Compute $ P(XY/Z \leq t)$ + +I found the solution for (i) part undermining the different values of $t$. +$ P(X/Y \leq t) = \int_0^1\int_0^{yt}dx dy = t/2 $ ; when $t\leq 1$ + +I am finding hard time to lake limits for different values of $t$ i.e when $t\leq 1 $ and $t>1$ + Thanks!",corrected question,2013-10-12 19:21:00.273 +185997,57369,22507.0,2,,CC BY-SA 3.0,762352f5-4845-4539-909c-941efdf021f9,"Clustering is seldom, if any, used for recommendations, since it is too crude. The most common techniques used are: + +- matrix factorization; read, for example, ""Matrix Factorization Techniques for Recommender Systems"" by Koren, Bell, and Volinsky. If you use R, there is are packages NMFN and gnmf for non-negative matrix factorization. In your case, this will be the matrix of 0's and 1's. There are many modifications and versions of this technique. + +- KNN. For each class, find classes highly correlative with it. Then predict the probability for this class as a linear regression (or, in your case, logistic regression) of the correlative classes, with relaxation. +- Restricted Boltzmann Machines. This is relatively hard to understand or implement. Read, for example, ""Restricted Boltzmann Machines for Collaborative Filtering"" by Salakhutdinov, Mnih, and Hinton. There are no Restricted Boltzmann Machine packages on R.",,2013-10-12 19:31:39.603 +185998,57370,22507.0,2,,CC BY-SA 3.0,6527c729-ca31-4d3c-a5be-64ad2608fed0,"I recommend ""The Elements of Statistical Learning"", by Hastie, Tibshirani, and Friedman. Don't just read it, play with some algorithms described by them (most of them are implemented in R, or you could even implement some yourself), and learn their weak and strong points.",,2013-10-12 19:39:43.643 +185999,57369,22507.0,5,,CC BY-SA 3.0,9e8cc3c0-5f25-4f43-a1f7-0fd1c1f60e9c,"Clustering is seldom, if ever, used for recommendations, since it is too crude. The most common techniques used are: + +- matrix factorization; read, for example, ""Matrix Factorization Techniques for Recommender Systems"" by Koren, Bell, and Volinsky. If you use R, there is are packages NMFN and gnmf for non-negative matrix factorization. In your case, this will be the matrix of 0's and 1's. There are many modifications and versions of this technique. +- KNN. For each class, find classes highly correlative with it. Then predict the probability for this class as a linear regression (or, in your case, logistic regression) of the correlative classes, with relaxation. +- Restricted Boltzmann Machines. This is relatively hard to understand or implement. Read, for example, ""Restricted Boltzmann Machines for Collaborative Filtering"" by Salakhutdinov, Mnih, and Hinton. There are no Restricted Boltzmann Machine packages on R. +- Often, a combination of different approaches (blending) is used, providing better results than each one separately. For example, Netflix uses a blending of Matrix Factorization and Restricted Boltzmann Machines.",added 221 characters in body,2013-10-12 19:45:19.737 +186000,57371,22507.0,2,,CC BY-SA 3.0,f2a59895-51d9-48d2-b813-28ffb4f9de02,"Calculate a correlation of two functions over a set of random examples. The two-sided Kolmogorov-Smirnov test compares one-dimensional distributions, not multidimensional functions.",,2013-10-12 20:15:29.000 +186001,57372,21947.0,2,,CC BY-SA 3.0,083d1d96-46d4-4db7-966f-c3a2987662e3,"The answer is **absolutely not**. There is no ""in the eye of the beholder"", there is no argument, the answer is **no, your data is not significant at the $p=0.05$ level**. (Ok, there is one way out, but its a very narrow path.) + +The key problem is this phrase: ""We **came across** some data..."". + +This suggests that you looked at several other statistical hypothesis, and rejected them because they did not reach your significance level. You found one hypothesis that (barely) met your standard, and you are wondering whether it is significant. Unless your $p$ value accounts for such multiple hypothesis testing, it is overly optimistic. Given that you are just three decimal points away from your threshold, considering even *one* additional hypothesis would surely push $p$ over the line. + +There is a name for this sort of statistical malfeasance: [data dredging][1]. I'm ambivalent about reporting it in the paper as an interesting hypothesis; does it have some physical reason you expect it to hold? + +**There is, however, one way out.** Perhaps you decided *a priori* to perform just this *one* test on just this *one* data set. You wrote that down in your lab notebook, in front of someone so that you could prove it later. Then you did your test. + +If you did this, then your result is valid at the $p=0.05$ level, and you can back it up to skeptics like me. Otherwise, sorry, it is not a statistically significant result. + + + [1]: https://en.wikipedia.org/wiki/Data_dredging",,2013-10-12 20:45:05.873 +186003,57236,,25,,,05fbbf93-72aa-4c57-a648-fdcbf5598db8,,http://twitter.com/#!/StackStats/status/389138447711354880,2013-10-12 21:20:10.563 +186004,57358,594.0,5,,CC BY-SA 3.0,8750ffda-8d60-4e3f-9baa-f816cfb1712e,"Consider the following short example of transformations. + +Let the joint density of X and Y be given by the unit square, i.e. + +$$f_{X,Y}(x,y) = \begin{cases} 1\ 0,edited tags,2013-10-12 22:59:22.727 +186009,57358,594.0,5,,CC BY-SA 3.0,b98cd876-2088-4b96-944d-af30ce185fae,"Consider the following short example of transformations. + +Let the joint density of X and Y be given by the unit square, i.e. + +$$f_{X,Y}(x,y) = \begin{cases} 1\ \quad 0 0$ be a fixed constant. + +(i) Compute $P(X/Y \leq t)$ +(ii) Compute $ P(XY \leq t)$ +(iii) Compute $ P(XY/Z \leq t)$ + +I found the solution for (i) part undermining the different values of $t$. +$ P(X/Y \leq t) = \int_0^1\int_0^{yt}dx dy = t/2 $ ; when $t\leq 1$ + +I am finding hard time to lake limits for different values of $t$ i.e when $t\leq 1 $ and $t>1$ + +",formatting,2013-10-12 23:14:21.973 +186012,57367,594.0,6,,CC BY-SA 3.0,f2ea2cf0-2a8e-4c65-9306-797a174eaed6,,edited tags,2013-10-12 23:17:27.737 +186013,57367,594.0,6,,CC BY-SA 3.0,cbfa9d53-1499-4e46-84b2-362858ecba21,,edited tags,2013-10-13 00:13:24.583 +186014,57237,,25,,,e5f76412-bdfc-4413-9f5d-252228f36dd6,,http://twitter.com/#!/StackStats/status/389183745468952576,2013-10-13 00:20:10.317 +186036,57378,5001.0,2,,CC BY-SA 3.0,0bcada00-63ca-43a9-b7f8-5f8e34dde4cc,"I am trying to interpret one of the p-values in a one variable linear regression. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is `y = 0.514x + 0.00087` and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation?",,2013-10-13 08:09:51.043 +186016,57354,594.0,5,,CC BY-SA 3.0,6b9c8c48-15d5-4e7e-a4c5-fe6ebeba05ce,"I have a problem when computing the mutual information between two variables. +Let's consider the following table: + + k=1 k=2 k=3 + c = 1 10 20 5 + c = 2 5 3 20 + +I want to calculate the mutual information between the class ($c$) and the cluster ($k$). + +E.g., for $\text{MI}(c=1, k=1)$, I will calculate the $P(c,k)$ as follows: + + k=1 k~=1 + c=1 10/63 25/63 + c~=1 5/63 23/63 + +and I can calculate the MI based on the following: + +$\text{MI}(c,k) = \sum_{c\in\{0,1\}} \sum_{k\in\{0,1\}} p(c,k)log_2 \frac{p(c,k)}{p(c)p(k)}$ + +In this case: I am getting the same MI for $(c=1, k=1)$ and $(c=2, k=1)$. What is the reason for this? How can I calculate the MI for each $(c,k)$ based on the first table? + +",added 34 characters in body,2013-10-13 01:56:26.553 +186017,57251,,25,,,fee0bf89-cd47-4af0-bb23-7fd45d290003,,http://twitter.com/#!/StackStats/status/389229042077286400,2013-10-13 03:20:09.907 +186020,57373,17459.0,3,,CC BY-SA 3.0,e2e784d5-b1b7-4edd-94fe-6ee0a142b610,,,2013-10-13 03:46:30.167 +186018,57373,17459.0,2,,CC BY-SA 3.0,e2e784d5-b1b7-4edd-94fe-6ee0a142b610,"I am stuck by a problem and wonder if anyone can give me some suggestions? +x1, x2, x3 all follow uniform [0,1] distribution, and subject to constraint that x1+x2+x3<=1. +what's the joint distribution for p(x1,x2,x3) ? and what's the variance-covariance matrix for it ? + +I get the joint distribution by geometrical way, that the pdf should be 1/6. +However, I can't calculate the variance-covariance matrix for it. I wonder how to get it ? + +Thank you very much. +",,2013-10-13 03:46:30.167 +186019,57373,17459.0,1,,CC BY-SA 3.0,e2e784d5-b1b7-4edd-94fe-6ee0a142b610,what's the pdf and covariance for this distribution?,,2013-10-13 03:46:30.167 +186021,57373,594.0,5,,CC BY-SA 3.0,872a0d51-6505-4bb6-aa26-01c3aa9a2928,"I am stuck on a problem and wonder if anyone can give me some suggestions. + +$X_1, X_2, X_3$ all follow a $\text{Uniform}[0,1]$ distribution and are subject to the constraint $X_1+X_2+X_3\leq 1$. + +What's the joint distribution for $(X_1, X_2, X_3)$, that is, what's $p(X_1, X_2, X_3)$, and what's the variance-covariance matrix for it? + +I get the joint distribution by geometrical way, that the pdf should be $1/6$. + +However, I can't calculate the variance-covariance matrix for it. I wonder how to get it? +",formatting,2013-10-13 04:17:33.090 +186026,57374,17123.0,2,,CC BY-SA 3.0,99ca1f82-0607-4eff-aa38-aa1af3f190e7,"Suppose there is a population, with goods and bads. The bad rate of the population(=bads/(bads+goods)) is of course unknown. Now, I have a sample of N from the population and I know the bad rate of this sample as b. The question is can I calculate the confidence interval based on N and b ONLY? In other words, can I calculate the confidence interval x such that with, say, 95% confidence the population bad rate falls in [range1,range2], where range1 will be b-x and range2 will be b+x. + +Thank you.",,2013-10-13 04:36:36.017 +186024,57374,17123.0,3,,CC BY-SA 3.0,99ca1f82-0607-4eff-aa38-aa1af3f190e7,,,2013-10-13 04:36:36.017 +186025,57374,17123.0,1,,CC BY-SA 3.0,99ca1f82-0607-4eff-aa38-aa1af3f190e7,How to estimate the confidence interval using sample average and sample size ONLY?,,2013-10-13 04:36:36.017 +186029,57359,,25,,,547ab796-736a-4d15-aae3-53ee81672086,,http://twitter.com/#!/StackStats/status/389274345891434496,2013-10-13 06:20:11.157 +186030,57374,594.0,5,,CC BY-SA 3.0,b6f7b306-b4c0-4c80-a9b5-da020de0a758,"Suppose there is a population, with goods and bads. The bad rate of the population(=bads/(bads+goods)) is of course unknown. + +Now, I have a sample of $N$ from the population and I know the bad rate of this sample as $b$. The question is can I calculate the confidence interval based on $N$ and $b$ ONLY? In other words, can I calculate the confidence interval $x$ such that with, say, 95% confidence the population bad rate falls in $[\text{range}_1,\text{range}_2]$, where $\text{range}_1$ will be $b-x$ and $\text{range}_2$ will be $b+x$. + +",added 46 characters in body,2013-10-13 06:26:53.203 +186031,57375,,2,anon,CC BY-SA 3.0,7e1773e8-6d58-496a-9401-44bac6c0d313,"Exactly as Glen_b said. Under random sampling the confidence interval for a binomial proportion can be easily calculated with, e.g., using the normal approximation. The formula can be found from Wikipedia, among other sources (http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval). + +As an example, the 95% confidence interval for a sample of 1000 with the proportion of bad of 0.5, the confidence interval would be from 0.5-sqrt((1/1000)*0.5*0.5)*1.96 to 0.5+sqrt((1/1000)*0.5*0.5)*1.96. In other words, in this case the 95% confidence interval would be 0.469-0.530. + + + +",,2013-10-13 06:29:18.963 +186032,57376,19455.0,2,,CC BY-SA 3.0,e90faa02-1c8b-4e11-922c-51408132aa10,"I would suggest a different approach. Instead of sweeping across all possible classifiers, +stop and think about your problem. How does your feature space look like? For the case of binary classification, are there two large clusters with some boundary, or is your feature space ""segmented"" and contains many clusters? + +In the former case, an SVM would be a good choice to separate the two clusters (with the right choice of kernel), in the latter a decision tree which splits the feature space into areas would probably be a better choice. Another issue is interpretability, do you need some sort of report or methodology for classification, or simply a prediction result? Decision tree can provide you with a methodology you can follow, enabling you to debug and check if you are overfitting. From my personal experience, understanding your dataset is at least as important as the choice of algorithm. + +",,2013-10-13 07:01:11.173 +186034,57377,12756.0,1,,CC BY-SA 3.0,294c3ee3-bbd6-4ff3-880e-f19958acb9b4,Usage of Linear optimization model,,2013-10-13 07:04:53.763 +186035,57377,12756.0,2,,CC BY-SA 3.0,294c3ee3-bbd6-4ff3-880e-f19958acb9b4,"Gonna need some support. + +Young Energy operates a power plant. The power plant is a coal-fired boiler that produces steam which in turn drives a generator. The company can buy different types of coal, and then mix them to meet the demands placed on it which is fired in the boiler. The table shows the characteristics of the different types of coal are: + +![enter image description here][1] + + +The requirement to be burned in the pan is: + + - BTU/lb: 11900, + - content of the ashes max 12,2% and + - max moisture 9,4%. + + [1]: https://i.stack.imgur.com/Jg23l.png + +> How should I implement a linear optimization model in this context?",,2013-10-13 07:04:53.763 +186033,57377,12756.0,3,,CC BY-SA 3.0,294c3ee3-bbd6-4ff3-880e-f19958acb9b4,,,2013-10-13 07:04:53.763 +186038,57378,5001.0,1,,CC BY-SA 3.0,0bcada00-63ca-43a9-b7f8-5f8e34dde4cc,Interpretation of the p-value of the y-intercept coefficient in a linear regression,,2013-10-13 08:09:51.043 +186037,57378,5001.0,3,,CC BY-SA 3.0,0bcada00-63ca-43a9-b7f8-5f8e34dde4cc,,,2013-10-13 08:09:51.043 +186041,57379,22646.0,3,,CC BY-SA 3.0,d3df0456-f538-4b70-a9b8-965b946b6be4,,,2013-10-13 08:15:40.013 +186039,57379,22646.0,2,,CC BY-SA 3.0,d3df0456-f538-4b70-a9b8-965b946b6be4,"Hi I am new to R and statistis and usd to linear models. Can you please explain the output? I used it to make a growth curve. +thanks + + +Formula: length ~ a * (1 - exp(-c * est_age)) + +Parameters: + Estimate Std. Error t value Pr(>|t|) +a 1.097e+03 1.026e+01 106.966 < 2e-16 *** +c 1.539e-01 1.982e-02 7.765 2.33e-09 *** +--- +Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + +Residual standard error: 41.74 on 38 degrees of freedom + +Number of iterations to convergence: 6 +Achieved convergence tolerance: 7.32e-07 ",,2013-10-13 08:15:40.013 +186040,57379,22646.0,1,,CC BY-SA 3.0,d3df0456-f538-4b70-a9b8-965b946b6be4,standard error of the residuals for a non-linear model,,2013-10-13 08:15:40.013 +186042,57378,5001.0,5,,CC BY-SA 3.0,9d0a3ba1-4b8c-4faa-b597-089e0d9c276c,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if and what way faults are found in it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is `y = 0.514x + 0.00087` and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"".",added 185 characters in body,2013-10-13 08:16:04.243 +186043,57378,5001.0,5,,CC BY-SA 3.0,249f4689-fe24-4cb9-9336-d45132e5a3b4,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is `y = 0.514x + 0.00087` and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"".",added 3 characters in body,2013-10-13 08:21:39.643 +186045,57379,594.0,5,,CC BY-SA 3.0,482e4d28-861d-4c72-a10e-a4ea65bb33df,"Hi I am new to R and statistics and used to linear models. Can you please explain the output? I used it to make a growth curve. + + + Formula: length ~ a * (1 - exp(-c * est_age)) + + Parameters: + Estimate Std. Error t value Pr(>|t|) + a 1.097e+03 1.026e+01 106.966 < 2e-16 *** + c 1.539e-01 1.982e-02 7.765 2.33e-09 *** + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + + Residual standard error: 41.74 on 38 degrees of freedom + Number of iterations to convergence: 6 + Achieved convergence tolerance: 7.32e-07 + +",added 44 characters in body,2013-10-13 09:23:36.447 +186046,57379,594.0,6,,CC BY-SA 3.0,482e4d28-861d-4c72-a10e-a4ea65bb33df,,added 44 characters in body,2013-10-13 09:23:36.447 +186047,57185,,5,user10619,CC BY-SA 3.0,837be1ea-06e8-4188-973d-5176ca508e5d,There is some confusion with respect to the measurement error. What is the definition in statistics and definition in psychometry ? The statistics does not seem to recognize the measurement error popularly called construct bias in psychometry.,"changed the background to be specific, deleted confusing part in the main question",2013-10-13 09:32:59.457 +186048,57185,,6,user10619,CC BY-SA 3.0,837be1ea-06e8-4188-973d-5176ca508e5d,,"changed the background to be specific, deleted confusing part in the main question",2013-10-13 09:32:59.457 +186049,57333,22359.0,5,,CC BY-SA 3.0,61df977b-84aa-4e6d-afb9-79d343c6fa98,"Try [this][1] paper. Your answer might be at chapter 3.2, figures 2 and 3. + +Long story short: The same performance can be obtained for different pairs of C and kernel parameters. You shouldn't try to manually tune a SVM. + +**Edit:** Some details: + +I usually tune C (the cost parameter) when I have largely imbalanced classes. That is, one class have 10% and the other 90%. Some SVM libraries (esp. libSVM which I use) lets you specify a cost for each class. According to [libsvm][1] paper, $\frac{c_1}{c_2} = \frac{n_2}{n_1}$ where $n_2>n_1$ , $n_i$ is the volume of the i'th class. If you let $c_2 = 1$ then $c_1 = n_2/n_1$ . There is also a ""global"" C, that is multiplied with the specific $c_i$ values. + +When the learning algorithm computes the error for the current SVM parameters, it multiplies each wrongly classified instance with this cost. If the cost is the same for both classes, the lesser class errors will get diluted and your final model will tend not to predict very well (or not at all) the weakly represented class. + +Gamma acts as the $\sigma$ for a Gaussian kernel $G(x) = exp(-x^2/2\sigma^2)$. Note from the equation of RBF : $K(x,y)=exp(-\gamma||x-y||^2)$ that $\gamma$ is more or less proportional to $1/\sigma^2$. Basically $\gamma$ controls the width of the kernel. + +The intuition behind this is that a large kernel will tend to produce a smoother border between classes and a narrower kernel a more intricate border. In extreme, the former will tend to give higher bias (it learns only the general aspect of the data) and the latter will tend to overfit (it learns all the details, including the outliers and errors in the data). None of these extremes are welcome in applications. A midpoint is desired, but this midpoint cannot be computed analytically and depends on the actual data. + +This is why, the metaparameters are usually searched through cross validation. Please keep in mind that you must optimize for BOTH parameters in the same time. + +Hope it helps! + +p.s. I am not an expert in SVM so I can't give you the intuition on how exactly the values for global cost parameter C actually influences the results or the convergence speed of SVM. + + [1]: http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf",Added more details and some insights from my experience,2013-10-13 09:41:06.863 +186051,57380,22648.0,2,,CC BY-SA 3.0,03eb5e40-d770-475c-8399-6eddf3575e67,"I'm trying to predict a response variable in linear regression that should be always positive (cost per click). It's a monetary amount. In adwords, you pay google for clicks on your ads, and a negative number would mean that google pays you when people clicked :P + +The predictors are all continuous values. The Rsquared and RMSE are decent when compared to other models, even out-of-sample: + +RMSE Rsquared + +1.4141477 0.8207303 + +I cannot rescale the predictions, because it's money, so even a small rescaling factor could change costs significantly. + +As far as I understand, for the regression model there's nothing special about zero and negative numbers, so it finds the best regression hyperplane no matter whether the output is partly negative. + +This is a very first attempt, using all variables I have. So there's room for refinement. + +Is there any way to tell the model that the output cannot be negative?",,2013-10-13 09:41:29.343 +186052,57380,22648.0,1,,CC BY-SA 3.0,03eb5e40-d770-475c-8399-6eddf3575e67,Negative values in predictions for an always-positive response variable in linear regression,,2013-10-13 09:41:29.343 +186050,57380,22648.0,3,,CC BY-SA 3.0,03eb5e40-d770-475c-8399-6eddf3575e67,,,2013-10-13 09:41:29.343 +186053,57379,,24,,CC BY-SA 3.0,5656219a-1981-480c-8721-c5bc33aa4d8a,,Proposed by 26338 approved by -1 edit id of 5604,2013-10-13 09:53:05.250 +186055,57379,,6,,CC BY-SA 3.0,5656219a-1981-480c-8721-c5bc33aa4d8a,,added standard error tag for clarification,2013-10-13 09:53:05.250 +186054,57379,,4,user88,CC BY-SA 3.0,7a589384-e816-404b-b754-5c2207ff5ae6,Standard error of the residuals for a non-linear model,added standard error tag for clarification,2013-10-13 09:53:05.250 +186067,57384,306.0,2,,CC BY-SA 3.0,ecc88eb5-300b-4bee-8d65-720dd1a9327a,"Assuming the coefficients to be normally distributed with the mean of 0 and an estimated standard error which you have not mentioned, the p value tells the quantile of how far the calculated value is from the mean. In the given case if you think that the value is significant at 99.73% level, even then the coefficient is different from 0. If the confidence level that you want is higher than this, then you fail to reject the hypothesis that the coefficient is different from 0.",,2013-10-13 12:29:25.737 +186068,57385,22651.0,1,,CC BY-SA 3.0,785e6034-8181-463a-aeae-f3b6db6a4b97,Testing linearity,,2013-10-13 13:09:09.757 +186056,57347,6162.0,5,,CC BY-SA 3.0,0cf1739b-4e5d-4cb6-bcf0-1884bfdf0f63,"There is a natural exact confidence interval for the grandmean in the balanced random one-way ANOVA model $$(y_{ij} \mid \mu_i) \sim_{\text{iid}} {\cal N}(\mu_i, \sigma^2_w), \quad j=1,\ldots,J, +\qquad +\mu_i \sim_{\text{iid}} {\cal N}(\mu, \sigma^2_b), \quad i=1,\ldots,I.$$ +Indeed, it is easy to check that the distribution of the observed means $\bar{y}_{i\bullet}$ is $\bar{y}_{i\bullet} \sim_{\text{iid}} {\cal N}(\mu, \tau^2)$ with $\tau^2=\sigma^2_b+\frac{\sigma^2_w}{J}$, +and it is well known that the between sum of squares $SS_b$ has distribution $$SS_b \sim J\tau^2\chi^2_{I-1}$$ and is independent of the overall observed mean $$\bar y_{\bullet\bullet} \sim {\cal N}(\mu, \frac{\tau^2}{I})$$. +Thus $$\frac{\bar y_{\bullet\bullet} - \mu}{\frac{1}{\sqrt{I}}\sqrt{\frac{SS_b}{J(I-1)}}}$$ has a Student $t$ distribution with $I-1$ degrees of freedom, wherefrom it is easy to get an exact confidence interval about $\mu$. + +**Note that this confidence interval is nothing but the classical interval for a Gaussian mean by considering only the group means $\bar{y}_{i\bullet}$ as the observations**. +Thus the simple approach you mention: + +> The simple approach is to first compute the mean of each experiment: +> 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the +> grand mean is 39.7 with the 95% confidence interval ranging from 17.4 +> to 61.9. + +is right. And your intuition about the ignored variation: + +> The problem with that approach is that it totally ignores the +> variation among triplicates. I wonder if there isn't a good way to +> account for that variation. + +is wrong. I also mention the correctness of such a simplification in [http://stats.stackexchange.com/a/72578/8402][1] + + + [1]: http://stats.stackexchange.com/a/72578/8402",added 2 characters in body,2013-10-13 10:06:57.330 +186057,57377,15827.0,5,,CC BY-SA 3.0,2d99c0a5-59dc-4dcd-a667-8f2816d343bc,"Young Energy operates a power plant. The power plant is a coal-fired boiler that produces steam which in turn drives a generator. The company can buy different types of coal, and then mix them to meet the demands placed on it which is fired in the boiler. The table shows the characteristics of the different types of coal are: + +![enter image description here][1] + + +The requirement to be burned in the pan is: + + - BTU/lb: 11900, + - content of the ashes max 12,2% and + - max moisture 9,4%. + + [1]: https://i.stack.imgur.com/Jg23l.png + +> How should I implement a linear optimization model in this context?",removed unnecessary wording,2013-10-13 10:17:12.280 +186058,57381,8074.0,2,,CC BY-SA 3.0,11563e57-1ec8-44b5-8624-dd75b24b9edf,"I think there are a few options for showing this type of data: + +The first option would be to conduct an ""Empirical Orthogonal Functions Analysis"" (EOF) (also referred to as ""Principal Component Analysis"" (PCA) in non-climate circles). For your case, this should be conducted on a correlation matrix of your data locations. For example, your data matrix `dat` would be your spatial locations in the column dimension, and the measured parameter in the rows; So, your data matrix will consist of time series for each location. The `prcomp()` function will allow you to obtain the principle components, or dominant modes of correlation, relating to this field: + + res <- prcomp(dat, retx = TRUE, center = TRUE, scale = TRUE) # center and scale should be ""TRUE"" for an analysis of dominant correlation modes) + #res$x and res$rotation will contain the PC modes in the temporal and spatial dimension, respectively. + +The second option would be to create maps that show correlation relative to an individual location of interest: + + C <- cor(dat) + #C[,n] would be the correlation values between the nth location (e.g. dat[,n]) and all other locations. + +Hope that helps. +",,2013-10-13 10:19:34.250 +186060,57374,0.0,10,,,00410162-1954-4cde-98d7-a60da85911a2,"{""OriginalQuestionIds"":[4756],""Voters"":[{""Id"":805,""DisplayName"":""Glen_b""},{""Id"":601,""DisplayName"":""John""},{""Id"":21054,""DisplayName"":""COOLSerdash""},{""Id"":22047,""DisplayName"":""Nick Cox""},{""Id"":17230,""DisplayName"":""Scortchi""}]}",101,2013-10-13 11:46:42.203 +186063,57382,,2,user10619,CC BY-SA 3.0,987eb6d7-1ade-42d4-b384-d507b299197c,Gross sampling error (MSE) appears to be a composite of two errors sampling and measurement error. How do we assess measurement error ? can we find out net sampling error ? ,,2013-10-13 11:56:43.490 +186061,57382,,1,user10619,CC BY-SA 3.0,987eb6d7-1ade-42d4-b384-d507b299197c,does sampling error include measurement error?,,2013-10-13 11:56:43.490 +186062,57382,,3,user10619,CC BY-SA 3.0,987eb6d7-1ade-42d4-b384-d507b299197c,,,2013-10-13 11:56:43.490 +186064,57378,5001.0,5,,CC BY-SA 3.0,f64aff0b-4368-4d42-be62-c83e954ed432,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is + +`y = 0.514x + 0.00087` + +and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"".",added 8 characters in body,2013-10-13 12:10:42.713 +186065,57383,,2,anon,CC BY-SA 3.0,944e979c-a2d2-4836-80bc-a758a9199821,"This depends on what you mean by a genomic location. For each cytoband this would be rather straight forward to do. Roughly: + +1) Get the cytoband locations for all genes. These are stored in the organism specific packages, e.g., org.Dm.eg.db, and are named as 'MAP' . You might need the chiptype specific annotation package to map between the probe identifiers and the genes first. + +2) Once you have the cytoband annotations for the genes, you can then test each cytoband separately with the functionality offered by, e.g., the topGO package. There is a section with the heading 'Predefined list of interesting genes' in the vignette of the topGO package that shortly shows how to do this is a similar case. + +For the smoothing approach you have thought of, it might be worth correcting the counts with the actual number of genes in any predefined window, taking into account that not all genes might be present on the chip. The exact gene locations are available in the organism specific annotation package (the same as above). Some difficulties might arise, since certain locations probably have a gene in both strands, so you just need to decide how to count them. + +The cytoband based approach is available in, e.g., Chipster software (see the manual entry at http://chipster.csc.fi/manual/stat-hyperG-cytoband.html), and the source code for the analysis is available at https://github.com/chipster/chipster/blob/master/src/main/modules/microarray/R-2.12/stat-hyperG-cytoband.R, which might in some details, if you decide to use the cytobands.",,2013-10-13 12:11:47.693 +186070,57385,22651.0,2,,CC BY-SA 3.0,785e6034-8181-463a-aeae-f3b6db6a4b97,"i have data which i need to test if they exihibit a linear relationship so that i can be able to use to make prediction of a response.Kindly assist in how to handle this problem.this is the data:task is to show that there is linear relationship between bricks used and wastes generated. + +Trials 1 2 3 4 5 6 7 8 +No. of Bricks (x) 1400 1800 2100 2400 2700 3000 3500 3800 +Wastage, % (y) 10.31 12.26 13.32 15.65 15.12 18.93 20.72 19.04 +Thanks. +Lawrence",,2013-10-13 13:09:09.757 +186069,57385,22651.0,3,,CC BY-SA 3.0,785e6034-8181-463a-aeae-f3b6db6a4b97,,,2013-10-13 13:09:09.757 +186071,57385,503.0,5,,CC BY-SA 3.0,2cdf7a9d-d9ae-4c84-8982-97f162202ba9,"I have two variables and I need to test if they exhibit a linear relationship so that I will be able to predict a response. Kindly assist in how to handle this problem. This is the data:task is to show that there is linear relationship between bricks used and wastes generated. + + Trials 1 2 3 4 5 6 7 8 + No. Bricks (x) 1400 1800 2100 2400 2700 3000 3500 3800 + Wastage, % (y) 10.31 12.26 13.32 15.65 15.12 18.93 20.72 19.04 + +","fixed spelling, grammar and formatting, removed signature",2013-10-13 13:15:53.497 +186072,57386,503.0,2,,CC BY-SA 3.0,00bed328-e374-47d5-ae01-a9b2e32b0b01,"My first point would be that you do not need to have a linear relationship in order to predict a response. + +Second, if you are trying to predict a response outside the range of the data (i.e. to less than 1400 or more than 3800 bricks) be very cautious. + +To your question: The first thing I would do is make a graph. In `R` this could be done as follows: + + x <- c(1400, 1800, 2100, 2400, 2700, 3000, 3500, 3800) + y <- c(10.31, 12.26, 13.32, 15.65, 15.12, 18.93, 20.72, 19.04) + plot(x,y) + lines(lowess(x,y)) + +The last line adds a loess curve to the data. The relationship appears to be linear at the lower levels, but then flatten at higher levels of bricks. + +I would not rely on any statistical test of linearity. With only 8 points, the deviation from linearity would have to be extreme for it to be significant and a much smaller deviation might be important. ",,2013-10-13 13:23:42.913 +186073,57387,12522.0,2,,CC BY-SA 3.0,e0a5ea15-110a-4837-84dd-c73e83b52dd2,"A possible formulation of this model is as follows: + +The purpose of the optimization problem is to obtain the % of each coal type to mix in order to minimize the cost of the mix without violating any operational constraint. + +$i = $ index for coal type (1 = A, 2 = B, 3 = B, 4 = D) + +$x_{i} =$ % of coal type $i$ to be included in the mix + +$c_{i} =$ cost per pound of coal of type $i$ + +$b_{i} =$ BTUs per pound of coal of type $i$ + +$a_{i} =$ % of ashes of coal of type $i$ + +$m_{i} =$ % of moisture of coal of type $i$ + +Objective Function: Minimize the cost of a pound of the mix + +Min $Z = \sum_{i=1}^{4} c_{i} \cdot x_{i}$ + +Subject to the following contraints: + +- $BTU/lb$ of the mix must be equal to 11,900: + + $\sum_{i=1}^4 b_{i} \cdot x_{i} = 11900$ + +- Content of ashes of the mix must be less than 12.2%: + + $\sum_{i=1}^4 a_{i} \cdot x{i} \leqslant 12.2\%$ + +- The percent of moisture of the mix must be less than 9.4%: + + $\sum_{i=1}^4 m_{i} \cdot x{i} \leqslant 9.4\%$ + +- The percent of each coal in the mix must add up to 100%: + + $\sum_{i=1}^4 x{i} = 100\%$ + +- Non-negativity constraint: + + $x_{i} \geqslant 0, \forall i$ + +You can implement the model in R using the Rglpk package or using the Excel Solver Add-in in MS Excel.",,2013-10-13 13:26:01.097 +186076,57388,22653.0,3,,CC BY-SA 3.0,05212a22-889b-4f40-8eff-fcced24eb259,,,2013-10-13 13:49:31.680 +186075,57388,22653.0,1,,CC BY-SA 3.0,05212a22-889b-4f40-8eff-fcced24eb259,Meta Analysis: Pooling samples or determine an average effect size,,2013-10-13 13:49:31.680 +186074,57388,22653.0,2,,CC BY-SA 3.0,05212a22-889b-4f40-8eff-fcced24eb259,"I am new to meta analysis and how I understood the terminology is that there are actually two ways of performing a meta analysis. Let's consider 5 clinical studies with fixed effects. Fixed effects in terms of the same medical treatment as well as demographic details of the participants. One way of analysing these data would be to pool all 5 studies together to obtain a very large study to increase the power to detect the effect of the medical treatment. The other would be to try to detect the effect in each analysis separately and then determine the average effect across the studies. As I understood meta analysis, both seem to be reasonable techniques. However, can anyone tell me pro's and con's for both techniques? When should I use which method? I would assume the results to be pretty similar anyhow or is that wrong to assume?",,2013-10-13 13:49:31.680 +186077,57389,166.0,2,,CC BY-SA 3.0,facc6fac-f0ed-4d70-a08b-a75b727d474a,"Your interpretation is almost right. + +A correct interpretation requires the following components: + + 1. The interpretation of p-values is in reference to either **a**) [frequentist] the probability of obtaining a value if you ran the same experiment (and statistic) many times over that you would obtain values that were as extreme or more extreme (the 'extreme' language implies a two-tailed test) if the true population value were 0 (or, in some tests that the difference is 0), i.e. if the null hypothesis were true; or **b**) the probability of obtaining that estimate of the parameter (e.g. intercept; using this statistical approach), or a more extreme value, if the population value for that parameter is 0. Your definition correctly uses the frequentist form. + 2. As you can see from point 1 above, the interpretation of p-values in a regression equation are not dependent on assumptions that the rest of the model is correct... just that the same approach was used. However, it does assume that those parameters are estimated. **So, your definition lacks** in that you say 'first coefficient is 0.514'. All you need to assert is that the first parameter is estimated... and that overall the model used has the same parameters estimated. The values obtained for the estimate are immaterial to the definition of the p-value for any given parameter. + 3. That we are referring to the value of y when all xs are at the value of zero. You correctly imply this point. + +You should also note that your example, in using the frequentist form, is not free from your wants and subjective beliefs. Specifically, the p-value is tied to the design of the experiment you ran. You acknowledge this when you mention using the same number of sampling pairs. + +In regards to your second question, the typical p-value reported for a regression equation is implicitly two-tailed. So, it refers to the absolute value of the parameters obtained. You didn't provide the Excel function you uses to calaculate the p-value, but I'd check there to see if Excel is calculating one-tailed (in the same direction) or two-tailed (extreme or more extreme) p-values.",,2013-10-13 13:57:06.653 +186078,57378,5001.0,5,,CC BY-SA 3.0,abb10b42-6233-449b-b741-19847b23706f,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is + +`y = 0.514x + 0.00087` + +and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"". + +Edit: The Excel funcion is `Tools > Data Analysis > Regression` in Office 2003 with service pack 2.",added 103 characters in body,2013-10-13 14:06:47.563 +186081,57390,21985.0,1,,CC BY-SA 3.0,d29f0bee-c706-4ccd-bc0a-807c66c66f50,"Auto regressive process, maximum likelihood estimator",,2013-10-13 14:45:19.743 +186080,57390,21985.0,2,,CC BY-SA 3.0,d29f0bee-c706-4ccd-bc0a-807c66c66f50,"A first-order autoregressive process, $X_0,\dots,X_n$, is given through the following conditional distributions: +$X_i | X_{i-1},\dots,X_0 \sim \mathcal{N}(\alpha X_{i-1},1)$, +for $i = 1,2,\dots,n$ and $X_0 \sim \mathcal{N}(0,1)$. + +I know that the log-likelihood function $\ell{(\alpha)}$ is of the form: +$\ell(\alpha) = - \frac{1}{2} \sum (x_i - \alpha x_{i-1})^2 + c$. But I don't know how to show that. + +I found for $\hat{\alpha}_{ML}$ the following solution: $\hat{\alpha}_{ML} = \frac{s}{t}, \mathrm{where} \; s = \sum x_1 x_{i-1} \mathrm{and} \; t = \sum (x_{i-1})^2$. Is this right? + +Then I have to show that this is the global maximum. If I take the second derivative I get a constant. Is this the sign that I got the global maximum, because the first derivative is linear wrt to $\alpha$? Right? + +",,2013-10-13 14:45:19.743 +186079,57390,21985.0,3,,CC BY-SA 3.0,d29f0bee-c706-4ccd-bc0a-807c66c66f50,,,2013-10-13 14:45:19.743 +186082,57368,21840.0,6,,CC BY-SA 3.0,ae55474d-0293-4e54-8c5d-634aad7a544e,,edited tags,2013-10-13 15:01:11.003 +186083,57391,2420.0,2,,CC BY-SA 3.0,a2a7b8f0-41ab-4ee7-a584-bb5a6dcbeef9,"I'm a software developer and I'll like to learn about neural networks. At this point I've find a problem which I'll like to solve at some point. It is about electrical load forecasting. I'm looking for similar problems and it will be great if I can find some similar examples with solutions. At this point I'm having troubles in finding the right model for the RNN, and more exactly I'm struggling with the input layer. As the output I need the forecast values for each hour. + +Any reference to books, links resources or advices are welcome and very appreciated. + +This is the problem that I'll like to solve: + +A very small factory, use a number of equipments to produce bread. Some of them are electrical equipments which means that they consume electrical power. Knowing which equipments will run on the next day, an electricity consumptions forecast can be computed. + +The equipment named E.V. is a special case of equipment. The human operator completes it's values in an empirically manner in order to have a good forecast for the next day. Those values can be positive or negative. + + + +----------------------------------------------------+ + |equipment name|power| 1h| 2h| 3h| 4h| 5h| 6h| 7h| 8h| + +----------------------------------------------------+ + |Equipment 1 | 2MW| - | - | on| on| on| - | - | - | + |Equipment 2 | 5MW| - | - | - | on| on| on| - | - | + |Equipment 3 | 1MW| on| on| on| on| on| on| on| on| + |E.V. | | .1|-.1|-.1| .1|-.2| .1| .1|-.1| + +--------------+-------------------------------------+ + |total/(forecast) |1.1| .9|2.9|8.1|7.8|6.1|1.1| .9| + +--------------+-------------------------------------+ + |real consumption | .9| .9|2.7|8.2|7.9|3.1|0.8| .7| + +--------------+-------------------------------------+ + +The problem is that the machines are not running at their maximal power, so it will be great if a more exactly forecast can be build.",,2013-10-13 15:03:39.740 +186085,57391,2420.0,3,,CC BY-SA 3.0,a2a7b8f0-41ab-4ee7-a584-bb5a6dcbeef9,,,2013-10-13 15:03:39.740 +186084,57391,2420.0,1,,CC BY-SA 3.0,a2a7b8f0-41ab-4ee7-a584-bb5a6dcbeef9,How to find the input layer and the architecture for a Neural Network,,2013-10-13 15:03:39.740 +186086,57392,5203.0,2,,CC BY-SA 3.0,5c99a260-8e91-4080-929f-7858de2bf2ef,"Yes, assuming by ""gross sampling error"" you mean mean-squared error or the $\epsilon$ term in a model like $Y=AX + \epsilon$ + +The error component of a model includes *all* sources of variability that are not explicitly included in the model. This includes sampling errors (uncertainty due to measuring only a subset of the population), measurement errors (uncertainty due to imprecisions in each measurement), and other things, like error attributable to a misspecified model (e.g., missing predictors/interactions). + +Keep in mind that these are actually types of errors. For example, there may be measurement error associated with each variable in the model, and that error might be a combination of systematic error (essentially, a bias; e.g., someone forgot that the scale reports the weight of the container + its contents) and random error. Given that, there isn't an automatic, all-purpose way of identifying the various error contributions. + +One way to examine measurement errors is through **calibration**. For example, you could put a weight on the scale and compare the scale's reading to the known mass of the weight. In many cases, the phenomena causing measurement error are reasonably well understood and have a specific structure (e.g., [shot noise][1]), which allows them to be incorporated into the model. Some large-scale physics experiments take this to incredible extremes to compare an apparatus's expected performance to the real data. Surveys are sometimes **benchmarked** by comparing data collected during the survey to larger data sets. For example, you might ask participants for demographic information (e.g., age, gender, income). These values are then compared to known population values (e.g., from a census or tax records), which might tell you how representative your respondents are of the general population. + +Sampling error is much harder to measure directly. You might expect sampling error to shrink as the number of samples approaches the size of the population, whereas a systematic measurement error would remain approximately the same, regardless of sample size. + + [1]: http://en.wikipedia.org/wiki/Shot_noise",,2013-10-13 15:15:38.567 +186087,57388,,25,,,7d664d4c-5795-45eb-8b4e-e46127803609,,http://twitter.com/#!/StackStats/status/389410239004823552,2013-10-13 15:20:10.623 +186088,57381,8074.0,5,,CC BY-SA 3.0,2a35e65a-a0bd-4fc6-b8a9-51d0c9446cb0,"I think there are a few options for showing this type of data: + +The first option would be to conduct an ""Empirical Orthogonal Functions Analysis"" (EOF) (also referred to as ""Principal Component Analysis"" (PCA) in non-climate circles). For your case, this should be conducted on a correlation matrix of your data locations. For example, your data matrix `dat` would be your spatial locations in the column dimension, and the measured parameter in the rows; So, your data matrix will consist of time series for each location. The `prcomp()` function will allow you to obtain the principal components, or dominant modes of correlation, relating to this field: + + res <- prcomp(dat, retx = TRUE, center = TRUE, scale = TRUE) # center and scale should be ""TRUE"" for an analysis of dominant correlation modes) + #res$x and res$rotation will contain the PC modes in the temporal and spatial dimension, respectively. + +The second option would be to create maps that show correlation relative to an individual location of interest: + + C <- cor(dat) + #C[,n] would be the correlation values between the nth location (e.g. dat[,n]) and all other locations. + +Hope that helps. +",corrected spelling,2013-10-13 16:04:24.417 +186089,57393,14850.0,2,,CC BY-SA 3.0,0620371e-2325-4828-b420-9728b9c35820,"It appears that you're using nls to fit a logistic model. + +By typing + + ?summary.nls + +you can read about the output. + +Estimates and standard errors are estimated by the Gauss-Newton algorithm (if the nls defaults are used) + +The P-values are the results of a one sided test of whether the parameters are zero or not. + +You can check the exact calculations used to create the output shown by typing: + + stats:::summary.nls",,2013-10-13 16:17:54.617 +186090,57378,5001.0,5,,CC BY-SA 3.0,2671864b-2426-41f9-a7ae-35f58d368e46,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is + +`y = 0.514x + 0.00087` + +and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"". + +Edit: The Excel funcion is `Tools > Data Analysis > Regression` in Office 2003 with service pack 2. Excel regression p-values on coefficients are 2 sided.",added 55 characters in body,2013-10-13 16:30:43.817 +186093,57394,16046.0,2,,CC BY-SA 3.0,e7e70389-f554-4886-b5ae-65780f3c30a0,"I am reading an [article][1] which is trying to justify the need for causal inference in their inferential framework. The thought experiment is as follows: + +> Suppose a statistician is asked to design a model for a simple time +> series $X_1,X_2,X_3,...$ and she decides to use a Bayesian method. +> Assume she collects a first observation $X_1 = x_1$. She computes the +> posterior probability density function (pdf) over the parameters +> $\theta$ of the model given the data using Bayes’ rule: $$p(\theta|X_1 += x_1) = \int\frac{p(X_1 = x_1|\theta)p(\theta)}{p(X_1 = x_1|\theta')p(\theta')}, $$ +> +> where $p(X_1 = x_1|θ)$ is the likelihood of $x_1$ given $\theta$ and +> p($\theta$) is the prior pdf of $\theta$. She can use the model to +> predict the next observation by drawing a sample $x_2$ from the +> predictive pdf: $$p(X_2 = x_2|X_1 = x_1) = \int p(X_2 = x_2|X_1 = + x_1,\theta)p(\theta|X_1 = x_1)d\theta,$$ +> +> where $p(X_2 = x_2|X1 = x1,\theta)$ is the likelihood of $x_2$ given +> $x_1$ and $\theta$. Note that $x_2$ is not drawn from $p(X_2 = x_2|X_1 +> = x_1, \theta)$. She understands that the nature of $x_2$ is very different from $x_1$: while $x_1$ is informative and does change the +> belief state of the Bayesian model, $x_2$ is non-informative and thus +> is a reflection of the model’s belief state. Hence, she would never +> use $x_2$ to further condition the Bayesian model. Mathematically, she +> seems to imply that: +> $$p(\theta|X_1 =x_1,X_2 =x_2)=p(\theta|X_1 =x_1)$$ +However I hardly believe that what this poor statistician should imply is: +$$p(\theta|X_1 =x_1,\text{do}(X_2 =x_2))=p(\theta|X_1 =x_1)$$ + +Am I right about this? + [1]: http://www.aaai.org/Papers/JAIR/Vol38/JAIR-3812.pdf",,2013-10-13 16:53:32.577 +186096,57378,5001.0,5,,CC BY-SA 3.0,78375e55-c472-4aaa-bad6-78b3471b1092,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is + +`y = 0.514x + 0.00087` + +and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"". + +**Edit**: The Excel funcion is `Tools > Data Analysis > Regression` in Office 2003 with service pack 2. Excel regression p-values on coefficients are 2 sided. + +**Edit**: Differentiation from this question [here][1]. The most up voted answer discusses the p-value of a hypothesis. I am not interested in that. I am interested in the p-value of a coefficient that is not the coefficient of an independent variable. + + + [1]: http://stats.stackexchange.com/questions/31/what-is-the-meaning-of-p-values-and-t-values-in-statistical-tests",added 377 characters in body,2013-10-13 16:57:56.673 +186097,57394,16046.0,5,,CC BY-SA 3.0,84dc775e-bf28-4c92-b58d-1bae19fffce8,"I am reading an [article][1] which is trying to justify the need for causal inference in their inferential framework. The thought experiment is as follows: + +> Suppose a statistician is asked to design a model for a simple time +> series $X_1,X_2,X_3,...$ and she decides to use a Bayesian method. +> Assume she collects a first observation $X_1 = x_1$. She computes the +> posterior probability density function (pdf) over the parameters +> $\theta$ of the model given the data using Bayes’ rule: $$p(\theta|X_1 += x_1) = \int\frac{p(X_1 = x_1|\theta)p(\theta)}{p(X_1 = x_1|\theta')p(\theta')}, $$ +> +> where $p(X_1 = x_1|θ)$ is the likelihood of $x_1$ given $\theta$ and +> p($\theta$) is the prior pdf of $\theta$. She can use the model to +> predict the next observation by drawing a sample $x_2$ from the +> predictive pdf: $$p(X_2 = x_2|X_1 = x_1) = \int p(X_2 = x_2|X_1 = + x_1,\theta)p(\theta|X_1 = x_1)d\theta,$$ +> +> where $p(X_2 = x_2|X1 = x1,\theta)$ is the likelihood of $x_2$ given +> $x_1$ and $\theta$. Note that $x_2$ is not drawn from $p(X_2 = x_2|X_1 +> = x_1, \theta)$. She understands that the nature of $x_2$ is very different from $x_1$: while $x_1$ is informative and does change the +> belief state of the Bayesian model, $x_2$ is non-informative and thus +> is a reflection of the model’s belief state. Hence, she would never +> use $x_2$ to further condition the Bayesian model. Mathematically, she +> seems to imply that: +> $$p(\theta|X_1 =x_1,X_2 =x_2)=p(\theta|X_1 =x_1)$$ + +However I hardly believe that what this poor statistician should imply is: +$$p(\theta|X_1 =x_1,\text{do}(X_2 =x_2))=p(\theta|X_1 =x_1)$$ + +Am I right about this? + [1]: http://www.aaai.org/Papers/JAIR/Vol38/JAIR-3812.pdf",added 2 characters in body,2013-10-13 16:59:51.637 +186098,57395,668.0,2,,CC BY-SA 3.0,9114b16b-0fb1-4360-87d0-d5e7dd2fb31f,"**Such a distribution does not exist.** + +To see why not, let $0 \lt t \lt 1/2$ and notice that $X_2\gt 1-t$ entails $X_1\le t$ and $X_3\gt 1-t$ also implies $X_1\le t$, for otherwise in either situation the sum of all the $X_i$ would exceed $1.$ The latter two events are disjoint, because we cannot simultaneously have $X_2\gt 1-t \gt 1/2$ and $X_3\gt 1-t\gt 1/2.$ Consequently the chance that $X_1\le t$ is no less than the sum of the chances that $X_2\ge 1-t$ and $X_3\ge 1-t$, each of which equals $t$ by the uniform distribution assumptions. This shows that $t \ge t+t,$ which for $t\gt 0$ obviously is false. + +This contradiction forces us to give up at least one of the assumptions: if indeed $X_1+X_2+X_3\le 1$, then the only other assumptions used in this argument are that each $X_i$ has a Uniform$[0,1]$ distribution. Therefore *at least one of the $X_i$ cannot have a Uniform$[0,1]$ distribution,* QED.",,2013-10-13 17:06:36.577 +186109,57396,15827.0,5,,CC BY-SA 3.0,5b5be56c-e160-4bd5-a1b4-03226ab23dfa,"I have a large population of size $n$ from an unknown continuous random variable $X$, and I do not know the underlying distribution of $X$. Given a constant number $c$, I want to determine the minimum sample size I need to estimate the probability $P(X \le c)$ given a confidence level, $p_c$, and confidence interval, $I_c$ (I am not sure if we need them! ). How can I find the minimum sample size to estimate this probability? Thanks for your help.",more use of TeX ,2013-10-13 18:10:21.797 +186112,57398,22658.0,2,,CC BY-SA 3.0,7200c158-80c3-4841-ba53-c6fa7ab5a8d8,"There are 2 competing statistical models. Model #1 (null hypothesis, McNemar): probability correct to incorrect = probability of incorrect to correct = 0.5 or equivalent b=c. Model #2: probability correct to incorrect < probability of incorrect to correct or equivalent b > c. For model #2 we use maximum likelihood method and logistic regression to determine model parameters representing model 2. Statistical methods look different because each method reflects a different model. +",,2013-10-13 18:26:48.740 +186115,57399,6813.0,3,,CC BY-SA 3.0,9fb279df-c7cf-483c-a537-7a1203d9c279,,,2013-10-13 18:37:21.500 +186099,57378,5001.0,5,,CC BY-SA 3.0,45bc7efb-6cb5-4ae4-9a86-7cc437114f2d,"I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it. + +From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is + +`y = 0.514x + 0.00087` + +and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027. + +Would it be correct to say that the interpretation of the p-value of the 0.00087 term is: + +> Under the assumption that the true value of the y-intercept is zero +> and the first coefficient is 0.514, random sampling of the same number +> of (x,y) pairs, specifically 90, would result in a least squares best +> fit line with a y-intercept at least as extreme as 0.00087, with a +> probability of 0.0027. + +If not, then what would be the correct interpretation? + +Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as + +> ""at least as extreme as 0.00087 in the same direction, that is, +> positive"". + +**Edit**: The Excel funcion is `Tools > Data Analysis > Regression` in Office 2003 with service pack 2. Excel regression p-values on coefficients are 2 sided. + +**Edit**: Regarding differentiation from this question [here][1]: The most up voted answer there discusses the p-value of a hypothesis, which seems ill defined or at least not specific. I am not interested in that. I am interested in the p-value of a coefficient that is not the coefficient of an independent variable. I am being very specific. + + + [1]: http://stats.stackexchange.com/questions/31/what-is-the-meaning-of-p-values-and-t-values-in-statistical-tests",added 15 characters in body,2013-10-13 17:07:28.077 +186102,57396,22656.0,2,,CC BY-SA 3.0,bd064baa-526d-4455-a7f3-db5d754efb34,"I have a large population of size n, but I do not know the underlying distribution of the population. I want to determine the minimum sample size I need to estimated the CDF given a confidence level and confidence interval (I am not sure if we need them! ). How can I find the minimum sample size? Thanks for your help.",,2013-10-13 17:09:46.403 +186101,57396,22656.0,1,,CC BY-SA 3.0,bd064baa-526d-4455-a7f3-db5d754efb34,Minimum Sample Size Required to Estimate CDF of a Given Population (Given a Confidence Level & Confidence Interval),,2013-10-13 17:09:46.403 +186100,57396,22656.0,3,,CC BY-SA 3.0,bd064baa-526d-4455-a7f3-db5d754efb34,,,2013-10-13 17:09:46.403 +186103,57394,16046.0,5,,CC BY-SA 3.0,248ea3c2-28f5-46e0-b8e6-c08ddc47c9bd,"I am reading an [article][1] which is trying to justify the need for causal inference in their inferential framework. The thought experiment is as follows: + +> Suppose a statistician is asked to design a model for a simple time +> series $X_1,X_2,X_3,...$ and she decides to use a Bayesian method. +> Assume she collects a first observation $X_1 = x_1$. She computes the +> posterior probability density function (pdf) over the parameters +> $\theta$ of the model given the data using Bayes’ rule: $$p(\theta|X_1 += x_1) = \int\frac{p(X_1 = x_1|\theta)p(\theta)}{p(X_1 = x_1|\theta')p(\theta')}, $$ +> +> where $p(X_1 = x_1|θ)$ is the likelihood of $x_1$ given $\theta$ and +> p($\theta$) is the prior pdf of $\theta$. She can use the model to +> predict the next observation by drawing a sample $x_2$ from the +> predictive pdf: $$p(X_2 = x_2|X_1 = x_1) = \int p(X_2 = x_2|X_1 = + x_1,\theta)p(\theta|X_1 = x_1)d\theta,$$ +> +> where $p(X_2 = x_2|X1 = x1,\theta)$ is the likelihood of $x_2$ given +> $x_1$ and $\theta$. Note that $x_2$ is not drawn from $p(X_2 = x_2|X_1 +> = x_1, \theta)$. She understands that the nature of $x_2$ is very different from $x_1$: while $x_1$ is informative and does change the +> belief state of the Bayesian model, $x_2$ is non-informative and thus +> is a reflection of the model’s belief state. Hence, she would never +> use $x_2$ to further condition the Bayesian model. Mathematically, she +> seems to imply that: +> $$p(\theta|X_1 =x_1,X_2 =x_2)=p(\theta|X_1 =x_1)$$ + +However I hardly believe that what this poor statistician should imply is: +$$p(\theta|X_1 =x_1,\text{do}(X_2 =x_2))=p(\theta|X_1 =x_1)$$ +Where ""do(or set)"" here comes from [Pearl][2]'s framework of causality which can be found [here][3] and [here][4]. +Now am I right about this? + + + [1]: http://www.aaai.org/Papers/JAIR/Vol38/JAIR-3812.pdf + [2]: http://bayes.cs.ucla.edu/jp_home.html + [3]: ftp://ftp.cs.ucla.edu/pub/stat_ser/r350.pdf + [4]: http://www.amazon.com/Causality-Reasoning-Inference-Judea-Pearl/dp/052189560X/ref=dp_ob_title_bk",added 325 characters in body,2013-10-13 17:15:58.790 +186104,57397,20473.0,2,,CC BY-SA 3.0,1cb3c237-05c4-4a2c-9402-57003ff3dfee,"I assume that you are using the OLS estimator on this linear regression model. You can use the _**inequality constrained least-squares estimator**_, which will be the solution to a minimization problem under inequality constraints. Using standard matrix notation (vectors are column vectors) the minimization problem is stated as + +$$\min_{\beta} (\mathbf y-\mathbf X\beta)'(\mathbf y-\mathbf X\beta) \\s.t.-\mathbf Z\beta \le \mathbf 0 $$ + +...where $\mathbf y$ is $n \times 1$ , $\mathbf X$ is $n\times k$, $\beta$ is $k\times 1$ and $\mathbf Z$ is the $m \times k$ matrix containing the out-of-sample regressor series of length $m$ that are used for prediction. We have $m$ linear inequality constraints (and the objective function is convex, so the first order conditions are sufficient for a minimum). + +The Lagrangean of this problem is + +$$L = (\mathbf y-\mathbf X\beta)'(\mathbf y-\mathbf X\beta) -\lambda'\mathbf Z\beta = \mathbf y'\mathbf y-\mathbf y'\mathbf X\beta - \beta'\mathbf X'\mathbf y+ \beta'\mathbf X'\mathbf X\beta-\lambda'\mathbf Z\beta$$ + +$$= \mathbf y'\mathbf y - 2\beta'\mathbf X'\mathbf y+ \beta'\mathbf X'\mathbf X\beta-\lambda'\mathbf Z\beta $$ + +where $\lambda$ is a $m \times 1$ column vector of non-negative Karush -Kuhn -Tucker multipliers. The first order conditions are (you may want to review rules for matrix and vector differentiation) + +$$\frac {\partial L}{\partial \beta}= \mathbb 0\Rightarrow - 2\mathbf X'\mathbf y +2\mathbf X'\mathbf X\beta - \mathbf Z'\lambda $$ + +$$\Rightarrow \hat \beta_R = \left(\mathbf X'\mathbf X\right)^{-1}\mathbf X'\mathbf y + \frac 12\left(\mathbf X'\mathbf X\right)^{-1}\mathbf Z'\lambda = \hat \beta_{OLS}+ \left(\mathbf X'\mathbf X\right)^{-1}\mathbf Z'\xi \qquad [1]$$ + +...where $\xi = \frac 12 \lambda$, for convenience, and $\hat \beta_{OLS}$ is the estimator we would obtain from ordinary least squares estimation. + +The method is fully elaborated in [Liew (1976)][1]. + + + + [1]: http://www.jstor.org/stable/2285614",,2013-10-13 17:17:58.327 +186105,57393,14850.0,5,,CC BY-SA 3.0,5ab41cf5-b791-4eb3-be5a-341b42604931,"It appears that you're using nls to fit a logistic model. + +By typing + + ?summary.nls + +you can read about the output. + +Estimates and standard errors are estimated by the Gauss-Newton algorithm (if the nls defaults are used) + +The P-values are the results of a two sided test of whether the parameters are zero or not. + +You can check the exact calculations used to create the output shown by typing: + + stats:::summary.nls",edited body,2013-10-13 17:22:26.553 +186106,57396,22656.0,5,,CC BY-SA 3.0,e4b867a3-e4c4-4963-a887-773954408d84,"I have a large population of size n from an unknown continuous random variable X, and I do not know the underlying distribution of X. I want to determine the minimum sample size I need to estimate the *CDF* of X given a confidence level, $p_c$, and confidence interval, $I_c$ (I am not sure if we need them! ). More precisely, given a constant number *c*, I want to estimate the probability $P(X<=c)$. How can I find the minimum sample size to estimate this probability? Thanks for your help.",added 176 characters in body,2013-10-13 17:26:00.743 +186108,57396,22656.0,4,,CC BY-SA 3.0,e5419645-6087-4a41-a426-810f1016097c,Minimum Sample Size Required to Estimate the Probability $P(X<=c)$ for a Constant c (Given a Confidence Level & Confidence Interval),added 176 characters in body,2013-10-13 17:31:26.440 +186107,57396,22656.0,5,,CC BY-SA 3.0,e5419645-6087-4a41-a426-810f1016097c,"I have a large population of size *n* from an unknown continuous random variable X, and I do not know the underlying distribution of X. Given a constant number *c*, I want to determine the minimum sample size I need to estimate the probability $P(X<=c)$ given a confidence level, $p_c$, and confidence interval, $I_c$ (I am not sure if we need them! ). How can I find the minimum sample size to estimate this probability? Thanks for your help.",added 176 characters in body,2013-10-13 17:31:26.440 +186110,57396,15827.0,4,,CC BY-SA 3.0,5b5be56c-e160-4bd5-a1b4-03226ab23dfa,Minimum Sample Size Required to Estimate the Probability $P(X \le c)$ for a Constant $c$ (Given a Confidence Level & Confidence Interval),more use of TeX ,2013-10-13 18:10:21.797 +186113,57399,6813.0,2,,CC BY-SA 3.0,9fb279df-c7cf-483c-a537-7a1203d9c279,"Recently, I was wondering about calculating the probability of a given individual in a given population ""knowing"" (let's say, present in individual's friend set) at least one person with a given trait A and at least one person with another given trait, B; where it is possible that one person in the given individual's friend set can possess both traits. + +For example, using genetic traits, in a given population, how could one calculate the probability that a given individual in a given population ""knows"" at least one person with grey eye colour and at least one person who is greater than 200cm tall; where, naturally, it is possible that any individual can possess grey eye colour *and* be greater than 200cm tall. + +I have developed a sort of model, but it may not be correctly specified; it is as follows: + +**Assumptions and Qualifications:** + + - First of all, for simplicity, let's assume that we define ""knowing"" as mutually connected friends on a social network. + - Secondly, the frequencies of genetic traits are (a) going to be determined by ethnicity of a given population as well as environmental factors (nutrition, healthcare) and (b) unlikely to be independently distributed across a given individual's friend set (e.g. family members will have greater genetic similarity); however, for this problem, let's adopt a simple model where both of the above conditions are violated. + - Thirdly, assume that the individual's friend set provides a microcosmic representation of society; this facilitates a frequentist generation of probabilities from the instance rate in the population. + - Finally, instances of genetic traits in the population are fabricated, but are used to generate probabilities for the examples. + +**Model Formulation:** + +I have reasoned that a binomial random distribution can be applied to the probability of each genetic trait, where a ""success"" is defined as an individual in the friend set possessing that genetic trait. + +Thus for trait A, we have: + +$$P(A=k) = \binom{N}{k}p_{a}^{k}(1-p_{a})^{N-k}$$ + +and, for trait B: + +$$P(B=k) = \binom{N}{k}p_{b}^{k}(1-p_{b})^{N-k}$$ + +where $N$ is number of friends in the friend set, $k$ is number of people containing the genetic trait and $p_{x}$ is the probability associated with possessing genetic trait $X$. + +Because the scenario is concerned with the probability of *at least one* person containing trait A and trait B, it is easier to find the complement of no people in a friend set containing the trait; for both traits: + +$$P(A >= 1) = 1 - P(A = 0)$$ + +and + +$$P(B >= 1) = 1 - P(B = 0)$$ + +Furthermore, we know that probability of an intersection of events is given by: + +$$\mathbb{P}(A \cap B) = \mathbb{P}(A|B)\mathbb{P}(B) = \mathbb{P}(B|A)\mathbb{P}(A)$$ + +However, because we are assuming independence between genetic traits, $\mathbb{P}(A)$ and $\mathbb{P}(B)$ are independent, thus: + +$$\mathbb{P}(A \cap B) = \mathbb{P}(A)\mathbb{P}(B)$$ + +Combining the information above, we get the following model for the above scenario: + +$$\mathbb{P}(A >=1, B >=1) = \left(1-\binom{N}{k}p_{a}^{k}(1-p_{a})^{N-k}\right)\left(1-\binom{N}{k}p_{b}^{k}(1-p_{b})^{N-k}\right)$$ + +So, for the original example above, assuming a friend set of size $N = 300$, the instance of trait A in the population is $\frac{1}{800}$ and the instance of trait B in the population is $\frac{1}{5000}$; according to the model, we get the final probability: + +$$\mathbb{P}(A >=1, B >=1)$$ + +$$ = \left(1-\binom{300}{0}\left(\frac{1}{800}\right)^{0}\left(\frac{799}{800}\right)^{300}\right)\left(1-\binom{300}{0}\left(\frac{1}{5000}\right)^{0}\left(\frac{4999}{5000}\right)^{300}\right)$$ + +$$\approx 0.018 = 1.8\%$$ + +Does this model seem reasonable given the assumptions? + +Assuming this model is *not* correctly specified, maybe somebody could provide a more accurate representation.",,2013-10-13 18:37:21.500 +186116,57400,7155.0,2,,CC BY-SA 3.0,d82bfde8-16f1-49b4-b727-3ee57af1bc65,"Define neural network to be $f$, time-series to be $x$, lag order to be $n$ and forecast horizon to be $h$. + +$ f(x_{t-1}, x_{t-2},..,x_{t-n}) = [x_t, x_{t+1},..,x_{t+h}]$ + +Assume you have the following time series, + + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + +You define $n=2$, $h=1$. + +Your inputs for that time-series are circulant matrix like. + +x = + + [[ 1, 0], + [ 2, 1], + [ 3, 2], + [ 4, 3], + [ 5, 4], + [ 6, 5], + [ 7, 6], + [ 8, 7]] + +Your outputs are + +y = + + [2, 3, 4, 5, 6, 7, 8, 9] + +So the length of your input layer is given by $n$, the length of your output layer is given by $h$, where your first input neuron is $x_{t-1}$ and your last input in $x_{t-n}$. Same goes for the forecast horizon. + +Instead of having multiple outputs for the forecast horizon, you can use a forecast horizon of 1 then recurse on the predictions to obtain any forecast horizon you want. + +For classic parametric stationary time series models the limit of the recursive behaviour of the system is well-studied. + +Your problem is a little more involved though. You have inputs and outputs of the system and you want the predict outputs to follow some reference trajectory. + +One solution is to use Narma-L2, which approximates the system by linear feedback using two neural networks. Define control inputs to be $c$ and production outputs to be $p$. Define reference production outputs to be $r$ + +You train two neural networks of the forms $g(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n}) = c_{t}$ and $k(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n}) = p_{t}$. + +The prediction for control inputs is then $c_t = \frac{r_t - k(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n})}{g(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n})}$ + +Also, neural networks are a PITA. There's plenty of good nonparametric regression models that are easier to train, like Gaussian Process Regression for instance. + +See: [Neural Network NARMA Control of a Gyroscopic +Inverted Pendulum][1] + + + [1]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.165.4988&rep=rep1&type=pdf",,2013-10-13 18:42:03.260 +186117,57400,7155.0,5,,CC BY-SA 3.0,fccab00c-9736-4b45-b698-3d47295079a1,"Define neural network to be $f$, time-series to be $x$, lag order to be $n$ and forecast horizon to be $h$. + +$ f(x_{t-1}, x_{t-2},..,x_{t-n}) = [x_t, x_{t+1},..,x_{t+h}]$ + +Assume you have the following time series, + + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + +You define $n=2$, $h=1$. + +Your inputs for that time-series are circulant matrix like. + +x = + + [[ 1, 0], + [ 2, 1], + [ 3, 2], + [ 4, 3], + [ 5, 4], + [ 6, 5], + [ 7, 6], + [ 8, 7]] + +Your outputs are + +y = + + [2, 3, 4, 5, 6, 7, 8, 9] + +So the length of your input layer is given by $n$, the length of your output layer is given by $h$, where your first input neuron is $x_{t-1}$ and your last input in $x_{t-n}$. Same goes for the forecast horizon. + +Instead of having multiple outputs for the forecast horizon, you can use a forecast horizon of 1 then recurse on the predictions to obtain any forecast horizon you want. + +For classic parametric stationary time series models the limit of the recursive behaviour of the system is well-studied. + +Your problem is a little more involved though. You have inputs and outputs of the system and you want the predict outputs to follow some reference trajectory. + +One solution is to use Narma-L2, which approximates the system by linear feedback using two neural networks. Define control inputs to be $c$ and production outputs to be $p$. Define reference production outputs to be $r$ + +You train two neural networks of the forms $g(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n}) = c_{t}$ and $k(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n}) = p_{t}$. + +The prediction for control inputs is then $c_t = \frac{r - k(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n})}{g(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n})}$ + +Also, neural networks are a PITA. There's plenty of good nonparametric regression models that are easier to train, like Gaussian Process Regression for instance. + +See: [Neural Network NARMA Control of a Gyroscopic +Inverted Pendulum][1] + + + [1]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.165.4988&rep=rep1&type=pdf",deleted 2 characters in body,2013-10-13 18:48:04.373 +186120,57401,15377.0,3,,CC BY-SA 3.0,fb0e59dc-b1c0-4cef-bf5e-b1c54818d119,,,2013-10-13 19:40:09.530 +186119,57401,15377.0,1,,CC BY-SA 3.0,fb0e59dc-b1c0-4cef-bf5e-b1c54818d119,A question on continuous random variable,,2013-10-13 19:40:09.530 +186118,57401,15377.0,2,,CC BY-SA 3.0,fb0e59dc-b1c0-4cef-bf5e-b1c54818d119,"Let say, I have 2 continuous random variables X1 & X2. Both have same location parameters. Other parameters may be same or may not. + +Now say, the q1-th quantile of X1 is less than the q1-th quantile of x2. But the q2-th quantile of x1 is more than the q2th quantile of x2. + +My question is, is that possible? Is there any example of x1 & x2 which have that property? + +I will be really grateful if someone can give me some pointer. + +Best regards,",,2013-10-13 19:40:09.530 +186121,57167,21952.0,5,,CC BY-SA 3.0,1c8b7396-c86d-48f3-8bc1-1da43e0f1665,"I have the following relationships + +logY ~ logX1 + logX2 + logX3 + logX4 + logX5 + +and + +X1 ~ Z1 + Z2 + Z3 + Z4 + Z5 + +X2 ~ Z1 + Z2 + Z3 + Z4 + Z5 + +X3 ~ Z1 + Z2 + Z3 + Z4 + Z5 + + + +where Y and Z1, Z2, Z3, Z4, Z5 are endogenous (Say while the Z's play a role in determining Y, the values of Z's are fixed depending upont he values of Y - Kind of like advertising expense has an impact on sales revenue but at the same time managers determine the advertisement expense on the expected sales revenue). So all the variable are changing simultaneously. Can anyone help me on how I can estimate this relationship? I also have instruments for each of the Z's (lagged variables have been treated as instruments and I have the previous year data for the problem as well. Thank you for all your help and suggestions. +",added 20 characters in body,2013-10-13 19:43:04.407 +186122,57393,14850.0,5,,CC BY-SA 3.0,57789e06-2c0a-4cac-ac4c-67bde15ff2dd,"It appears that you're using nls. + +By typing + + ?summary.nls + +you can read about the output. + +Estimates and standard errors are estimated by the Gauss-Newton algorithm (if the nls defaults are used) + +The P-values are the results of a two sided test of whether the parameters are zero or not. + +You can check the exact calculations used to create the output shown by typing: + + stats:::summary.nls",deleted 24 characters in body,2013-10-13 20:02:59.907 +186123,57402,7229.0,2,,CC BY-SA 3.0,0ade767c-13eb-4c64-9951-623b65a1a1b3,"I don't see clearly behind the lines but it seems to me that there are too much data points. + +Since you want to show the regional homogeneity and not exactly stations, I'd suggest you firstly to group them spatially. For example, overlay by a ""fishnet"" and compute average measured value in every cell (at every time moment). If you place these average values in the cell centers this way you rasterize the data (or you can compute also mean latitude and longitude in every cell if you don't want overlaying lines). Or to average inside administrative units, whatever. Then for these new averaged ""stations"" you can calculate correlations and plot a map with smaller number of lines. + +![enter image description here][1] + +This can also remove those random single high-correlation lines going through all area. + + + [1]: https://i.stack.imgur.com/EnuPm.png",,2013-10-13 20:29:43.190 +186124,57403,20062.0,2,,CC BY-SA 3.0,f8fe78fc-d97c-41c0-b5f8-2daefa063682,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with f.e. their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (oterwise they would form one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case are given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm + + +",,2013-10-13 20:35:21.350 +186126,57403,20062.0,3,,CC BY-SA 3.0,f8fe78fc-d97c-41c0-b5f8-2daefa063682,,,2013-10-13 20:35:21.350 +186125,57403,20062.0,1,,CC BY-SA 3.0,f8fe78fc-d97c-41c0-b5f8-2daefa063682,What is the point in using Mantel's test instead of Moran's I?,,2013-10-13 20:35:21.350 +186129,57404,19547.0,3,,CC BY-SA 3.0,3ed8d6ba-09d0-4f22-968d-42ba1d8902f5,,,2013-10-13 20:50:28.997 +186127,57404,19547.0,2,,CC BY-SA 3.0,3ed8d6ba-09d0-4f22-968d-42ba1d8902f5,I was wondering if it is possible to apply the method of hypothesis testing to real life. For example if someone can use it for decision making. I have always used this method for homework problems but maybe we can use this method as an aid in decision making. Therefore we could somehow know for example the probability of rejecting wrongly an alternative decision. What do you think on that? And it would be nice if someone can give an example if he thinks that this can be done.,,2013-10-13 20:50:28.997 +186128,57404,19547.0,1,,CC BY-SA 3.0,3ed8d6ba-09d0-4f22-968d-42ba1d8902f5,Hypothesis Testing applied to real life,,2013-10-13 20:50:28.997 +186132,57405,19750.0,3,,CC BY-SA 3.0,3c4db1a9-6e40-4fdf-a292-1a88b29339b1,,,2013-10-13 21:00:08.203 +186131,57405,19750.0,1,,CC BY-SA 3.0,3c4db1a9-6e40-4fdf-a292-1a88b29339b1,Bayesian variable selection,,2013-10-13 21:00:08.203 +186130,57405,19750.0,2,,CC BY-SA 3.0,3c4db1a9-6e40-4fdf-a292-1a88b29339b1,"Chapter 13 of Kevin Murphy's book [Machine Learning: A Probabilistic Perspective][1] discusses Sparse Linear Models. After a short introduction on the benefits of sparse models, he introduces the following problem: + +![enter image description here][2] + + + [1]: http://www.amazon.com/Machine-Learning-Probabilistic-Perspective-Computation/dp/0262018020 + [2]: https://i.stack.imgur.com/AciWF.png + +How does he derive equation 13.1 above? i.e. why does it take that form, and what is $f$ supposed to represent here?",,2013-10-13 21:00:08.203 +186133,57404,668.0,10,,,c9900018-0e1d-4a42-a042-045c302a0e6a,"{""OriginalQuestionIds"":[6966],""Voters"":[{""Id"":919,""DisplayName"":""whuber""}]}",101,2013-10-13 21:02:14.880 +186134,57273,,25,,,6991772e-6c6f-48e9-b50b-1b3e185fa270,,http://twitter.com/#!/StackStats/status/389501157363957760,2013-10-13 21:21:27.260 +186135,57406,22507.0,2,,CC BY-SA 3.0,bff2b055-8c6a-4749-be10-5b1cb2ef9037,"Machine learning often deals with optimization of a function which has many local minimas. Feedforward neural networks with hidden units is a good example. Whether these functions are discrete or continuous, there is no method which achieves a global minimum and stops. It is easy to prove that there is no general algorithm to find a global minimum of a continuous function even if it is one-dimensional and smooth (has infinitely many derivatives). In practice, all algorithms for learning neural networks stuck into a local minimum. It is easy to check this: create a random neural network, make a big set of its responses to random inputs, then try to learn another neural network with the same architecture to copy the responses. While the perfect solution exists, neither backpropagation not any other learning algorithm will be able to discover it, starting from a random set of weights. + +Some learning methods, like simulated annealing or genetic algorithms, explore many local minimas. For continuous functions there are methods like gradient descent, which find the closest local minimum. They are much faster, thats why they are widely used in practice. But given enough time, the former group of methods outperforms the later in terms of training set error. But with reasonable time constraints, for real world problems, the latter group is usually better. + +For some models, like logistic regression, there is one local minimum, the function is convex, the minimization converges to the minimum, but the models themselves are simplistic. + +Thats the bitter truth. + +Note also that proof of convergence and proof of convergence to the best solution are two different things. K-means algorithm is an example of this. + +Finally, for some models we don't know how to learn at all. For example, if the output is an arbitrary computable function of inputs, we don't know good algorithms which, in reasonable time, find a Turing or equivalent machine implementing this function. For instance, if f(1)=2, f(2)=3, f(3)=5, f(4)=7, ..., f(10)=29 (prime numbers), we don't know any learning algorithm which would be able to predict that f(11)=31, unless it already knows the concept of prime numbers.",,2013-10-13 21:24:53.867 +186136,57324,5045.0,5,,CC BY-SA 3.0,3ba4788f-0466-485f-bd5f-20395f02ea74,"Take a look at the tooth brushing example at the very start of Chapter 14 of Andrew Vickers' book [What is a p-value anyway? 34 Stories to Help You Actually Understand Statistics][1]. It starts on page 57 or you can use the table of contents button in the bottom *left* corner to find it. + +Here's an excerpt: + +> [I]f you do nothing else, please try to remember the following +> sentence: “the $p$-value is the probability that the data would be at +> least as extreme as those observed, if the null hypothesis were true.” +> Though I’d prefer that you also understood it—about which, teeth +> brushing. +> +> I have three young children. In the evening, before we get to bedtime +> stories (bedtime stories being a nice way to end the day), we have to +> persuade them all to bathe, use the toilet, clean their teeth, change +> into pajamas, get their clothes ready for the next day and then +> actually get into bed (the persuading part being a nice way to go +> crazy). My five-year-old can often be found sitting on his bed, fully +> dressed, claiming to have clean teeth. The give-away is the bone dry +> toothbrush: he says that he has brushed his teeth, I tell him that he +> couldn’t have. +> +> My reasoning here goes like this: the toothbrush is dry; it is +> unlikely that the toothbrush would be dry if my son had cleaned his +> teeth; therefore he hasn’t cleaned his teeth. Or using +> statistician-speak: here are the data (a dry toothbrush); here is a +> hypothesis (my son has cleaned his teeth); the data would be unusual +> if the hypothesis were true, therefore we should reject the +> hypothesis. +> +> [...] +> +> So here is what to parrot when we run into each other at a bar and I +> still haven’t managed to work out any new party tricks: “The $p$-value +> is the probability that the data would be at least as extreme as those +> observed, if the null hypothesis were true.” When I recover from +> shock, you can explain it to me in terms of a toothbrush (“The +> probability of the toothbrush being dry if you’ve just cleaned your +> teeth”). + +The other thing I really like about this example is that it also explains that failing to reject the null does not mean the null is necessarily true. Vickers writes that his son has now worked out the trick and has taken to running his toothbrush under the tap for a second or two before heading to bed. Just because the toothbrush is wet (and the data is consistent with the null hypothesis), it does not mean that his son has cleaned his teeth. + + [1]: http://www.pearsonhighered.com/vickers/",added 2237 characters in body,2013-10-13 21:27:49.040 +186137,57406,22507.0,5,,CC BY-SA 3.0,0a2c7dfd-79e9-4f4c-ab25-e4d472e27e8c,"Machine learning often deals with optimization of a function which has many local minimas. Feedforward neural networks with hidden units is a good example. Whether these functions are discrete or continuous, there is no method which achieves a global minimum and stops. It is easy to prove that there is no general algorithm to find a global minimum of a continuous function even if it is one-dimensional and smooth (has infinitely many derivatives). In practice, all algorithms for learning neural networks stuck into a local minimum. It is easy to check this: create a random neural network, make a big set of its responses to random inputs, then try to learn another neural network with the same architecture to copy the responses. While the perfect solution exists, neither backpropagation not any other learning algorithm will be able to discover it, starting from a random set of weights. + +Some learning methods, like simulated annealing or genetic algorithms, explore many local minimas. For continuous functions there are methods like gradient descent, which find the closest local minimum. They are much faster, thats why they are widely used in practice. But given enough time, the former group of methods outperforms the later in terms of training set error. But with reasonable time constraints, for real world problems, the latter group is usually better. + +For some models, like logistic regression, there is one local minimum, the function is convex, the minimization converges to the minimum, but the models themselves are simplistic. + +Thats the bitter truth. + +Note also that proof of convergence and proof of convergence to the best solution are two different things. K-means algorithm is an example of this. + +Finally, for some models we don't know how to learn at all. For example, if the output is an arbitrary computable function of inputs, we don't know good algorithms which, in reasonable time, find a Turing or equivalent machine implementing this function. For instance, if f(1)=2, f(2)=3, f(3)=5, f(4)=7, ..., f(10)=29 (ten first primes), we don't know any learning algorithm which would be able to predict, in reasonable time, that f(11)=31, unless it already knows the concept of prime numbers.",added 3 characters in body,2013-10-13 21:30:45.657 +186138,57407,594.0,2,,CC BY-SA 3.0,0103da2a-a50d-48cc-8404-8a799f0b6086,"Since you appear to doubt the example offered, I have included a diagram. As Michael Mayer said, two normal distributions centered around 0, one with larger variance, is sufficient. + +In the diagram, we compare the 0.1 and the 0.9 quantiles for $\sigma=1$ (blue) and $\sigma=0.8$ (dark orange) + +![normcdfs sigma=1, sigma=0.8][1] + +Michael Mayer's example fulfills the requirements of your question with $q_1=0.1$, $q_2=0.9$ and $X_1$ being the one with larger variance. + + [1]: https://i.stack.imgur.com/pT43v.png +",,2013-10-13 22:09:36.687 +186139,57408,7155.0,2,,CC BY-SA 3.0,513c9664-a49a-4184-8b6d-f0a6752b6dca,"The divisor is just a normalizing constant, so we can ignore it for the moment. If we plug in $f(\gamma)$ it simplifies to $p(D|\gamma)p(\gamma)$, by Bayes' rule is equal to $p(\gamma|D)p(D)$. Now since $p(D)$ isn't a function of $\gamma$ it falls into the normalizing constant. Thus it simplifies to $p(\gamma|D)$. + +This expansion seems pointless until you realize that the $\gamma$ that $p(\gamma|D)$ is at its maximum is the same $\gamma$ that $f(\gamma)$ is at its minimum. So we can study $f(\gamma)$ by itself.",,2013-10-13 22:17:17.043 +186214,57428,20470.0,5,,CC BY-SA 3.0,db7a9836-5766-421a-9970-ebc244a2e876,"I am trying to implement the ID3 algorithm on a data set . However , all attributes are continuous and can have values between 1-10. I found that we have to specify the bin intervals for discretization but couldn't understand how to do this exactly . + +Can some one explain on how to do this? The [data set][1] I am using is Breast Cancer Data from Wisconsin hospitals. + + + [1]: http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",grammar / rephrasal,2013-10-14 09:23:33.433 +186213,57428,,24,,CC BY-SA 3.0,db7a9836-5766-421a-9970-ebc244a2e876,,"Proposed by 28740 approved by 805, 88 edit id of 5605",2013-10-14 09:23:33.433 +186140,57380,594.0,5,,CC BY-SA 3.0,0c5da6a6-6c47-4e94-a493-13a739ff0d52,"I'm trying to predict a response variable in linear regression that should be always positive (cost per click). It's a monetary amount. In adwords, you pay google for clicks on your ads, and a negative number would mean that google pays you when people clicked :P + +The predictors are all continuous values. The Rsquared and RMSE are decent when compared to other models, even out-of-sample: + + RMSE Rsquared + 1.4141477 0.8207303 + +I cannot rescale the predictions, because it's money, so even a small rescaling factor could change costs significantly. + +As far as I understand, for the regression model there's nothing special about zero and negative numbers, so it finds the best regression hyperplane no matter whether the output is partly negative. + +This is a very first attempt, using all variables I have. So there's room for refinement. + +Is there any way to tell the model that the output cannot be negative? +",added 20 characters in body,2013-10-13 22:19:36.263 +186142,57409,10060.0,2,,CC BY-SA 3.0,71e5fb88-481e-4227-8145-9e7b060c66dd,"If the domain all have a base of 100, a stacked bar chart may be suitable. + +![enter image description here][1] + + [1]: https://i.stack.imgur.com/BaEoD.png",,2013-10-13 22:36:21.843 +186143,57399,6813.0,5,,CC BY-SA 3.0,2c159a88-eb82-488f-ba4f-b8b95497442b,"Recently, I was wondering about calculating the probability of a given individual in a given population ""knowing"" (let's say, present in individual's friend set) at least one person with a given trait A and at least one person with another given trait, B; where it is possible that any number of people in the given individual's friend set can possess both traits. + +For example, using genetic traits, in a given population, how could one calculate the probability that a given individual in a given population ""knows"" at least one person with grey eye colour and at least one person who is greater than 200cm tall; where, naturally, it is possible that any number of people in the friend set can possess grey eye colour *and* be greater than 200cm tall. + +I have developed a sort of model, but it may not be correctly specified; it is as follows: + +**Assumptions and Qualifications:** + + - First of all, for simplicity, let's assume that we define ""knowing"" as mutually connected friends on an online social network. + - Secondly, the frequencies of genetic traits are (a) going to be determined by ethnicity of a given population as well as environmental factors (nutrition, healthcare) and (b) unlikely to be independently distributed across a given individual's friend set (e.g. family members will have greater genetic similarity); however, for this problem, let's adopt a simple model where both of the above conditions are violated. + - Thirdly, assume that the individual's friend set provides a microcosmic representation of society; this facilitates a frequentist generation of probabilities from the instance rate in the population. + - Finally, instances of genetic traits in the population are fabricated, but are used to generate probabilities for the examples. + +**Model Formulation:** + +I have reasoned that a binomial random distribution can be applied to the probability of each genetic trait, where a ""success"" is defined as an individual in the friend set possessing that genetic trait. + +Thus for trait A, we have: + +$$P(A=k) = \binom{N}{k}p_{a}^{k}(1-p_{a})^{N-k}$$ + +and, for trait B: + +$$P(B=k) = \binom{N}{k}p_{b}^{k}(1-p_{b})^{N-k}$$ + +where $N$ is number of friends in the friend set, $k$ is number of people containing the genetic trait and $p_{x}$ is the probability associated with possessing genetic trait $X$. + +Because the scenario is concerned with the probability of *at least one* person possessing trait A and *at least one* person possessing trait B, it is easier to find the complement of no people in a friend set containing the trait; for both traits: + +$$P(A >= 1) = 1 - P(A = 0)$$ + +and + +$$P(B >= 1) = 1 - P(B = 0)$$ + +Furthermore, we know that probability of an intersection of events is given by: + +$$\mathbb{P}(A \cap B) = \mathbb{P}(A|B)\mathbb{P}(B) = \mathbb{P}(B|A)\mathbb{P}(A)$$ + +However, because we are assuming independence between genetic traits, $\mathbb{P}(A)$ and $\mathbb{P}(B)$ are independent, thus: + +$$\mathbb{P}(A \cap B) = \mathbb{P}(A)\mathbb{P}(B)$$ + +Combining the information above, we get the following model for the above scenario: + +$$\mathbb{P}(A >=1, B >=1) = \left(1-\binom{N}{k}p_{a}^{k}(1-p_{a})^{N-k}\right)\left(1-\binom{N}{k}p_{b}^{k}(1-p_{b})^{N-k}\right)$$ + +So, for the original example above, assuming a friend set of size $N = 300$, the instance of trait A in the population is $\frac{1}{800}$ and the instance of trait B in the population is $\frac{1}{5000}$; according to the model, we get the final probability: + +$$\mathbb{P}(A >=1, B >=1)$$ + +$$ = \left(1-\binom{300}{0}\left(\frac{1}{800}\right)^{0}\left(\frac{799}{800}\right)^{300}\right)\left(1-\binom{300}{0}\left(\frac{1}{5000}\right)^{0}\left(\frac{4999}{5000}\right)^{300}\right)$$ + +$$\approx 0.018 = 1.8\%$$ + +Does this model seem reasonable given the assumptions? + +Assuming this model is *not* correctly specified, maybe somebody could provide a more accurate representation.",added 75 characters in body,2013-10-13 22:42:12.770 +186145,57410,22662.0,2,,CC BY-SA 3.0,904f8820-3e23-4cf7-b5ac-70e3915743d9,"The joint entropy is the amount of information we get when we observe X and Y at the same time, but what would happen if we don't observe them at the same time. + +For example, when i toss a coin, if i got tails i will only observe the variable X, but if i got heads i will only observe the variable Y. How could i find the entropy?",,2013-10-13 22:51:56.027 +186146,57410,22662.0,1,,CC BY-SA 3.0,904f8820-3e23-4cf7-b5ac-70e3915743d9,Joint entropy of two random variables,,2013-10-13 22:51:56.027 +186144,57410,22662.0,3,,CC BY-SA 3.0,904f8820-3e23-4cf7-b5ac-70e3915743d9,,,2013-10-13 22:51:56.027 +186147,57195,22547.0,5,,CC BY-SA 3.0,3bc0266b-79cf-4130-8bd0-1e2718e27c6b,"I have data for a network of weather stations across the United States. This gives me a data frame that contains date, latitude, longitude, and some measured value. Assume that data are collected once per day and driven by regional-scale weather (no, we are not going to get into that discussion). + +I'd like to show graphically how simultaneously-measured values are correlated across time and space. My goal is to show the regional homogeneity (or lack thereof) of the value that is being investigated. + +## Data set ## + +To start with, I took a group of stations in the region of Massachusetts and Maine. I selected sites by latitude and longitude from an index file that is available on NOAA's FTP site. + +![enter image description here][1] + +Straight away you see one problem: there are lots of sites that have similar identifiers or are very close, but without more information we have no reason to join data sets (FWIW, I identify them using both the USAF and WBAN codes). Looking deeper in to the data, I saw that they have similar coordinates, but not identical, and the elevations of the sites differ. So, I have to treat them as separate stations, because I don't know better, and the data inevitably contains pairs of stations that are very close to each other. + +## Preliminary Analysis ## + +I tried grouping the data by calendar month and then calculating the ordinary least squares regression between different pairs of data. I then plot the correlation between all pairs as a line connecting the stations (below). The line color shows the value of R2 from the OLS fit. The figure then shows how the 30+ data points from January, February, etc. are correlated between different stations in the area of interest. + +![correlation between daily data during each calendar month][2] + +I've written the underlying codes so that the daily mean is only calculated if there are data points every 6-hour period, so data should be comparable across sites. + +### Problems ### +Unfortunately, there is simply too much data to make sense of on one plot. That can't be fixed by reducing the size of the lines. + +I've tried plotting the correlations between the nearest neighbors in the region, but that turns into a mess very quickly. The facets below show the network without correlation values, using $k$ nearest neighbors from a subset of the stations. This figure was just to test the concept. +![enter image description here][3] + +The network appears to be too complex, so I think I need to figure out a way to reduce the complexity, or apply some kind of spatial kernel. + +I am also not sure what is the most appropriate metric to show correlation, but for the intended (non-technical) audience, the correlation coefficient from OLS might just be the simplest to explain. I may need to present some other information like the gradient or standard error as well. + +### Questions ### + +I'm learning my way into this field and R at the same time, and would appreciate suggestions on: + + 1. What's the more formal name for what I'm trying to do? Are there some helpful terms that would let me find more literature? My searches are drawing blanks for what must be a common application. + 2. Are there more appropriate methods to show the correlation between multiple data sets separated in space? + 3. ... in particular, methods that are easy to show results from visually? + 4. Are any of these implemented in R? + 5. Do any of these approaches lend themselves to automation? + + + [1]: https://i.stack.imgur.com/aZm4N.jpg + [2]: https://i.stack.imgur.com/X4YZI.jpg + [3]: https://i.stack.imgur.com/NWzm2.jpg","Added more information, clarified question",2013-10-14 00:19:48.887 +186148,57411,20473.0,2,,CC BY-SA 3.0,b4d04a36-7744-4806-9795-04e1a324e4d1,"Entropy (joint entropy included), _is a property of the distribution_ that a random variable follows. The available sample (and hence the timing of observation) plays no role in it. + +Copying for Cover & Thomas, the joint entropy $H(X,Y)$ of two discrete random variables $X, Y,$ with joint distribution $p(x,y)$, is defined as + +$$H(X,Y) = - \sum_{S_X}\sum_{S_Y}p(x,y)\log p(x,y) $$ + +Examine the expression: the sums are taken over _all possible values_ of $X$ and $Y$, i.e. over all the values that belong to the support of each r.v. ($S_X$ and $S_Y$ respectively), irrespective of whether some of these values may not materialize or be observed in a sample. What we actually observe, or when, plays no role, in calculating entropy, and joint entropy in particular. + +Turning to your specific example: The side of a coin itself can not be modeled as a random variable. A random variable maps _events_ into real numbers. The side of a coin is not an event. _Observing_ one of the two sides is an event. _Not observing_ a side, is an event. So let's define a random variable $X$ by ""$X$ takes the value $1$ if heads is observed, $0$ otherwise"". And define $Y$ by ""$Y$ takes the value $1$ if tails is observed, $0$ otherwise"". Assume the coin is fair. The joint distribution of these two random variables is then described by +$$\begin{align} +P(X=1,Y=1) &= 0 \\ +P(X=1,Y=0) &= 0.5 \\ +P(X=0,Y=1) &= 0.5 \\ +P(X=0,Y=0) &= 0 +\end{align}$$ + +As usual, we consider the distribution at non-zero values, so + +$$H(X,Y) = - 0.5\log(0.5) - 0.5\log(0.5) $$ + +and using base-2 for the logarithm we get + +$$H(X,Y) = - 0.5(-1) - 0.5(-1) = 1 $$ + +Finally, you can easily find that the entropy of $X$ (and likewise for $Y$) is +$$H(X) = - \sum_{S_X}p(x)\log p(x) = - 0.5(-1) - 0.5(-1) = 1 $$ + +So in this case $H(X,Y) = H(X) = H(Y)$. But the general expression for the decomposition of joint entropy is + +$$H(X,Y) = H(X) + H(Y\mid X) = H(Y) + H(X\mid Y)$$ + +where $H(Y\mid X)$ and $H(X\mid Y)$ are conditional entropies. Then we conclude that $H(Y\mid X) = H(X\mid Y) = 0$ in this case. The intuition is straightforward: given $X$ what has happened to $Y$ is certain (and likewise in reverse), so conditional entropy is zero. + + + + +",,2013-10-14 00:19:58.667 +186149,57276,,25,,,48a1e4f1-9329-47c8-b939-6ed34c52ce28,,http://twitter.com/#!/StackStats/status/389546456044302336,2013-10-14 00:21:27.263 +186150,57195,22547.0,5,,CC BY-SA 3.0,eb0d5c7a-9887-4322-a0cf-386ff41cf4d4,"I have data for a network of weather stations across the United States. This gives me a data frame that contains date, latitude, longitude, and some measured value. Assume that data are collected once per day and driven by regional-scale weather (no, we are not going to get into that discussion). + +I'd like to show graphically how simultaneously-measured values are correlated across time and space. My goal is to show the regional homogeneity (or lack thereof) of the value that is being investigated. + +## Data set ## + +To start with, I took a group of stations in the region of Massachusetts and Maine. I selected sites by latitude and longitude from an index file that is available on NOAA's FTP site. + +![enter image description here][1] + +Straight away you see one problem: there are lots of sites that have similar identifiers or are very close. FWIW, I identify them using both the USAF and WBAN codes. Looking deeper in to the metadata I saw that they have different coordinates and elevations, and data stop at one site then start at another. So, because I don't know any better, I have to treat them as separate stations. This means the data contains pairs of stations that are very close to each other. + +## Preliminary Analysis ## + +I tried grouping the data by calendar month and then calculating the ordinary least squares regression between different pairs of data. I then plot the correlation between all pairs as a line connecting the stations (below). The line color shows the value of R2 from the OLS fit. The figure then shows how the 30+ data points from January, February, etc. are correlated between different stations in the area of interest. + +![correlation between daily data during each calendar month][2] + +I've written the underlying codes so that the daily mean is only calculated if there are data points every 6-hour period, so data should be comparable across sites. + +### Problems ### +Unfortunately, there is simply too much data to make sense of on one plot. That can't be fixed by reducing the size of the lines. + +I've tried plotting the correlations between the nearest neighbors in the region, but that turns into a mess very quickly. The facets below show the network without correlation values, using $k$ nearest neighbors from a subset of the stations. This figure was just to test the concept. +![enter image description here][3] + +The network appears to be too complex, so I think I need to figure out a way to reduce the complexity, or apply some kind of spatial kernel. + +I am also not sure what is the most appropriate metric to show correlation, but for the intended (non-technical) audience, the correlation coefficient from OLS might just be the simplest to explain. I may need to present some other information like the gradient or standard error as well. + +### Questions ### + +I'm learning my way into this field and R at the same time, and would appreciate suggestions on: + + 1. What's the more formal name for what I'm trying to do? Are there some helpful terms that would let me find more literature? My searches are drawing blanks for what must be a common application. + 2. Are there more appropriate methods to show the correlation between multiple data sets separated in space? + 3. ... in particular, methods that are easy to show results from visually? + 4. Are any of these implemented in R? + 5. Do any of these approaches lend themselves to automation? + + + [1]: https://i.stack.imgur.com/aZm4N.jpg + [2]: https://i.stack.imgur.com/X4YZI.jpg + [3]: https://i.stack.imgur.com/NWzm2.jpg",Fixed an annoying paragraph,2013-10-14 00:27:35.263 +186153,57412,22665.0,3,,CC BY-SA 3.0,b5ffea2e-53f7-43f9-beca-9f08924391fa,,,2013-10-14 00:46:15.680 +186152,57412,22665.0,1,,CC BY-SA 3.0,b5ffea2e-53f7-43f9-beca-9f08924391fa,Understanding Bayesian Predictive Distributions,,2013-10-14 00:46:15.680 +186151,57412,22665.0,2,,CC BY-SA 3.0,b5ffea2e-53f7-43f9-beca-9f08924391fa,"I'm taking an Intro to Bayes course and I'm having some difficulty understanding predictive distributions. I understand why they are useful and I'm familiar with the definition, but there are some things I don't quite understand. + + +**1) How to get the right predictive distribution for a vector of new observations** + +Suppose that we have built a sampling model $p(y_i | \theta)$ for the data and a prior $p(\theta)$. Assume that the observations $y_i$ are conditionally independent given $\theta$. + +We have observed some data $\mathcal{D} = \{y_1, y_2, \, ... \, , y_k\}$, and we update our prior $p(\theta)$ to the posterior $p(\theta | \mathcal{D})$. + +If we wanted to predict a vector of new observations $\mathcal{N} = \{\tilde{y}_1, \tilde{y}_2, \, ... \, , \tilde{y}_n\}$, I think we should try to get the posterior predictive using this formula +$$ +p(\mathcal{N} | \mathcal{D}) = \int p(\theta | \mathcal{D}) p ( \mathcal{N} | \theta) \, \mathrm{d} \theta = \int p(\theta | \mathcal{D}) \prod_{i=1}^n p(\tilde{y}_i | \theta) \, \mathrm{d} \theta, +$$ +which is not equal to +$$ +\prod_{i=1}^n \int p(\theta | \mathcal{D}) p(\tilde{y}_i | \theta) \, \mathrm{d} \theta, +$$ +so the predicted observations are not independent, right? + +Say that $\theta | \mathcal{D} \sim$ Beta($a,b$) and $p(y_i | \theta) \sim$ Binomial($n, \theta$) for a fixed $n$. In this case, if I wanted to simulate 6 new $\tilde{y}$, if I understand this correctly, it would be wrong to simulate 6 draws independently from the Beta-Binomial distribution that corresponds to the posterior predictive for a single observation. Is this correct? I don't know how to interpret that the observations are not independent marginally, and I'm not sure I understand this correctly. + +**Simulating from posterior predictives** + +Many times when we simulate data from the posterior predictive we follow this scheme: + +For $b$ from 1 to $B$: + +1) Sample $\theta^{(b)}$ from $p(\theta | \mathcal{D})$. + +2) Then simulate new data $\mathcal{N}^{(b)}$ from $p(\mathcal{N} | \theta^{(b)})$. + +I don't quite know how to prove this scheme works, although it looks intuitive. Also, does this have a name? I tried to look up a justification and I tried different names, but I had no luck. + +**Thanks!** + + +",,2013-10-14 00:46:15.680 +186156,57413,22666.0,3,,CC BY-SA 3.0,b5fd25fe-d85b-482f-932a-91fcebcb248c,,,2013-10-14 01:03:58.377 +186154,57413,22666.0,1,,CC BY-SA 3.0,b5fd25fe-d85b-482f-932a-91fcebcb248c,time persistence in panel data,,2013-10-14 01:03:58.377 +186155,57413,22666.0,2,,CC BY-SA 3.0,b5fd25fe-d85b-482f-932a-91fcebcb248c,"I am using dynamic model with panel quarter data using Stata. And my sample contain 16 nations from 2000 to 2010. +Is there an approximated number of observations in the panel data to be considered as a time persistent process? ",,2013-10-14 01:03:58.377 +186159,57414,22667.0,3,,CC BY-SA 3.0,a03803f5-2902-4799-8446-b97c5187dad1,,,2013-10-14 01:17:13.023 +186158,57414,22667.0,1,,CC BY-SA 3.0,a03803f5-2902-4799-8446-b97c5187dad1,"I have a discrete distribution and want to know to what extent other samples differ from it, what is the right test?",,2013-10-14 01:17:13.023 +186157,57414,22667.0,2,,CC BY-SA 3.0,a03803f5-2902-4799-8446-b97c5187dad1,"This is kind of a basic stats question, but I want to make sure I am doing this right. + +I have a distribution of objects. Specifically: + `array([ 6072., 112673., 126874., 44366., 5384., 14697., 20323., 68197., 98024.,39483., 103990., 18556., 32930., 23551., 6897.])` + + +I then have a lot of samples like [1,4,0,0,0,0...] (same length) and I'd like to know how far the samples are from the distribution above. Correlation doesn't really do it. +[4,0,0,0...] should be as far away as [32,0,0,0,..] but with lower significance ",,2013-10-14 01:17:13.023 +186160,57415,22507.0,2,,CC BY-SA 3.0,c841ba15-c3ce-4c61-ae92-c6472cf5bc86,"If you want to do logistic regression, a simple approach is: + +- for each continuous feature with missing data, replace all missing values by the average or median value for this feature, and create one more boolean feature which indicates whether the data is missing or not +- for each unordered categorical feature with missing data, put all missing values into a new category",,2013-10-14 02:23:17.417 +186168,57418,19043.0,2,,CC BY-SA 3.0,1c750ec5-1aa1-4e8f-9aff-861c64148e74,"I am comparing measurements on a test group relative to a control group in three different environmental conditions. I am interested in both differences between environmental conditions and differences between test and control groups. I ran a two-way ANOVA with an interaction term and looked at pairwise comparisons when terms were significant. + +When the interaction term was significant the Tukey HSD function in R automatically outputs all comparisons. Comparisons between test and test groups on different environmental conditions, comparisons between test and control groups on different environmental conditions and so forth. Needless to say, this resulted in a large number of tests to correct for. + +My adviser thinks that I should only do three tests to compare test group to control group on each environmental condition (and then only adjust for three tests). I think that because I am interested in differences between environmental conditions in this study, I should run most tests. If I wasn't interested in differences in environmental conditions it should be a nested ANOVA, right? You can see from graphs that the interaction term come from differences between test and control groups on two ecological sites, but it doesn't seem valid to just only run comparisons between groups you 'suspect' will be different. The ones I'm not sure I care about are differences test and control groups on two different environmental conditions. + +**Is it valid to only run comparisons between groups you are interested in to reduce the number of tests you have to adjust the Tukey HSD p-value for or should you run comparisons on all combinations of groups.** + +Thanks for help in advance.",,2013-10-14 04:02:43.010 +186167,57418,19043.0,1,,CC BY-SA 3.0,1c750ec5-1aa1-4e8f-9aff-861c64148e74,Do only certain pairwise comparisons after significant interaction in two-way ANOVA,,2013-10-14 04:02:43.010 +186169,57418,19043.0,3,,CC BY-SA 3.0,1c750ec5-1aa1-4e8f-9aff-861c64148e74,,,2013-10-14 04:02:43.010 +186170,57418,19043.0,6,,CC BY-SA 3.0,82e1a7cf-3d64-44db-a76b-385b33fa8e00,,edited tags,2013-10-14 04:09:01.183 +186173,57419,22668.0,3,,CC BY-SA 3.0,e8e95338-ea3a-4e90-9c3b-3c0097e82f6c,,,2013-10-14 04:45:22.343 +186172,57419,22668.0,1,,CC BY-SA 3.0,e8e95338-ea3a-4e90-9c3b-3c0097e82f6c,Weightining using TraMineR,,2013-10-14 04:45:22.343 +186171,57419,22668.0,2,,CC BY-SA 3.0,e8e95338-ea3a-4e90-9c3b-3c0097e82f6c,"I have read some posts on weightining. However, I am still unclear on the sort of weights I need to use. I am using data from the Longitudinal Survey of Australian Youth (LSAY). This survey provides longitudinal weights for each survey wave (i.e. correction for sampling error and attrition). Because I have weight variables for each period (10) and I can only specify one of them in TraMiner, I am not sure which I should use. I have read that whether weights for the first or last wave should be used. I have not read any reasons why. + +Can anyone provide me some guidance on this issue? + +Regards,",,2013-10-14 04:45:22.343 +186174,57403,20062.0,5,,CC BY-SA 3.0,44667f13-691a-4752-9016-f28d60d1696e,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with for example their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (oterwise they would form one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case are given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm + + +",added 7 characters in body,2013-10-14 04:50:14.720 +186175,57420,306.0,2,,CC BY-SA 3.0,de36620c-5ba4-4198-95a5-a9fbb686d8ca,"First things first. There needs to be greater information given as this does not have a universally correct answer. Different types of distributions have to be looked at with different types of procedures. + +But just to show that yes this is possible, we assume that each of the variables that you have mentioned are normally distributed but the parameters of the normal distributions are different from each other for any given pair. + +Now we take n samples of these variables, each sample containing one instance of each of the variables. We group together each of the variables separately and then calculate the correlation coefficients for each pair of the groups. If we cannot reject the hypothesis of these correlation coefficients being zero, we hypothesize that the variables are independent of each other. So we have a set of variables which are independent from each other, but they have different probability distributions.",,2013-10-14 05:05:30.750 +186176,57419,594.0,5,,CC BY-SA 3.0,ee55be4e-5e4e-422d-9bd6-3fe8890f6109,"I have read some posts on weighting. However, I am still unclear on the sort of weights I need to use. I am using data from the Longitudinal Survey of Australian Youth (LSAY). This survey provides longitudinal weights for each survey wave (i.e. correction for sampling error and attrition). Because I have weight variables for each period (10) and I can only specify one of them in TraMiner, I am not sure which I should use. I have read that whether weights for the first or last wave should be used. I have not read any reasons why. + +Can anyone provide me some guidance on this issue? +",deleted 12 characters in body,2013-10-14 05:42:42.343 +186178,57421,22669.0,1,,CC BY-SA 3.0,e4deac19-e258-40f6-aba2-15f23c743082,"ARIMA (0,1,1) or (0,1,0) - or something else?",,2013-10-14 05:55:50.823 +186179,57421,22669.0,3,,CC BY-SA 3.0,e4deac19-e258-40f6-aba2-15f23c743082,,,2013-10-14 05:55:50.823 +186177,57421,22669.0,2,,CC BY-SA 3.0,e4deac19-e258-40f6-aba2-15f23c743082,"I've just started learning time series so please excuse me if it's painfully obvious; I haven't managed to find the answer elsewhere. + +I have a data series showing a pretty obvious trend although it's quite noisy. I can take pretty much any division of the data and run classical tests to show a highly significant difference in means. + +I decided to have a look at time series analysis to see if it could help describe the trend. An ARIMA(0,1,1) model comes out with AIC,BIC=34.3,37.3 (Stata), whilst an ARIMA(0,1,0) model comes out with AIC,BIC=55.1,58.1 - so I understand I'm supposed to prefer the (0,1,1) model. + +However, the coefficient for the MA(1) is displaying as -0.9999997 (and not showing any p-values). If I try the same in SPSS I get an MA(1) coefficient of 1.000 (I assume SPSS uses opposite signs) with a p-value of 0.990 - does this mean it suggests I drop the term? + +My understanding is that the effect of a MA(1) coefficient of -1 is basically to remove the old error term and convert the whole series to a linear trend. Does this mean ARIMA is totally unsuitable for my needs? On the plus side it gives me a sensible value for the trend. If I use the (0,1,0) model then I still get a reasonable value for the trend but it's not significant any more. + +Thanks for your help!",,2013-10-14 05:55:50.823 +186218,57428,,5,user88,CC BY-SA 3.0,83a273f9-8dd7-4fab-aa03-d24c7a7844f0,"I am trying to implement the ID3 algorithm on a data set. However, all attributes are continuous and can have values between 1-10. I found that we have to specify the bin intervals for discretization but couldn't understand how to do this exactly. + +Can some one explain on how to do this? The [data set][1] I am using is Breast Cancer Data from Wisconsin hospitals. + + + [1]: http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",deleted 3 characters in body; edited title,2013-10-14 09:25:05.803 +186220,57382,,4,user88,CC BY-SA 3.0,a35668d6-5197-48bc-8b7c-1f1348fbe40b,Does sampling error include measurement error?,edited title,2013-10-14 09:25:44.370 +186180,57422,9007.0,2,,CC BY-SA 3.0,d8d17543-b723-4ab9-b12a-496a57957098,"TraMineR, and more generally sequence analysis, treats each sequence as a whole. Ideally, weights should therefore correct for selection bias of the sample of sequences you consider, and longitudinal weights should be used. +The choice of the weights depends on which sequences you retain for your analysis: + +1. If you use only complete sequences until the last wave, then the choice should be the longitudinal weights associated to the last wave, which accounts attrition until the last wave). + +2. If you chose to retain all sequences complete until say the 6th wave and admit missing states for the last four waves, then you should chose the weights associated to wave 6. + +3. If you retain all sequences, even for those for which you have valid data for the first wave only, then you should use weights of the first wave. + +4. In case you select only sequences with no more than say 4 missing states, the choice may be more ambiguous. If most of the four missing states occur in the last four positions, then you could adopt the solution 2 as a good approximation. + +Hope this helps. +",,2013-10-14 05:56:44.773 +186181,57423,5637.0,2,,CC BY-SA 3.0,377b0648-7361-496e-839f-2e712619c7e0,"It is difficult to give the right answer without looking at the data. Here are some points that may help you in your modelling. + +The coefficient of MA(1) very close to 1 indicates the sign of overdifferencing. This means unit root in Moving averages. + +My suggestion would be: Check the original series is stationary (visually) or check the presence of unit root. If you observe deterministic trend (eg: linear), add the trend part with time series model. If the original series is stationary build the time series without differencing. ",,2013-10-14 06:12:22.867 +186182,57391,2420.0,5,,CC BY-SA 3.0,e2987fb5-fcde-4d72-bec8-fd332af0c4ba,"I'm a software developer and I'll like to learn about neural networks. At this point I've find a problem which I'll like to solve at some point. It is about electrical load forecasting. I'm looking for similar problems and it will be great if I can find some similar examples with solutions. At this point I'm having troubles in finding the right model for the RNN, and more exactly I'm struggling with the input layer. As the output I need the forecast values for each hour. + +Any reference to books, links resources or advices are welcome and very appreciated. + +This is the problem that I'll like to solve: + +A very small factory, use a number of equipments to produce bread. Some of them are electrical equipments which means that they consume electrical power. Knowing which equipments will run on the next day, an electricity consumptions forecast can be computed. + +The equipment named E.V. is a special case of equipment. The human operator completes it's values in an empirically manner in order to have a good forecast for the next day. Those values can be positive or negative. + + + +----------------------------------------------------+ + |equipment name|power| 1h| 2h| 3h| 4h| 5h| 6h| 7h| 8h| + +----------------------------------------------------+ + |Equipment 1 | 2MW| - | - | on| on| on| - | - | - | + |Equipment 2 | 5MW| - | - | - | on| on| on| - | - | + |Equipment 3 | 1MW| on| on| on| on| on| on| on| on| + |E.V. | | .1|-.1|-.1| .1|-.2| .1| .1|-.1| + +--------------+-------------------------------------+ + |total/(forecast) |1.1| .9|2.9|8.1|7.8|6.1|1.1| .9| + +--------------+-------------------------------------+ + |real consumption | .9| .9|2.7|8.2|7.9|3.1|0.8| .7| + +--------------+-------------------------------------+ + +The problem is that the machines are not running at their maximal power, so it will be great if a more exactly forecast can be build. I have data from 2 years back for every day. Also, do you think that date is a good candidate for the input layer?",date related quesion,2013-10-14 06:16:41.540 +186183,57278,,25,,,9e94a38e-f2a7-4bf9-82df-2b2d8269b230,,http://twitter.com/#!/StackStats/status/389637058559750144,2013-10-14 06:21:28.587 +186184,57391,2420.0,5,,CC BY-SA 3.0,6acfb5c0-357c-4606-b929-7b277e355f9a,"I'm a software developer and I'll like to learn about neural networks. At this point I've find a problem which I'll like to solve at some point. It is about electrical load forecasting. I'm looking for similar problems and it will be great if I can find some similar examples with solutions. At this point I'm having troubles in finding the right model for the RNN, and more exactly I'm struggling with the input layer. As the output I need the forecast values for each hour. + +Any reference to books, links resources or advices are welcome and very appreciated. + +This is the problem that I'll like to solve: + +A very small factory, use a number of equipments to produce bread. Some of them are electrical equipments which means that they consume electrical power. Knowing which equipments will run on the next day, an electricity consumptions forecast can be computed. + +The equipment named E.V. is a special case of equipment. The human operator completes it's values in an empirically manner in order to have a good forecast for the next day. Those values can be positive or negative. + + + +----------------------------------------------------+ + |equipment name|power| 1h| 2h| 3h| 4h| 5h| 6h| 7h| 8h| + +----------------------------------------------------+ + |Equipment 1 | 2MW| - | - | on| on| on| - | - | - | + |Equipment 2 | 5MW| - | - | - | on| on| on| - | - | + |Equipment 3 | 1MW| on| on| on| on| on| on| on| on| + |E.V. | | .1|-.1|-.1| .1|-.2| .1| .1|-.1| + +--------------+-------------------------------------+ + |total/(forecast) |1.1|0.9|2.9|8.1|7.8|6.1|1.1|0.9| + +--------------+-------------------------------------+ + |real consumption |0.9|0.9|2.7|8.2|7.9|3.1|0.8|0.7| + +--------------+-------------------------------------+ + +The problem is that the machines are not running at their maximal power, so it will be great if a more exactly forecast can be build. I have data from 2 years back for every day. Also, do you think that date is a good candidate for the input layer?",edited body,2013-10-14 06:32:43.717 +186185,57403,20062.0,5,,CC BY-SA 3.0,5e6dfa0f-fb61-4e74-86fc-8cf4b9f8d1ca,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with for example their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (otherwise they would fuse into a one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case are given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm + + +",added 8 characters in body,2013-10-14 06:56:42.453 +186186,57420,306.0,5,,CC BY-SA 3.0,7ba1fcea-62cb-4447-b98e-500632d6aeef,"First things first. There needs to be greater information given as this does not have a universally correct answer. Different types of distributions have to be looked at with different types of procedures. + +But just to show that yes this is possible, we assume that each of the variables that you have mentioned are normally distributed but the parameters of the normal distributions are different from each other for any given pair. + +Now we take n samples each of these variables. Then calculate the correlation coefficients for each pair of the variables. If we cannot reject the hypothesis of these correlation coefficients being zero, we hypothesize that the variables are independent of each other. So we have a set of variables which are independent from each other, but they have different probability distributions.",deleted 109 characters in body,2013-10-14 07:13:02.163 +186187,57413,1406.0,4,,CC BY-SA 3.0,51b68dd9-d950-4e19-b0aa-279f9c39c540,Time persistence in panel data,edited title,2013-10-14 07:21:24.520 +186188,57424,1406.0,2,,CC BY-SA 3.0,7e6a7696-5fd7-41ff-aa97-c4831b878996,"Time series issues, such as unit roots, etc in panel data can be accounted for when there is enough time series dimension for single unit regression estimation. This means at least 30 observations. If you have less, you can only use ideas from time series regressions, such as doing regression on growth rates instead of levels, etc. + +In fact J. Wooldridge in his book ""Econometric Analysis of Cross Section and Panel Data"" recommends to treat all the time series issues as a question of covariance matrix of the unit error term. Translated to Stata parlance, use cluster-robust standard errors for your analysis and you should be ok, with the usual caveat that there are no magical fixes in modelling, i.e. if your model is not sound, no fancy estimation method is going to help you.",,2013-10-14 07:37:48.140 +186216,57431,12683.0,5,,CC BY-SA 3.0,a72964dc-3626-4a94-a8a2-d1f335bebd3b,"Statistics doesn't give a special meaning to 'measurement' in the way it does to 'estimate'. (As @Glen said, we 'estimate parameters'.) So it's going to depend on your area of application and on what $O$ and $\theta$ represent. + +If the variance $\sigma^2$ describes the measurement error of some instrument or procedure, and $\theta$ is some property considered rather inherent to the thing being measured, it's natural to talk about 'measuring $\theta$', and about the $O$s as 'measurements of $\theta$'. E.g. the $O$s are several measurements of the length $\theta$ of a steel shaft. + +If the variance $\sigma^2$ describes the variability of different individuals, and $\theta$ is some feature of the population considered rather contingent, it's not so natural to talk about 'measuring $\theta$'. E.g. the $O$s are single measurements of the lengths of each steel shaft from a batch, rather than measurements of the average length $\theta$ of a shaft in the batch . + +In any case 'measuring an observation' is oddly worded; 'making an observation' is usual. ",deleted 1 characters in body,2013-10-14 09:23:44.217 +186189,57394,1406.0,5,,CC BY-SA 3.0,e83e26ec-eecc-480b-a967-923e9194bd62,"I am reading an [article][1] which is trying to justify the need for causal inference in their inferential framework. The thought experiment is as follows: + +> Suppose a statistician is asked to design a model for a simple time +> series $X_1,X_2,X_3,...$ and she decides to use a Bayesian method. +> Assume she collects a first observation $X_1 = x_1$. She computes the +> posterior probability density function (pdf) over the parameters +> $\theta$ of the model given the data using Bayes’ rule: $$p(\theta|X_1 += x_1) = \int\frac{p(X_1 = x_1|\theta)p(\theta)}{p(X_1 = x_1|\theta')p(\theta')}, $$ +> +> where $p(X_1 = x_1|θ)$ is the likelihood of $x_1$ given $\theta$ and +> p($\theta$) is the prior pdf of $\theta$. She can use the model to +> predict the next observation by drawing a sample $x_2$ from the +> predictive pdf: $$p(X_2 = x_2|X_1 = x_1) = \int p(X_2 = x_2|X_1 = + x_1,\theta)p(\theta|X_1 = x_1)d\theta,$$ +> +> where $p(X_2 = x_2|X_1 = x_1,\theta)$ is the likelihood of $x_2$ given +> $x_1$ and $\theta$. Note that $x_2$ is not drawn from $p(X_2 = x_2|X_1 +> = x_1, \theta)$. She understands that the nature of $x_2$ is very different from $x_1$: while $x_1$ is informative and does change the +> belief state of the Bayesian model, $x_2$ is non-informative and thus +> is a reflection of the model’s belief state. Hence, she would never +> use $x_2$ to further condition the Bayesian model. Mathematically, she +> seems to imply that: +> $$p(\theta|X_1 =x_1,X_2 =x_2)=p(\theta|X_1 =x_1)$$ + +However I hardly believe that what this poor statistician should imply is: +$$p(\theta|X_1 =x_1,\text{do}(X_2 =x_2))=p(\theta|X_1 =x_1)$$ +Where ""do(or set)"" here comes from [Pearl][2]'s framework of causality which can be found [here][3] and [here][4]. +Now am I right about this? + + + [1]: http://www.aaai.org/Papers/JAIR/Vol38/JAIR-3812.pdf + [2]: http://bayes.cs.ucla.edu/jp_home.html + [3]: ftp://ftp.cs.ucla.edu/pub/stat_ser/r350.pdf + [4]: http://www.amazon.com/Causality-Reasoning-Inference-Judea-Pearl/dp/052189560X/ref=dp_ob_title_bk",added 2 characters in body,2013-10-14 07:40:23.930 +186192,57425,11117.0,3,,CC BY-SA 3.0,1a93c317-2f29-4436-926e-084ec8b0242e,,,2013-10-14 07:44:40.060 +186190,57425,11117.0,2,,CC BY-SA 3.0,1a93c317-2f29-4436-926e-084ec8b0242e,"Consider that $\theta$ is an hidden parameter and one has an observation such that $O$: +$$ +O \sim N(\theta,\sigma^2). +$$ +My question concerning vocabulary: + +do we measure $\theta$ and it gives us $O$? (so we measure the true value) + +or + +do we measure $O$ ? (so we measure the observation) + +I am looking for strong sources.",,2013-10-14 07:44:40.060 +186191,57425,11117.0,1,,CC BY-SA 3.0,1a93c317-2f29-4436-926e-084ec8b0242e,Vocabulary: do we measure actual values or observations?,,2013-10-14 07:44:40.060 +186195,57426,20470.0,3,,CC BY-SA 3.0,e8f487bd-71b5-48c1-8765-0c4484ff0625,,,2013-10-14 07:49:16.613 +186194,57426,20470.0,1,,CC BY-SA 3.0,e8f487bd-71b5-48c1-8765-0c4484ff0625,Hidden Markov Model for Predicting Event Occurrence,,2013-10-14 07:49:16.613 +186193,57426,20470.0,2,,CC BY-SA 3.0,e8f487bd-71b5-48c1-8765-0c4484ff0625,"I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks something like the figure below where the red columns represent event times, also indicated as `tE`s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {`tE` to `tE-5`} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested by Rabiner on Pg. 273 of this [paper][3]. Then, hopefully I will have an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] `P(Observations|HMM)` on a new day, where `Observations` will be a sliding window vector, which I will update to contain the observations between the current time `t` and `t-5` as the day goes on. + +I expect to see `P(Observations|HMM)` increase for `Observations` that resemble the ""pre-event windows"". This would in effect allow me to predict the events before they happen. + +**Question**: *Does the plan below sound like a plausible implementation of a Hidden Markov Model?* + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",,2013-10-14 07:49:16.613 +186196,57425,11117.0,5,,CC BY-SA 3.0,d0f192e9-7cde-4e01-9f28-57737f56e217,"Consider that $\theta$ is an hidden parameter and one has an observation such that $O$: +$$ +O \sim N(\theta,\sigma^2). +$$ +My question concerning vocabulary: + +do we measure $\theta$ and it gives us $O$? (so we measure the true value) + +or + +do we measure $O$ ? (so we measure the observation) + +I am looking for unquestionable sources.",added 8 characters in body,2013-10-14 07:54:35.030 +186197,57427,20473.0,2,,CC BY-SA 3.0,4c8804a1-b223-4df3-b431-66cc93e17195,"Using the **chain rule**, the joint density here can be decomposed as (denoting $\mathbf X$ the collection of the $n+1$ random variables) + +$$f_{\mathbf X}(x_n,x_{n-1},...,x_0) = f(x_n\mid x_{n-1},...,x_0)\cdot f(x_{n-1}\mid x_{n-2},...,x_0)\cdot f(x_{n-2}\mid x_{n-3},...,x_0) \cdot...\cdot f(x_0)$$ + + + +$$=\left(\prod_{i=1}^{n}\frac {1}{\sqrt{2\pi}}\exp\left\{-\frac {(x_i-\alpha x_{i-1})^2}{2}\right\}\right)\frac {1}{\sqrt{2\pi}}\exp\left\{-\frac {x_0^2}{2}\right\}$$ + +Viewed as a likelihood function of $\alpha$, and taking its natural logarithm, we have + +$$\ln L(\alpha \mid \mathbf X) = -\frac 12\sum_{i=1}^n (x_i-\alpha x_{i-1})^2 +c$$ + +...where in $c$ is also included the density of $x_0$ (but $x_0$ affects estimation of $\alpha$ through its presence in the conditional density related to $X_1$). + +Then + +$$\frac {\partial \ln L(\alpha \mid \mathbf X)}{\partial \alpha} = \frac {\partial }{\partial \alpha} \left(-\frac 12\sum_{i=1}^n (x_i-\alpha x_{i-1})^2\right)$$ + +$$=-\frac 12\frac {\partial }{\partial \alpha} \left(\sum_{i=1}^n (x_i^2-2\alpha x_ix_{i-1}+\alpha^2x_{i-1}^2)\right) $$ + +$$=-\frac 12\frac {\partial }{\partial \alpha} \left(\sum_{i=1}^n x_i^2-2\alpha \sum_{i=1}^nx_ix_{i-1}+\alpha^2\sum_{i=1}^nx_{i-1}^2)\right) $$ + +$$=\sum_{i=1}^n x_ix_{i-1} -\alpha\sum_{i=1}^nx_{i-1}^2$$ + +Setting + +$$\frac {\partial \ln L(\alpha \mid \mathbf X)}{\partial \alpha} =0\Rightarrow \hat \alpha_{ML} = \frac {\sum_{i=1}^n x_ix_{i-1}}{\sum_{i=1}^nx_{i-1}^2}$$ + +while $$\frac {\partial^2 \ln L(\alpha \mid \mathbf X)}{\partial \alpha^2} = -\sum_{i=1}^nx_{i-1}^2 <0$$ + +which guarantees a global and unique maximum, since it is negative irrespective of $\alpha$. + + +",,2013-10-14 08:16:58.337 +186198,57287,21624.0,5,,CC BY-SA 3.0,555137da-1248-44f8-b27a-0c20a1c5f421,"I am using bootstrap for my simulation. + +The number of the population is flexible for each case, and the sample size is decided by a certain percentage. For example, I have a 10,000 population, and I decide to use 10% for each iteration of bootstrap, so the sample size is 1,000. + +In practice, I found it is hard to decide how many times to run the bootstrap is enough. With less simulation, the results appear insufficiant, while with a large number of simulation they are purely redundant. + +May I know if there is a method that can help me to decide the number of iterations to run?",added 7 characters in body,2013-10-14 08:16:58.780 +186217,57426,,4,user88,CC BY-SA 3.0,262b3cb7-c2aa-421d-bb15-ad2eab34a1bd,Hidden Markov model for predicting event occurrence,edited title,2013-10-14 09:24:06.377 +186219,57428,,4,user88,CC BY-SA 3.0,83a273f9-8dd7-4fab-aa03-d24c7a7844f0,How to discretise continuous attributes while implementing the ID3 algorithm?,deleted 3 characters in body; edited title,2013-10-14 09:25:05.803 +186235,57435,22677.0,3,,CC BY-SA 3.0,91dad556-62a6-4f5e-8e17-f1484f4ffe6f,,,2013-10-14 13:34:50.787 +186199,57426,20470.0,5,,CC BY-SA 3.0,18fd3bbb-aaf5-45aa-8c0a-634fbd2dfab0,"I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks something like the figure below where the red columns represent event times, also indicated as $t_E$'s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested by Rabiner on Pg. 273 of this [paper][3]. Then, hopefully I will have an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] $P(Observations|HMM)$ on a new day, where $Observations$ will be a sliding window vector, which I will update to contain the observations between the current time $t$ and $t-5$ as the day goes on. + +I expect to see $P(Observations|HMM)$ increase for `Observations` that resemble the ""pre-event windows"". This would in effect allow me to predict the events before they happen. + +**Question**: *Does the plan below sound like a plausible implementation of a Hidden Markov Model?* + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",added 6 characters in body,2013-10-14 08:18:53.413 +186200,57426,20470.0,5,,CC BY-SA 3.0,79e07fc6-349d-4511-8c49-e0b64d399d6c,"I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks something like the figure below where the red columns highlight event times, i.e. $t_E$'s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested by Rabiner on Pg. 273 of this [paper][3]. Then, hopefully I will have an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] $P(Observations|HMM)$ on a new day, where $Observations$ will be a sliding window vector, which I will update to contain the observations between the current time $t$ and $t-5$ as the day goes on. + +I expect to see $P(Observations|HMM)$ increase for $Observations$ that resemble the ""pre-event windows"". This would in effect allow me to predict the events before they happen. + +**Question**: *Does this sound like a plausible implementation of a Hidden Markov Model?* + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",deleted 14 characters in body,2013-10-14 08:24:37.727 +186203,57428,22629.0,1,,CC BY-SA 3.0,350589a0-ccf5-4589-bf50-737effc43f8b,How to discrete the continuous attributes for implementing ID3 Algorithm,,2013-10-14 08:33:17.943 +186202,57428,22629.0,3,,CC BY-SA 3.0,350589a0-ccf5-4589-bf50-737effc43f8b,,,2013-10-14 08:33:17.943 +186201,57428,22629.0,2,,CC BY-SA 3.0,350589a0-ccf5-4589-bf50-737effc43f8b,"I am trying to implement the ID3 algorithm for a data set . However , all the attributes are continuous and can have values between 1-10. I have searched on this and found that we have to find the threshold for discretization but couldn't understand on how to do this exactly . Can some one explain on how to do this .? The dataset I am using is Breast Cancer Data from Wisconsin hospitals(http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data)",,2013-10-14 08:33:17.943 +186206,57429,20740.0,3,,CC BY-SA 3.0,464c6a0b-e9e9-4e8c-b199-a0127f74d019,,,2013-10-14 08:35:58.630 +186205,57429,20740.0,2,,CC BY-SA 3.0,464c6a0b-e9e9-4e8c-b199-a0127f74d019," + +I am looking to do a linear regression on two independent variables that will be present in varying proportions. + +For example trying to do a linear regression on $Y$ which is payment behavior (payback rate) of customers based on the the quality (let's say Gini coefficient) of the new and existing customer credit scores ($X_1$ and $X_2$, respectively) adjusted for the proportion of new and existing customers in the sample. + +Existing customers will be present in proportion $p$ and new customers in proportion $1-p = q$. + +In general more new customers, $q$, has a negative effect. Better scoring ($X_1, X_2$) and more existing customers p have a positive effect. + +What is a good way to model this? + +Would something like the following be a good solution trying to use $p$ and $q$ as some sort of interaction effect? + +$Y = X_1+X_2+\frac{X_1}{q}+X_2 p$ + +Would it be better to include p and q as variables themselves as well? +",,2013-10-14 08:35:58.630 +186204,57429,20740.0,1,,CC BY-SA 3.0,464c6a0b-e9e9-4e8c-b199-a0127f74d019,Linear regression with independent variables with varying proportions,,2013-10-14 08:35:58.630 +186207,57381,8074.0,5,,CC BY-SA 3.0,dfc399a8-a6a3-4ab5-abc4-23efce60346b,"I think there are a few options for showing this type of data: + +The first option would be to conduct an ""Empirical Orthogonal Functions Analysis"" (EOF) (also referred to as ""Principal Component Analysis"" (PCA) in non-climate circles). For your case, this should be conducted on a correlation matrix of your data locations. For example, your data matrix `dat` would be your spatial locations in the column dimension, and the measured parameter in the rows; So, your data matrix will consist of time series for each location. The `prcomp()` function will allow you to obtain the principal components, or dominant modes of correlation, relating to this field: + + res <- prcomp(dat, retx = TRUE, center = TRUE, scale = TRUE) # center and scale should be ""TRUE"" for an analysis of dominant correlation modes) + #res$x and res$rotation will contain the PC modes in the temporal and spatial dimension, respectively. + +The second option would be to create maps that show correlation relative to an individual location of interest: + + C <- cor(dat) + #C[,n] would be the correlation values between the nth location (e.g. dat[,n]) and all other locations. + +###EDIT: additional example + +While the following example doesn't use gappy data, you could apply the same analysis to a data field following interpolation with DINEOF (see previous link for R script). The example uses monthly anomaly sea level pressure data from the following data set (http://www.esrl.noaa.gov/psd/gcos_wgsp/Gridded/data.hadslp2.html): + + #dropbox link: + #https://dl.dropboxusercontent.com/u/52403158/slp_sm_grd.csv + #https://dl.dropboxusercontent.com/u/52403158/slp_sm.csv + + slp <- read.csv(""slp_sm.csv"", row.names=1) + grd <- read.csv(""slp_sm_grd.csv"", row.names=1) + + slp <- as.matrix(slp) + time <- as.POSIXct(rownames(slp)) + + ###EOF SLP anom + + #centered and scaled data + slp.sc <- scale(slp, center=TRUE, scale=TRUE) + + #correlation matrix + COR <- cov(slp.sc) + + # Decompose matrix using eigen() to derive PC loadings + E <- eigen(COR) + E$vectors # loadings + E$values # lambda values + expl.var <- E$values / sum(E$values) # explained variance + cum.expl.var <- cumsum(expl.var) # cumulative explained variance + plot(cum.expl.var) + + # Project data on loadings to derive new coordinates (principal components) + A <- slp.sc %*% E$vectors + +###Create maps of EOF modes + + + ###Map the leading EOF mode + + #make interpolation + require(akima) + require(maps) + + eof.num <- 1 + F1 <- interp(x=grd$lon, y=grd$lat, z=E$vectors[,eof.num]) # interpolated spatial EOF mode + + + png(paste0(""EOF_mode"", eof.num, "".png""), width=7, height=6, units=""in"", res=400) + #x11(width=7, height=5) + + par(ps=10) #settings before layout + layout(matrix(c(1,2), nrow=2, ncol=1, byrow=TRUE), heights=c(4,2), widths=7) + #layout.show(2) # run to see layout; comment out to prevent plotting during .pdf + par(cex=1) # layout has the tendency change par()$cex, so this step is important for control + + par(mar=c(4,4,1,1)) # I usually set my margins before each plot + pal <- colorRampPalette(c(""blue"", ""cyan"", ""yellow"", ""red"")) + image(F1, col=pal(100)) + map(""world"", add=TRUE, lwd=2) + contour(F1, add=TRUE, col=""white"") + box() + + par(mar=c(4,4,1,1)) # I usually set my margins before each plot + plot(time, A[,eof.num], t=""l"", lwd=1, ylab="""", xlab="""") + mtext(paste0(""EOF "", eof.num, "" [expl.var = "", round(expl.var[eof.num]*100), ""%]""), side=3, line=1) + + dev.off() # closes device + +![enter image description here][1] + +###Create correlation map + + ##Correlation to target location + loc <- c(-90, 0) + target <- which(grd$lon==loc[1] & grd$lat==loc[2]) + + F1 <- interp(x=grd$lon, y=grd$lat, z=COR[,target]) # interpolated spatial EOF mode + + + png(paste0(""Correlation_map"", ""_lon"", loc[1], ""_lat"", loc[2], "".png""), width=7, height=5, units=""in"", res=400) + #x11(width=7, height=5) + + par(ps=10) #settings before layout + layout(matrix(c(1,2), nrow=2, ncol=1, byrow=TRUE), heights=c(4,1), widths=7) + #layout.show(2) # run to see layout; comment out to prevent plotting during .pdf + par(cex=1) # layout has the tendency change par()$cex, so this step is important for control + + par(mar=c(4,4,1,1)) # I usually set my margins before each plot + pal <- colorRampPalette(c(""blue"", ""cyan"", ""yellow"", ""red"", ""yellow"", ""cyan"", ""blue"")) + #pal <- colorRampPalette(c(""purple4"", ""white"", ""blue"")) + ncolors <- 100 + breaks <- seq(-1,1,,ncolors+1) + image(F1, col=pal(ncolors), breaks=breaks) + map(""world"", add=TRUE, lwd=2) + contour(F1, add=TRUE, col=""white"") + box() + + par(mar=c(4,4,0,1)) # I usually set my margins before each plot + levs <- breaks[-1] - diff(breaks)/2 + image(x=levs, y=1, z=as.matrix(levs), col=pal(ncolors), breaks=breaks, ylab="""", xlab="""", yaxt=""n"") + mtext(""Correlation [R]"", side=1, line=2.5) + box() + + dev.off() # closes device + + +![enter image description here][2] + + + + + + + + [1]: https://i.stack.imgur.com/oo2C0.png + [2]: https://i.stack.imgur.com/fK7St.png",Additional example provided,2013-10-14 08:44:05.013 +186208,57430,16474.0,2,,CC BY-SA 3.0,b1fa5d6c-a568-498c-8bae-a37f7381e63e,"I would stick with $p$ only, as $q$ does not add any information on top of $p$. I would add interaction terms between $X_1$ and $p$ and $X_2$ and $p$ and then include the main effects of both $X_1$, $X_2$ and $p$. So: + +$Y =\beta_0 + \underbrace{\beta_1 X_1 + \beta_2 X_2 + \beta_3 p}_{\textrm{main effects}} + \underbrace{\beta_4 X_1 p + \beta_5 X_2 p}_{\textrm{interactions}} + \varepsilon$",,2013-10-14 08:46:34.843 +186209,57381,8074.0,5,,CC BY-SA 3.0,5ed1c6a3-d400-4698-8f47-bacd44a26022,"I think there are a few options for showing this type of data: + +The first option would be to conduct an ""Empirical Orthogonal Functions Analysis"" (EOF) (also referred to as ""Principal Component Analysis"" (PCA) in non-climate circles). For your case, this should be conducted on a correlation matrix of your data locations. For example, your data matrix `dat` would be your spatial locations in the column dimension, and the measured parameter in the rows; So, your data matrix will consist of time series for each location. The `prcomp()` function will allow you to obtain the principal components, or dominant modes of correlation, relating to this field: + + res <- prcomp(dat, retx = TRUE, center = TRUE, scale = TRUE) # center and scale should be ""TRUE"" for an analysis of dominant correlation modes) + #res$x and res$rotation will contain the PC modes in the temporal and spatial dimension, respectively. + +The second option would be to create maps that show correlation relative to an individual location of interest: + + C <- cor(dat) + #C[,n] would be the correlation values between the nth location (e.g. dat[,n]) and all other locations. + +###EDIT: additional example + +While the following example doesn't use gappy data, you could apply the same analysis to a data field following interpolation with DINEOF (http://menugget.blogspot.de/2012/10/dineof-data-interpolating-empirical.html). The example below uses a subset of monthly anomaly sea level pressure data from the following data set (http://www.esrl.noaa.gov/psd/gcos_wgsp/Gridded/data.hadslp2.html): + + #dropbox link: + #https://dl.dropboxusercontent.com/u/52403158/slp_sm_grd.csv + #https://dl.dropboxusercontent.com/u/52403158/slp_sm.csv + + slp <- read.csv(""slp_sm.csv"", row.names=1) + grd <- read.csv(""slp_sm_grd.csv"", row.names=1) + + slp <- as.matrix(slp) + time <- as.POSIXct(rownames(slp)) + + ###EOF SLP anom + + #centered and scaled data + slp.sc <- scale(slp, center=TRUE, scale=TRUE) + + #correlation matrix + COR <- cov(slp.sc) + + # Decompose matrix using eigen() to derive PC loadings + E <- eigen(COR) + E$vectors # loadings + E$values # lambda values + expl.var <- E$values / sum(E$values) # explained variance + cum.expl.var <- cumsum(expl.var) # cumulative explained variance + plot(cum.expl.var) + + # Project data on loadings to derive new coordinates (principal components) + A <- slp.sc %*% E$vectors + +###Create maps of EOF modes + + + ###Map the leading EOF mode + + #make interpolation + require(akima) + require(maps) + + eof.num <- 1 + F1 <- interp(x=grd$lon, y=grd$lat, z=E$vectors[,eof.num]) # interpolated spatial EOF mode + + + png(paste0(""EOF_mode"", eof.num, "".png""), width=7, height=6, units=""in"", res=400) + #x11(width=7, height=5) + + par(ps=10) #settings before layout + layout(matrix(c(1,2), nrow=2, ncol=1, byrow=TRUE), heights=c(4,2), widths=7) + #layout.show(2) # run to see layout; comment out to prevent plotting during .pdf + par(cex=1) # layout has the tendency change par()$cex, so this step is important for control + + par(mar=c(4,4,1,1)) # I usually set my margins before each plot + pal <- colorRampPalette(c(""blue"", ""cyan"", ""yellow"", ""red"")) + image(F1, col=pal(100)) + map(""world"", add=TRUE, lwd=2) + contour(F1, add=TRUE, col=""white"") + box() + + par(mar=c(4,4,1,1)) # I usually set my margins before each plot + plot(time, A[,eof.num], t=""l"", lwd=1, ylab="""", xlab="""") + mtext(paste0(""EOF "", eof.num, "" [expl.var = "", round(expl.var[eof.num]*100), ""%]""), side=3, line=1) + + dev.off() # closes device + +![enter image description here][1] + +###Create correlation map + + ##Correlation to target location + loc <- c(-90, 0) + target <- which(grd$lon==loc[1] & grd$lat==loc[2]) + + F1 <- interp(x=grd$lon, y=grd$lat, z=COR[,target]) # interpolated spatial EOF mode + + + png(paste0(""Correlation_map"", ""_lon"", loc[1], ""_lat"", loc[2], "".png""), width=7, height=5, units=""in"", res=400) + #x11(width=7, height=5) + + par(ps=10) #settings before layout + layout(matrix(c(1,2), nrow=2, ncol=1, byrow=TRUE), heights=c(4,1), widths=7) + #layout.show(2) # run to see layout; comment out to prevent plotting during .pdf + par(cex=1) # layout has the tendency change par()$cex, so this step is important for control + + par(mar=c(4,4,1,1)) # I usually set my margins before each plot + pal <- colorRampPalette(c(""blue"", ""cyan"", ""yellow"", ""red"", ""yellow"", ""cyan"", ""blue"")) + #pal <- colorRampPalette(c(""purple4"", ""white"", ""blue"")) + ncolors <- 100 + breaks <- seq(-1,1,,ncolors+1) + image(F1, col=pal(ncolors), breaks=breaks) + map(""world"", add=TRUE, lwd=2) + contour(F1, add=TRUE, col=""white"") + box() + + par(mar=c(4,4,0,1)) # I usually set my margins before each plot + levs <- breaks[-1] - diff(breaks)/2 + image(x=levs, y=1, z=as.matrix(levs), col=pal(ncolors), breaks=breaks, ylab="""", xlab="""", yaxt=""n"") + mtext(""Correlation [R]"", side=1, line=2.5) + box() + + dev.off() # closes device + + +![enter image description here][2] + + + + + + + + [1]: https://i.stack.imgur.com/oo2C0.png + [2]: https://i.stack.imgur.com/fK7St.png",added 64 characters in body,2013-10-14 08:51:04.077 +186210,57431,12683.0,2,,CC BY-SA 3.0,a3f5abf1-75f0-4699-81e3-9214f3aee652,"Statistics doesn't give a special meaning to 'measurement' in the way it does to 'estimate'. (As @Glen said, we 'estimate parameters'.) So it's going to depend on your area of application and on what $O$ and $\theta$ represent. + +If the variance $\sigma^2$ describes the measurement error of some instrument or procedure, and $\theta$ is some property considered rather inherent to the thing being measured, it's natural to talk about 'measuring $\theta$', and about the $O$s as 'measurements of $\theta$'. E.g. the $O$s are several measurements of the length $\theta$ of a steel shaft. + +If the variance $\sigma^2$ describes the variability of different individuals, and $\theta$ is some feature of the population considered rather contingent, it's not so natural to talk about 'measuring $\theta$'. E.g. the $O$s are several measurements of the lengths of each steel shaft from a batch, rather than measurements of the average length $\theta$ of a shaft in the batch . + +In any case 'measuring an observation' is oddly worded; 'making an observation' is usual. ",,2013-10-14 09:18:20.040 +186211,57403,20062.0,5,,CC BY-SA 3.0,dffd28bc-e1d2-4308-aefc-0fe9ea06d46c,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with for example their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** + + +---------- + + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (otherwise they would fuse into a one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case are given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +---------- + + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm",deleted 6 characters in body,2013-10-14 09:18:28.737 +186221,57432,12683.0,2,,CC BY-SA 3.0,ab4205ad-e5bf-4fa6-a338-767f22e3bf63,"In regression analysis each response $Y_i$ is modelled conditional on the observed predictor value $x_i$; as (with a normal distribution of errors) $Y_i\sim\mathcal{N}(\beta_0+\beta_1 x,\sigma^2)$ where $\beta_0$ and $\beta_1$ are the intercept & slope coefficients respectively, and $\sigma^2$ is the common error variance. Just as if the $x_i$s had been set by an experimenter rather than themselves sampled. The marginal distribution of the $Y_i$s is not necessarily thought about at all.",,2013-10-14 11:06:43.030 +186222,57432,12683.0,5,,CC BY-SA 3.0,1716269b-37ec-4ab7-a62c-5813ac6ee135,"In regression analysis each response $Y_i$ is modelled conditional on the observed predictor value $x_i$; as (with a normal distribution of errors) $Y_i\sim\mathcal{N}(\beta_0+\beta_1 x_i,\sigma^2)$ where $\beta_0$ and $\beta_1$ are the intercept & slope coefficients respectively, and $\sigma^2$ is the common error variance. Just as if the $x_i$s had been set by an experimenter rather than themselves sampled. The marginal distribution of the $Y_i$s is not necessarily thought about at all.",added 2 characters in body,2013-10-14 11:25:59.807 +186224,57421,22669.0,5,,CC BY-SA 3.0,76777c45-044d-4adc-989c-bc08337d5dd5,"I've just started learning time series so please excuse me if it's painfully obvious; I haven't managed to find the answer elsewhere. + +I have a data series showing a pretty obvious trend although it's quite noisy. I can take pretty much any division of the data and run classical tests to show a highly significant difference in means. + +I decided to have a look at time series analysis to see if it could help describe the trend. An ARIMA(0,1,1) model comes out with AIC,BIC=34.3,37.3 (Stata), whilst an ARIMA(0,1,0) model comes out with AIC,BIC=55.1,58.1 - so I understand I'm supposed to prefer the (0,1,1) model. + +However, the coefficient for the MA(1) is displaying as -0.9999997 (and not showing any p-values). If I try the same in SPSS I get an MA(1) coefficient of 1.000 (I assume SPSS uses opposite signs) with a p-value of 0.990 - does this mean it suggests I drop the term? + +My understanding is that the effect of a MA(1) coefficient of -1 is basically to remove the old error term and convert the whole series to a linear trend. Does this mean ARIMA is totally unsuitable for my needs? On the plus side it gives me a sensible value for the trend. If I use the (0,1,0) model then I still get a reasonable value for the trend but it's not significant any more. + +Thanks for your help! + +EDIT: +Thanks for looking in. The trend looks like a fairly linear decrease; the data points seen to fairly noisily rattle around above and below a trend line. The ARIMA (0,1,1) model produces something that's not far off a straight line decrease which seems sensible - the (0,1,1) produces what is essentially a lagged version of the data, translated down by one monthof trend. +The data aren't stationary (due to the trend) - though the first differences seem to be. I don't think the (0,1,1) is a bad model - I'm just a little confused by the p-value seeming to suggest I should drop the MA term - or wondering if it means I should bin ARIMA entirely!",added 658 characters in body,2013-10-14 11:40:14.750 +186225,57297,,25,,,7fbf0b61-0b7a-4a51-b47b-caf471e3048f,,http://twitter.com/#!/StackStats/status/389727656163704832,2013-10-14 12:21:28.737 +186226,57426,20470.0,5,,CC BY-SA 3.0,7edd3968-28cc-4aba-be6d-5fbae8c73ce4,"I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks like the figure below where there are 3 observation levels and the red columns highlight event times, i.e. $t_E$'s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested on Pg. 273 of Rabiner's [paper][3]. Hopefully, this will allow me to train an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] $P(Observations|HMM)$ on a new day, where $Observations$ will be a sliding window vector, which I will update to contain the observations between the current time $t$ and $t-5$ as the day goes on. + +I expect to see $P(Observations|HMM)$ increase for $Observations$ that resemble the ""pre-event windows"". This would in effect allow me to predict the events before they happen. + +**Question**: *Does this sound like a plausible implementation of a Hidden Markov Model?* + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",added 30 characters in body,2013-10-14 12:46:02.177 +186227,57433,,2,user12555,CC BY-SA 3.0,a35c990b-1530-4ea3-99a9-0f6822f464b0,"I have experience of deploying random forests in a SQL Server environment via `User Defined Function`. The trick is to convert the `IF-THEN ELSE` rules that you get from each tree into a `CASE-WHEN END` or any other `Conditional Processing` construct (admittedly I've used JMP Pro's Bootstrap Forest implementation - 500k lines of SQL code). + +There is absolutely no reason why this cannot be achived using the `rattle` `R` package. Have a look at `randomForest2Rules` & `printRandomForests` functions in that package. Both take `random forest` object as input and visit each tree in the forest and output a set of `IF-THEN ELSE` rules. Taking this as a starting point it should not be difficult converting this logic into your desired language. + +The above, also makes it important to decide the smallest no. of trees you need in the forest to make predictions at a desired level of accuracy (hint: plot(rf.object) shows you at what point the forest predictions do not improve despite adding more trees.) in order to keep the no. of lines to represent the forest down.",,2013-10-14 13:08:34.127 +186230,57434,1945.0,3,,CC BY-SA 3.0,983812e4-e1f1-431c-92a1-455d31585f13,,,2013-10-14 13:13:22.673 +186229,57434,1945.0,1,,CC BY-SA 3.0,983812e4-e1f1-431c-92a1-455d31585f13,How does RVM achieve sparsity?,,2013-10-14 13:13:22.673 +186228,57434,1945.0,2,,CC BY-SA 3.0,983812e4-e1f1-431c-92a1-455d31585f13,"I have read several textbook descriptions on RVM and none of them provide an adequate (plain English) explanation of how RVM achieves sparsity. + +I am left feeling like the authors left out a paragraph of text that would have connected the dots and instead decided to replace (rather than supplement) it with mathematical derivations. + +Could someone please explain the basic idea as to how RVM works in relation to learning sparse regression models?",,2013-10-14 13:13:22.673 +186231,57319,1693.0,33,,,9496dc85-ae19-4aa4-b3d1-8a038ebbea8a,,821,2013-10-14 13:17:00.657 +186232,57433,,5,user12555,CC BY-SA 3.0,a00853fa-17b9-4568-800e-a1decd2956d8,"I have experience of deploying random forests in a SQL Server environment via `User Defined Function`. The trick is to convert the `IF-THEN ELSE` rules that you get from each tree into a `CASE-WHEN END` or any other `Conditional Processing` construct (admittedly I've used JMP Pro's Bootstrap Forest implementation - 500k lines of SQL code). + +There is absolutely no reason why this cannot be achived using the `rattle` `R` package. Have a look at `randomForest2Rules` & `printRandomForests` functions in that package. Both take `random forest` object as input and visit each tree in the forest and output a set of `IF-THEN ELSE` rules. Taking this as a starting point it should not be difficult converting this logic into your desired language in an automated way, since the output from the above mentioned function is structured text. + +The above, also makes it important to decide the smallest no. of trees you need in the forest to make predictions at a desired level of accuracy (hint: plot(rf.object) shows you at what point the forest predictions do not improve despite adding more trees.) in order to keep the no. of lines to represent the forest down.",added 91 characters in body,2013-10-14 13:25:40.947 +186234,57435,22677.0,2,,CC BY-SA 3.0,91dad556-62a6-4f5e-8e17-f1484f4ffe6f,"I ask this question out of curiosity + +earlier today when i was trying to test for heteroscedasticity in `R`, i accidentally mistook `white.test` of `tseries` package for `white.test` of `bstat` package. + +i found out later that the former tests for **Neglected Non Linearity** while the latter tests for **Heteroscedasticity** + +now this is something new, i hadn't heard about the ***""neglected""*** part before, can someone please enlighten me about the ***""neglected""***? + +",,2013-10-14 13:34:50.787 +186233,57435,22677.0,1,,CC BY-SA 3.0,91dad556-62a6-4f5e-8e17-f1484f4ffe6f,"Can someone enlighten me on what is ""Neglected Nonlinearity""?",,2013-10-14 13:34:50.787 +186236,57436,7949.0,2,,CC BY-SA 3.0,22360406-35cc-489f-a885-6b279f0c5be0,"There is no obvious relationship between $R^2$ and reversal of the sign of a regression coefficient. Assume you have data for which the true model is for example +$$ +y_i = 0+5x_i -z_z + \epsilon_i +$$ +with $\epsilon_i \sim N(0, sd_\text{error}^2)$. I show the zero to make explicit that the intercept of the true model is zero, this is just a simplification. + +When x and y are highly correlated and centered about zero then the coefficient of z when regressing over just z will be positive instead of negative. Note that the true model coefficients do not change with $sd_\text{error}$ but you can make $R^2$ vary between zero and one by changing the magnitude of the residual error. Look for example at the following R-code: + + require(MASS) + sd.error <- 1 + x.and.z <- mvrnorm(1000, c(0,0) , matrix(c(1, 0.9,0.9,1),nrow=2)) # set correlation to 0.9 + x <- x.and.y[, 1] + z <- x.and.y[, 2] + y <- 5*x - z + rnorm(1000, 0, sd.error) # true model + modell1 <- lm(y~x+z) + modell2 <- lm(y~z) + print(summary(modell1)) # coefficient of z should be negative + print(summary(modell2)) # coefficient of z should be positive + +and play a bit with sd.error. Look for example at $sd_\text{error}=50$. + +Note that with a very large sd.error the coefficient estimation will become more unstable and the reversal might not show up every time. But that's a limitation of the sample size. + +A short summary would be that the variance of the error does not affect the expectations and thus reversal. Therefore neither does $R^2$.",,2013-10-14 13:45:26.510 +186237,57365,12683.0,5,,CC BY-SA 3.0,06cf0d71-63ad-44cb-b46a-d861e6de8a9d,"We have a response variable $Y$ and predictor $X$, and we draw $n$ samples $(Y_1,X_1), \ldots, (Y_n, X_n)$ from the population of interest to do a regression analysis. Under the assumptions of a simple linear regression model, my question is a conceptual one: how do we really think about the response on the $i$th unit, $Y_i$? Do we say it's drawn from the level or subpopulation of individuals with $ X = x_i $, or from the aggregate population over all the values of $X$? Moreover, while we assume that the response $Y$ in every subpopulation defined by $X$ is normal with equal variances, how do we think about the aggregate population from which $Y_i$ is drawn? ",fixed typo,2013-10-14 14:09:18.437 +186238,57436,7949.0,5,,CC BY-SA 3.0,1e8b0301-099c-495f-863c-9c56e18e8d06,"There is no obvious relationship between $R^2$ and reversal of the sign of a regression coefficient. Assume you have data for which the true model is for example +$$ +y_i = 0+5x_i -z_z + \epsilon_i +$$ +with $\epsilon_i \sim N(0, sd_\text{error}^2)$. I show the zero to make explicit that the intercept of the true model is zero, this is just a simplification. + +When x and y are highly correlated and centered about zero then the coefficient of z when regressing over just z will be positive instead of negative. Note that the true model coefficients do not change with $sd_\text{error}$ but you can make $R^2$ vary between zero and one by changing the magnitude of the residual error. Look for example at the following R-code: + + require(MASS) + sd.error <- 1 + x.and.z <- mvrnorm(1000, c(0,0) , matrix(c(1, 0.9,0.9,1),nrow=2)) # set correlation to 0.9 + x <- x.and.z[, 1] + z <- x.and.z[, 2] + y <- 5*x - z + rnorm(1000, 0, sd.error) # true model + modell1 <- lm(y~x+z) + modell2 <- lm(y~z) + print(summary(modell1)) # coefficient of z should be negative + print(summary(modell2)) # coefficient of z should be positive + +and play a bit with sd.error. Look for example at $sd_\text{error}=50$. + +Note that with a very large sd.error the coefficient estimation will become more unstable and the reversal might not show up every time. But that's a limitation of the sample size. + +A short summary would be that the variance of the error does not affect the expectations and thus reversal. Therefore neither does $R^2$.",edited body,2013-10-14 14:17:13.783 +186241,57437,10147.0,3,,CC BY-SA 3.0,87609a67-2cdd-49fe-b148-5b698893ab70,,,2013-10-14 14:23:05.487 +186240,57437,10147.0,1,,CC BY-SA 3.0,87609a67-2cdd-49fe-b148-5b698893ab70,Ordered Response Variable,,2013-10-14 14:23:05.487 +186239,57437,10147.0,2,,CC BY-SA 3.0,87609a67-2cdd-49fe-b148-5b698893ab70,"For regression with ordered response variable, there are different methods, for example, discriminant analysis, probit or logit model. I am wondering what are teh different focuces of the different methods and which one is more often used. +Thanks.",,2013-10-14 14:23:05.487 +186244,57438,13385.0,3,,CC BY-SA 3.0,dcf068ac-18ea-4455-a7a6-be4ddc3f2ecb,,,2013-10-14 14:28:38.417 +186242,57438,13385.0,1,,CC BY-SA 3.0,dcf068ac-18ea-4455-a7a6-be4ddc3f2ecb,Finding parameters to maximize expected utility of random variable,,2013-10-14 14:28:38.417 +186243,57438,13385.0,2,,CC BY-SA 3.0,dcf068ac-18ea-4455-a7a6-be4ddc3f2ecb,"I'm trying to analyze some data consisting of five randomized parameters and a utility function which indirectly depends on the parameters, by experimentation. That is to say, the parameters of the experiment are chosen randomly, and successes and failures are counted up. I want to find parameters for which the expected utility of successes and failures is highest. + +From my days in calculus, I can see that an algorithm could consist of: + + 1. Regression to a (hopefully analytically tractable) surface + 2. Finding a maximum + 3. Finding the pre-image of my maximum (if I use any of the C libraries I've seen, which seem to focus on the maximum value, not its pre-image) + +But I'm not sure about the ""fiddly bits"" like: + + - The distribution of points (I don't have any data yet) + - Any substantive idea of the shape of the surface, though I am expecting diminishing marginal utility, so it should be non-linear and have a bump. + - Numerical stability + +This seems like it should be straight-forward, in terms of applied decision theory. So, is my plan sensible? Any pointers to literature, algorithms, C or Haskell libraries?",,2013-10-14 14:28:38.417 +186263,57444,22262.0,1,,CC BY-SA 3.0,b60ff2ee-04b5-4d89-ac6d-e8f94c271e61,Using quantile regression to predict probability of surpassing threshold,,2013-10-14 15:45:47.440 +186338,57466,22690.0,2,,CC BY-SA 3.0,12f10fd2-c133-4190-b061-d656b66ce430,"I'm trying to understand the effects of adding non-conditionally independent features to a naive Bayes classifier. Let's say I have the features vector $X = [x_1,x_2,x_3,x_4]$ and that for each value of $x_3$ I get the same value for $x_4$: + +For all $i \in \{samples\}$, $x_{3}^{i} = x_{4}^{i}$ + +I could say that the conditionally independent assumption of $x_n$ given the class $Y = y_k$ does not hold anymore since the value of $x_{3}^{i}$ foresee $x_{4}^{i}$, and that naive Bayes classifier may not produce the expected results. I'm not really sure about that explanation and I would appreciate your point of view about it.",,2013-10-14 21:58:00.910 +186245,57426,20470.0,5,,CC BY-SA 3.0,6c318c0d-15b2-46bd-98aa-b23e9dbc815a,"**Question**: *Is the set-up below look like a sensible implementation of a Hidden Markov model?* + +I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks like the figure below where there are 3 observation levels and the red columns highlight event times, i.e. $t_E$'s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested on Pg. 273 of Rabiner's [paper][3]. Hopefully, this will allow me to train an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] $P(Observations|HMM)$ on a new day, where $Observations$ will be a sliding window vector, which I will update to contain the observations between the current time $t$ and $t-5$ as the day goes on. + +I expect to see $P(Observations|HMM)$ increase for $Observations$ that resemble the ""pre-event windows"". This would in effect allow me to predict the events before they happen. + + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",added 13 characters in body,2013-10-14 14:37:27.677 +186246,57439,20831.0,2,,CC BY-SA 3.0,41ebf3db-4483-4538-8eeb-0f6cc762e905,"As you say, the data is not stationary, we can find the stationary transformed data by differencing, and checked by the unit root test (e.g Augmented Dickey-Fuller test, Elliott-Rothenberg-Stock test, KPSS test, Phillips-Perron test, Schmidt-Phillips test, Zivot-Andrews test...) We can talk about ARMA model only after confirming the stationarity. + +It is a classical way to identify the ARMA(p, q) by the ACF plot and PACF plot. ARMA(0,1) and ARMA(0,0) can be told here. Another method to identify p, q is about the EACF, but it is not widely used for univariate time series. + +Empirical studies show that AIC usually tends to overfitting. The advantage of using AIC is for automatic algorithm to find the best model, but it is not usually recommended in traditional time series textbook.",,2013-10-14 14:41:48.740 +186248,57403,20062.0,5,,CC BY-SA 3.0,c81ab607-7372-4c3f-b984-d563b7758365,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with for example their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** + + +---------- + + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (otherwise they would fuse into a one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case is given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +---------- + + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm",deleted 1 characters in body,2013-10-14 14:52:45.480 +186249,57440,2666.0,2,,CC BY-SA 3.0,f6003478-3f0a-49fd-84f7-4ef3f6cfd2d3,"I don't think that discriminant analysis will be very efficient because it does not use the ordering. There are 4 commonly used families for ordinal response that are based on direct probability modeling: logistic, probit, log-log (Cox model) and complementary log-log. These are implemented in the R `rms` package `orm` function, which also handles continuous $Y$. Graphical methods can be used to choose from among the 4. Proportional odds is the easiest to interpret.",,2013-10-14 14:56:24.637 +186250,57237,22570.0,5,,CC BY-SA 3.0,e340690a-b502-4f76-bddb-311a253cff53,"I'm trying to generate sets of causally connected random variables and started off doing this with a monte carlo approach. + +The baseline is a 2-dimensional measured histogram from which I draw random values. + +In my concrete examples these variables are acceleration $\bf{a}$ and velocity $\bf{v}$ - so obviously +$v_{i+1} = v_{i} + a_i * dt$ +has to hold. + +My current naive approach is: + +I start with a some $v_0$. +Then I generate a random $a_0$ according to the measured probability of $\bf{a}$ for the value of $v_0$. Using this $a_0$ I can calculate $v_1$ and the whole procedure starts over again. + +So when I check the generated accelerations $\bf{a}$ in bins of $\bf{v}$ everything's fine. +But I obviously this does not at all respect the marginal distribution of $\bf{v}$. + +I'm kind of familiar with basic monte carlo methods, though lacking some theoretical background as you might guess. +I'd be fine if the two variables where *just* connected by some correlation matrix, but the causal connection between the two gives me headaches. + +I didn't manage to find an example for this kind of problem somewhere - I might be googl'ing the wrong terms. +I'd be satisfied if somebody could point me to some literature/example or promising method to get a hold on this. + +(Or tell me that's is not really possible given my inputs - that's what I'm guessing occasionally...) + + +**EDIT:** + +The actual aim of this whole procedure: +I have a set of measurements $\bf{a}$ and $\bf{v}$, represented in a two-dimensional histogram $N(a,v)$. Given this input I'd like to generate sets of random $\bf{a_r}$ and $\bf{v_r}$ that reproduce the measured distribution.",added 283 characters in body,2013-10-14 15:03:07.133 +186251,57438,13385.0,5,,CC BY-SA 3.0,d6d0f1cc-0a92-48ab-80d0-705713ce14c7,"I'm trying to analyze some data consisting of five randomized parameters and a utility function which indirectly depends on the parameters, by experimentation. That is to say, the parameters of the experiment are chosen randomly, and successes and failures are counted up. I want to find parameters for which the expected utility of successes and failures is highest. + +From my days in calculus, I can see that an algorithm could consist of: + + 1. Regression to a (hopefully analytically tractable) surface + 2. Finding a maximum + 3. Finding the pre-image of my maximum (if I use any of the C libraries I've seen, which seem to focus on the maximum value, not its pre-image) + +But I'm not sure about the ""fiddly bits"" like: + + - The distribution of points (I don't have any data yet) + - Any substantive idea of the shape of the surface, though I am expecting diminishing marginal utility, so it should be non-linear and have a bump. + - Numerical stability + +This seems like it should be straight-forward, in terms of applied decision theory. So, is my plan sensible? Any pointers to literature, algorithms, C or Haskell libraries? + +Addition in response to comment: + +I'm trying to find the ""best"" parameters in terms of student performance. The 5-tuple represents: + + 1. $b$: The ""base"" waiting time before seeing a problem again. + 2. $p_1$: A constant factor if the student says the problem was easy. + 3. $p_2$: A constant factor if the student says it was hard. + 4. $p_3$: A constant factor if the student says it was ""normal"". + 5. $p_4$: A constant factor if the student got it wrong. + +The waiting time for the next viewing is computed by multiplying all of the responses the student has issued, and the base waiting time, and computing $e^{b \prod p_{i,j}}$. So, for example, a wrong answer makes the waiting time much shorter. An 'easy' report makes it quite a bit longer. + +Now, if the student gets the next viewing wrong, we want to count it as a failure. If the student gets it right (regardless of the difficulty the student reports), we count it as a success. + +I want to maximize the utility function $\frac{|\text{success}|}{|\text{total}|}$ by varying the 5-tuple. I guess $\frac{|\text{success}|}{|\text{failure}|}$ would serve the same purpose.",response to comment,2013-10-14 15:16:08.880 +186271,57444,22262.0,5,,CC BY-SA 3.0,07b0186d-57eb-4941-b4dc-32f2b7cc555b,"Consider a continuous response $Y \sim N(0,\sigma^2)$ and design matrix vector $i$; $\mathbf{X}$. Suppose that I am interested in estimating the probability that $Y \leq 0.1$ given observing $\mathbf{X}$. + +I want to use quantile regression to do this - **can I confirm that this is a legitimate methodology**? + +We have quantiles $\tau \in [0,1]$ and after estimating our quantile regression for each $\tau$ we have our quantile estimates $\mathbf{q} := \{\hat{Q}(\tau) : \tau \in \{0.01,0.02,...,0.99\}\}$. I want to select the $\tau$ such that $\hat{Q}(\tau) \approx 0.1$. When I find such a $\hat{Q}(\tau)$ it seems to then follow naturally that $P(Y \leq 0.1) = \tau$. The reason is that my model has estimated the $\tau$-th quantile to be $0.1$, which is point on the x-axis in $Y$'s pdf that I need to find. + +In practice this may not work since an estimated quantile can be lower for higher $\tau$ under some $\mathbf{X}$. + +Not looking for logistic regression with a discretized response as a solution. +",added 78 characters in body,2013-10-14 16:37:14.707 +186252,57426,20470.0,5,,CC BY-SA 3.0,944c2c04-734d-428e-abbd-b7c221e35074,"**Question**: *Is the set-up below a sensible implementation of a Hidden Markov model?* + +I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks like the figure below where there are 3 observation levels and the red columns highlight event times, i.e. $t_E$'s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested on Pg. 273 of Rabiner's [paper][3]. Hopefully, this will allow me to train an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] $P(Observations|HMM)$ on a new day, where $Observations$ will be a sliding window vector, which I will update to contain the observations between the current time $t$ and $t-5$ as the day goes on. + +I expect to see $P(Observations|HMM)$ increase for $Observations$ that resemble the ""pre-event windows"". This would in effect allow me to predict the events before they happen. + + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",deleted 10 characters in body,2013-10-14 15:16:34.037 +186253,57319,,25,,,da5a9977-52ba-4e15-84a7-725c14cd88d8,,http://twitter.com/#!/StackStats/status/389772954042896384,2013-10-14 15:21:28.577 +186254,57441,22678.0,2,,CC BY-SA 3.0,fb956893-b1ab-46c3-82de-e2ab487cc1f3,"I would disagree on your first point. The $L_2$ regularized model is +$$ +\parallel Y-K\beta \parallel_2^2 + \lambda \beta^T R \beta +$$ +where K is the known kernel matrix and $R$ is the regularization matrix. +$K=R$ is only a good choice, when the gaussian kernel is used. +For more information please see +*A. Smola, B. Schölkopf, On a Kernel-based Method for Pattern Recognition, +Regression, Approximation, and Operator Inversion, 1997* + +@author, the discussion about ""good kernels"" is rather popular. +See this post for example: +http://stats.stackexchange.com/questions/48506/what-function-could-be-a-kernel + +However, there are ways to compute an optimized kernel based on your regularization idea. +You should find some approaches presented at NIPS. + +",,2013-10-14 15:23:57.277 +186255,57319,1693.0,5,,CC BY-SA 3.0,144b9427-a108-4d03-a359-8c24f6a65123,"I am modeling an outcome for hospital patients, 'RA' (whether readmitted). My predictor of interest is 'HHS' (whether referred to Home Health Services such as from a visiting nurse). Those referred readmit at a 15.2% rate; others, 9.2%, but the former are needier, sicker patients. Conventional thinking is that if we controlled for severity of illness this difference would not only be washed out but would reverse itself. In other words, holding constant the severity of illness, having HHS should mean a lower RA rate. + +With HHS as the sole predictor, B in a logistic regression = 0.6 (N ~ 25k). B is reduced to 0.2 with a group of covariates controlled, each accounting for some aspect of severity of illness, but B doesn't fall below zero. + +HHS alone explains only about 1% of the variance in RA; with the other predictors, this becomes 4%.* Perhaps this is the problem--that these covariates are not explaining enough variance to ""succeed"" in reversing the sign of the coefficient of interest. If this is true, is there a way to estimate how high their explained variance needs to be for such a reversal to show up? + + +---------- + +*Using either of 2 pseudo-RSQ formulas; Cox & Snell's or Menard's [-2LL0 - (-2LL1)] / [-2LL0.]",see comment 10/14/13 11:27 am,2013-10-14 15:24:08.373 +186256,57414,22667.0,5,,CC BY-SA 3.0,f792b475-1e2c-4fbe-a530-94b5b2083319,"This is kind of a basic stats question, but I want to make sure I am doing this right. + +I have a distribution of objects. Specifically: + `array([ 6072., 112673., 126874., 44366., 5384., 14697., 20323., 68197., 98024.,39483., 103990., 18556., 32930., 23551., 6897.])` + + +I then have a lot of samples like [1,4,0,0,0,0...] (same length) and I'd like to know how far the samples are from the distribution above. Correlation doesn't really do it. +[32,0,0,0,..] should be further away than [4,0,0,0...].",deleted 25 characters in body,2013-10-14 15:28:49.173 +186259,57442,22677.0,2,,CC BY-SA 3.0,fe815939-6dd1-40a1-8abb-0a5e3993091e,"I'm really sorry with this foolish question. + +I just wanted to know how to do ***Heteroscedasticity Test on a Univariate Model***? + ++ ex: an univariate autoregressive model ++ ex: an univariate ARCH/GARCH model + +if it possible, how does one do that on `R` + + + + +",,2013-10-14 15:34:17.913 +186258,57442,22677.0,3,,CC BY-SA 3.0,fe815939-6dd1-40a1-8abb-0a5e3993091e,,,2013-10-14 15:34:17.913 +186257,57442,22677.0,1,,CC BY-SA 3.0,fe815939-6dd1-40a1-8abb-0a5e3993091e,How to do Univariate Heteroscedasticity Test,,2013-10-14 15:34:17.913 +186260,57411,20473.0,5,,CC BY-SA 3.0,6d981043-2a0b-4202-a989-df7844e3ca83,"Entropy (joint entropy included), _is a property of the distribution_ that a random variable follows. The available sample (and hence the timing of observation) plays no role in it. + +Copying for Cover & Thomas, the joint entropy $H(X,Y)$ of two discrete random variables $X, Y,$ with joint distribution $p(x,y)$, is defined as + +$$H(X,Y) = - \sum_{S_X}\sum_{S_Y}p(x,y)\log p(x,y) $$ + +Examine the expression: the sums are taken over _all possible values_ of $X$ and $Y$, i.e. over all the values that belong to the support of each r.v. ($S_X$ and $S_Y$ respectively), irrespective of whether some of these values may not materialize or be observed in a sample. What we actually observe, or when, plays no role, in calculating entropy, and joint entropy in particular. + +Turning to your specific example: The side of a coin itself can not be modeled as a random variable. A random variable maps _events_ into real numbers. The side of a coin is not an event. _Observing_ one of the two sides is an event. _Not observing_ a side, is an event. So let's define a random variable $X$ by ""$X$ takes the value $1$ if heads is observed, $0$ otherwise"". And define $Y$ by ""$Y$ takes the value $1$ if tails is observed, $0$ otherwise"". Assume the coin is fair. The joint distribution of these two random variables is then described by +$$\begin{align} +P(X=1,Y=1) &= 0 \\ +P(X=1,Y=0) &= 0.5 \\ +P(X=0,Y=1) &= 0.5 \\ +P(X=0,Y=0) &= 0 +\end{align}$$ + +Note that the numerical mapping we chose (the zero/one values) does not play, as numbers go, any decisive part in the probabilities assigned -we could have chosen a 5/6 mapping for $X$ and a 56/89 mapping for $Y$ (or whatever) -the allocation of probabilities in the joint distribution would have been the same (it is the underlying structure of _events_ that is the critical factor). +Next, as always, we consider the distribution at non-zero values, so + +$$H(X,Y) = - 0.5\log(0.5) - 0.5\log(0.5) $$ + +and using base-2 for the logarithm we get + +$$H(X,Y) = - 0.5(-1) - 0.5(-1) = 1 $$ + +Finally, you can easily find that the entropy of $X$ (and likewise for $Y$) is +$$H(X) = - \sum_{S_X}p(x)\log p(x) = - 0.5(-1) - 0.5(-1) = 1 $$ + +So in this case $H(X,Y) = H(X) = H(Y)$. But the general expression for the decomposition of joint entropy is + +$$H(X,Y) = H(X) + H(Y\mid X) = H(Y) + H(X\mid Y)$$ + +where $H(Y\mid X)$ and $H(X\mid Y)$ are conditional entropies. Then we conclude that $H(Y\mid X) = H(X\mid Y) = 0$ in this case. The intuition is straightforward: given $X$ what has happened to $Y$ is certain (and likewise in reverse), so conditional entropy is zero. + + + + +",added 397 characters in body,2013-10-14 15:37:05.653 +186261,57443,2149.0,2,,CC BY-SA 3.0,b547c9c5-28e8-4446-8580-23ce17815d02,"This question was answered in 1988 http://www.unc.edu/~jbhill/tsay.pdf by R.Tsay and implemented in AUTOBOX in 1990. As of this date (today) no other forecasting/time series package has implemented his elegant and creative solution. Simply adjust your series for time trend changes, level shift changes , seasonal pulses and pulses AND the correct ARIMA structure. Verify that the model parameters are constant over time and then search for change points in error variance as he recommends.",,2013-10-14 15:43:16.087 +186264,57444,22262.0,3,,CC BY-SA 3.0,b60ff2ee-04b5-4d89-ac6d-e8f94c271e61,,,2013-10-14 15:45:47.440 +186262,57444,22262.0,2,,CC BY-SA 3.0,b60ff2ee-04b5-4d89-ac6d-e8f94c271e61,"Consider a continuous response $Y(i) \sim N(0,\sigma^2)$ and design matrix vector $i$; $\mathbf{X}(i)$. Suppose that I am interested in estimating the probability that $Y(i+1) \leq 0.1$ given observing $\mathbf{X}(i+1)$. + +I want to use quantile regression to do this - **can I confirm that this is a legitimate methodology**? + +We have quantiles $\tau \in [0,1]$ and after estimating our quantile regression for each $\tau$ we have our quantile estimates $\mathbf{q} := \{\hat{Q}(\tau) : \tau \in \{0.01,0.02,...,0.99\}\}$. I want to select the $\tau$ such that $\hat{Q}(\tau) \approx 0.1$. When I find such a $\hat{Q}(\tau)$ it seems to then follow naturally that $P(Y(i+1) \leq 0.1) = \tau$. The reason is that my model has estimated the $\tau$-th quantile to be $0.1$, which is point on the x-axis in $Y$'s pdf that I need to find. + +In practice this may not work since an estimated quantile can be lower for higher $\tau$ under some $\mathbf{X}(i+1)$. + +As a side question that I think deserves integration into this question; **are there other methods that could be used to to solve this type of probability estimation problem** (aside from discretizing the response for a classification model input, using an ensemble classifier and taking the ratio of votes in favour of $Y(i+1) <= 0.1$, or arbitrary decision boundaries in conditional expectation models). + + + +",,2013-10-14 15:45:47.440 +186265,57445,22143.0,2,,CC BY-SA 3.0,f63ecaec-7186-43fd-9e47-8e412f96918f,"In Relevance vector machines ([RVM]( http://en.wikipedia.org/wiki/Relevance_vector_machine)) we have a prior on the weight vector $\mathbf{w}$ (which is $N+1$ dimensional, where $N$ is the number of examples) as shown in equation (5) of ([1]): +$$p(\mathbf{w}|\alpha) = \Pi_{i=0}^{N}\mathcal{N}(w_i|0,\alpha_i^{-1}),$$ +where $\mathbf{\alpha}$ is the $N+1$ dimensional vector of hyperparameters. + +This prior is supposed to ensure that the weight vector $\mathbf{w}$ (which represents the number of ""support vectors"" which are active) is ""sparse"" if we can integrate out all the nuisance parameters ($\alpha$). See paragraph preceding Section 2.2 in ([1]). + +*Potential points of confusion:* + +- the notation $\mathbf{w}$ is different from the $d$-dimensional linear model representation. Here, while comparing RVM with SVM, only think of the dual SVM formulation with the $N+1$ dimensional parameter $\mathbf{w}$. +- ""Sparse"" for (dual) SVMs means the number of support vectors is small. Do not confuse with number of non-zero coefficients in (the d-dimensional) linear models. + + + [1]:http://jmlr.org/papers/volume1/tipping01a/tipping01a.pdf",,2013-10-14 15:49:20.960 +186266,57437,15827.0,5,,CC BY-SA 3.0,f7639700-4688-40cc-ba3f-ad0f0440ce1f,"For regression with ordered response variable, there are different methods, for example, discriminant analysis, probit or logit model. I am wondering what are the different focuses of the different methods and which one is more often used. +",deleted 7 characters in body,2013-10-14 16:01:02.053 +186267,57446,4320.0,2,,CC BY-SA 3.0,2cde2e82-2e8c-40ba-a66b-eccd36ace0e5,"One problem with the approach you've described is you will need to define what kind of increase in $P(O)$ is meaningful, which may be difficult as $P(O)$ will always be very small in general. It may be better to train two HMMs, say HMM1 for observation sequences where the event of interest occurs and HMM2 for observation sequences where the event **doesn't** occur. Then given an observation sequence $O$ you can predict the event will occur if $P(O|HMM1) > P(O|HMM2)$. + +***Disclaimer**: What follows is based on my own personal experience, so take it for what it is.* One of the nice things about HMMs is they allow you to deal with variable length sequences and variable order effects (thanks to the hidden states). Sometimes this is necessary (like in lots of NLP applications). However, it seems like you have a priori assumed that only the last 5 observations are relevant for predicting the event of interest. If this assumption is realistic then you may have significantly more luck using traditional techniques (logistic regression, naive bayes, SVM, etc) and simply using the last 5 observations as features/independent variables. Typically these types of models will be easier to train and (in my experience) produce better results.",,2013-10-14 16:03:34.203 +186268,57444,22262.0,5,,CC BY-SA 3.0,917e042b-7e08-40ca-963d-5a74ab7f0069,"Consider a continuous response $Y(i) \sim N(0,\sigma^2)$ and design matrix vector $i$; $\mathbf{X}(i)$. Suppose that I am interested in estimating the probability that $Y(i+1) \leq 0.1$ given observing $\mathbf{X}(i+1)$. + +I want to use quantile regression to do this - **can I confirm that this is a legitimate methodology**? + +We have quantiles $\tau \in [0,1]$ and after estimating our quantile regression for each $\tau$ we have our quantile estimates $\mathbf{q} := \{\hat{Q}(\tau) : \tau \in \{0.01,0.02,...,0.99\}\}$. I want to select the $\tau$ such that $\hat{Q}(\tau) \approx 0.1$. When I find such a $\hat{Q}(\tau)$ it seems to then follow naturally that $P(Y(i+1) \leq 0.1) = \tau$. The reason is that my model has estimated the $\tau$-th quantile to be $0.1$, which is point on the x-axis in $Y$'s pdf that I need to find. + +In practice this may not work since an estimated quantile can be lower for higher $\tau$ under some $\mathbf{X}(i+1)$. + + +",deleted 411 characters in body,2013-10-14 16:04:01.923 +186269,57442,15827.0,5,,CC BY-SA 3.0,f3657eb3-2ec2-4f41-bb6a-c51e510469d0," +I just wanted to know how to do ***Heteroscedasticity Test on a Univariate Model***? + ++ ex: an univariate autoregressive model ++ ex: an univariate ARCH/GARCH model + +If it is possible, how does one do that in `R`? + + + + +","""I'm really sorry with this foolish question."" No need to apologise. If it's worth asking, ask! ",2013-10-14 16:05:14.313 +186270,57444,22262.0,5,,CC BY-SA 3.0,7ce63764-1e05-42e1-9f22-0e6b27f8ebeb,"Consider a continuous response $Y \sim N(0,\sigma^2)$ and design matrix vector $i$; $\mathbf{X}$. Suppose that I am interested in estimating the probability that $Y \leq 0.1$ given observing $\mathbf{X}$. + +I want to use quantile regression to do this - **can I confirm that this is a legitimate methodology**? + +We have quantiles $\tau \in [0,1]$ and after estimating our quantile regression for each $\tau$ we have our quantile estimates $\mathbf{q} := \{\hat{Q}(\tau) : \tau \in \{0.01,0.02,...,0.99\}\}$. I want to select the $\tau$ such that $\hat{Q}(\tau) \approx 0.1$. When I find such a $\hat{Q}(\tau)$ it seems to then follow naturally that $P(Y \leq 0.1) = \tau$. The reason is that my model has estimated the $\tau$-th quantile to be $0.1$, which is point on the x-axis in $Y$'s pdf that I need to find. + +In practice this may not work since an estimated quantile can be lower for higher $\tau$ under some $\mathbf{X}$. + + +",deleted 26 characters in body,2013-10-14 16:11:31.397 +186272,57447,2666.0,2,,CC BY-SA 3.0,08a44fc4-13ab-4603-8dea-6386df1a3834,"It doesn't appear that $Y$ is binary. Ordinal regression is a good choice here. With any of the ordinal models (proportional odds, proportional hazards, probit, etc.) you can compute the probability that $Y \geq y$ for all $y$. That probability will change at the unique values of $y$. The R `rms` package `orm` function implements this efficiently and has a function generator for exceedance probabilities. If you were extremely fortunate and really have Gaussian residuals you can use the maximum likelihood estimator of the exceedance probabilities, which is a simple function of $\hat{\mu}$ and $\hat{\sigma}$.",,2013-10-14 16:42:40.467 +186343,57467,16469.0,1,,CC BY-SA 3.0,c8d779be-ca7c-4724-a9e8-d9741c65c5e6,How to test (and accept) that a coefficient in a linear regression model equals zero,,2013-10-14 21:58:30.230 +186273,57443,2149.0,5,,CC BY-SA 3.0,7b81da42-3129-4abd-833e-4a2b04491995,"This question was answered in 1988 http://www.unc.edu/~jbhill/tsay.pdf by R.Tsay and implemented in AUTOBOX in 1990. As of this date (today) no other forecasting/time series package has implemented his elegant and creative solution. Simply adjust your series for time trend changes, level shift changes , seasonal pulses and pulses AND the correct ARIMA structure. Verify that the model parameters are constant over time and then search for change points in error variance as he recommends. + + +Edited to respond to Nick .. + +Nick As you may know ARCH/GARCH concerns itself with developing an ARIMA model for the squared residuals. The problem is if you have unusual(one-time) anomilies these are dealt with by inorporating pulse indicator series, yielding a zero residual for each identified point. Squaring these residiuals leads to a distribution that has long tails and is not amenable to ARIMA. When I programmed and implented ARCH/GARCH so that I could jump on the ""next new thing"" I found that it was fundamentally inconsistent with Intervention Detection schemes. Essentially ARCH/GARCH provides a possible solution for a ""change in variance"" that may well be more easily handled by Intervention Detection (violations in the expected value). Thus at this point in time my prefernces (Occam's Razor) for the most simplest solution/transformation/drug/remedy causes me to keep the soution as simple as possible but not too simple. The current release of AUTOBOX treats variance heterogeneity by identifying anomalies,parameter changes and determninistic variance changes ... If all this fails the user can square the residuals and build an arima model to construct his/her own ARCH/GARCH model . Here I stand, I can do no other !",added 1188 characters in body,2013-10-14 16:44:48.263 +186274,57421,22669.0,5,,CC BY-SA 3.0,6d07cfeb-3566-4844-8418-05cceb2b3d98,"I've just started learning time series so please excuse me if it's painfully obvious; I haven't managed to find the answer elsewhere. + +I have a data series showing a pretty obvious trend although it's quite noisy. I can take pretty much any division of the data and run classical tests to show a highly significant difference in means. + +I decided to have a look at time series analysis to see if it could help describe the trend. An ARIMA(0,1,1) model comes out with AIC,BIC=34.3,37.3 (Stata), whilst an ARIMA(0,1,0) model comes out with AIC,BIC=55.1,58.1 - so I understand I'm supposed to prefer the (0,1,1) model. + +However, the coefficient for the MA(1) is displaying as -0.9999997 (and not showing any p-values). If I try the same in SPSS I get an MA(1) coefficient of 1.000 (I assume SPSS uses opposite signs) with a p-value of 0.990 - does this mean it suggests I drop the term? + +My understanding is that the effect of a MA(1) coefficient of -1 is basically to remove the old error term and convert the whole series to a linear trend. Does this mean ARIMA is totally unsuitable for my needs? On the plus side it gives me a sensible value for the trend. If I use the (0,1,0) model then I still get a reasonable value for the trend but it's not significant any more. + +Thanks for your help! + +EDIT: +Thanks for looking in. The trend looks like a fairly linear decrease; the data points seen to fairly noisily rattle around above and below a trend line. The ARIMA (0,1,1) model produces something that's not far off a straight line decrease which seems sensible - the (0,1,1) produces what is essentially a lagged version of the data, translated down by one month of trend. +The data aren't stationary (due to the trend) - though the first differences seem to be. I don't think the (0,1,1) is a bad model - I'm just a little confused by the p-value seeming to suggest I should drop the MA term - or wondering if it means I should bin ARIMA entirely! + +EDIT2 +@vinux - thanks for the suggestion; that makes a lot of sense (and seems to be what the -1 MA term is trying to create?). +I've uploaded as many graphs as I could think of as people had requested. + +![tsline y - graph of raw values][1] +![tsline D.y - graph of differences][2] +![ac y - autocorrelations of y][3] +![pac y - partial autocorrelations of y][4] +![ac D.y - autocorrelations of first differences][5] +![pac D.y - partial autocorrelations of first differences][6] + + + [1]: https://i.stack.imgur.com/OR2sj.png + [2]: https://i.stack.imgur.com/xlzl2.png + [3]: https://i.stack.imgur.com/8lQ57.png + [4]: https://i.stack.imgur.com/IlqiP.png + [5]: https://i.stack.imgur.com/gUa6p.png + [6]: https://i.stack.imgur.com/KE5Sa.png",added graphs,2013-10-14 16:53:09.697 +186275,57443,2149.0,5,,CC BY-SA 3.0,f9f1a92e-2d9f-446a-bed2-40a9d31110ef,"This question was answered in 1988 http://www.unc.edu/~jbhill/tsay.pdf by R.Tsay and implemented in AUTOBOX in 1990. As of this date (today) no other forecasting/time series package has implemented his elegant and creative solution. Simply adjust your series for time trend changes, level shift changes , seasonal pulses and pulses AND the correct ARIMA structure. Verify that the model parameters are constant over time and then search for change points in error variance as he recommends. + + +Edited to respond to Nick .. + +Nick As you may know ARCH/GARCH concerns itself with developing an ARIMA model for the squared residuals. The problem is if you have unusual(one-time) anomilies these are dealt with by inorporating pulse indicator series, yielding a zero residual for each identified point. Squaring these residiuals leads to a distribution that has long tails and is not amenable to ARIMA. When I programmed and implented ARCH/GARCH so that I could jump on the ""next new thing"" I found that it was fundamentally inconsistent with Intervention Detection schemes. Essentially ARCH/GARCH provides a possible solution for a ""change in variance"" that may well be more easily handled by Intervention Detection (violations in the expected value). Thus at this point in time my prefernces (Occam's Razor) for the most simplest solution/transformation/drug/remedy causes me to keep the soution as simple as possible but not too simple. The current release of AUTOBOX treats variance heterogeneity by identifying anomalies,parameter changes and determninistic variance changes and no need for power transformations via Box-Cox ... If all this fails the user can square the residuals and build an arima model to construct his/her own ARCH/GARCH model . Here I stand, I can do no other !",added 50 characters in body,2013-10-14 17:04:26.240 +186276,57448,16644.0,2,,CC BY-SA 3.0,db327fdd-ad54-4cfe-b297-940579fe33ed,"The Dvoretzky-Kiefer-Wolfowitz inequality can be used here. The required sample size $b$ (I'm using $b$ to distinguish it from $n$ because you already set your population size as $n$ in the problem statement) is determined by $$b \geq \left( {1 \over 2 \epsilon^2 } \right) \mathrm{ln} \left( {2 \over \alpha} \right),$$ where $\epsilon$ is how close you want your empirical cdf to be and $1-\alpha$ is the confidence level. + +So, for example, if you want to estimate $F(c)$ within $\epsilon = 0.01$ with 95% confidence, the formula gives a sample size of $$b \geq 18444.4,$$ or $b = 18445.$ + +This will cover any and all $c,$ so it is possible you can do much better. Perhaps one of the commenters will fill in the details on a more efficient solution for a single value of $c.$ ",,2013-10-14 17:26:35.837 +186306,57396,22656.0,5,,CC BY-SA 3.0,780c3d10-d6d5-494d-bad5-284780aa2870,"I have a large population of size $n$ from an unknown continuous random variable $X$, and I do not know the underlying distribution of $X$. Given a constant number $c$, I want to determine the minimum sample size I need to estimate the probability $P(X \le c)$ given a confidence level, $p_c$, and confidence interval, $I_c$ (I am not sure if we need them! ). How can I find the minimum sample size to estimate this probability? + +I have found the following discussion in [Wikipedia][1] which is independent of the number of population. I am not sure if it is a good way to determine sample size! +![enter image description here][2] + + + + + +I have also found some methods to determine sample size for data to be analyzed by nonparametric tests.you don't have to make any assumption about the distribution of the values. That is why it is called nonparametric. Now I am confused if these nonparametric methods can be used to solve my problem or the method I found in Wikipedia is the correct way to solve my problem, or there exists a better solution. + +Thanks for your help. + + + [1]: http://en.wikipedia.org/wiki/Sample_size_determination#Estimating_proportions_and_means + [2]: https://i.stack.imgur.com/t09Kn.jpg",added 1 characters in body,2013-10-14 20:06:33.677 +186342,57467,16469.0,3,,CC BY-SA 3.0,c8d779be-ca7c-4724-a9e8-d9741c65c5e6,,,2013-10-14 21:58:30.230 +186277,57443,15827.0,5,,CC BY-SA 3.0,b1524954-3a02-4e66-bc96-c3e5fa0dd9a3,"This question was answered in 1988 http://www.unc.edu/~jbhill/tsay.pdf by R.Tsay and implemented in AUTOBOX in 1990. As of this date (today) no other forecasting/time series package has implemented his elegant and creative solution. Simply adjust your series for time trend changes, level shift changes, seasonal pulses and pulses AND the correct ARIMA structure. Verify that the model parameters are constant over time and then search for change points in error variance as he recommends. + + +Edited to respond to Nick .. + +As you may know ARCH/GARCH concerns itself with developing an ARIMA model for the squared residuals. The problem is if you have unusual (one-time) anomalies these are dealt with by incorporating pulse indicator series, yielding a zero residual for each identified point. Squaring these residuals leads to a distribution that has long tails and is not amenable to ARIMA. When I programmed and implemented ARCH/GARCH so that I could jump on the ""next new thing"" I found that it was fundamentally inconsistent with Intervention Detection schemes. Essentially ARCH/GARCH provides a possible solution for a ""change in variance"" that may well be more easily handled by Intervention Detection (violations in the expected value). Thus at this point in time my preferences (Occam's Razor) for the simplest solution/transformation/drug/remedy causes me to keep the solution as simple as possible but not too simple. The current release of AUTOBOX treats variance heterogeneity by identifying anomalies, parameter changes and deterministic variance changes and no need for power transformations via Box-Cox... If all this fails the user can square the residuals and build an arima model to construct his/her own ARCH/GARCH model. Here I stand, I can do no other!",small fixes,2013-10-14 17:29:22.363 +186280,57449,21884.0,3,,CC BY-SA 3.0,1ccb25bf-ad69-4d50-93c4-1bba353d9255,,,2013-10-14 17:30:14.367 +186279,57449,21884.0,1,,CC BY-SA 3.0,1ccb25bf-ad69-4d50-93c4-1bba353d9255,Covariance matrix equality,,2013-10-14 17:30:14.367 +186278,57449,21884.0,2,,CC BY-SA 3.0,1ccb25bf-ad69-4d50-93c4-1bba353d9255,"The (unbiased) sample covariance matrix + +$$\mathbf{S}=\dfrac{1}{n-1}\sum_{j=1}^{n}(\mathbf{X}_{j}-\bar{\mathbf{X}})(\mathbf{X}_{j}-\bar{\mathbf{X}})^{T}$$ +can be rewritten as + +$$\mathbf{S}=\dfrac{1}{n-1}\mathbf{X}^{T}\mathbf{X}-\dfrac{1}{n(n-1)}\mathbf{X}^{T}\mathbf{1}\mathbf{1}^{T}\mathbf{X}$$ + +where $$\mathbf{1}=\left(\begin{array}{c} +1\\ +\vdots\\ +1 +\end{array}\right)_{(n\times1)}.$$ + +One (tedious) way of proving this is to expand out the left hand side and the right hand side of the equality, and showing that the entries of the matrices match. I've done this successfully. + +My question: is there a neater / more concise way to prove such an equality?",,2013-10-14 17:30:14.367 +186283,57450,19265.0,3,,CC BY-SA 3.0,88e41e68-9a8d-43ea-a315-3dcba00b969c,,,2013-10-14 18:06:27.340 +186281,57450,19265.0,1,,CC BY-SA 3.0,88e41e68-9a8d-43ea-a315-3dcba00b969c,What is the loss function for C - Support Vector Classification?,,2013-10-14 18:06:27.340 +186282,57450,19265.0,2,,CC BY-SA 3.0,88e41e68-9a8d-43ea-a315-3dcba00b969c,"In article [LIBSVM: A Library for Support Vector Machines][1] there is written, than C-SVC uses loss function: + +$$ \frac{1}{2}w^Tw+C\sum\limits_{i=1}^l\xi_i$$ + +OK, I know, what is $w^Tw$. + +But what is $\xi_i$? I know, that it is somehow connected with misclassifications, but is it calculated exactly? + + +P.S. I don't use any non-linear kernels. + + + [1]: http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf",,2013-10-14 18:06:27.340 +186286,57451,21840.0,3,,CC BY-SA 3.0,891f8bfe-7da9-41a3-a3e3-dda017d0e5df,,,2013-10-14 18:11:53.353 +186285,57451,21840.0,1,,CC BY-SA 3.0,891f8bfe-7da9-41a3-a3e3-dda017d0e5df,Probability of having real roots,,2013-10-14 18:11:53.353 +186284,57451,21840.0,2,,CC BY-SA 3.0,891f8bfe-7da9-41a3-a3e3-dda017d0e5df,"Let $U,V,W$ are independent random variables with uniform(0,1) distribution. I am trying to find the probability that $Ux^{2}+Vx+W$ has real roots, that is, $P(V^{2}-4UW> 0)$ +I have solved this question using double integral but how to do this using triple integral. +My Approach: +I started with cdf: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(V>2\sqrt{UW})$ = $\int\int_{2\sqrt{UW}}^1 P(V>2\sqrt{UW}) du dw$ +=$\int\int\int_{2\sqrt{UW}}^1 vdu dw dv$ + +I am finding hard time to get the limits of integral over the region in 3 dimensions. + +Thanks!",,2013-10-14 18:11:53.353 +186289,57452,22627.0,3,,CC BY-SA 3.0,c2368592-87e7-423b-a53e-c2e61888382e,,,2013-10-14 18:13:32.393 +186288,57452,22627.0,1,,CC BY-SA 3.0,c2368592-87e7-423b-a53e-c2e61888382e,"Expected maximum given population size, mean, and variance",,2013-10-14 18:13:32.393 +186287,57452,22627.0,2,,CC BY-SA 3.0,c2368592-87e7-423b-a53e-c2e61888382e,"How would one estimate the maximum given population size, a few moments, and perhaps some additional assumption on the distribution? + +Something like ""I'm going to do $N_s≫1$ measurements out of population of size $N_p≫N_s$; will record mean $μ_s$, standard deviation $σ_s$, and maximal value in the sample hs; I am willing to assume binomial (or Poisson, etc) distribution; what is the expected maximal value hp of the entire population?"" + +Related question: does one need to make the assumptions on the nature of the population distribution, or the sample statistics would be enough to estimate hp? +",,2013-10-14 18:13:32.393 +186290,57412,,25,,,2f8277b9-8db0-4204-b6fd-98d9da5d7fdf,,http://twitter.com/#!/StackStats/status/389818253478227968,2013-10-14 18:21:28.833 +186291,56768,0.0,34,,,eb0a2164-7ff0-48dc-9547-825eb8ee35a6,,806,2013-10-14 18:25:40.383 +186292,57453,22682.0,3,,CC BY-SA 3.0,0227bfcb-8f78-452f-b9d1-f86d3872233e,,,2013-10-14 18:27:13.640 +186294,57453,22682.0,1,,CC BY-SA 3.0,0227bfcb-8f78-452f-b9d1-f86d3872233e,Is there a way to remove individual trees from a forest in the randomForest package in R?,,2013-10-14 18:27:13.640 +186293,57453,22682.0,2,,CC BY-SA 3.0,0227bfcb-8f78-452f-b9d1-f86d3872233e,"I am trying to implement the ideas in this paper: http://www.sciencedirect.com/science/article/pii/S0925231212003396. + +This requires me to be able to remove individual trees from the forest and reclassify my training data for each removal. I've been using the randomForest package in R and had a comb through the manual but couldn't find any way of running the forest with a subset of trees, or even with an individual tree. There is a getTree function but that only gives a matrix of the node structure of the tree. + +Is there any way to do this, either in randomForest (preferably) or via another package?",,2013-10-14 18:27:13.640 +186327,57463,22687.0,1,,CC BY-SA 3.0,61c15056-2b23-4203-b6e1-23875b097154,How does one generate the table mapping t-test values to p values?,,2013-10-14 21:14:24.377 +186341,57467,16469.0,2,,CC BY-SA 3.0,c8d779be-ca7c-4724-a9e8-d9741c65c5e6,"I understand that in a linear regression model like: + + +$y_i = b_0 + b_1 * x_i + \epsilon_i$ + +I can have a null-hypothesis: + +$H_0: b_1 = 0$ and $H_1: b_1 \neq 0$. + +And then I can reject $H_0$ or fail to reject $H_0$. But what if I want to accept that $b_1 = 0$?",,2013-10-14 21:58:30.230 +186295,57421,22669.0,5,,CC BY-SA 3.0,5e25e120-d328-48c7-ab4e-5564cd5eebf4,"I've just started learning time series so please excuse me if it's painfully obvious; I haven't managed to find the answer elsewhere. + +I have a data series showing a pretty obvious trend although it's quite noisy. I can take pretty much any division of the data and run classical tests to show a highly significant difference in means. + +I decided to have a look at time series analysis to see if it could help describe the trend. An ARIMA(0,1,1) model comes out with AIC,BIC=34.3,37.3 (Stata), whilst an ARIMA(0,1,0) model comes out with AIC,BIC=55.1,58.1 - so I understand I'm supposed to prefer the (0,1,1) model. + +However, the coefficient for the MA(1) is displaying as -0.9999997 (and not showing any p-values). If I try the same in SPSS I get an MA(1) coefficient of 1.000 (I assume SPSS uses opposite signs) with a p-value of 0.990 - does this mean it suggests I drop the term? + +My understanding is that the effect of a MA(1) coefficient of -1 is basically to remove the old error term and convert the whole series to a linear trend. Does this mean ARIMA is totally unsuitable for my needs? On the plus side it gives me a sensible value for the trend. If I use the (0,1,0) model then I still get a reasonable value for the trend but it's not significant any more. + +Thanks for your help! + +EDIT: +Thanks for looking in. The trend looks like a fairly linear decrease; the data points seen to fairly noisily rattle around above and below a trend line. The ARIMA (0,1,1) model produces something that's not far off a straight line decrease which seems sensible - the (0,1,1) produces what is essentially a lagged version of the data, translated down by one month of trend. +The data aren't stationary (due to the trend) - though the first differences seem to be. I don't think the (0,1,1) is a bad model - I'm just a little confused by the p-value seeming to suggest I should drop the MA term - or wondering if it means I should bin ARIMA entirely! + +EDIT2 +@vinux - thanks for the suggestion; that makes a lot of sense (and seems to be what the -1 MA term is trying to create?). +I've uploaded as many graphs as I could think of as people had requested. + +![tsline y - graph of raw values][1] +![tsline D.y - graph of differences][2] +![ac y - autocorrelations of y][3] +![pac y - partial autocorrelations of y][4] +![ac D.y - autocorrelations of first differences][5] +![pac D.y - partial autocorrelations of first differences][6] + +I've also put the monthly data up in CSV format at [pastebin][7] + + + [1]: https://i.stack.imgur.com/OR2sj.png + [2]: https://i.stack.imgur.com/xlzl2.png + [3]: https://i.stack.imgur.com/8lQ57.png + [4]: https://i.stack.imgur.com/IlqiP.png + [5]: https://i.stack.imgur.com/gUa6p.png + [6]: https://i.stack.imgur.com/KE5Sa.png + [7]: http://pastebin.com/7ih4mhfB ""Pastebin""",added 116 characters in body,2013-10-14 18:46:44.143 +186296,57451,21840.0,5,,CC BY-SA 3.0,0b157093-6dd9-4e5b-a6e9-007dbccde960,"Let $U,V,W$ are independent random variables with uniform(0,1) distribution. I am trying to find the probability that $Ux^{2}+Vx+W$ has real roots, that is, $P(V^{2}-4UW> 0)$ +I have solved this question using double integral but how to do this using triple integral. +My Approach: +I started with cdf: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(V>2\sqrt{UW})$ = $\int\int_{2\sqrt{uw}}^1 P(V>2\sqrt{UW}) dU dW$ +=$\int\int\int_{2\sqrt{uw}}^1 vdU dW dV$ + +I am finding hard time to get the limits of integral over the region in 3 dimensions. + +Thanks!",fixed question,2013-10-14 18:47:38.117 +186297,57454,22507.0,2,,CC BY-SA 3.0,6de349ea-b006-4f3d-92a1-df4a30c869a3,"One idea is, instead of creating one forest with N trees, create N ""forests"" of 1 tree each by calling randomForest() N times. Then you could manipulate them as you wish.",,2013-10-14 18:48:24.640 +186298,57451,21840.0,5,,CC BY-SA 3.0,7490454d-4e8d-46b6-9142-01dbdaf77de5,"Let $U,V,W$ are independent random variables with uniform(0,1) distribution. I am trying to find the probability that $Ux^{2}+Vx+W$ has real roots, that is, $P(V^{2}-4UW> 0)$ +I have solved this question using double integral but how to do this using triple integral. +My Approach: +I started with cdf: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(V>2\sqrt{UW})$ = $\int\int_{2\sqrt{uw}}^1 P(V>2\sqrt{UW}) dU dW$ +=$\int\int\int_{2\sqrt{uw}}^1 vdU dW dV$ + +I am finding hard time to get the limits of integral over the region in 3 dimensions. + +Using double integral: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(-2ln V <-ln 4 - ln U - ln W) = P(X <-ln 4 +Y)$ +where $X=-2 ln V, Y = - ln U - ln W $ +$X$ has exp(1) and $Y$ has gamma(2,1) distribution. +$P(X <-ln 4 +Y) = \int_{ln4}^\infty P(X < -ln4 +Y) f_Y(y) dy $ +$$=\int_{ln4}^\infty\int_0^{-ln4+y} \frac{1}{2} e^{-\frac{x}{2}}ye^{-y} dxdy $$ +Solving this I got 0.2545. + +Thanks!",Posted alternate solution ,2013-10-14 19:12:55.470 +186299,57453,22682.0,5,,CC BY-SA 3.0,c38813a0-5fb5-4bc3-b841-ce498a563d12,"I am trying to implement the ideas in this paper: http://www.sciencedirect.com/science/article/pii/S0925231212003396. + +This requires me to be able to remove individual trees from the forest and reclassify my training data for each removal. I've been using the randomForest package in R and had a comb through the manual but couldn't find any way of running the forest with a subset of trees, or even with an individual tree. There is a getTree function but that only gives a matrix of the node structure of the tree. + +Is there any way to do this, either in randomForest (preferably) or via another random forest implementation (e.g. scikit-learn)?",added 41 characters in body,2013-10-14 19:13:29.730 +186301,57455,14748.0,1,,CC BY-SA 3.0,4c0ab385-bff0-4982-a74b-ee6f80265401,Predictive algorithm validation,,2013-10-14 19:13:34.737 +186302,57455,14748.0,3,,CC BY-SA 3.0,4c0ab385-bff0-4982-a74b-ee6f80265401,,,2013-10-14 19:13:34.737 +186300,57455,14748.0,2,,CC BY-SA 3.0,4c0ab385-bff0-4982-a74b-ee6f80265401,"In putting a binary 1/0 predictive algorithm into production, what are the consequences where only the positive (1) predictions are checked, meaning only true or false positives are detected, and then fed back into training the model? Will that bias the algorithm in any way so that it progressively gets worse and worse because it never sees true or false negatives?",,2013-10-14 19:13:34.737 +186304,57396,22656.0,5,,CC BY-SA 3.0,84413c7f-4cbd-4eed-8d54-a697f1521ae5,"I have a large population of size $n$ from an unknown continuous random variable $X$, and I do not know the underlying distribution of $X$. Given a constant number $c$, I want to determine the minimum sample size I need to estimate the probability $P(X \le c)$ given a confidence level, $p_c$, and confidence interval, $I_c$ (I am not sure if we need them! ). How can I find the minimum sample size to estimate this probability? + +I have found the following discussion in [Wikipedia][1] which is independent of the number of population. I am not sure if it is a good way to determine sample size! +![enter image description here][2] + + + + + +I have also found some methods to determine sample size for data to be analyzed by nonparametric tests.you don't have to make any assumption about the distribution of the values. That is why it is called nonparametric. Now I am confused if these nonparametric methods can be used to solve my problem or the method I found in Wikipedia is the correct way to solve my problem, or there exist a better solution. + +Thanks for your help. + + + [1]: http://en.wikipedia.org/wiki/Sample_size_determination#Estimating_proportions_and_means + [2]: https://i.stack.imgur.com/t09Kn.jpg",added 740 characters in body,2013-10-14 19:53:31.000 +186305,57456,22143.0,2,,CC BY-SA 3.0,7f86091b-54f8-443e-8072-63faa27b434a,"$\xi_i$ are the slack variables. They are typically nonzero when the 2-class data is non-separable. We are trying the minimize the slack as much as possible (by minimizing their sum, since they are non-negative) along with maximizing the margin ($w^Tw$) term. + +*Exact calculation*: Well, if the convex program has been solved to optimality without any optimization error, then yes, they are calculated exactly.",,2013-10-14 20:04:52.890 +186346,57468,22693.0,2,,CC BY-SA 3.0,57b8f504-34bd-42a3-9fb0-34dc60348f08,"I have performed a log transformation on my skewed data, however on my DV it went from positive skew to negative skew after the (log) transformation, further data was missing from my DV after the transformation? Please help",,2013-10-14 22:01:18.457 +186345,57468,22693.0,3,,CC BY-SA 3.0,57b8f504-34bd-42a3-9fb0-34dc60348f08,,,2013-10-14 22:01:18.457 +186344,57468,22693.0,1,,CC BY-SA 3.0,57b8f504-34bd-42a3-9fb0-34dc60348f08,Trouble transforming data,,2013-10-14 22:01:18.457 +186307,57457,6162.0,2,,CC BY-SA 3.0,808109a5-5906-412d-a9d6-abf0d6d420b1,"Here is a solution without multiple integrals calculation (because I don't like multiple integrals). Actually it only uses three elementary simple integrals. +$$ +P(V^{2}-4UW \leq 0) = E\bigl[P(V^{2}-4UW \leq 0 \mid U,W)\bigr] = E\bigl[f(U,W)\bigr]$$ where $f(u,w)=P(V^{2}-4uw \leq 0)= \min\bigl\{1, 2\sqrt{uw}\bigr\}$. +$$ +E\bigl[f(U,W)\bigr] = E[g(W)] +$$ +where +$$\begin{align} +g(w) & = E\bigl[\min\bigl\{1, 2\sqrt{Uw}\bigr\}\bigr] += 1 \times \Pr(2\sqrt{Uw}>1) + E\bigl[2\sqrt{Uw} \mathbf{1}_{2\sqrt{Uw}\leq 1}\bigr] \\ +& = \Pr(U>\frac{1}{4w}) + 2\sqrt{w}E\bigl[\sqrt{U} \mathbf{1}_{U \leq \frac{1}{4w}}\bigr] \\ +& = \max\bigl\{0, 1 - \frac{1}{4w}\bigr\} + 2\sqrt{w} \times \frac{2}{3} \times \min\bigl\{1, \frac{1}{{(4w)}^{\frac{3}{2}}}\bigr\} \\ +& =\begin{cases} + 0 + \frac{4}{3}\sqrt{w} & \text{if } w \leq \frac{1}{4} \\ +1 - \frac{1}{4w} + \frac{1}{6w} & \text{if } w > \frac{1}{4} +\end{cases}, \end{align}$$ +and we get +$$ E[g(W)] = \frac{1}{9} + \frac{3}{4} - \frac{1}{12} \log 4 = \frac{31}{36}-\frac{\log 2}{6},$$ +and finally +$$P(V^{2}-4UW > 0) = \frac{5}{36} + \frac{\log 2}{6} \approx 0.2544134.$$ + + + +",,2013-10-14 20:07:35.763 +186310,57458,22684.0,3,,CC BY-SA 3.0,3adcd59c-1bd4-482c-85b7-8a6ee8ba83cf,,,2013-10-14 20:08:14.273 +186308,57458,22684.0,2,,CC BY-SA 3.0,3adcd59c-1bd4-482c-85b7-8a6ee8ba83cf," 0 down vote favorite + + +I came across a online data mining course project + +http://www.kdnuggets.com/data_mining_course/assignments/final-project.html + +The data is of samples with 7000 features as genes. Each gene is associated with a value. Some of the values are negative. The data looks like in this way: + +SNO ""U48730_at"" ""U58516_at"" ""U73738_at"" ""X06956_at"" ""X16699_at"" ""X83863_at"" + +X1 "" 27"" "" 161"" "" 0"" "" 34"" "" 2"" "" 116"" +X2 "" 27"" "" 265"" "" 0"" "" 98"" "" 2"" "" 123"" +X3 "" 24"" "" 126"" "" 0"" "" 21"" "" 0"" "" 142"" +X4 "" 27"" "" 163"" "" -1"" "" 16"" "" -1"" "" 134"" +X5 "" 41"" "" 138"" "" 1"" "" 29"" "" 1"" "" 153"" +X6 "" 55"" "" 107"" "" -1"" "" 17"" "" 0"" "" 152"" +X7 "" 27"" "" 99"" "" 0"" "" 57"" "" 1"" "" 139"" +X8 "" 2"" "" 137"" "" -1"" "" 19"" "" -3"" "" 213"" +X9 "" -5"" "" 161"" "" -3"" "" 23"" "" 2"" "" 193"" +X10 "" 0"" "" 110"" "" -3"" "" 7"" "" -1"" "" 208"" +X11 "" -7"" "" 67"" "" 1"" "" 2"" "" -2"" "" 149"" +X12 "" 4"" "" 93"" "" 3"" "" 37"" "" 2"" "" 266"" +X13 "" 2"" "" 75"" "" 3"" "" 30"" "" 6"" "" 205"" + +The professor advise the students to first do 'data cleaning'. The original sentence is Threshold both train and test data to a minimum value of 20, maximum of 16,000. + +I first thought that it is to search over each gene and if there is a value out of the bounds, then just discard this gene as a feature. However, it seems for every gene, there must be a sample with the value out of bound. + +What should I do by ""threshold this data""? Is that like if the value is below 20, then set it 20 or if the value is above 16000, then just set it as 16000? + +In fact, I did the last operation in R by + + data[data<20] <- 20 + +and it turns out that the speed of the command is very slow. (79*7070 samples) + +Thanks in advance! +",,2013-10-14 20:08:14.273 +186309,57458,22684.0,1,,CC BY-SA 3.0,3adcd59c-1bd4-482c-85b7-8a6ee8ba83cf,Questions about thresholding the data,,2013-10-14 20:08:14.273 +186311,57459,22143.0,2,,CC BY-SA 3.0,cbfd51d1-65e9-4e2e-b071-265e35f9dc2b,"> What should I do by ""threshold this data""? Is that like if the value is below 20, then set it 20 or if the value is above 16000, then just set it as 16000? + +Yes.",,2013-10-14 20:17:26.320 +186312,57460,14799.0,2,,CC BY-SA 3.0,b5cf480f-d515-4c9c-bee8-3afc1ec22aff,"This is for OLS regression. Consider a geometric representation of three variables -- two predictors, $X_1$ and $X_2$, and a dependent variable, $Y$. Each variable is represented by a vector from the origin. The length of the vector equals the standard deviation of the corresponding variable. The cosine of the angle between any two vectors equals the correlation of the corresponding two variables. I will take all the standard deviations to be 1. + +![enter image description here][1] + +The picture shows the plane determined by the $X_1$ and $X_2$ when they correlate positively with one another. $Y$ is a vector coming out of the screen; the dashed line is its projection into the predictor space and is the regression estimate of $Y$, $\hat{Y}$. The length of the dashed line equals the multiple correlation, $R$, of $Y$ with $X_1$ and $X_2$. + +If the projection is in any of the colored sectors then both predictors correlate positively with $Y$. The signs of the regression coefficients $\beta_1$ and $\beta_2$ are immediately apparent visually, because $\hat{Y}$ is the vector sum of $\beta_1 X_1$ and $\beta_2 X_2$. If the projection is in the yellow sector then both $\beta_1$ and $\beta_2$ are positive, but if the projection is in either the red or the blue sector then we have what appears to be suppression; that is, the sign of one of the regression weights is opposite to the sign of the corresponding simple correlation with $Y$. In the picture, $\beta_1$ is positive and $\beta_2$ is negative. + +Since the length of the projection can vary between 0 and 1 no matter where it is in the predictor space, there is no minimum $R^2$ for suppression. + + + [1]: https://i.stack.imgur.com/io9v4.png",,2013-10-14 20:20:50.680 +186315,57461,22685.0,2,,CC BY-SA 3.0,3d6be59c-08ee-4d27-b223-2cc28dcb8dd4,"On [pg. 378][1] they claim two probability distributions are e(k) close if the distance between them is at most e(k). + +What is significance of two distributions X and Y being ""close to"" or ""far from"" each other? Why would anybody care, especially in cryptography ? + + [1]: http://download.springer.com/static/pdf/305/chp%253A10.1007%252F978-3-642-22792-9_21.pdf?auth66=1381954902_7a8eccbd8188fded3878a75ad24f8c83&ext=.pdf",,2013-10-14 20:32:28.583 +186313,57461,22685.0,1,,CC BY-SA 3.0,3d6be59c-08ee-4d27-b223-2cc28dcb8dd4,What is the point of measuring statistical distance?,,2013-10-14 20:32:28.583 +186314,57461,22685.0,3,,CC BY-SA 3.0,3d6be59c-08ee-4d27-b223-2cc28dcb8dd4,,,2013-10-14 20:32:28.583 +186317,57448,,24,,CC BY-SA 3.0,4514d200-2907-4faf-a33d-ad7a7a9bdf88,,"Proposed by 30815 approved by 7290, 601 edit id of 5606",2013-10-14 20:35:33.910 +186316,57448,22143.0,5,,CC BY-SA 3.0,4514d200-2907-4faf-a33d-ad7a7a9bdf88,"The [Dvoretzky-Kiefer-Wolfowitz inequality](http://en.wikipedia.org/wiki/Dvoretzky%E2%80%93Kiefer%E2%80%93Wolfowitz_inequality) can be used here. The required sample size $b$ (I'm using $b$ to distinguish it from $n$ because you already set your population size as $n$ in the problem statement) is determined by $$b \geq \left( {1 \over 2 \epsilon^2 } \right) \mathrm{ln} \left( {2 \over \alpha} \right),$$ where $\epsilon$ is how close you want your empirical cdf to be and $1-\alpha$ is the confidence level. + +So, for example, if you want to estimate $F(c)$ within $\epsilon = 0.01$ with 95% confidence, the formula gives a sample size of $$b \geq 18444.4,$$ or $b = 18445.$ + +This will cover any and all $c,$ so it is possible you can do much better. Perhaps one of the commenters will fill in the details on a more efficient solution for a single value of $c.$ ",Added a link to D-K-W inequality.,2013-10-14 20:35:33.910 +186319,57458,5237.0,6,,CC BY-SA 3.0,b4c3ac5e-cd11-426f-9546-63c2bd28b344,,added tags; formatted; removed peripheral comments,2013-10-14 20:47:24.533 +186347,57467,16469.0,5,,CC BY-SA 3.0,ab8dc2fe-f835-48aa-8582-41c958cd6470,"I understand that in a linear regression model like: + + +$y_i = b_0 + b_1 * x_i + \epsilon_i$ + +I can have a null and an alternative hypothesis: + +$H_0: b_1 = 0$ and $H_1: b_1 \neq 0$. + +And then I can reject $H_0$ or fail to reject $H_0$. But what if I want to accept that $b_1 = 0$?",added 19 characters in body,2013-10-14 22:03:47.687 +186318,57458,5237.0,5,,CC BY-SA 3.0,b4c3ac5e-cd11-426f-9546-63c2bd28b344,"I came across a [data mining course project](http://www.kdnuggets.com/data_mining_course/assignments/final-project.html) online. + +The data is of samples with 7000 features as genes. Each gene is associated with a value. Some of the values are negative. The data looks like in this way: + + SNO ""U48730_at"" ""U58516_at"" ""U73738_at"" ""X06956_at"" ""X16699_at"" ""X83863_at"" + + X1 "" 27"" "" 161"" "" 0"" "" 34"" "" 2"" "" 116"" + X2 "" 27"" "" 265"" "" 0"" "" 98"" "" 2"" "" 123"" + X3 "" 24"" "" 126"" "" 0"" "" 21"" "" 0"" "" 142"" + X4 "" 27"" "" 163"" "" -1"" "" 16"" "" -1"" "" 134"" + X5 "" 41"" "" 138"" "" 1"" "" 29"" "" 1"" "" 153"" + X6 "" 55"" "" 107"" "" -1"" "" 17"" "" 0"" "" 152"" + X7 "" 27"" "" 99"" "" 0"" "" 57"" "" 1"" "" 139"" + X8 "" 2"" "" 137"" "" -1"" "" 19"" "" -3"" "" 213"" + X9 "" -5"" "" 161"" "" -3"" "" 23"" "" 2"" "" 193"" + X10 "" 0"" "" 110"" "" -3"" "" 7"" "" -1"" "" 208"" + X11 "" -7"" "" 67"" "" 1"" "" 2"" "" -2"" "" 149"" + X12 "" 4"" "" 93"" "" 3"" "" 37"" "" 2"" "" 266"" + X13 "" 2"" "" 75"" "" 3"" "" 30"" "" 6"" "" 205"" + +The professor advise the students to first do 'data cleaning'. The original sentence is Threshold both train and test data to a minimum value of 20, maximum of 16,000. + +I first thought that it is to search over each gene and if there is a value out of the bounds, then just discard this gene as a feature. However, it seems for every gene, there must be a sample with the value out of bound. + +What should I do by ""threshold this data""? Is that like if the value is below 20, then set it 20 or if the value is above 16000, then just set it as 16000? + +In fact, I did the last operation in R by + + data[data<20] <- 20 + +and it turns out that the speed of the command is very slow. (79*7070 samples) +",added tags; formatted; removed peripheral comments,2013-10-14 20:47:24.533 +186321,57461,674.0,5,,CC BY-SA 3.0,30f8528b-ec84-4928-b491-04cf40c8b1d3,"On [pg. 378][1] of ""Cryptography with Tamperable and Leaky +Memory"", Kalai et al. claim two probability distributions are $e(k)$ close if the distance between them is at most $e(k)$. + +What is significance of two distributions X and Y being ""close to"" or ""far from"" each other? Why would anybody care, especially in cryptography? + + [1]: http://download.springer.com/static/pdf/305/chp%253A10.1007%252F978-3-642-22792-9_21.pdf?auth66=1381954902_7a8eccbd8188fded3878a75ad24f8c83&ext=.pdf",added 65 characters in body,2013-10-14 20:57:16.717 +186322,57452,22627.0,5,,CC BY-SA 3.0,2e150419-f34e-494b-9ef3-456837404f4f,"How would one estimate the maximum given population size, a few moments, and perhaps some additional assumption on the distribution? + +Something like ""I'm going to do $N_s≫1$ measurements out of population of size $N_p≫N_s$; will record mean $μ_s$, standard deviation $σ_s$, and maximal value in the sample $X_s$; I am willing to assume binomial (or Poisson, etc) distribution; what is the expected maximal value $X_p$ of the entire population?"" + +Related question: does one need to make the assumptions on the nature of the population distribution, or the sample statistics would be enough to estimate $X_p$? + +Edit: the background I just added in the comments may not be clear enough. So here it is: + +The end purpose it to print a set of shapes (wires, gates, etc) on a VLSI circuit that matches the designed shapes (a.k.a. targets) as well as possible. The measure of fitness of the manufactured set of shapes is the MAXIMAL difference from the target, rather than the $\sigma$ along the $~10^9$ location. The reason for evaluating the maximum difference is clear: a single short circuit is bad enough to bring down the entire chip, and then it wouldn't matter how close you were to the target in the remaining 99.999999% of the chip's location. + +The problem is that it's very costly to measure the printed shape in too many locations: you literally need to look though an electron microscope at the half-manufactured chip (that's going to get trashed after the destructive measurements), adjust for metrology errors, etc. Therefore more than $10^4$ measurements is hardly ever being done. The result of those measurement is the maximal target difference $X_s$ of the SAMPLE, as well as any other sample statistics you may wish for. + +And now one needs to estimate the maximal difference $X_p$ for the entire population... And now one wishes that he paid more attention in the statistics class back in college...",added 1281 characters in body,2013-10-14 21:00:15.407 +186323,57319,1693.0,5,,CC BY-SA 3.0,799b3975-d4b3-4a57-bbd3-45f0970c940b,"I am modeling an outcome for hospital patients, 'RA' (whether readmitted). My predictor of interest is 'HHS' (whether referred to Home Health Services such as from a visiting nurse). Those referred readmit at a 15.2% rate; others, 9.2%, but the former are needier, sicker patients. Conventional thinking is that if we controlled for severity of illness this difference would not only be washed out but would reverse itself. In other words, holding constant the severity of illness, having HHS should mean a lower RA rate. + +With HHS as the sole predictor, B in a **logistic** regression = 0.6 (N ~ 25k). B is reduced to 0.2 with a group of covariates controlled, each accounting for some aspect of severity of illness, but B doesn't fall below zero. + +HHS alone explains only about 1% of the variance in RA; with the other predictors, this becomes 4%.* Perhaps this is the problem--that these covariates are not explaining enough variance to ""succeed"" in reversing the sign of the coefficient of interest. If this is true, is there a way to estimate how high **their** explained variance needs to be for such a reversal to show up? + + +---------- + +*Using either of 2 pseudo-RSQ formulas; Cox & Snell's or Menard's [-2LL0 - (-2LL1)] / [-2LL0.]",emphasized things that seem to be getting missed,2013-10-14 21:00:27.517 +186324,57462,22143.0,2,,CC BY-SA 3.0,bf401e13-0cfd-4398-8bac-afa630655a96,"*Try 1*: + +If $X \sim U[a,b]$ (uniform, either discrete or continuous), then the MLE estimator for b (which is $\max_{x \in [a,b]} X$) is essentially $\max_{i=1,...,N_s}x_i$. + +I chose uniform distribution because it is the worst case distribution in terms of entropy. This is in line with the MaxEnt (maximum entropy) principle. I also assumed a linear order in the values of the random variable. + +We can make the following claim about the estimator $\max_{i=1,...,N_s}x_i$ to *its* mean using Hoeffdings inequality (without assuming that $X \sim U[a,b]$). Assuming $x_i$ are i.i.d from some distribution with bounded support $[a,b]$, we have +\begin{align*} +\mathbb{P}_{x_1,...,x_{N_s}}\left(|\max_{i=1,...,N_s}x_i - \mathbb{E}[\max_{i=1,...,N_s}x_i]| \geq \epsilon\right) \leq 2\exp\left(\frac{-2\epsilon^2}{N_s(b-a)}\right) +\end{align*} +Here we do not need to know $b$ exactly, any rough or crude upper bound will suffice. The above concentration is only saying that the estimator is close to the expected value of the estimator which is not the same as being close to the unknown $\max_{x \in [a,b] X} = b$. + + +*Additional comment*: I would make the measurements uniformly at random over the plane/chip so that hopefully no region with high $X$ values is missed. This observation is independent of the above.",,2013-10-14 21:10:03.567 +186325,57451,,5,,CC BY-SA 3.0,2f6b0898-ce1f-4eff-97e6-e8eb77b1939f,"Let $U,V,W$ are independent random variables with $\mathrm{Uniform}(0,1)$ distribution. I am trying to find the probability that $Ux^{2}+Vx+W$ has real roots, that is, $P(V^{2}-4UW> 0)$ +I have solved this question using double integral but how to do this using triple integral. +My Approach: +I started with cdf: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(V>2\sqrt{UW})$ = $\int\int_{2\sqrt{uw}}^1 P(V>2\sqrt{UW}) dU dW$ +=$\int\int\int_{2\sqrt{uw}}^1 vdU dW dV$ + +I am finding hard time to get the limits of integral over the region in 3 dimensions. + +Using double integral: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(-2\ln V <-\ln 4 - \ln U - \ln W) = P(X <-\ln 4 +Y)$ +where $X=-2 \ln V, Y = - \ln U -\ln W $ +$X$ has $\exp(1)$ and $Y$ has $\mathrm{gamma}(2,1)$ distribution. +$P(X <-\ln 4 +Y) = \int_{\ln4}^\infty P(X < -\ln 4 +Y) f_Y(y) dy $ +$$=\int_{\ln 4}^\infty\int_0^{-\ln 4+y} \frac{1}{2} e^{-\frac{x}{2}}ye^{-y} dxdy $$ +Solving this I got $0.2545$. + +Thanks!",added 42 characters in body,2013-10-14 21:11:37.503 +186328,57463,22687.0,3,,CC BY-SA 3.0,61c15056-2b23-4203-b6e1-23875b097154,,,2013-10-14 21:14:24.377 +186326,57463,22687.0,2,,CC BY-SA 3.0,61c15056-2b23-4203-b6e1-23875b097154,"In the dark ages, we would map the results of a Student's t-test to a null hypothesis probability *p* by looking up *T* and degrees of freedom in a table to get an approximate result. + +What is the mathematical algorithm that generates that table? *ie*, how can I write a function to generate a precise *p* given an arbitrary *T* and *df*? + +The reason I ask is that I'm writing a piece of embedded software that continually monitors hundreds of populations with hundreds of samples each, and raises an alert if successive snapshots of a given population come to differ significantly. Currently it uses a crude *z*-score comparison, but it would be nice to use a more valid test. ",,2013-10-14 21:14:24.377 +186329,57452,668.0,6,,CC BY-SA 3.0,dab5fe07-d5c4-4682-beff-318a5b36ca8f,,edited tags,2013-10-14 21:18:11.250 +186330,57464,22507.0,2,,CC BY-SA 3.0,3577bed7-1dd0-4108-9f1d-f360f3b72378,The algorithm which never received 0's will be grossly biased and predict amost exclusively 1's. ,,2013-10-14 21:19:18.890 +186331,57461,,25,,,1c1522b0-ff57-48d2-a5a9-db20fb291343,,http://twitter.com/#!/StackStats/status/389863932598689792,2013-10-14 21:22:59.600 +186332,57465,22143.0,2,,CC BY-SA 3.0,0d143c84-dcd7-4ca1-ae6c-63b6cb33c9fc,"I am thinking of the following two points: + + - You are observing the true labels and their associated predictors, a.k.a the pair $y_i,x_i$ only when the algorithm is predicting a label of $1$. The algorithm is updated regardless of whether it made an error or not. This means that there is no feedback on mistakes (like in online learning). We get new data irrespective of our prediction performance. + + + - The question we need to ask is then: *Does the algorithm's output influence the data source?* If the algorithm is not influencing the source, then this aspect where we 'conditionally observe new data' will not bias the algorithm by itself (everything else held constant).",,2013-10-14 21:24:50.677 +186333,57464,668.0,33,,,2dda2333-42d3-4dce-87db-f82d7f00e7f6,,822,2013-10-14 21:26:23.200 +186334,57407,594.0,5,,CC BY-SA 3.0,4548fda7-26e2-4787-ab8d-f628e5069c28,"Since you appear to doubt the example offered, I have included a diagram. As Michael Mayer said, two normal distributions centered around 0, one with larger variance, is sufficient. + +In the diagram, we compare the 0.1 and the 0.9 quantiles for $\sigma=1$ (blue) and $\sigma=0.8$ (dark orange) + +![normcdfs sigma=1, sigma=0.8][1] + +Michael Mayer's example fulfills the requirements of your question with $q_1=0.1$, $q_2=0.9$ and $X_1$ being the one with larger variance. + +--- + +Edit: + +For the case where $q_1$ and $q_2$ must both be on the same side of whatever the measure of location is, let's take two symmetric distributions. + +Let $X_1$ be $\sim \text{N}(0,1^2)$ and let $X_2$ be an equal mixture of a $\text{N}(-0.8,0.1^2)$ and a $\text{N}(0.8,0.1^2)$, and let $q_1 = 0.6$ and $q_2 = 0.9$: + +![normal 0,1 vs symmetric mixture of normals with small s.d.][2] + +This example fulfills the new requirements of your question with $q_1=0.6$, $q_2=0.9$ and $X_1$ being the one with only a single normal component (shown in blue above). + + [1]: https://i.stack.imgur.com/pT43v.png + [2]: https://i.stack.imgur.com/2dHFj.png",added 613 characters in body,2013-10-14 21:47:55.657 +186335,57462,22143.0,5,,CC BY-SA 3.0,be99e4b7-a614-49c0-bcdb-20e91dbd0300,"*Try 1*: + +If $X \sim U[a,b]$ (uniform, either discrete or continuous), then the MLE estimator for b (which is $\max_{x \in [a,b]} X$) is essentially $\max_{i=1,...,N_s}x_i$. + +I chose uniform distribution because it is the worst case distribution in terms of entropy. This is in line with the MaxEnt (maximum entropy) principle. I also assumed a linear order in the values of the random variable. + +We can make the following claim about the estimator $\max_{i=1,...,N_s}x_i$ to *its* mean using Hoeffdings inequality (without assuming that $X \sim U[a,b]$). Assuming $x_i$ are i.i.d from some distribution with bounded support $[a,b]$, we have +\begin{align*} +\mathbb{P}_{x_1,...,x_{N_s}}\left(|\max_{i=1,...,N_s}x_i - \mathbb{E}[\max_{i=1,...,N_s}x_i]| \geq \epsilon\right) \leq 2\exp\left(\frac{-2\epsilon^2}{N_s(b-a)}\right) +\end{align*} +Here we do not need to know $b$ exactly, any rough or crude upper bound will suffice. The above concentration is only saying that the estimator is close to the expected value of the estimator which is not the same as being close to the unknown $\max_{x \in [a,b]}X = b$. + + +*Additional comment*: I would make the measurements uniformly at random over the plane/chip so that hopefully no region with high $X$ values is missed. This observation is independent of the above.",lowercase X to uppercase X,2013-10-14 21:47:57.387 +186336,57401,594.0,5,,CC BY-SA 3.0,4bc20b9d-f358-4cae-85d9-fd2330ce198f,"Let say, I have 2 continuous random variables X1 & X2. Both have same location parameters. Other parameters may be same or may not. + +Now say, the q1-th quantile of X1 is less than the q1-th quantile of x2. But the q2-th quantile of x1 is more than the q2th quantile of x2. + +My question is, is that possible? Is there any example of x1 & x2 which have that property? + +I will be really grateful if someone can give me some pointer. + +--- + +Edit: At this point, I realize the question I asked for was not correctly specified. + +I'm particularly interested in the case where the two quantiles being considered are on the same side of the location parameter. +",Edited in modification of question,2013-10-14 21:50:33.490 +186337,57407,594.0,5,,CC BY-SA 3.0,7c29850d-de7d-4942-b4c9-16f85e4e8aaa,"Since you appear to doubt the example offered, I have included a diagram. As Michael Mayer said, two normal distributions centered around 0, one with larger variance, is sufficient. + +In the diagram, we compare the 0.1 and the 0.9 quantiles for $\sigma=1$ (blue) and $\sigma=0.8$ (dark orange) + +![normcdfs sigma=1, sigma=0.8][1] + +Michael Mayer's example fulfills the requirements of your question with $q_1=0.1$, $q_2=0.9$ and $X_1$ being the one with larger variance. + +--- + +Edit: + +For the case where $q_1$ and $q_2$ must both be on the same side of whatever the measure of location is, let's take two symmetric distributions, which share the same mean and median. + +Let $X_1$ be $\sim \text{N}(0,1^2)$ and let $X_2$ be an equal mixture of a $\text{N}(-0.8,0.1^2)$ and a $\text{N}(0.8,0.1^2)$, and let $q_1 = 0.6$ and $q_2 = 0.9$: + +![normal 0,1 vs symmetric mixture of normals with small s.d.][2] + +This example fulfills the new requirements of your question with $q_1=0.6$, $q_2=0.9$ and $X_1$ being the one with only a single normal component (shown in blue above). + +Further, you should note that 'location parameter' isn't sufficiently specified. I could parameterize normal distributions by their 5th percentile and their standard deviation, and call the parameter based on the 5th percentile the location parameter (it's just a shift of the mean by 1.645. and still a perfectly valid location parameter). Then Michael's original example suffices even under the new conditions! + +(If that contradicts your intention, your intention needs to be specific enough to exclude it.) + + [1]: https://i.stack.imgur.com/pT43v.png + [2]: https://i.stack.imgur.com/2dHFj.png",added 613 characters in body,2013-10-14 21:56:22.453 +186340,57466,22690.0,3,,CC BY-SA 3.0,12f10fd2-c133-4190-b061-d656b66ce430,,,2013-10-14 21:58:00.910 +186339,57466,22690.0,1,,CC BY-SA 3.0,12f10fd2-c133-4190-b061-d656b66ce430,Naive Bayes with invalid independence assumption,,2013-10-14 21:58:00.910 +186349,57469,22507.0,2,,CC BY-SA 3.0,166e842b-c687-4ccc-8081-e2243825f323,"You cannot. ""Accept that $b_1=0$"" is the same as ""reject that $b_1\ne 0$"". But on what basis you could do this? No matter how many observations you have, you cannot distinguish between 0 and sufficiently small value of $b_1$. You can only accept that $|b_1|<\epsilon$ (the smaller $\epsilon$ the more observations you need).",,2013-10-14 22:07:54.383 +186352,57468,155.0,4,,CC BY-SA 3.0,25195edb-211a-41e2-a906-7cea371476ed,Whether to log transform variable when untransformed variable has positive skew and transformed has negative skew?,deleted 11 characters in body; edited tags; edited title,2013-10-14 22:08:55.297 +186351,57468,155.0,6,,CC BY-SA 3.0,25195edb-211a-41e2-a906-7cea371476ed,,deleted 11 characters in body; edited tags; edited title,2013-10-14 22:08:55.297 +186350,57468,155.0,5,,CC BY-SA 3.0,25195edb-211a-41e2-a906-7cea371476ed,"I have performed a log transformation on my skewed data, however on my DV it went from positive skew to negative skew after the (log) transformation, further data was missing from my DV after the transformation. Please help",deleted 11 characters in body; edited tags; edited title,2013-10-14 22:08:55.297 +186353,57470,22143.0,2,,CC BY-SA 3.0,521a17b0-41bd-4c4a-aa6a-e27917f68ab9,"*Try 2*: + +This is a heuristic and I don't know of any statistical guarantees. The procedure is as follows: + + - construct the empirical distribution function. If it looks exponential, convert the values to log scale to see a power-law tail. + - Fit a curve on this modified histogram. That is, do a 1-D regression. Hopefully the curve mimics the tail of a well-behaved distribution. + - Pick the point where the line intersects the x-axis in the interval $[\max_{i=1,...,N_s}x_i,\infty)$. + +This is another estimator of the max value of the support of the *population*.",,2013-10-14 22:17:38.120 +186354,57471,155.0,2,,CC BY-SA 3.0,636a075a-7045-445b-bdf0-fee02ec436e7,"### Additional missing data after log transformation +If you have additional missing data after log transformation, it is likely that you have data that is less than or equal to zero. (i.e., log(0), log(-1), etc. is not defined). So if you want to use a log transformation on data with negative numbers, you need to add a constant to the raw variable so that the minimum of the resulting variable is greater than zero. So your transformation should be + + $$\log(x + c)$$ + +where $x$ is your untransformed variable and $c = 1 - \textrm{min}(x)$. + +### Transformation flips the skewness +There is plenty of discussion on this site about when and whether transformations are useful. You might also like this [discussion of issues surrounding transformations](http://pareonline.net/getvn.asp?v=8&n=6). In general, if a log transformation is flipping the direction of your skewness, then there is a good chance that you did not have very much skewness to begin with. To test whether the transformation makes a substantive difference with the context of multiple regression, examine your correlations, R-squares, and standardised betas before and after transformation, and see what changes you observed. In many cases you will see that it makes little difference. + +Another point, is that the assumption pertains to the residuals of a multiple regression and not the dependent variable itself. + +If you really care about optimising the transformation to make the variable approximate a normal distribution, then you can use the [box-cox transformation](http://en.wikipedia.org/wiki/Power_transform). Or a simpler approach is just to try a range of transformations. A common set of transformations from greater to less change is: + + -1/x^2 + -1/x + log(x) + sqrt(x) + +So if `log(x)` is transforming too much, you could try `sqrt(x)`. + +",,2013-10-14 22:29:32.767 +186355,57468,155.0,6,,CC BY-SA 3.0,36cdac63-432c-4aae-ab87-6ef7390f5283,,edited title,2013-10-14 22:29:51.690 +186356,57468,155.0,4,,CC BY-SA 3.0,36cdac63-432c-4aae-ab87-6ef7390f5283,Whether to log transform variable when untransformed variable has positive skew and transformed has negative skew with additional missing data?,edited title,2013-10-14 22:29:51.690 +186357,57471,155.0,5,,CC BY-SA 3.0,8b3e371b-3130-4219-acaa-68fb302ddd0f,"### Additional missing data after log transformation +If you have additional missing data after log transformation, it is likely that you have data that is less than or equal to zero. (i.e., log(0), log(-1), etc. is not defined). So if you want to use a log transformation on data with negative numbers, you need to add a constant to the raw variable so that the minimum of the resulting variable is greater than zero. So your transformation could be + + $$\log(x + c)$$ + +where $x$ is your untransformed variable and $c = 1 - \textrm{min}(x)$. + +### Transformation flips the skewness +There is plenty of discussion on this site about when and whether transformations are useful. You might also like this [discussion of issues surrounding transformations](http://pareonline.net/getvn.asp?v=8&n=6). In general, if a log transformation is flipping the direction of your skewness, then there is a good chance that you did not have very much skewness to begin with. To test whether the transformation makes a substantive difference with the context of multiple regression, examine your correlations, R-squares, and standardised betas before and after transformation, and see what changes you observed. In many cases you will see that it makes little difference. + +Another point, is that the assumption pertains to the residuals of a multiple regression and not the dependent variable itself. + +If you really care about optimising the transformation to make the variable approximate a normal distribution, then you can use the [box-cox transformation](http://en.wikipedia.org/wiki/Power_transform). Or a simpler approach is just to try a range of transformations. A common set of transformations from greater to less change is: + + -1/x^2 + -1/x + log(x) + sqrt(x) + +So if `log(x)` is transforming too much, you could try `sqrt(x)`. + +",deleted 1 characters in body,2013-10-14 22:34:52.727 +186360,57472,21991.0,3,,CC BY-SA 3.0,13705cbe-734f-49e8-8367-005ffc64f799,,,2013-10-14 22:37:36.997 +186359,57472,21991.0,1,,CC BY-SA 3.0,13705cbe-734f-49e8-8367-005ffc64f799,Numerical example to understand Expectation-Maximization,,2013-10-14 22:37:36.997 +186358,57472,21991.0,2,,CC BY-SA 3.0,13705cbe-734f-49e8-8367-005ffc64f799,"I am trying to get a good grasp on the EM algorithm, to be able to implement and use it. I spent a full day reading the theory and a paper where EM is used to track an aircraft using the position information coming from a radar. Honestly, I dont think I fully understand the underlying idea. Can someone point me to a numerical example showing a few iterations (3-4) of the EM for a simpler problem (like estimating the parameters of a Gaussian distribution or a sequence of a sinusoidal series or fitting a line). + +Even if someone can point me to a piece of code (with synthetic data), I can try and step through the code. + +Thanks a lot. + + + ",,2013-10-14 22:37:36.997 +186470,57498,16474.0,5,,CC BY-SA 3.0,f948319d-b751-4408-ab0c-565465769ac6,"As statisticians, we come across many distributions under the banners ""discrete"",""continuous"" and ""univariate"",""multivariate"".But can anyone, offer me a good reason behind the existence and motivation for so many distributions. How do we get them? and what can a layman understand from it? + +What is the logic behind the existence of distributions?",deleted 18 characters in body,2013-10-15 08:01:56.230 +186361,57407,594.0,5,,CC BY-SA 3.0,0ae68851-626f-47fb-b116-4176f95c786f,"Since you appear to doubt the example offered, I have included a diagram. As Michael Mayer said, two normal distributions centered around 0, one with larger variance, is sufficient. + +In the diagram, we compare the 0.1 and the 0.9 quantiles for $\sigma=1$ (blue) and $\sigma=0.8$ (dark orange) + +![normcdfs sigma=1, sigma=0.8][1] + +Michael Mayer's example fulfills the requirements of your question with $q_1=0.1$, $q_2=0.9$ and $X_1$ being the one with larger variance. + +--- + +Edit: + +For the case where $q_1$ and $q_2$ must both be on the same side of whatever the measure of location is, let's take two symmetric distributions, which share the same mean and median. + +Let $X_1$ be $\sim \text{N}(0,1^2)$ and let $X_2$ be an equal mixture of a $\text{N}(-0.8,0.1^2)$ and a $\text{N}(0.8,0.1^2)$, and let $q_1 = 0.6$ and $q_2 = 0.9$: + +![normal 0,1 vs symmetric mixture of normals with small s.d.][2] + +This example fulfills the new requirements of your question with $q_1=0.6$, $q_2=0.9$ and $X_1$ being the one with only a single normal component (shown in blue above). + +Further, you should note that 'location parameter' isn't sufficiently specified. I could parameterize normal distributions by their 5th percentile and their standard deviation, and call the parameter based on the 5th percentile the location parameter (it's just a shift of the mean by 1.645\sigma. and can work as a perfectly valid location parameter). Then Michael's original example suffices even under the new conditions. If that contradicts your intention, your intention needs to be stated specifically enough to exclude it. + + [1]: https://i.stack.imgur.com/pT43v.png + [2]: https://i.stack.imgur.com/2dHFj.png",added 18 characters in body,2013-10-14 22:45:40.010 +186362,57471,15827.0,5,,CC BY-SA 3.0,2bb59698-3f9c-4bfa-91ca-1ca5c8970d17,"### Additional missing data after log transformation +If you have additional missing data after log transformation, it is likely that you have data that is less than or equal to zero. (i.e., log(0), log(-1), etc. is not defined). So if you want to use a log transformation on data with negative numbers, you need to add a constant to the raw variable so that the minimum of the resulting variable is greater than zero. So your transformation could be + + $$\log(x + c)$$ + +where $x$ is your untransformed variable and $c = 1 - \textrm{min}(x)$. + +### Transformation flips the skewness +There is plenty of discussion on this site about when and whether transformations are useful. You might also like this [discussion of issues surrounding transformations](http://pareonline.net/getvn.asp?v=8&n=6). In general, if a log transformation is flipping the direction of your skewness, then there is a good chance that you did not have very much skewness to begin with. To test whether the transformation makes a substantive difference with the context of multiple regression, examine your correlations, R-squares, and standardised betas before and after transformation, and see what changes you observed. In many cases you will see that it makes little difference. + +Another point, is that the assumption pertains to the residuals of a multiple regression and not the dependent variable itself. + +If you really care about optimising the transformation to make the variable approximate a normal distribution, then you can use the [Box-Cox transformation](http://en.wikipedia.org/wiki/Power_transform). Or a simpler approach is just to try a range of transformations. A common set of transformations from greater to less change is: + + -1/x^2 + -1/x + log(x) + sqrt(x) + +So if `log(x)` is transforming too much, you could try `sqrt(x)`. + +","Box, Cox proper names ",2013-10-14 22:48:09.820 +186363,57467,16469.0,6,,CC BY-SA 3.0,fa327cb6-3f3e-4483-8587-c59b514bf44b,,edited tags,2013-10-14 22:50:41.347 +186366,57473,5643.0,3,,CC BY-SA 3.0,7be0dfba-53df-4825-b890-17af59f59aaa,,,2013-10-14 23:00:43.557 +186365,57473,5643.0,1,,CC BY-SA 3.0,7be0dfba-53df-4825-b890-17af59f59aaa,Intuition for the difference of sample means,,2013-10-14 23:00:43.557 +186364,57473,5643.0,2,,CC BY-SA 3.0,7be0dfba-53df-4825-b890-17af59f59aaa,"I read in Wilcox, 2003 p. 247 that the standard error of the difference between two sample means is (assuming the normality and homoskedasticity assumptions): + +$\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}$ + +Rather than simply adding the two distributions standard errors as in: + +$\frac{\sigma_1^2}{\sqrt{n_1}} + \frac{\sigma_2^2}{\sqrt{n_2}}$ + +What is the intuition behind squaring the sum of the two variances divided into their respective sample size?",,2013-10-14 23:00:43.557 +186370,57474,22163.0,3,,CC BY-SA 3.0,49318068-1b21-415f-8ca4-bfc985f90017,,,2013-10-14 23:12:42.747 +186369,57474,22163.0,2,,CC BY-SA 3.0,49318068-1b21-415f-8ca4-bfc985f90017,"I want to do a logistic regression in SPSS. However, since I analyse unemployment spells the subjects are sometimes repeated (violating the independence assumption of the regression). One way of removing the within subject variation is by applying a Genlin model with the repeated subject subcommand (in essence a GEE model). Thus, I tried out a Genlin model with binomal probability and the logit link, comparing it to a standard logistic regression. I used the exact same variables in the two procedures. + +However, the results that was delivered from the Genlin procedure was inverted relative to that of the logistic regression. For instance: Exp(B) for women (of the independent variable sex/gender) was just above 2.0 in logistic regression while being at 0.49 in Genlin. The same happened with every independent variable. + + - Any suggestions to why the results of the Genlin procedure is + inverted? + - Is there any way to get Genlin results in accordance to the logistic regression?",,2013-10-14 23:12:42.747 +186368,57474,22163.0,1,,CC BY-SA 3.0,49318068-1b21-415f-8ca4-bfc985f90017,Inverse SPSS results: Logistic regression command vs. Genlin?,,2013-10-14 23:12:42.747 +186371,57475,503.0,2,,CC BY-SA 3.0,93fa2ee1-d5ef-4acd-8ec1-e4a5a18418e6,"Look into equivalence testing. See [this search][1] for lots of threads. Also see [Esteban & Nowacki][2] + + + [1]: http://stats.stackexchange.com/search?q=equivalence%20testing + [2]: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3019319/",,2013-10-14 23:15:43.797 +186372,57476,503.0,2,,CC BY-SA 3.0,3ab9d683-82ef-4a69-a3ca-f9b555418e55,"You don't square the sum of the variances, you take the square root of thQe sum of the variances. You do this for the same reason that the standard deviation is the square root of the variance: It make the units the same as the original ones, rather than squared units. + +Although we often lose site of it while doing statistics, the square of a measure involve squaring the *measure* as well as the number of units. For example, the square of 2 meters is not 4 meters, it is 4 meters squared, more commonly called 4 square meters. The same thing happens with other units that we aren't used to thinking of in this way: e.g. if you are measuring IQ, the square of an IQ is not an IQ of 10,000; it is a squared IQ of 10,000. + +You divide by the sample size as a scaling technique. Variances (tend to) go up with sample size; you divide by n to deal with that. ",,2013-10-14 23:21:40.057 +186373,57476,15827.0,5,,CC BY-SA 3.0,6288582f-90b1-4736-ace3-d63c29d70dca,"You don't square the sum of the variances, you take the square root of the sum of the variances. You do this for the same reason that the standard deviation is the square root of the variance: It make the units the same as the original ones, rather than squared units. + +Although we often lose sight of it while doing statistics, the square of a measure involve squaring the *measure* as well as the number of units. For example, the square of 2 meters is not 4 meters, it is 4 meters squared, more commonly called 4 square meters. The same thing happens with other units that we aren't used to thinking of in this way: e.g. if you are measuring IQ, the square of an IQ is not an IQ of 10,000; it is a squared IQ of 10,000. + +You divide by the sample size as a scaling technique. Variances (tend to) go up with sample size; you divide by $n$ to deal with that. ",small fixes,2013-10-14 23:23:03.383 +186374,57477,594.0,2,,CC BY-SA 3.0,78b3e376-3e9a-4f3a-9ee2-94dd21d38cd4,"For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +http://en.wikipedia.org/wiki/Variance#Basic_properties +",,2013-10-14 23:40:17.447 +186375,57477,594.0,5,,CC BY-SA 3.0,3ee355a1-e8e0-4d5f-8979-e9a468c0efe5,"You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +Further, + +$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$ + +(if the $X_i$ are independent of each other). + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +(If the algebra is unconvincing, or you don't believe the basic properties of variance that I used, you might consider simulation, which should soon make it clear.)",added 198 characters in body,2013-10-14 23:46:12.817 +186378,57478,7860.0,3,,CC BY-SA 3.0,6ce06302-530c-47d7-b95f-30367712e271,,,2013-10-15 00:04:46.003 +186377,57478,7860.0,1,,CC BY-SA 3.0,6ce06302-530c-47d7-b95f-30367712e271,Adding errors to Gaussian kernel density estimator,,2013-10-15 00:04:46.003 +186376,57478,7860.0,2,,CC BY-SA 3.0,6ce06302-530c-47d7-b95f-30367712e271,"I'm using the [scipy.stats.gaussian_kde][1] function to generate a `KDE` from a set of $N$ points in a 2D space: $A = \{(x_1,y_1), (x_2,y_2), (x_3,y_2), ..., (x_N,y_N)\}$ + +Each one of these points has a given error attached to it. So for example, the point $(x_1,y_1)$ has errors $(e_{x_1},e_{y_1})$ and so on. I can assume the errors are normally distributed in both axis. + +The `python` function that I use to generate the `KDE` has no way to integrate these errors into the calculations and I wonder how I would even do such a thing if I did it manually. + +Ie: what is the statistically correct way to generate a `KDE` accounting for errors in the data used? + + + [1]: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html",,2013-10-15 00:04:46.003 +186379,57473,5643.0,5,,CC BY-SA 3.0,2b79a748-b9aa-41a9-80d5-8abfe4c490ec,"I read in Wilcox, 2003 p. 247 that the standard error of the difference between two sample means is (assuming the normality and homoskedasticity assumptions): + +$\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}$ + +Rather than simply adding the two samples standard errors as in: + +$\frac{\sigma_1^2}{\sqrt{n_1}} + \frac{\sigma_2^2}{\sqrt{n_2}}$ + +What is the intuition behind squaring the sum of the two variances divided into their respective sample size?","Rephrased, question is about sample mean errors, not the underlying distributions",2013-10-15 00:20:57.200 +186380,57468,,25,,,54dbb17a-d61b-472e-9152-fa82fbd069a0,,http://twitter.com/#!/StackStats/status/389909221011386368,2013-10-15 00:22:57.193 +186383,57479,64247.0,2,abe3,CC BY-SA 3.0,faa20063-e539-41da-84c8-90ad3c0a7df3,"I am looking over slides for a big data class. The slides suggest doing a pairwise plot of data (if not too many variables) to evaluate the quality of output from k-means clustering -- with each data point color-coded by its cluster. The slides say: + +> If the (colored) clusters look separated in at least some of the plots. They won’t be very separated in all of the plots. + +How would this tell you if a pairwise plot is effective? You would want the colors to be mixed up in the plots to make sure that you have genuine multi-dimensional clusters and not just groups of data points that are very similar on one variable?",,2013-10-15 00:37:03.313 +186382,57479,64247.0,3,abe3,CC BY-SA 3.0,faa20063-e539-41da-84c8-90ad3c0a7df3,,,2013-10-15 00:37:03.313 +186381,57479,64247.0,1,abe3,CC BY-SA 3.0,faa20063-e539-41da-84c8-90ad3c0a7df3,How would you use pair-wise plots to test the effectiveness of k-means clustering?,,2013-10-15 00:37:03.313 +186384,57480,2121.0,2,,CC BY-SA 3.0,d6786cef-171e-4805-8863-c37895c71e51,"I think if each dataset is already weighted to your satisfaction, then you have a couple of different options. Which one is the right one may vary based on your objectives and the particulars of your existing data collection and weighting. + + - Union all of the datasets, along with their pre-calculated weights, and that's it. + +> This would be the right choice if each dataset was weighted towards a proper total count and didn't over-state the importance of any individual record relative to another dataset. If one dataset was weighted to reflect Total US Population, and another dataset was weighted in place to its own total count of respondents, then this would not be the right choice. + + - Calculate a weight for each dataset to multiply by each record's existing weight + +> This would be the right choice if each of your datasets are of equal importance regardless of their size. Example below... + + - Union all of the raw data and re-calculate the weights on the new, entire dataset + +> This would be the right choice if the reasons for non-response are similar across your different surveys - it results in the simplest data for you to work with, and it's the least likely to produce extreme weights. + +Example for #2: each dataset is weighted to equal importance, with this ""dataset weight"" being multiplied by whatever weight has already been calculated within the dataset. + + > Survey 1: 100 people weight: 2 + > Survey 2: 200 people weight: 1 + > Survey 3: 300 people weight: 2/3 + > Survey 4: 150 people weight: 4/3 + > Survey 5: 250 people weight: 4/5",,2013-10-15 01:07:38.337 +186387,57481,14548.0,3,,CC BY-SA 3.0,9ff8dbd6-f8e4-4b1f-a4ff-81f0618288d4,,,2013-10-15 01:08:53.047 +186386,57481,14548.0,2,,CC BY-SA 3.0,9ff8dbd6-f8e4-4b1f-a4ff-81f0618288d4,"Having performed linear regression, I can find the confidence interval for the response conditioned on a particular x value. However, I am interested in a C.I for the *mean* response for a set of N new observations. That is, I need to combine the N prediction intervals intervals. + +The closest post I could find was http://stats.stackexchange.com/questions/8755/calculating-the-mean-using-regression-data, but it only handles the univariate case. + +I tried deriving the standard error of the mean response below, but I'm not sure if this correct. + +$\begin{align} +var(\hat{\bar{y}}) &= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_1 \ldots x_n \right) \\ +&= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_i \right), \quad \text{where the } \hat{y_i}|x_i \text{ are independent} \\ +&= \frac{1}{n^2} \sum_i var(\hat{y}_i|x_i) \\ +\end{align}$ + +where $var(\hat{y}_i|x_i) = \sqrt{\sigma^2 x_i^T (X^TX)^{-1}x_i}$ for $x_i$ in the training data and $var(\hat{y}_i|x^*_i) = \sqrt{\sigma^2 (1+ x_i^{*T} (X^TX)^{-1}x^*_i)}$ for $x_i$ in the test data. + +Am I right track here? Also, is there an R implementation somewhere, or should I do it from scratch? + +Thanks, + +A.",,2013-10-15 01:08:53.047 +186385,57481,14548.0,1,,CC BY-SA 3.0,9ff8dbd6-f8e4-4b1f-a4ff-81f0618288d4,Combining prediction confidence intervals in a regression,,2013-10-15 01:08:53.047 +186390,57482,22695.0,3,,CC BY-SA 3.0,2524c28b-0a79-43cd-b8dd-a276358e8fc3,,,2013-10-15 01:09:29.977 +186389,57482,22695.0,1,,CC BY-SA 3.0,2524c28b-0a79-43cd-b8dd-a276358e8fc3,Finding the full conditonal distribution when there are multiple distributions involved,,2013-10-15 01:09:29.977 +186388,57482,22695.0,2,,CC BY-SA 3.0,2524c28b-0a79-43cd-b8dd-a276358e8fc3,"6 neighboring countries have the following disease instances: $y = (y_1, y_2,...,y_n)$ with a population of $x = (x_1, x_2,...,x_n)$. + +The following model and prior distributions are considered: + +$y_i|\theta_i,p_i \sim poisson(\theta_i x_i)$ + +$\theta_i | \alpha, \beta \sim gamma(\alpha, \beta)$ + +$\alpha \sim gamma(1,1)$ + +$\beta \sim gamma(10,1)$ + +a) Find the full conditional rate $p(\theta_i | \theta_{-i}, \alpha, \beta, x, y)$ + +b) Find the posterior distribution. + +Attempt: + +a) For finding the conditional rate with two variables, I would use Bayes' theory. I am not sure if this applies with multiple distributions. + +$$p(\theta_i | \theta_{-i}, \alpha, \beta, x, y) = \frac{P(\theta_i \bigcap \theta_{-i} \bigcap \alpha \bigcap \beta \bigcap x \bigcap y)}{P( \theta_{-i}, \alpha, \beta, x, y)}$$ + +$$ = \frac{P(\theta_{-i}, \alpha, \beta, x, y | \theta_i)P(\theta_i)}{\sum_{i=1}^6 P(\theta_{-i}, \alpha, \beta, x, y | \theta_i)P(\theta_i)}$$ + +b) The posterior probability is the (prior)x(likelihood). So this would be $$poision(\theta_i x_i) x L(theta_i x_i)$$ + +I'm not sure how to do the pdf of a poisson variable as it is variable. The likelihood function is $L(\theta_i y_i) = \frac{\theta_i^{\sum_{i=1}^n y_i} e^{-n \theta_i}}{y_1!,y_2!,..,y_n!}$",,2013-10-15 01:09:29.977 +186393,57483,22659.0,3,,CC BY-SA 3.0,68a5c8ea-ab92-4955-b766-da8fe9bc7aee,,,2013-10-15 01:19:13.100 +186392,57483,22659.0,1,,CC BY-SA 3.0,68a5c8ea-ab92-4955-b766-da8fe9bc7aee,Scikit-learn's Gaussian Processes: how to include multiple hyperparameters in kernel/cov function?,,2013-10-15 01:19:13.100 +186391,57483,22659.0,2,,CC BY-SA 3.0,68a5c8ea-ab92-4955-b766-da8fe9bc7aee,"I'm using the scikits-learn implementation of gaussian processes. A simple thing to do is to combine multiple kernels as a linear combination to describe your timeseries properly. So I'd like to include both the squared exponential kernel and the periodic kernel. Linear combinations of valid kernels produce valid kernels, and same goes for multiplying valid kernels (given by Rasmussen and Wiliams). + +Unfortunately I haven't figured out how to give the theta parameters properly to the model. For example, if we have + +$$ +k_{gauss}(x,x') = \exp{(\theta (x-x')^2)} +$$ + +then it is alright (this is how the squared-exponential kernel is defined in scikits-learn). But if I wanted + +$$ +k_{gauss}(x,x') = \theta_0 \exp{(\theta_1 (x-x')^2)} +$$ + +then it is impossible, it seems. The $\mathbf{\theta}$ thing is supposed to be an array, in case you have multiple dimensions/features (even though scikits-learn doesn't support multidimensional GPs, someone developed it and it will be merged soon). So there is one row with the columns being the parameter in such-and-such dimension. But you cannot have more rows, otherwise it screams at you. + +So question: has anyone actually been able to use kernels that use more than one hyperparameter? If so, what am I doing wrong? And if it is indeed not possible with the current code in scikits, does anyone have some tips on how to extend it so that it can? This is a really important feature that I need. Thanks.",,2013-10-15 01:19:13.100 +186394,57482,594.0,5,,CC BY-SA 3.0,f58526e1-91e2-4f64-8482-cdcd0b0b61ff,"6 neighboring countries have the following disease instances: $y = (y_1, y_2,...,y_n)$ with a population of $x = (x_1, x_2,...,x_n)$. + +The following model and prior distributions are considered: + +$y_i|\theta_i,p_i \sim \text{Poisson}(\theta_i x_i)$ + +$\theta_i | \alpha, \beta \sim \text{gamma}(\alpha, \beta)$ + +$\alpha \sim \text{gamma}(1,1)$ + +$\beta \sim \text{gamma}(10,1)$ + +a) Find the full conditional rate $p(\theta_i | \theta_{-i}, \alpha, \beta, x, y)$ + +b) Find the posterior distribution. + +Attempt: + +a) For finding the conditional rate with two variables, I would use Bayes' theory. I am not sure if this applies with multiple distributions. + +$$p(\theta_i | \theta_{-i}, \alpha, \beta, x, y) = \frac{P(\theta_i \bigcap \theta_{-i} \bigcap \alpha \bigcap \beta \bigcap x \bigcap y)}{P( \theta_{-i}, \alpha, \beta, x, y)}$$ + +$$ = \frac{P(\theta_{-i}, \alpha, \beta, x, y | \theta_i)P(\theta_i)}{\sum_{i=1}^6 P(\theta_{-i}, \alpha, \beta, x, y | \theta_i)P(\theta_i)}$$ + +b) The posterior probability is the (prior)x(likelihood). So this would be $$\text{Poisson}(\theta_i x_i) \times L(\theta_i x_i)$$ + +I'm not sure how to do the pdf of a Poisson variable as it is variable. The likelihood function is $L(\theta_i y_i) = \frac{\theta_i^{\sum_{i=1}^n y_i} e^{-n \theta_i}}{y_1!,y_2!,..,y_n!}$",formatting,2013-10-15 01:21:50.723 +186395,57473,594.0,5,,CC BY-SA 3.0,0bc4ab9c-c6f5-4737-9603-0f5fa9867269,"I read in Wilcox, 2003 p. 247 that the standard error of the difference between two sample means is (assuming the normality and homoskedasticity assumptions): + +$\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}$ + +Rather than simply adding the two sample standard errors as in: + +$\frac{\sigma_1}{\sqrt{n_1}} + \frac{\sigma_2}{\sqrt{n_2}}$ + +What is the intuition behind squaring the sum of the two variances divided into their respective sample size?",deleted 5 characters in body,2013-10-15 01:23:16.260 +186396,57407,594.0,5,,CC BY-SA 3.0,62d6b7f5-f1fe-4859-a46c-c0af7e5c3043,"Since you appear to doubt the example offered, I have included a diagram. As Michael Mayer said, two normal distributions centered around 0, one with larger variance, is sufficient. + +In the diagram, we compare the 0.1 and the 0.9 quantiles for $\sigma=1$ (blue) and $\sigma=0.8$ (dark orange) + +![normcdfs sigma=1, sigma=0.8][1] + +Michael Mayer's example fulfills the requirements of your question with $q_1=0.1$, $q_2=0.9$ and $X_1$ being the one with larger variance. + +--- + +Edit: + +For the case where $q_1$ and $q_2$ must both be on the same side of whatever the measure of location is, let's take two symmetric distributions, which share the same mean and median. + +Let $X_1$ be $\sim \text{N}(0,1^2)$ and let $X_2$ be an equal mixture of a $\text{N}(-0.8,0.1^2)$ and a $\text{N}(0.8,0.1^2)$, and let $q_1 = 0.6$ and $q_2 = 0.9$: + +![normal 0,1 vs symmetric mixture of normals with small s.d.][2] + +This example fulfills the new requirements of your question with $q_1=0.6$, $q_2=0.9$ and $X_1$ being the one with only a single normal component (shown in blue above). + +Further, you should note that 'location parameter' isn't sufficiently specified. I could parameterize normal distributions by their 5th percentile and their standard deviation, and call the parameter based on the 5th percentile the location parameter (it's just a shift of the mean by $1.645\sigma$. and can work as a perfectly valid location parameter). Then Michael's original example suffices even under the new conditions. If that contradicts your intention, your intention needs to be stated specifically enough to exclude it. + + [1]: https://i.stack.imgur.com/pT43v.png + [2]: https://i.stack.imgur.com/2dHFj.png",added 2 characters in body,2013-10-15 01:24:42.953 +186397,57473,5643.0,5,,CC BY-SA 3.0,ecc2a928-5bd1-42cb-b396-fd330d53b2e4,"I read in Wilcox, 2003 p. 247 that the standard error of the difference between two sample means is (assuming the normality and homoskedasticity assumptions): + +$\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}$ + +Rather than simply adding the two sample standard errors as in: + +$\frac{\sigma_1}{\sqrt{n_1}} + \frac{\sigma_2}{\sqrt{n_2}}$ + +What is the intuition behind taking the square of the sum of the two variances divided into their respective sample size, rather than the sum of the standard errors?",added 12 characters in body,2013-10-15 01:42:01.237 +186398,57481,14548.0,5,,CC BY-SA 3.0,41881012-ce7a-46db-a81c-d7ad9962b1d3,"Having performed linear regression, I can find the confidence interval for the response conditioned on a particular x value. However, I am interested in a C.I for the *mean* response for a set of N new observations. That is, I need to combine the N prediction intervals intervals. + +The closest post I could find was http://stats.stackexchange.com/questions/8755/calculating-the-mean-using-regression-data, but it only handles the univariate case. + +I tried deriving the standard error of the mean response below, but I'm not sure if this correct. + +$\begin{align} +var(\hat{\bar{y}}) &= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_1 \ldots x_n \right) \\ +&= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_i \right), \quad \text{where the } \hat{y_i}|x_i \text{ are independent} \\ +&= \frac{1}{n^2} \sum_i var(\hat{y}_i|x_i) \\ +\end{align}$ + +where $var(\hat{y}_i|x_i) = \sqrt{\sigma^2 x_i^T (X^TX)^{-1}x_i}$ for $x_i$ in the training data and $var(\hat{y}_i|x^*_i) = \sqrt{\sigma^2 (1+ x_i^{*T} (X^TX)^{-1}x^*_i)}$ for $x^*_i$ in the test data. + +Am I right track here? Also, is there an R implementation somewhere, or should I do it from scratch? + +Thanks, + +A.",added 2 characters in body,2013-10-15 01:51:40.657 +186400,57484,22698.0,1,,CC BY-SA 3.0,9c09643e-5a5a-453c-92c1-eb7ab9d19d61,Bound for the correlation of three random variables,,2013-10-15 01:55:03.623 +186401,57484,22698.0,3,,CC BY-SA 3.0,9c09643e-5a5a-453c-92c1-eb7ab9d19d61,,,2013-10-15 01:55:03.623 +186399,57484,22698.0,2,,CC BY-SA 3.0,9c09643e-5a5a-453c-92c1-eb7ab9d19d61,There are three random variables with the same correlation pho. What’s the tightest bound you can give for pho?,,2013-10-15 01:55:03.623 +186402,57485,7155.0,2,,CC BY-SA 3.0,7738bdad-8e94-4a84-a6b4-06e11160436e,"On scikit-learn==0.14.1. + +$\theta_0$ can be a vector. The following code works for me. + + import numpy as np + from sklearn.gaussian_process import GaussianProcess + from sklearn.datasets import make_regression + X, y = make_regression() + bad_theta = np.abs(np.random.normal(0,1,100)) + model = GaussianProcess(theta0=bad_theta) + model.fit(X,y) + +You can pass any kernel you want as the parameter corr. The following is the radial basis function that sklearn uses for Gaussian processes. + + def squared_exponential(theta, d): + """""" + Squared exponential correlation model (Radial Basis Function). + (Infinitely differentiable stochastic process, very smooth):: + + n + theta, dx --> r(theta, dx) = exp( sum - theta_i * (dx_i)^2 ) + i = 1 + + Parameters + ---------- + theta : array_like + An array with shape 1 (isotropic) or n (anisotropic) giving the + autocorrelation parameter(s). + + dx : array_like + An array with shape (n_eval, n_features) giving the componentwise + distances between locations x and x' at which the correlation model + should be evaluated. + + Returns + ------- + r : array_like + An array with shape (n_eval, ) containing the values of the + autocorrelation model. + """""" + + theta = np.asarray(theta, dtype=np.float) + d = np.asarray(d, dtype=np.float) + + if d.ndim > 1: + n_features = d.shape[1] + else: + n_features = 1 + + if theta.size == 1: + return np.exp(-theta[0] * np.sum(d ** 2, axis=1)) + elif theta.size != n_features: + raise ValueError(""Length of theta must be 1 or %s"" % n_features) + else: + return np.exp(-np.sum(theta.reshape(1, n_features) * d ** 2, axis=1)) + +It looks like you're doing something pretty interesting, btw. +",,2013-10-15 01:56:23.430 +186404,57486,22677.0,1,,CC BY-SA 3.0,e8c88b1b-732d-4f19-8efb-19c08704ee6b,How does one determine what ARL0 should be used on CPM package to test for Structural Change,,2013-10-15 02:10:01.050 +186405,57486,22677.0,2,,CC BY-SA 3.0,e8c88b1b-732d-4f19-8efb-19c08704ee6b,"i'm trying to find multiple break point by using `processStream` from `CPM` package on `R` +can someone enlighten me on what is ***ARL0*** how does one determine what ***ARL0*** should be used + ++ `processStream(ret.fin.chn,""Kolmogorov-Smirnov"",ARL0=500,lambda=NA)` ++ `$changePoints` ++ `[1] 59 75 250 286 443 448 663 1037 1042 1261 1576 1842 1853 2013 2035 2621 2633` ++ `$detectionTimes` ++ `[1] 73 89 285 334 447 503 670 1040 1145 1428 1639 1951 1874 2030 2078 2632 2644` + +while + ++ `processStream(ret.fin.chn,""Kolmogorov-Smirnov"",ARL0=2000,lambda=NA)` ++ `$changePoints' ++ `[1] 59 75 663 1037 1261 1559 1842 2013 2035 2621 2633` ++ `$detectionTimes` ++ `[1] 75 90 691 1041 1480 1688 2026 2032 2266 2633 2646` + +and + ++ `processStream(ret.fin.chn,""Kolmogorov-Smirnov"",ARL0=3000,lambda=NA)` ++ `$changePoints` ++ `[1] 59 75 663 1037 1261 1559 1842 2013 2149` ++ `$detectionTimes` ++ `[1] 75 92 692 1041 1490 1690 2026 2032 2284` + +so its seems that different ARL0 will gives fewer break point detection, its that a good thing? + +*note: the times series `ret.fin.chn` contains 2749 rows* + +below are excerpts from `R` help + +>ARL0 + +>Determines the ARL_0 which the CPM should have, which corresponds to the average number of observations before a false positive occurs, assuming that the sequence does not undergo a chang. Because the thresholds of the CPM are computationally expensive to estimate, the package contains pre-computed values of the thresholds corresponding to several common values of the ARL_0. This means that only certain values for the ARL_0 are allowed. Specifically, the ARL_0 must have one of the following values: 370, 500, 600, 700, ..., 1000, 2000, 3000, ..., 10000, 20000, ..., 50000* +",,2013-10-15 02:10:01.050 +186403,57486,22677.0,3,,CC BY-SA 3.0,e8c88b1b-732d-4f19-8efb-19c08704ee6b,,,2013-10-15 02:10:01.050 +186406,57481,14548.0,5,,CC BY-SA 3.0,7f56abee-c49a-46f0-81b7-944bb8c252d0,"Having performed linear regression, I can find the confidence interval for the response conditioned on a particular x value. However, I am interested in a C.I for the *mean* response for a set of N new observations. That is, I need to combine the N prediction intervals intervals. + +The closest post I could find was http://stats.stackexchange.com/questions/8755/calculating-the-mean-using-regression-data, but it only handles the univariate case. + +I tried deriving the standard error of the mean response below, but I'm not sure if this correct. + +$\begin{align} +var(\hat{\bar{y}}) &= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_1 \ldots x_n \right) \\ +&= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_i \right), \quad \text{where the } \hat{y_i}|x_i \text{ are independent} \\ +&= \frac{1}{n^2} \sum_i var(\hat{y}_i|x_i) \\ +\end{align}$ + +where $var(\hat{y}_i|x_i) = \sqrt{\sigma^2 x_i^T (X^TX)^{-1}x_i}$ for $x_i$ in the training data and $var(\hat{y}_i|x^*_i) = \sqrt{\sigma^2 (1+ x_i^{*T} (X^TX)^{-1}x^*_i)}$ for $x^*_i$ in the test data. + +Am I on the right track here? Also, is there an R implementation somewhere, or should I do it from scratch? + +Thanks, + +A.",fixed grammar,2013-10-15 02:15:34.537 +186409,57487,13037.0,1,,CC BY-SA 3.0,735b28ff-8b3c-4a08-bedf-85e50bf60955,Weighted Least Squares Estimate,,2013-10-15 02:16:00.033 +186408,57487,13037.0,2,,CC BY-SA 3.0,735b28ff-8b3c-4a08-bedf-85e50bf60955,"Here is a problem from a practice test. Suppose that $$X_i = \mu + \epsilon_i,\quad i=1,\ldots,n\quad \epsilon_i\sim N(0,\sigma^2_1)$$ $$Y_i = \mu + \delta_i,\quad i=1,\ldots,m\quad \delta_i\sim N(0,\sigma^2_2)$$ All $\epsilon_i$'s and $\delta_i$'s are independent. The paramters $\mu, \sigma_1^2, $ and $\sigma_2^2$ are unknown. Let $\theta=m/n$, $\rho=\sigma_2^2/\sigma_1^2$. Suppose $\rho$ is known. Show that the least squares (weighted) estimator of $\mu$ is $$ \hat{\mu} = \dfrac{\rho\bar{X} + \theta\bar{Y}}{\rho+\theta}$$ + +MY ATTEMPT: + +I can't figure out how to use the fact that $\rho$ is known. I tried $$\hat{\mu} = \text{argmin}\left\{\sum_{i=1}^n (X_i-\mu)^2 + \sum_{i=1}^m (Y_i-\mu)^2\right\}$$ and arrived that the weighted averaged $$ \hat{\mu} = \dfrac{n\bar{X} + m\bar{Y}}{n+m}$$ But again this does not use the fact that we know what the ratio $\sigma_2^2/\sigma_1^2$ is. Any ideas?",,2013-10-15 02:16:00.033 +186407,57487,13037.0,3,,CC BY-SA 3.0,735b28ff-8b3c-4a08-bedf-85e50bf60955,,,2013-10-15 02:16:00.033 +186410,57477,594.0,5,,CC BY-SA 3.0,e14e6a4e-c63b-44cd-81cf-6443b465e117,"You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +Further, + +$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$ + +(if the $X_i$ are independent of each other). + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +In summary: the correct term has $\sigma^2/n$ terms because we're looking at averages and that's the variance of an average of independent random variables. It has a $+$ because the two samples are independent, so their variances add. It has a square root because we want the standard deviation of the distribution of the difference in sample means (the standard error of the difference in means). That's *every single part of the formula explained*. + +The reason why we don't just add standard errors is *standard errors don't add*. The *variances* add, though, so we can use that to work out the standard errors. + +(If the algebra is unconvincing, or you don't believe the basic properties of variance that I used, you might consider simulation, which should soon make it clear.)",added 620 characters in body,2013-10-15 02:21:47.723 +186411,57477,594.0,5,,CC BY-SA 3.0,7d6e02ab-fed7-4363-9ece-55f262151ce4,"You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +Further, + +$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$ + +(if the $X_i$ are independent of each other). + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +In summary: the correct term: + +![annotated se formula][1] + +1) has $\sigma^2/n$ terms because we're looking at averages and that's the variance of an average of independent random variables. + +2) It has a $+$ because the two samples are independent, so their variances add. + +3) It has a square root because we want the standard deviation of the distribution of the difference in sample means (the standard error of the difference in means). + +That's *every single part of the formula explained*. + +The reason why we don't just add standard errors is *standard errors don't add* - the standard error of the difference in means is NOT the sum of the standard errors of the means. The variances *do* add, though, so we can use that to work out the standard errors. + +(If the algebra is unconvincing, or you don't believe the basic properties of variance that I used, you might consider simulation, which should soon make it clear.) + + + [1]: https://i.stack.imgur.com/TUqrV.png",added 98 characters in body,2013-10-15 02:41:42.597 +186412,57488,17730.0,2,,CC BY-SA 3.0,c2d5107a-1748-47d5-aa9d-7f0d0410c4ee,"I want to express the joint probability of $\Phi_A$ and $\Phi_B$: $p(\Phi_A, \Phi_B)$ conditioned that $\Phi_A$ and $\Phi_B$ are both greater than some value C. How would I express this mathematically? I guess my intuition says: + +$p(\Phi_A, \Phi_B | \Phi>C)$ + +Is this correct? Is there a better way to express this?",,2013-10-15 02:45:34.410 +186414,57488,17730.0,3,,CC BY-SA 3.0,c2d5107a-1748-47d5-aa9d-7f0d0410c4ee,,,2013-10-15 02:45:34.410 +186413,57488,17730.0,1,,CC BY-SA 3.0,c2d5107a-1748-47d5-aa9d-7f0d0410c4ee,How to express joint conditional probability with multiple conditions,,2013-10-15 02:45:34.410 +186471,57429,20740.0,5,,CC BY-SA 3.0,24e0b951-6a32-4a75-91ba-8862a89af12f," +I am looking to do a linear regression on two independent variables that will be present in varying proportions. + +For example trying to do a linear regression on $Y$ which is payment behavior (payback rate) of customers based on the the quality (let's say Gini coefficient) of the new and existing customer credit scores ($X_1$ and $X_2$, respectively) adjusted for the proportion of new and existing customers in the sample. + +Existing customers will be present in proportion $p$ and new customers in proportion $1-p = q$. + +$Y$, payback rate is the percentage of total customers who pay back. It could be expressed as the weighted average $Y = Y_1q + Y_2p$ where $Y_i$ is the payback rate of new/existing customers. + +In general more new customers, $q$, has a negative effect. Better scoring ($X_1, X_2$) and more existing customers p have a positive effect. + +What is a good way to model this? + +Would something like the following be a good solution trying to use $p$ and $q$ as some sort of interaction effect? + +$Y = X_1+X_2+\frac{X_1}{q}+X_2 p$ + +Would it be better to include p and q as variables themselves as well?",clarifying Y variable,2013-10-15 08:16:02.787 +186415,57477,594.0,5,,CC BY-SA 3.0,a580f93c-7fdd-4473-8da7-54a5eb729789,"You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +Further, + +$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$ + +(if the $X_i$ are independent of each other). + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +In summary: the correct term: + +![annotated se formula][1] + +$\color{red}{(1)}$ has $\sigma^2/n$ terms because we're looking at averages and that's the variance of an average of independent random variables; + +$\color{red}{(2)}$ has a $+$ because the two samples are independent, so their variances add; and + +$\color{red}{(3)}$ has a square root because we want the standard deviation of the distribution of the difference in sample means (the standard error of the difference in means). + +That's *every single part of the formula explained*. + +The reason why we don't just add standard errors is *standard errors don't add* - the standard error of the difference in means is NOT the sum of the standard errors of the means. The variances *do* add, though, so we can use that to work out the standard errors. + +(If the algebra is unconvincing, or you don't believe the basic properties of variance that I used, you might consider simulation, which should soon make it clear.) + + + [1]: https://i.stack.imgur.com/TUqrV.png",added 98 characters in body,2013-10-15 02:46:54.563 +186416,57477,594.0,5,,CC BY-SA 3.0,2f1883d2-2fd1-4d53-8447-0846c67be5d2,"You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +Further, + +$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$ + +(if the $X_i$ are independent of each other). + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +In summary: the correct term: + +![annotated se formula][1] + +$\color{red}{(1)}$ has $\sigma^2/n$ terms because we're looking at averages and that's the variance of an average of independent random variables; + +$\color{red}{(2)}$ has a $+$ because the two samples are independent, so their variances add; and + +$\color{red}{(3)}$ has a square root because we want the standard deviation of the distribution of the difference in sample means (the standard error of the difference in means). The part under the bar of the square root is the variance of the difference (the square of the standard error). Taking square roots of squared standard errors gives us standard errors. + +That's *every single part of the formula explained*. + +The reason why we don't just add standard errors is *standard errors don't add* - the standard error of the difference in means is NOT the sum of the standard errors of the sample means for independent samples. The variances *do* add, though, so we can use that to work out the standard errors. + +(If the algebra is unconvincing, or you don't believe the basic properties of variance that I used, you might consider simulation, which should soon make it clear.) + + + [1]: https://i.stack.imgur.com/TUqrV.png",added 98 characters in body,2013-10-15 02:52:05.747 +186417,57489,4656.0,2,,CC BY-SA 3.0,6086c157-09ec-4319-bdb9-192fccb6652e,"The common correlation $\rho$ can have value $+1$ but not $-1$. If $\rho_{X,Y}= \rho_{X,Z}=-1$, then $\rho_{Y,Z}$ cannot equal $-1$ but is in fact $+1$. +The smallest value of the common correlation of three random variables +is $-\frac{1}{2}$. More generally, +the minimum common correlation of $n$ random variables is $-\frac{1}{n-1}$ +when, regarded as vectors, they are at the vertices of a simplex (of dimension $n-1$) +in $n$-dimensional space. + +Start with $n$ uncorrelated zero-mean unit-variance +random variables $X_i$ and set +$Y_i = X_i - \frac{1}{n}\sum X_i$. Then, $E[Y_i]=0$, +$$\operatorname{var}(Y_i) = \left(\frac{n-1}{n}\right)^2 + (n-1)\left(\frac{1}{n}\right)^2 += \frac{n-1}{n}$$ +and +$$\operatorname{cov}(Y_i,Y_j) = -2\left(\frac{n-1}{n}\right)\left(\frac{1}{n}\right) + +(n-2)\left(\frac{1}{n}\right)^2 = -\frac{1}{n}$$ +giving +$$\rho_{Y_i,Y_j} += \frac{\operatorname{cov}(Y_i,Y_j)}{\sqrt{\operatorname{var}(Y_i)\operatorname{var}(Y_j)}} +=\frac{-1/n}{(n-1)/n} += -\frac{1}{n-1}.$$",,2013-10-15 02:58:22.840 +186420,57484,155.0,4,,CC BY-SA 3.0,7f7256b0-76bf-4f68-9601-89d7a19c3d22,Bound for the correlation of three random variables with the same correlation?,made consistent with the comments,2013-10-15 03:16:25.790 +186419,57484,155.0,5,,CC BY-SA 3.0,7f7256b0-76bf-4f68-9601-89d7a19c3d22,"There are three random variables, $x,y,z$. The three correlations between the three variables are the same. I.e., + +$$\rho=\textrm{cor}(x,y)=\textrm{cor}(x,z)=\textrm{cor}(y,z)$$ + +What is the tightest bound you can give for $\rho$?",made consistent with the comments,2013-10-15 03:16:25.790 +186421,57484,594.0,5,,CC BY-SA 3.0,71c200b6-6e63-4b17-ab1e-7a999f52cb05,"There are three random variables, $x,y,z$. The three correlations between the three variables are the same. That is, + +$$\rho=\textrm{cor}(x,y)=\textrm{cor}(x,z)=\textrm{cor}(y,z)$$ + +What is the tightest bound you can give for $\rho$?",added 3 characters in body; edited title,2013-10-15 03:18:13.273 +186422,57484,594.0,4,,CC BY-SA 3.0,71c200b6-6e63-4b17-ab1e-7a999f52cb05,Bound for the correlation of three random variables,added 3 characters in body; edited title,2013-10-15 03:18:13.273 +186423,55209,594.0,33,,,e201a40e-9dd3-417c-991b-702b74bb6242,,823,2013-10-15 03:20:06.157 +186424,57249,594.0,33,,,46b59258-5de9-4ce0-bbbd-87187711d00b,,824,2013-10-15 03:21:42.803 +186425,57490,22677.0,2,,CC BY-SA 3.0,c1253c55-e6fb-40b4-ae47-d3c323a760ab,"@Dail if you're more inclined to the applied rather than the theoretical behind detection of structural break, you might want try `http://cran.r-project.org/web/packages/cpm/index.html` this is the link for `CPM` package of `R`, where you can use `processStream` to find multiple break point in your time series. +",,2013-10-15 03:22:22.863 +186426,55209,,25,,,8fd16737-01b9-4541-8167-202142b760bf,,http://twitter.com/#!/StackStats/status/389954516902494208,2013-10-15 03:22:56.557 +186472,57507,22706.0,2,,CC BY-SA 3.0,58a9887b-bd08-4567-9144-642d46e51d93,"A penalized regression provides biased estimates and I thought that SE and confidence intervals in the frequentist ridge and lasso regressions are not useful. I would assume that the same problems exists in an Bayesian approach but Kyung, Gill, Ghaosh and Casella (2010) say that the Bayesian formulation produces valid standard errors. Does it mean that a 95% credibility intervals includes with 95% probability the true biased estimate and if yes, is this a useful information?",,2013-10-15 08:17:27.980 +186474,57507,22706.0,3,,CC BY-SA 3.0,58a9887b-bd08-4567-9144-642d46e51d93,,,2013-10-15 08:17:27.980 +186473,57507,22706.0,1,,CC BY-SA 3.0,58a9887b-bd08-4567-9144-642d46e51d93,How do I interpret the credibility interval in a Bayesian Regularized Regression?,,2013-10-15 08:17:27.980 +186428,57477,594.0,5,,CC BY-SA 3.0,dd36fcab-d87d-440e-9886-9f37c277dc48,"You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$. + +This is not the case for independent variables. + +For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$ + +Further, + +$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$ + +(if the $X_i$ are independent of each other). + +http://en.wikipedia.org/wiki/Variance#Basic_properties + +In summary: the correct term: + +![annotated se formula][1] + +$\color{red}{(1)}$ has $\sigma^2/n$ terms because we're looking at averages and that's the variance of an average of independent random variables; + +$\color{red}{(2)}$ has a $+$ because the two samples are independent, so their variances add; and + +$\color{red}{(3)}$ has a square root because we want the standard deviation of the distribution of the difference in sample means (the standard error of the difference in means). The part under the bar of the square root is the variance of the difference (the square of the standard error). Taking square roots of squared standard errors gives us standard errors. + +That's *every single part of the formula explained*. + +The reason why we don't just add standard errors is *standard errors don't add* - the standard error of the difference in means is NOT the sum of the standard errors of the sample means for independent samples - the sum will always be too large. The variances *do* add, though, so we can use that to work out the standard errors. + +(If the algebra is unconvincing, or you don't believe the basic properties of variance that I used, you might consider simulation, which should soon make it clear.) + + + [1]: https://i.stack.imgur.com/TUqrV.png",added 35 characters in body,2013-10-15 04:39:23.473 +186429,57491,668.0,2,,CC BY-SA 3.0,3a9a0504-70ba-438c-a7a3-671289724b38,"###Overview### + +Any correlation matrix is the covariance matrix of the standardized random variables, whence--like all correlation matrices--it must be positive semi-definite. Equivalently, its eigenvalues are non-negative. This imposes a simple condition on $\rho$: it must not be any less than $-1/2$ (and of course cannot exceed $1$). Conversely, any such $\rho$ actually corresponds to the correlation matrix of some trivariate distribution, proving these bounds are the tightest possible. + +----- + +###Derivation of the conditions on $\rho$### + +Consider the $n$ by $n$ correlation matrix with all off-diagonal values equal to $\rho.$ (The question concerns the case $n=3,$ but this generalization is no more difficult to analyze.) Let's call it $\mathbb{C}(\rho, n).$ By definition, $\lambda$ is an eigenvalue of provided there exists a nonzero vector $\mathbf{x}_\lambda$ such that + +$$\mathbb{C}(\rho,n) \mathbf{x}_\lambda = \lambda \mathbf{x}_\lambda.$$ + +These eigenvalues are easy to find in the present case, because + +1. Letting $\mathbf{1} = (1, 1, \ldots, 1)'$, compute that + + $$\mathbb{C}(\rho,n)\mathbf{1} = (1+(n-1)\rho)\mathbf{1}.$$ + +2. Letting $\mathbf{y}_j = (-1, 0, \ldots, 0, 1, 0, \ldots, 0)$ with a $1$ only in the $j^\text{th}$ place (for $j = 2, 3, \ldots, n$), compute that + + $$\mathbb{C}(\rho,n)\mathbf{y}_j = (1-\rho)\mathbf{y}_j.$$ + +Because the $n$ eigenvectors found so far span the full $n$ dimensional space (proof: an easy row reduction shows the absolute value of their determinant equals $n$, which is nonzero), they constitute a basis of *all* the eigenvectors. We have therefore found all the eigenvalues and determined they are either $1+(n-1)\rho$ or $1-\rho$ (the latter with multiplicity $n-1$). In addition to the well-known inequality $-1 \le \rho \le 1$ satisfied by all correlations, non-negativity of the first eigenvalue further implies + +$$\rho \ge -\frac{1}{n-1}$$ + +while the non-negativity of the second eigenvalue imposes no new conditions. + +---- + +###Proof of sufficiency of the conditions### + +The implications work in both directions: provided $-1/(n-1)\le \rho \le 1,$ the matrix $\mathbb{C}(\rho, n)$ is nonnegative-definite and therefore is a valid correlation matrix. It is, for instance, the correlation matrix for a multinormal distribution. Specifically, writing + +$$\Sigma(\rho, n) = (1 + (n-1)\rho)\mathbb{I}_n - \frac{\rho}{(1-\rho)(1+(n-1)\rho)}\mathbf{1}\mathbf{1}'$$ + +for the inverse of $\mathbb{C}(\rho, n)$ when $-1/(n-1) \lt \rho \lt 1,$ let the vector of random variables $(X_1, X_2, \ldots, X_n)$ have distribution function + +$$f_{\rho, n}(\mathbf{x}) = \frac{\exp\left(-\frac{1}{2}\mathbf{x}\Sigma(\rho, n)\mathbf{x}'\right)}{(2\pi)^{n/2}\left((1-\rho)^{n-1}(1+(n-1)\rho)\right)^{1/2}}$$ + +where $\mathbf{x} = (x_1, x_2, \ldots, x_n)$: the correlation matrix for these $n$ random variables is $\mathbb{C}(\rho, n).$ + +The special cases $\rho = -1/(n-1)$ and $\rho = 1$ can also be realized by *degenerate* distributions; I won't go into the details except to point out that in the former case the distribution can be considered supported on the hyperplane $\mathbf{x}.\mathbf{1}=0$, where it is a sum of identically distributed mean-$0$ Normal distribution, while in the latter case (perfect positive correlation) it is supported on the line generated by $\mathbf{1}'$, where it has a mean-$0$ Normal distribution. + +----- + +###More about non-degeneracy### + +A review of this analysis makes it clear that the correlation matrix $\mathbb{C}(-1/(n-1), n)$ has a rank of $n-1$ and $\mathbb{C}(1, n)$ has a rank of $1$ (because only one eigenvector has a nonzero eigenvalue). For $n\ge 2$, this makes the correlation matrix degenerate in either case. Otherwise, the existence of its inverse $\Sigma(\rho, n)$ proves it is nondegenerate. + +",,2013-10-15 04:52:22.687 +186432,57492,20130.0,2,,CC BY-SA 3.0,bddab7b4-a694-4187-a25a-79d9500b647b,"There's a question for [textbooks](http://stats.stackexchange.com/questions/4612/good-econometrics-textbooks) here, but I would like to ask similar question about handbooks: What econometrics handbooks would you recommend? + +One option is Elsevier's 6 volumes of _Handbook of Econometrics_ series edited by Griliches and Instriligator. However, perhaps you would recommend other handbooks, more concise or otherwise? + +I suggest two possible formats: one is a reference-card format with minimum explanations, and the other one is a more extended format with proofs and more detailed exposition.",,2013-10-15 04:59:24.173 +186431,57492,20130.0,1,,CC BY-SA 3.0,bddab7b4-a694-4187-a25a-79d9500b647b,What are good econometrics handbooks?,,2013-10-15 04:59:24.173 +186430,57492,20130.0,3,,CC BY-SA 3.0,bddab7b4-a694-4187-a25a-79d9500b647b,,,2013-10-15 04:59:24.173 +186433,57492,668.0,16,,,6646ce27-17a8-4f10-9747-9bb92a410f93,,,2013-10-15 05:02:38.393 +186434,57492,668.0,10,,,c4100e75-afee-4758-a145-e7aba0b4655a,"{""Voters"":[{""Id"":919,""DisplayName"":""whuber""}]}",105,2013-10-15 05:02:49.377 +186435,57473,155.0,4,,CC BY-SA 3.0,53fb2689-fa60-459f-b1b9-b1261babc13c,Intuition for the standard error of the difference of sample means,edited title,2013-10-15 05:11:39.080 +186436,57493,3183.0,2,,CC BY-SA 3.0,7c9cabbc-776b-456b-b912-869a04ef1d54,"It sounds like your question has two parts: the underlying idea and a concrete example. I'll start with the underlying idea, then link to an example at the bottom. + +------ + +EM is useful in Catch-22 situations where it seems like you need to know $A$ before you can calculate $B$ and you need to know $B$ before you can calculate $A$. + +The most common case people deal with is probably mixture distributions, which can be tricky to deal with. For our example, let's look at a simple Gaussian mixture model: + +> You have two different univariate Gaussian distributions with different means and unit variance. + +>You have a bunch of data points, but you're not sure which points came from which distribution, and you're also not sure about the means of the two distributions. + +And now you're stuck: + +* If you knew which distribution each point came from, then you could estimate the two distributions' means using the sample means from the relevant points. But you don't actually know which points to assign to which distribution, so this won't work. + +* If you knew the true means, you could figure out which data points came from which Gaussian. For example, if a data point had a very high value, it probably came from the distribution with the higher mean. But you don't know what the means are, so this won't work either. + +But if you don't know either of these things, it's hard to know where to start. + +***What EM lets you do is alternate between these two tractable steps instead of tackling the whole process at once.*** + +First, you'll update the probability that each data point came from Gaussian 1 versus Gaussian 2 using the procedure from the first bullet point. Then you'll update your estimates for the distributions' means using the procedure from the second bullet point. Each time you do these updates, you're improving a lower bound on the model's likelihood. + +That's already pretty cool: even though the two suggestions in the bullet points above didn't seem like they'd work individually, you can still use them together to improve the model. The ***real*** magic of EM is that, after enough iterations, the lower bound will be so high that there won't be any space between it and the local maximum. As a result, and you've locally optimized the likelihood. + +So you haven't just *improved* the model, you've found the *best* possible model one can find with incremental updates. + +------ + +[This](http://en.wikipedia.org/wiki/File:Em_old_faithful.gif) page from Wikipedia shows a slightly more complicated example (two-dimensional Gaussians and unknown covariance), but the basic idea is the same. It also includes well-commented `R` code for implementing the example. + +In the code, the ""Expectation"" step (E-step) corresponds to my second bullet point: figuring out which Gaussian gets responsibility for each data point, given the current parameters for each Gaussian. The ""Maximization"" step (M-step) updates the means and covariances, given these assignments. + +As you can see in the animation, these updates quickly allow the algorithm to go from a set of terrible estimates to a set of very good ones: there really do seem to be two clouds of points centered on the two Gaussian distributions.",,2013-10-15 05:25:55.930 +186437,57494,155.0,2,,CC BY-SA 3.0,6387df5d-b4c1-4b53-89fb-034f3f30dd09,"The standard error of the mean for $n$ independent observations is $\frac{\sigma}{\sqrt{n}}$ where $\sigma$ is the standard deviation. + +So if we have two independent samples we have the standard errors for the means of group 1 and group 2. + +$$\sigma_{\mu_1}=\frac{\sigma_1}{\sqrt{n_1}}$$ +$$\sigma_{\mu_2}=\frac{\sigma_2}{\sqrt{n_2}}$$ + +If we square these values we get the variance of the mean: + +$$\sigma^2_{\mu_1}=\frac{\sigma^2_1}{n_1}$$ +$$\sigma^2_{\mu_2}=\frac{\sigma^2_2}{n_2}$$ + +The variance of the sum or difference of two independent random variables +is the sum of the two variances. Rhus, + +$$\sigma^2_{\mu_1 - \mu_2} =\sigma^2_{\mu_1} + \sigma^2_{\mu_2} = \frac{\sigma^2_1}{n_1} + \frac{\sigma^2_2}{n_2} $$ + +So if we want the standard error of the difference we take the square root of the variance: + +$$\sigma_{\mu_1 - \mu_2} =\sqrt{\sigma^2_{\mu_1} + \sigma^2_{\mu_2}} = \sqrt{\frac{\sigma^2_1}{n_1} + \frac{\sigma^2_2}{n_2}} $$ + + +",,2013-10-15 05:35:05.007 +186438,57495,11440.0,2,,CC BY-SA 3.0,c3ec3129-9bec-42cb-9670-fa7335773365,"Well, it is your choice which notation to use, but you certainly can just use logical operators: + +$p(\Phi_A, \Phi_B \; |\; \Phi_A>C \,\cap \Phi_B > C)$ + +Your current notation is not clear as $\Phi$ is not defined and not obvious what it means.",,2013-10-15 05:35:17.670 +186439,57212,22190.0,5,,CC BY-SA 3.0,7c83219f-3170-4c4c-b894-0189093e9b8b,"I am new to statistics and was asked to develop a statistical model, which I had started, they ask me to carry out concordance and discordance now, however I don't know anything about these terms except that the concordance is the probability that a pair of individuals will both have a certain characteristic, given that one of the pair has the characteristic and the opposite for discordance. +Still I don't know why I have to find them and what would be the appropriate value of both for a decent model. +",deleted 3 characters in body,2013-10-15 05:42:02.543 +186440,57494,155.0,5,,CC BY-SA 3.0,789499c8-878e-4dea-bc29-bdcdec678d48,"### Algebraic intuition +The standard error of the mean for $n$ independent observations is $\frac{\sigma}{\sqrt{n}}$ where $\sigma$ is the standard deviation. + +So if we have two independent samples we have the standard errors for the means of group 1 and group 2. + +$$\sigma_{\mu_1}=\frac{\sigma_1}{\sqrt{n_1}}$$ +$$\sigma_{\mu_2}=\frac{\sigma_2}{\sqrt{n_2}}$$ + +If we square these values we get the variance of the mean: + +$$\sigma^2_{\mu_1}=\frac{\sigma^2_1}{n_1}$$ +$$\sigma^2_{\mu_2}=\frac{\sigma^2_2}{n_2}$$ + +The variance of the sum or difference of two independent random variables +is the sum of the two variances. Thus, + +$$\sigma^2_{\mu_1 - \mu_2} =\sigma^2_{\mu_1} + \sigma^2_{\mu_2} = \frac{\sigma^2_1}{n_1} + \frac{\sigma^2_2}{n_2} $$ + +So if we want the standard error of the difference we take the square root of the variance: + +$$\sigma_{\mu_1 - \mu_2} =\sqrt{\sigma^2_{\mu_1} + \sigma^2_{\mu_2}} = \sqrt{\frac{\sigma^2_1}{n_1} + \frac{\sigma^2_2}{n_2}} $$ + +So imagine this is intuitive if the component steps are intuitive. In particular it helps if you find intuitive the idea that the variance of the sum of independent variables is the sum of the variances of the component variables. + +### Fuzzy Intuition +In terms of more general intuition, if $n_1 = n_2$ and $\sigma=\sigma_1=\sigma_2$ then the standard error of the difference between means will be $\sqrt{2}\sigma_\mu\approx 1.4\times \sigma_\mu$. It makes sense that this value of approximately 1.4 is greater than 1 (i.e., the variance of a variable after adding a constant; i.e., equivalent to one sample t-test) and less than 2 (i.e., the standard deviation of the sum of two perfectly correlated variables (with equal variance) and the standard error implied by the formula you mention: $\frac{\sigma_1}{\sqrt{n_1}} + \frac{\sigma_2}{\sqrt{n_2}}$).",added 790 characters in body,2013-10-15 05:50:56.207 +186443,57496,12314.0,1,,CC BY-SA 3.0,02a56597-a917-40a1-bfe8-276fce6f92ec,Forecasting the population of a village; 1 day to a year ahead,,2013-10-15 06:05:55.207 +186441,57496,12314.0,2,,CC BY-SA 3.0,02a56597-a917-40a1-bfe8-276fce6f92ec,"Suppose that I have daily data on the population of a small village, given by $Y(t)$, as well as daily data on various factors that are relevant to the size of the population in the future, given by vector $X(t)$. These explanatory variables include untransformed variables as well as features engineered to be informative over long horizons (e.g. one of the variables captures the number of deaths over the last 30 days). I have collected this data for 10 years. + +My objective is to forecast $Y(t)$ ahead by 1,2,3,...,365 days. I expect long-run forecasts to be different to short-run forecasts. If a holiday season is coming up I might expect a downwards spike in a few months time (people visiting the city), but if someone is on their deathbed then I will expect a downwards spike in a few days. + +Since the population is sufficiently small that $\Delta Y(t+k)$ is typically in $\{-2,-1,0,1,2\}$ for the forecasting horizon under question, I will use a multiple categorical response variable classification model that will assign probabilities to the various class labels being observed. + +My question centers on the specific considerations I need to make when constructing forecasts of the change from $Y(t)$ to $Y(t+k)$ where $k$ is large (e.g. 100 days). + +Basically there will be the most hideous autocorrelation structure in $\Delta Y(t+k)$ over these time scales. If someone dies on day $2$, they are also dead on day $3, 4, ..., k$, meaning a string of $k$ or so $\Delta Y(t+k)$ will contain this same information. + +These queries result: + +- What are some ways of dealing with this immense autocorrelation structure in my response. Is it even a problem? +- Are there alternative methodologies to the ones I've proposed for forecasting these horizons (aside from typical machine learning methods such as random forests which I'm already working with). +- Any other handy advice. + + + + + + + + +",,2013-10-15 06:05:55.207 +186445,57474,22163.0,4,,CC BY-SA 3.0,850cd926-bedb-49f6-b5fe-5e7113c274e8,Inverted SPSS results: Logistic regression command vs. Genlin?,added 4 characters in body,2013-10-15 06:12:48.370 +186444,57474,22163.0,5,,CC BY-SA 3.0,850cd926-bedb-49f6-b5fe-5e7113c274e8,"I want to do a logistic regression in SPSS. However, since I analyse unemployment spells the subjects are sometimes repeated (violating the independence assumption of the regression). One way of removing the within subject variation is by applying a Genlin model with the repeated subject subcommand (in essence a GEE model). Thus, I tried out a Genlin model with binomal probability and the logit link, comparing it to a standard logistic regression. I used the exact same variables in the two procedures. + +However, the results that was delivered from the Genlin procedure was inverted relative to that of the logistic regression. For instance: Exp(B) for women (of the independent variable sex/gender) was just above 2.0 in logistic regression while being at 0.49 in Genlin. The same happened with every independent variable. + + - Any suggestions to why the results of the Genlin procedure is + inverted? + - Is there any way to get the Genlin results in accordance to the logistic regression?",added 4 characters in body,2013-10-15 06:12:48.370 +186447,57496,12314.0,6,,CC BY-SA 3.0,2fea95e8-7ac4-436f-9cd8-fa3a90b5965b,,edited tags; edited title,2013-10-15 06:16:06.313 +186446,57496,12314.0,4,,CC BY-SA 3.0,2fea95e8-7ac4-436f-9cd8-fa3a90b5965b,Forecasting time-series ahead by multiple time horizons,edited tags; edited title,2013-10-15 06:16:06.313 +186448,57484,,25,,,83cb3cf9-0674-4239-9da8-cc186207a1fb,,http://twitter.com/#!/StackStats/status/389999817193426944,2013-10-15 06:22:56.990 +186450,57497,22703.0,2,,CC BY-SA 3.0,8e440619-e0db-46c8-83fd-221f7648ffcc,"Any statistical distribution is described in terms of shape, scale and location parameters. But what do these parameters mean, geometrically, statistically and for a layman with minimum statistical knowledge? + +I have explored wikipedia and still, this doubt continues to exist. + +Please help. This understanding is quite important to me.",,2013-10-15 06:36:38.530 +186451,57497,22703.0,1,,CC BY-SA 3.0,8e440619-e0db-46c8-83fd-221f7648ffcc,Parameters of a Statistical Distribution,,2013-10-15 06:36:38.530 +186449,57497,22703.0,3,,CC BY-SA 3.0,8e440619-e0db-46c8-83fd-221f7648ffcc,,,2013-10-15 06:36:38.530 +186452,57493,3183.0,5,,CC BY-SA 3.0,650487e7-d01c-447e-8cd9-8a4e429d8f75,"It sounds like your question has two parts: the underlying idea and a concrete example. I'll start with the underlying idea, then link to an example at the bottom. + +------ + +EM is useful in Catch-22 situations where it seems like you need to know $A$ before you can calculate $B$ and you need to know $B$ before you can calculate $A$. + +The most common case people deal with is probably mixture distributions, which can be tricky to deal with. For our example, let's look at a simple Gaussian mixture model: + +> You have two different univariate Gaussian distributions with different means and unit variance. + +>You have a bunch of data points, but you're not sure which points came from which distribution, and you're also not sure about the means of the two distributions. + +And now you're stuck: +* If you knew the true means, you could figure out which data points came from which Gaussian. For example, if a data point had a very high value, it probably came from the distribution with the higher mean. But you don't know what the means are, so this won't work. + +* If you knew which distribution each point came from, then you could estimate the two distributions' means using the sample means of the relevant points. But you don't actually know which points to assign to which distribution, so this won't work either. + +So neither approach seems like it works: you'd need to know the answer before you can find the answer, and you're stuck. + +***What EM lets you do is alternate between these two tractable steps instead of tackling the whole process at once.*** + +First, you'll update the probability that each data point came from Gaussian 1 versus Gaussian 2 using the procedure from the first bullet point. Then you'll update your estimates for the distributions' means using the procedure from the second bullet point. Each time you do these updates, you're improving a lower bound on the model's likelihood. + +That's already pretty cool: even though the two suggestions in the bullet points above didn't seem like they'd work individually, you can still use them together to improve the model. The ***real*** magic of EM is that, after enough iterations, the lower bound will be so high that there won't be any space between it and the local maximum. As a result, and you've locally optimized the likelihood. + +So you haven't just *improved* the model, you've found the *best* possible model one can find with incremental updates. + +------ + +[This](http://en.wikipedia.org/wiki/File:Em_old_faithful.gif) page from Wikipedia shows a slightly more complicated example (two-dimensional Gaussians and unknown covariance), but the basic idea is the same. It also includes well-commented `R` code for implementing the example. + +In the code, the ""Expectation"" step (E-step) corresponds to my first bullet point: figuring out which Gaussian gets responsibility for each data point, given the current parameters for each Gaussian. The ""Maximization"" step (M-step) updates the means and covariances, given these assignments, as in my second bullet point. + +As you can see in the animation, these updates quickly allow the algorithm to go from a set of terrible estimates to a set of very good ones: there really do seem to be two clouds of points centered on the two Gaussian distributions.",swap bullet points so E step is first & everything is consistent,2013-10-15 06:37:59.930 +186455,57498,22703.0,3,,CC BY-SA 3.0,2eb6ddc4-7257-480d-bcdb-3c8bfb807e85,,,2013-10-15 06:42:30.540 +186453,57498,22703.0,2,,CC BY-SA 3.0,2eb6ddc4-7257-480d-bcdb-3c8bfb807e85,"As statisticians, we come across many distributions under the banners ""discrete"",""continuous"" and ""univariate"",""multivariate"".But can anyone, offer me a good reason behind the existence and motivation for so many distributions. How do we get them? and what can a layman understand from it? + +What is the logic behind the existence of distributions? + +Please help...",,2013-10-15 06:42:30.540 +186454,57498,22703.0,1,,CC BY-SA 3.0,2eb6ddc4-7257-480d-bcdb-3c8bfb807e85,Motivation for statistical distributions,,2013-10-15 06:42:30.540 +186456,57493,3183.0,5,,CC BY-SA 3.0,fbbee6c7-5fba-4b20-a9a0-8474398d01d2,"It sounds like your question has two parts: the underlying idea and a concrete example. I'll start with the underlying idea, then link to an example at the bottom. + +------ + +EM is useful in Catch-22 situations where it seems like you need to know $A$ before you can calculate $B$ and you need to know $B$ before you can calculate $A$. + +The most common case people deal with is probably mixture distributions. For our example, let's look at a simple Gaussian mixture model: + +> You have two different univariate Gaussian distributions with different means and unit variance. + +>You have a bunch of data points, but you're not sure which points came from which distribution, and you're also not sure about the means of the two distributions. + +And now you're stuck: + +* If you knew the true means, you could figure out which data points came from which Gaussian. For example, if a data point had a very high value, it probably came from the distribution with the higher mean. But you don't know what the means are, so this won't work. + +* If you knew which distribution each point came from, then you could estimate the two distributions' means using the sample means of the relevant points. But you don't actually know which points to assign to which distribution, so this won't work either. + +So neither approach seems like it works: you'd need to know the answer before you can find the answer, and you're stuck. + +***What EM lets you do is alternate between these two tractable steps instead of tackling the whole process at once.*** + +First, you'll update the probability that each data point came from Gaussian 1 versus Gaussian 2 using the procedure from the first bullet point. Then you'll update your estimates for the distributions' means using the procedure from the second bullet point. Each time you do these updates, you're improving a lower bound on the model's likelihood. + +That's already pretty cool: even though the two suggestions in the bullet points above didn't seem like they'd work individually, you can still use them together to improve the model. The ***real*** magic of EM is that, after enough iterations, the lower bound will be so high that there won't be any space between it and the local maximum. As a result, and you've locally optimized the likelihood. + +So you haven't just *improved* the model, you've found the *best* possible model one can find with incremental updates. + +------ + +[This](http://en.wikipedia.org/wiki/File:Em_old_faithful.gif) page from Wikipedia shows a slightly more complicated example (two-dimensional Gaussians and unknown covariance), but the basic idea is the same. It also includes well-commented `R` code for implementing the example. + +In the code, the ""Expectation"" step (E-step) corresponds to my first bullet point: figuring out which Gaussian gets responsibility for each data point, given the current parameters for each Gaussian. The ""Maximization"" step (M-step) updates the means and covariances, given these assignments, as in my second bullet point. + +As you can see in the animation, these updates quickly allow the algorithm to go from a set of terrible estimates to a set of very good ones: there really do seem to be two clouds of points centered on the two Gaussian distributions that EM finds.",minor,2013-10-15 06:43:12.093 +186457,57493,3183.0,5,,CC BY-SA 3.0,1a66876b-f5f0-4bc0-b933-3c7dc4ebd148,"It sounds like your question has two parts: the underlying idea and a concrete example. I'll start with the underlying idea, then link to an example at the bottom. + +------ + +EM is useful in Catch-22 situations where it seems like you need to know $A$ before you can calculate $B$ and you need to know $B$ before you can calculate $A$. + +The most common case people deal with is probably mixture distributions. For our example, let's look at a simple Gaussian mixture model: + +> You have two different univariate Gaussian distributions with different means and unit variance. + +>You have a bunch of data points, but you're not sure which points came from which distribution, and you're also not sure about the means of the two distributions. + +And now you're stuck: + +* If you knew the true means, you could figure out which data points came from which Gaussian. For example, if a data point had a very high value, it probably came from the distribution with the higher mean. But you don't know what the means are, so this won't work. + +* If you knew which distribution each point came from, then you could estimate the two distributions' means using the sample means of the relevant points. But you don't actually know which points to assign to which distribution, so this won't work either. + +So neither approach seems like it works: you'd need to know the answer before you can find the answer, and you're stuck. + +***What EM lets you do is alternate between these two tractable steps instead of tackling the whole process at once.*** + +You'll need to start with a guess about the two means (although your guess doesn't necessarily have to be very accurate, you do need to start somewhere). + +If your guess about the means was accurate, then you'd have enough information to carry out the step in my first bullet point above, and you could (probabilistically) assign each data point to one of the two Gaussians. Even though we know our guess is wrong, let's try this anyway. And then, given each point's assigned distributions, you could get new estimates for the means using the second bullet point. It turns out that, each time you do loop through these two steps, you're improving a lower bound on the model's likelihood. + +That's already pretty cool: even though the two suggestions in the bullet points above didn't seem like they'd work individually, you can still use them together to improve the model. The ***real*** magic of EM is that, after enough iterations, the lower bound will be so high that there won't be any space between it and the local maximum. As a result, and you've locally optimized the likelihood. + +So you haven't just *improved* the model, you've found the *best* possible model one can find with incremental updates. + +------ + +[This](http://en.wikipedia.org/wiki/File:Em_old_faithful.gif) page from Wikipedia shows a slightly more complicated example (two-dimensional Gaussians and unknown covariance), but the basic idea is the same. It also includes well-commented `R` code for implementing the example. + +In the code, the ""Expectation"" step (E-step) corresponds to my first bullet point: figuring out which Gaussian gets responsibility for each data point, given the current parameters for each Gaussian. The ""Maximization"" step (M-step) updates the means and covariances, given these assignments, as in my second bullet point. + +As you can see in the animation, these updates quickly allow the algorithm to go from a set of terrible estimates to a set of very good ones: there really do seem to be two clouds of points centered on the two Gaussian distributions that EM finds.",minor clarification,2013-10-15 06:58:22.890 +186458,57499,22703.0,2,,CC BY-SA 3.0,558e7911-fca3-44f5-8eee-07ef37164a66,"Well, I would suggest you to go through a book on R by Maria L Rizzo. One of the chapters contain the use of EM algorithm with a numerical example. I remember going through the code for better understanding. + +Also, try to view it from a clustering point of view in the beginning. Work out by hand, a clustering problem where 10 observations are taken from two different normal densities. This should help.Take help from R :)",,2013-10-15 07:03:26.383 +186461,57500,22703.0,3,,CC BY-SA 3.0,db493b5e-b2c6-4b5b-8ea0-974233d502f1,,,2013-10-15 07:09:16.460 +186460,57500,22703.0,1,,CC BY-SA 3.0,db493b5e-b2c6-4b5b-8ea0-974233d502f1,Regression Methods,,2013-10-15 07:09:16.460 +186459,57500,22703.0,2,,CC BY-SA 3.0,db493b5e-b2c6-4b5b-8ea0-974233d502f1," +What is the fundamental difference between 1) Linear Regression 2) Non linear regression 3) Parametric Regression 4) Non Parametric Regression? +When do we go for each of the types? How do we know what to choose? What kind of data is of demand here? What are the assumptions unique to each? +At times, if you go through papers you get to see a combination of the names above. + +Please help",,2013-10-15 07:09:16.460 +186475,57497,15827.0,5,,CC BY-SA 3.0,828167bb-bd9d-4c05-8118-42b036de8963,"Any statistical distribution is described in terms of shape, scale and location parameters. But what do these parameters mean, geometrically, statistically and for a layman with minimum statistical knowledge? + +I have explored wikipedia and still, this doubt continues to exist. + +",personal comments unnecessary ,2013-10-15 08:18:12.107 +186478,57508,3993.0,2,,CC BY-SA 3.0,db70dac2-603d-4dc2-b845-8d57649c7408,"***TL, DR summary:*** + +Is there any theoretical or empirical basis to support the following statement being true as a general rule of thumb? + +""When estimating a mixed model, typically the estimated variances/standard deviations of random effects associated with 'higher-order' terms (e.g., random effects of two-way, three-way, and beyond interaction terms) turn out to be *smaller* than the estimated variances/standard deviations of random effects associated with 'lower-order' terms (e.g., the residual variance, variances associated with simple effects of grouping factors)."" + +The source of this claim is me. ;) + +**** + +Okay, now for the longer version... + +Typically when I sit down to start analyzing a new dataset which I know will call for a mixed model, one of the first models that I fit (after the statistical foreplay of looking through the observations in the dataset, plotting various things, cross-tabulating different factors, etc.) is one that is pretty close to the ""maximal"" random effects specification, where every random effect that is in-principle possible to estimate from the data, is estimated. + +Naturally, it is not uncommon that this nearly-maximal model will have some computational problems (convergence errors, or wacky variance/covariance estimates, or etc.) and that I have to trim back this model to find one that my data can more easily support. Fine. + +In these situations, the method I have come to prefer for trimming random terms is not to rely on significance tests or likelihood ratios, but rather to just identify the random effects that seem to have the smallest standard deviations (which can admittedly be a little tricky when predictors are on very different scales, but I try to take account of this in my appraisal) and remove these terms first, sequentially in an iterative process. The idea being that I want to alter the predictions of the model as little as possible while still reducing the complexity of the model. + +One pattern that I seem to have noticed after a pretty good amount of time spent doing this is that following this method very often leads me to trim random effects associated with higher-order terms (as defined above) of the model first. This is not always true, and occasionally some of the higher-order terms explain a lot of variance, but this doesn't seem to be the general pattern. In sharp contrast, I usually find that lower-order random terms -- particularly those associated with simple effects of the grouping factors -- explain a pretty good amount of variance and are fairly essential to the model. At the extreme, the residual term commonly accounts for close to the most variance, although of course removing this term wouldn't be sensible. + +This entirely informal observation leads me to form the hypothesis that I stated at the beginning of this question. + +If it is true, then it constitutes a useful piece of advice that might be passed down to people who are less experience with this kind of model selection process. But before I begin doing so, I want to check with other, more experienced users of mixed models about their reactions to this observation. Does it seem more or less true to you? Is it roughly consistent with your experience fitting many different mixed models to many different datasets? Do you know of any sensible, theoretical reasons why we might actually *expect* this to be true in a lot of cases? Or does it just seem like bullshit? + +One possible answer here is that it is not true even in my own case, and I have simply deceived myself. Certainly a possibility that I am open to. + +Another possibility is that it might be true in my own case, but that this could simply be a kind of coincidence having to do with the kinds of datasets that I tend to work with routinely (which, FYI, are datasets in psychological / social sciences, a slight majority being experimental in origin, but also a fair proportion of non-experimental stuff). If this is the case then there is probably no good reason for expecting my observations to hold in general in other fields that handle very different kinds of data. Still, if there is a coherent non-coincidental reason for why this might be expected to be true, even if only for these particular kinds of datasets, I would love to hear it. + +And of course another possibility is that others *have* noticed similar patterns in their own data, and that it represents some kind of general rule of thumb that people find useful to keep in mind as they fit mixed models to various different data. If this is the case then it seems like there must be some compelling statistical-theoretical reason for why this pattern arises. But I really don't know what that reason would look like. + +I welcome anyone's thoughts and opinions about this. Note that as far as I'm concerned, totally legitimate responses to this question might be as simple as comments like ""Yeah I have noticed something similar in the data I've worked with, but I have no idea why it should be true"" or conversely ""I have noticed nothing remotely like this in the data I've worked with."" Of course I also welcome longer and more involved discussions...",,2013-10-15 08:21:54.933 +186477,57508,3993.0,1,,CC BY-SA 3.0,db70dac2-603d-4dc2-b845-8d57649c7408,Relative variances of higher-order vs. lower-order random terms in mixed models,,2013-10-15 08:21:54.933 +186462,57501,20470.0,2,,CC BY-SA 3.0,4172a0ce-6ab2-4563-9a26-ea59addbb64e,"This is a recipe to learn EM with a practical 'Coin-Toss' example: + +**1-** Read this short [EM tutorial paper][1] by Do and Batzoglou. + +**2-** You may have question marks in your head, have a look at the explanations on this maths stack exchange [page][2] (which clarified the example on the paper for me). + +**3-** Look at/run this code that I wrote in Python that simulates the solution to the coni-toss problem in the EM tutorial paper of item 1: + +***Warning :*** The code may be messy/suboptimal, but it does the job. + + import numpy as np + import math + + #### E-M Coin Toss Example as given in the EM tutorial paper by Do and Batzoglou* #### + + def get_mn_log_likelihood(obs,probs): + """""" Return the (log)likelihood of obs, given the probs"""""" + # Multinomial Distribution Log PMF + # ln (pdf) = multinomial coeff * product of probabilities + # ln[f(x|n, p)] = [ln(n!) - (ln(x1!)+ln(x2!)+...+ln(xk!))] + [x1*ln(p1)+x2*ln(p2)+...+xk*ln(pk)] + + multinomial_coeff_denom= 0 + prod_probs = 0 + for x in range(0,len(obs)): # loop through state counts in each observation + multinomial_coeff_denom = multinomial_coeff_denom + math.log(math.factorial(obs[x])) + prod_probs = prod_probs + obs[x]*math.log(probs[x]) + + multinomial_coeff = math.log(math.factorial(sum(obs))) - multinomial_coeff_denom + likelihood = multinomial_coeff + prod_probs + return likelihood + + # 1st: Coin B, {HTTTHHTHTH}, 5H,5T + # 2nd: Coin A, {HHHHTHHHHH}, 9H,1T + # 3rd: Coin A, {HTHHHHHTHH}, 8H,2T + # 4th: Coin B, {HTHTTTHHTT}, 4H,6T + # 5th: Coin A, {THHHTHHHTH}, 7H,3T + # so, from MLE: pA(heads) = 0.80 and pB(heads)=0.45 + + # represent the experiments + head_counts = np.array([5,9,8,4,7]) + tail_counts = 10-head_counts + experiments = zip(head_counts,tail_counts) + + # initialise the pA(heads) and pB(heads) + pA_heads = np.zeros(100); pA_heads[0] = 0.60 + pB_heads = np.zeros(100); pB_heads[0] = 0.50 + + # E-M begins! + delta = 0.001 + j = 0 # iteration counter + improvement = float('inf') + while (improvement>delta): + expectation_A = np.zeros((5,2), dtype=float) + expectation_B = np.zeros((5,2), dtype=float) + for i in range(0,len(experiments)): + e = experiments[i] # i'th experiment + ll_A = get_mn_log_likelihood(e,np.array([pA_heads[j],1-pA_heads[j]])) # loglikelihood of e given coin A + ll_B = get_mn_log_likelihood(e,np.array([pB_heads[j],1-pB_heads[j]])) # loglikelihood of e given coin B + + weightA = math.exp(ll_A) / ( math.exp(ll_A) + math.exp(ll_B) ) # corresponding weight of A proportional to likelihood of A + weightB = math.exp(ll_B) / ( math.exp(ll_A) + math.exp(ll_B) ) # corresponding weight of B proportional to likelihood of B + + expectation_A[i] = np.dot(weightA, e) + expectation_B[i] = np.dot(weightB, e) + + pA_heads[j+1] = sum(expectation_A)[0] / sum(sum(expectation_A)); + pB_heads[j+1] = sum(expectation_B)[0] / sum(sum(expectation_B)); + + improvement = max( abs(np.array([pA_heads[j+1],pB_heads[j+1]]) - np.array([pA_heads[j],pB_heads[j]]) )) + j = j+1 + + + [1]: http://ai.stanford.edu/~chuongdo/papers/em_tutorial.pdf + [2]: http://math.stackexchange.com/questions/25111/how-does-expectation-maximization-work",,2013-10-15 07:21:33.433 +186463,57501,20470.0,5,,CC BY-SA 3.0,14a084c7-7ff5-43b0-918e-1dc5169d1673,"This is a recipe to learn EM with a practical 'Coin-Toss' example: + +**1-** Read this short [EM tutorial paper][1] by Do and Batzoglou. + +**2-** You may have question marks in your head, have a look at the explanations on this maths stack exchange [page][2] (which clarified the example on the paper for me). + +**3-** Look at/run this code that I wrote in Python that simulates the solution to the coin-toss problem in the EM tutorial paper of item 1: + +***Warning :*** The code may be messy/suboptimal, but it does the job. + + import numpy as np + import math + + #### E-M Coin Toss Example as given in the EM tutorial paper by Do and Batzoglou* #### + + def get_mn_log_likelihood(obs,probs): + """""" Return the (log)likelihood of obs, given the probs"""""" + # Multinomial Distribution Log PMF + # ln (pdf) = multinomial coeff * product of probabilities + # ln[f(x|n, p)] = [ln(n!) - (ln(x1!)+ln(x2!)+...+ln(xk!))] + [x1*ln(p1)+x2*ln(p2)+...+xk*ln(pk)] + + multinomial_coeff_denom= 0 + prod_probs = 0 + for x in range(0,len(obs)): # loop through state counts in each observation + multinomial_coeff_denom = multinomial_coeff_denom + math.log(math.factorial(obs[x])) + prod_probs = prod_probs + obs[x]*math.log(probs[x]) + + multinomial_coeff = math.log(math.factorial(sum(obs))) - multinomial_coeff_denom + likelihood = multinomial_coeff + prod_probs + return likelihood + + # 1st: Coin B, {HTTTHHTHTH}, 5H,5T + # 2nd: Coin A, {HHHHTHHHHH}, 9H,1T + # 3rd: Coin A, {HTHHHHHTHH}, 8H,2T + # 4th: Coin B, {HTHTTTHHTT}, 4H,6T + # 5th: Coin A, {THHHTHHHTH}, 7H,3T + # so, from MLE: pA(heads) = 0.80 and pB(heads)=0.45 + + # represent the experiments + head_counts = np.array([5,9,8,4,7]) + tail_counts = 10-head_counts + experiments = zip(head_counts,tail_counts) + + # initialise the pA(heads) and pB(heads) + pA_heads = np.zeros(100); pA_heads[0] = 0.60 + pB_heads = np.zeros(100); pB_heads[0] = 0.50 + + # E-M begins! + delta = 0.001 + j = 0 # iteration counter + improvement = float('inf') + while (improvement>delta): + expectation_A = np.zeros((5,2), dtype=float) + expectation_B = np.zeros((5,2), dtype=float) + for i in range(0,len(experiments)): + e = experiments[i] # i'th experiment + ll_A = get_mn_log_likelihood(e,np.array([pA_heads[j],1-pA_heads[j]])) # loglikelihood of e given coin A + ll_B = get_mn_log_likelihood(e,np.array([pB_heads[j],1-pB_heads[j]])) # loglikelihood of e given coin B + + weightA = math.exp(ll_A) / ( math.exp(ll_A) + math.exp(ll_B) ) # corresponding weight of A proportional to likelihood of A + weightB = math.exp(ll_B) / ( math.exp(ll_A) + math.exp(ll_B) ) # corresponding weight of B proportional to likelihood of B + + expectation_A[i] = np.dot(weightA, e) + expectation_B[i] = np.dot(weightB, e) + + pA_heads[j+1] = sum(expectation_A)[0] / sum(sum(expectation_A)); + pB_heads[j+1] = sum(expectation_B)[0] / sum(sum(expectation_B)); + + improvement = max( abs(np.array([pA_heads[j+1],pB_heads[j+1]]) - np.array([pA_heads[j],pB_heads[j]]) )) + j = j+1 + + + [1]: http://ai.stanford.edu/~chuongdo/papers/em_tutorial.pdf + [2]: http://math.stackexchange.com/questions/25111/how-does-expectation-maximization-work",edited body,2013-10-15 07:27:05.690 +186464,57502,22704.0,2,,CC BY-SA 3.0,e7896a51-07f0-4ea1-9b17-ccb021b9c595,"Take a look at a post in Healthy Algorithm: +http://healthyalgorithms.com/2011/11/23/causal-modeling-in-python-bayesian-networks-in-pymc/ + +also in PyMC's totorial: +http://pymc-devs.github.io/pymc/tutorial.html + +Maybe you would try the following code clip (assuming you have imported pymc as mc): + + A = mc.Normal('A', mu_A, tau_A) + B = mc.Normal('B', mu_B, tau_B) + p_C = mc.Lambda('p_C', lambda A=A, B=B: <>, doc='Pr[C|AB]') + C = mc.Bernoulli('C', p_C)",,2013-10-15 07:29:18.930 +186465,57503,5637.0,2,,CC BY-SA 3.0,dee819ef-2704-4837-a2ff-5b8c33a9c5cc,"The names of the parameters are suggestive. Location, scale, and shape parameters are associated with central tendency, dispersion and skewness respectively. For eg: If you change location parameters, mostly it change only the central tendency measures. + + +Try this online tool. [Distributions][1] + +See how the distribution changes for different values of parameters. You could try this with generalized extreme value distribution. + +Not all standard distributions have all three parameters. Some distributions have only one or two of the parameters (eg: gamma distribution-shape and scale parameters) + + + [1]: http://socr.ucla.edu/htmls/SOCR_Distributions.html",,2013-10-15 07:33:28.403 +186466,57501,20470.0,5,,CC BY-SA 3.0,6b73b0a7-9218-4d3b-8333-e23230351bb4,"This is a recipe to learn EM with a practical and (in my opinion) very intuitive 'Coin-Toss' example: + +**1-** Read this short [EM tutorial paper][1] by Do and Batzoglou. This is the schema where the coin toss example is explained: + +![enter image description here][2] + +**2-** You may have question marks in your head, have a look at the explanations on this maths stack exchange [page][3] (which clarified the example on the paper for me). + +**3-** Look at/run this code that I wrote in Python that simulates the solution to the coin-toss problem in the EM tutorial paper of item 1: + +**P.S** The code may be suboptimal, but it does the job. + + import numpy as np + import math + + #### E-M Coin Toss Example as given in the EM tutorial paper by Do and Batzoglou* #### + + def get_mn_log_likelihood(obs,probs): + """""" Return the (log)likelihood of obs, given the probs"""""" + # Multinomial Distribution Log PMF + # ln (pdf) = multinomial coeff * product of probabilities + # ln[f(x|n, p)] = [ln(n!) - (ln(x1!)+ln(x2!)+...+ln(xk!))] + [x1*ln(p1)+x2*ln(p2)+...+xk*ln(pk)] + + multinomial_coeff_denom= 0 + prod_probs = 0 + for x in range(0,len(obs)): # loop through state counts in each observation + multinomial_coeff_denom = multinomial_coeff_denom + math.log(math.factorial(obs[x])) + prod_probs = prod_probs + obs[x]*math.log(probs[x]) + + multinomial_coeff = math.log(math.factorial(sum(obs))) - multinomial_coeff_denom + likelihood = multinomial_coeff + prod_probs + return likelihood + + # 1st: Coin B, {HTTTHHTHTH}, 5H,5T + # 2nd: Coin A, {HHHHTHHHHH}, 9H,1T + # 3rd: Coin A, {HTHHHHHTHH}, 8H,2T + # 4th: Coin B, {HTHTTTHHTT}, 4H,6T + # 5th: Coin A, {THHHTHHHTH}, 7H,3T + # so, from MLE: pA(heads) = 0.80 and pB(heads)=0.45 + + # represent the experiments + head_counts = np.array([5,9,8,4,7]) + tail_counts = 10-head_counts + experiments = zip(head_counts,tail_counts) + + # initialise the pA(heads) and pB(heads) + pA_heads = np.zeros(100); pA_heads[0] = 0.60 + pB_heads = np.zeros(100); pB_heads[0] = 0.50 + + # E-M begins! + delta = 0.001 + j = 0 # iteration counter + improvement = float('inf') + while (improvement>delta): + expectation_A = np.zeros((5,2), dtype=float) + expectation_B = np.zeros((5,2), dtype=float) + for i in range(0,len(experiments)): + e = experiments[i] # i'th experiment + ll_A = get_mn_log_likelihood(e,np.array([pA_heads[j],1-pA_heads[j]])) # loglikelihood of e given coin A + ll_B = get_mn_log_likelihood(e,np.array([pB_heads[j],1-pB_heads[j]])) # loglikelihood of e given coin B + + weightA = math.exp(ll_A) / ( math.exp(ll_A) + math.exp(ll_B) ) # corresponding weight of A proportional to likelihood of A + weightB = math.exp(ll_B) / ( math.exp(ll_A) + math.exp(ll_B) ) # corresponding weight of B proportional to likelihood of B + + expectation_A[i] = np.dot(weightA, e) + expectation_B[i] = np.dot(weightB, e) + + pA_heads[j+1] = sum(expectation_A)[0] / sum(sum(expectation_A)); + pB_heads[j+1] = sum(expectation_B)[0] / sum(sum(expectation_B)); + + improvement = max( abs(np.array([pA_heads[j+1],pB_heads[j+1]]) - np.array([pA_heads[j],pB_heads[j]]) )) + j = j+1 + + + [1]: http://ai.stanford.edu/~chuongdo/papers/em_tutorial.pdf + [2]: https://i.stack.imgur.com/mj0nb.gif + [3]: http://math.stackexchange.com/questions/25111/how-does-expectation-maximization-work",added 128 characters in body,2013-10-15 07:35:33.550 +186467,57504,22678.0,2,,CC BY-SA 3.0,58cc8075-3c1b-41fc-93a7-c150bc326371,"Basically, it depends of the function type, you are trying to model from data: + +* Linear. $f(x)=a_1x_2+a_2x_2+ \cdots$ + +* Nonlinear: $f(x)=x_1*x_2/x_4+exp(-x_2/x_2)$ + +* Parameteric: actually, the both from top, but where you have physical/application meaning for the parameters $x_i$. e.g. splines, where the parameters of interest represent the path of a trajectory. + +* Non-Parametric: Linear model for nonlinear problems. Same as splines, but the bases are kernels. This is good, when you have a nonlinear/complex model but would like to do some kind of model selection (which abstract $x_i$ is the most important for your data e.g.). See Kernel (ridge) regression for details on this. + +",,2013-10-15 07:41:50.867 +186468,57505,22678.0,2,,CC BY-SA 3.0,db2f0890-4be7-445f-bfaa-ee5038e3b14c,"You will need a robust loss function in the kernel estimation model. +However, this topic may become quite advances very fast. :) +For a good start, I would suggest the one class SVM from sklearn. +http://scikit-learn.org/stable/modules/svm.html#density-estimation-novelty-detection +",,2013-10-15 07:46:47.790 +186469,57506,5671.0,2,,CC BY-SA 3.0,451e1840-c3e4-4432-b9f9-c64af518c673,"Actually the simplest approach would be **Association Rule Mining**, aka Frequent Itemset Mining (FIM). ""Clustering"" is an attempt to uncover structure, but not so much to make recommendations. It's explorative, not predictive; the clusters will most often be something rather obvious to the domain expert. + +FIM will learn rules of the form that students, which have taken class A and B, have also taken class C with x% probability, i.e. + +$$ {A,B} \rightarrow {C} \text{ with confidence }x\%$$ + +You *really* need to go through some introductory course. APRIORI is discussed everywhere, and is an obbvious fit here. In particular as you don't have quantities to predict (you don't have users that take class A 5 times and class B 2 times and thus are likely to buy -2 times class C...) Depending on your data, FPGrowth or Eclat algorithms may be more performat though.",,2013-10-15 07:47:16.070 +186476,57508,3993.0,3,,CC BY-SA 3.0,db70dac2-603d-4dc2-b845-8d57649c7408,,,2013-10-15 08:21:54.933 +186479,57403,20062.0,5,,CC BY-SA 3.0,9cac6f26-dc33-4ea3-8231-94c3d00fb31d,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with for example their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** *(see p-values tests under each plot)* + + +---------- + + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (otherwise they would fuse into a one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case is given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +---------- + + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm",added 39 characters in body,2013-10-15 08:23:13.427 +186480,57509,16474.0,2,,CC BY-SA 3.0,54f66405-b401-4c26-8511-b8ea573e7d12,"In many cases a distribution can be described as a result of some idealized experiment. For example if we flip a fair coin $n$ times the number of heads will follow a binomial distribution with parameters $n$ and .5. These idealized experiments are often used as models; they are used as simplified representation of how the data came to be. There are obviously many such models, and as a consequence many distributions. If you want the logic behind all distributions, then that will require a book of many volumes, e.g.: + +N. L. Johnson, S. Kotz and N. Balakrishnan (2000). _Continuous Multivariate Distributions_, Vol. 1 (second edition), New York: Wiley & Sons. + + N. L. Johnson, S. Kotz and N. Balakrishnan (1997). _Discrete Multivariate Distributions_. New York: John Wiley & Sons. + + N. L. Johnson, S. Kotz and N. Balakrishnan (1995). _Continuous Univariate Distributions_, Vol. 2 (second edition), New York: John Wiley & Sons. + + N. L. Johnson, S. Kotz and N. Balakrishnan (1994). _Continuous Univariate Distributions_, Vol. 1 (second edition), New York: John Wiley & Sons. + + N. L. Johnson, A. W. Kemp and S. Kotz (1992). _Univariate Discrete Distributions_ (second edition), New York: John Wiley & Sons. + +A shorter list of distributions that is more suitable/affordable for owning yourself is: + +Forbes, C., Evans, M., Hastings, N., & Peacock, B. (2011). Statistical distributions. Wiley",,2013-10-15 08:28:49.833 +186481,57492,20130.0,5,,CC BY-SA 3.0,04914208-f376-4f68-b180-47443cbc6682,"There's a question for [textbooks](http://stats.stackexchange.com/questions/4612/good-econometrics-textbooks) here, but I would like to ask similar question about handbooks: What econometrics handbooks would you recommend? + +Assumed audience is researchers and graduate-level students. It needs to include the material of Greene's _Econometric Analysis_ and Wooldridge's _Econometric Analysis of Cross Section and Panel Data_ in a denser form (without discussions and proofs), as well as mainstream techniques not mentioned in these two books. + +One option is Elsevier's 6 volumes of _Handbook of Econometrics_ series edited by Griliches and Instriligator. However, perhaps you would recommend other handbooks, more concise or otherwise? + +I suggest two possible formats: one is a reference-card format with minimum explanations, and the other one is a more extended format with proofs and more detailed exposition.",added 322 characters in body,2013-10-15 08:37:16.480 +186482,57510,19557.0,2,,CC BY-SA 3.0,2cd67842-111b-4450-b010-57955347945e,"ID3 is an [algorithm](http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm) for building a decision tree classifier based on maximizing information gain at each level of splitting across all available attributes. It's a precursor to the [C4.5 algorithm](http://www2.cs.uregina.ca/~dbd/cs831/notes/ml/dtrees/c4.5/tutorial.html). + +The ID3 algorithm won't predict a continuous value (you typically use regression for that) but it can use continuous values as input and should segment those appropriately to classify the various instances in your training set. Without looking too closely at the data or knowing what tool you're using, I'd suggest using simple bins for the value you're attempting to predict (""high"" or ""low""). + +It depends on the distribution of values across the range 1-10 how many bins you use. The goal is to get a relatively even distribution of training samples across the bins. If you have values concentrated in one particular bin (either ""high"" or ""low""), you'll have problems with the precision and recall of your resulting model. In highly skewed data sets, always guessing the dominant class will seem the optimal result to the model. + +Most machine learning tools like [Weka](http://www.cs.waikato.ac.nz/ml/weka/) allow you to reclassify data into bins. It's also fairly simple to do this with a Python script or even a simple formula in Excel (=IF(::cell of interest:: > 5,""high"",""low"")) since this dataset is so small. + +However you decide to accomplish binning, make sure you look at the data first to make sure those bins make sense.",,2013-10-15 08:47:11.623 +186485,57511,8819.0,1,,CC BY-SA 3.0,00523226-1afc-440c-a428-077b71005636,Integration with respect to Multivariate normal distribution,,2013-10-15 08:47:23.500 +186484,57511,8819.0,3,,CC BY-SA 3.0,00523226-1afc-440c-a428-077b71005636,,,2013-10-15 08:47:23.500 +186483,57511,8819.0,2,,CC BY-SA 3.0,00523226-1afc-440c-a428-077b71005636,"I am working on the numerical integration of an integral of the following functional form: + +$$ \int\limits_{R^{G}} F(x_{1},x_{2},\text{…}x_{G})d\text{Φ}_{\text{Σ}}(x_{1},x_{2},\text{…},x_{G}) $$ + +Here +$$ \text{Φ}_{\text{Σ}}(x_{1},x_{2},\text{…},x_{G}) $$ + +is the G-dimensional multivariate normal distribution with correlation matrix $\Sigma$ and F is some function of the constituent marginals. + +What I am essentially doing is calculating the expectation of a function over a correlated multivariate normal distribution. Practically, G is expected to be equal to or less than 4 and most often just 2 or 3. + +Can some one know-how share any of the fundamental references that tackles the issue.My research yielded some information, and it appears that Gaussian quadrature is one of the preferred ways to approach the problem. I am referring to the book **Applied Computational +Economics and Finance** by Miranda and Fackler for addressing the implementation aspects of the algorithm. + +But, I wanted to get some help from the expert community here on if I am on the right track. + +Sorry if it is a repeat, however I searched on the site, and was not able to find a question that matches with what I had.",,2013-10-15 08:47:23.500 +186486,57511,,25,,,98ff56cb-661f-4da7-9f71-50572a9a8643,,http://twitter.com/#!/StackStats/status/390045115139784704,2013-10-15 09:22:56.877 +186487,57496,12314.0,5,,CC BY-SA 3.0,56107172-1e2b-409a-8a30-1b988fa357cd,"Suppose that I have daily data on the population of a small village, given by $Y(t)$, as well as daily data on various factors that are relevant to the size of the population in the future, given by vector $X(t)$. These explanatory variables include untransformed variables as well as features engineered to be informative over long horizons (e.g. one of the variables captures the number of deaths over the last 30 days). I have collected this data for 8 years. + +My objective is to forecast $Y(t)$ ahead by 1,2,3,...,365 days. I expect long-run forecasts to be different to short-run forecasts. If a holiday season is coming up I might expect a downwards spike in a few months time (people visiting the city), but if someone is on their deathbed then I will expect a downwards spike in a few days. + +Since the population is sufficiently small that $\Delta Y(t+k)$ is typically in $\{-2,-1,0,1,2\}$ for the forecasting horizon under question, I will use a multiple categorical response variable classification model that will assign probabilities to the various class labels being observed. + +My question centers on the specific considerations I need to make when constructing forecasts of the change from $Y(t)$ to $Y(t+k)$ where $k$ is large (e.g. 100 days). + +Basically there will be the most hideous autocorrelation structure in $\Delta Y(t+k)$ over these time scales. If someone dies on day $2$, they are also dead on day $3, 4, ..., k$, meaning a string of $k$ or so $\Delta Y(t+k)$ will contain this same information. + +These queries result: + +- What are some ways of dealing with this immense autocorrelation structure in my response. Is it even a problem? +- Are there alternative methodologies to the ones I've proposed for forecasting these horizons (aside from typical machine learning methods such as random forests which I'm already working with). +- Any other handy advice. + + + + + + + + +",deleted 1 characters in body,2013-10-15 09:23:12.893 +186488,57512,22707.0,2,,CC BY-SA 3.0,b7ed8621-0a12-4c24-a4a6-d5cae062c956,"The output of TANH is already between -1 and 1. So, if you normalise the inpu, be sure to normalise for the hidden activation functions. In theory it is not required to normalise, because tanh(1000) is mathematically different from tanh(10000). But in practice these are the same, so you should indeed normalise the input in most applications.",,2013-10-15 09:31:11.227 +186490,57513,10594.0,2,,CC BY-SA 3.0,96a35508-4fe5-4957-98d5-5291599c2ef8,"I am conducting a glm. The explanatory variable is categorical with three levels (control, treat1, treat2). The response variable is 0 or 1. +The response rate for each treatment level is ploted as the figure below (from left to right: control, treat1, treat2): + +![enter image description here][1] + +There seems to be a big treatment effect between treat1 vs. control and treat2 vs. control. I applied glm: + +`fit <- glm(response ~ treatment, family = binomial, data = dat)` + + Coefficients: + Estimate Std. Error z value Pr(>|z|) + (Intercept) -21.57 6536.57 -0.003 0.997 + treat1 23.76 6536.57 0.004 0.997 + treat2 43.13 9364.95 0.005 0.996 + +The z-test shows that neither treat1 nor treat2 is significant compared to the reference level control. + +However, the analysis of deviance confirmed that the treatment factor as a whole is highly significant: + + drop1(M2, test=""Chisq"") + response ~ treatment + Df Deviance AIC LRT Pr(>Chi) + 13.003 19.003 + treatment 2 77.936 79.936 64.932 7.946e-15 *** + +How shall I interpret such strange result? Why the individual z-test does not give me any significant result, while according to the plot there is obvious an effect between treat1 and control, and between treat2 and control? + +Thanks + + + + + [1]: https://i.stack.imgur.com/Abi8S.png",,2013-10-15 09:49:58.953 +186493,57513,10594.0,5,,CC BY-SA 3.0,a205df0c-ff86-4268-aaa7-7c1e43aa77b3,"I am conducting a glm. The explanatory variable is categorical with three levels (control, treat1, treat2). The response variable is 0 or 1. +The response rate for each treatment level is ploted as the figure below (from left to right: control, treat1, treat2): + +![enter image description here][1] + +There seems to be a big treatment effect between treat1 vs. control and treat2 vs. control. I applied glm: + +`fit <- glm(response ~ treatment, family = binomial, data = dat)` + + Coefficients: + Estimate Std. Error z value Pr(>|z|) + (Intercept) -21.57 6536.57 -0.003 0.997 + treat1 23.76 6536.57 0.004 0.997 + treat2 43.13 9364.95 0.005 0.996 + +The z-test shows that neither treat1 nor treat2 is significant compared to the reference level control. + +However, the analysis of deviance confirmed that the treatment factor as a whole is highly significant: + + drop1(M2, test=""Chisq"") + + response ~ treatment + Df Deviance AIC LRT Pr(>Chi) + 13.003 19.003 + treatment 2 77.936 79.936 64.932 7.946e-15 *** + +How shall I interpret such strange result? Why the individual z-test does not give me any significant result, while according to the plot there is obvious an effect between treat1 and control, and between treat2 and control? + +Thanks + + + + + [1]: https://i.stack.imgur.com/Abi8S.png",added 18 characters in body,2013-10-15 09:57:39.077 +186496,57514,14874.0,3,,CC BY-SA 3.0,ee3b9636-f7b1-4cff-a940-8cfb978839b9,,,2013-10-15 10:00:24.350 +186495,57514,14874.0,1,,CC BY-SA 3.0,ee3b9636-f7b1-4cff-a940-8cfb978839b9,Constructing a bivariate distribution from two gamma-distributed random variables with nonlinear dependence?,,2013-10-15 10:00:24.350 +186494,57514,14874.0,2,,CC BY-SA 3.0,ee3b9636-f7b1-4cff-a940-8cfb978839b9,"I've got 2 gamma-distributed random variables $(X,Y)$ with arbitrary scale and shape parameters. Further, $Y$ should be a non-linear function of $X$, lets say $Y=\sqrt{X}$. What I am interested in is the joint probability $F_{X,Y}(\cdot)$. + +All suggestions or general comments are welcome. + +Thank you in advance + +",,2013-10-15 10:00:24.350 +186497,57403,20062.0,5,,CC BY-SA 3.0,bf03f4b5-03a4-4ba2-8e69-2386289dcdc5,"**Mantel's test widely occur in biological studies** in which is used to +examine the correlation between spatial distribution of animals (position in space) with for example their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (*PNAS, Animal Behaviour, Molecular Ecology...*) + +Recently I fabricated some patterns which may occur in nature, but Mantel's test **seems to be quite useless.** *(see p-values under each plot)* + + +---------- + + +***Imaginary situation:*** Suppose you have orchard (17 x 17 trees) and on each tree a crow is sitting. Levels of ""noise"" for each crow are given and you are searching for pattern in distribution. + +***There are 5 possibilities:*** + + 1. **""Birds of a feather flock together.""** The more similar crows are, the + smaller is geographical distance between them **(single cluster)**. + + 2. **""Birds of a feather flock together.""** Also the more similar crows + are, the smaller is geographical distance between them **(multiple + clusters)** but, one cluster of noisy crows has no knowledge about the + existence of second cluster (otherwise they would fuse into a one big + cluster). + + 3. **""Monotonical trend occur""**. + + 4. **""Opposites attract each other""**. Similar crows can not withstand each + other. + + 5. **""Random pattern""** - the level of noise has no significant effect on + spatial distribution. + +Under each case is given plot of points from which the Mantel test computes correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points). + +![enter image description here][1] + + + [1]: https://i.stack.imgur.com/TWQqa.png + +**Why scientists do not use Moran's I instead? Is there some hidden reason I do not see? +And if there is such reason,how can I know (how different the hypotheses must be constructed) to appropriately use Mantel's or Moran's I test? A real life example will be helpful.** + + +---------- + + +**Example data:** *(compressed as possible)* + + r.gen<-seq(-100,100,5) + r.val<-sample(r.gen, 289, replace=TRUE) + z10<-rep(0, times=10) + z11<-rep(0, times=11) + r5<-c(5,15,25,15,5) + r71<-c(5,20,40,50,40,20,5) + r72<-c(15,40,60,75,60,40,15) + r73<-c(25,50,75,100,75,50,25) + rbPal <- colorRampPalette(c(""blue"",""red"")) + my.data<-data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17), + c1 = c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71, + z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72, + z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5), + seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5), + seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5), + seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5), + seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, + replace=TRUE)) + + # adding colors + my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))] + my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))] + my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))] + my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))] + my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))] + +Creating matrix of geographical distances (for Moran's I is inversed): + + point.dists <- dist(cbind(my.data$x,my.data$y)) + point.dists.inv <- 1/point.dists + point.dists.inv<-as.matrix(point.dists.inv) + diag(point.dists.inv) <- 0 + +Plot creation: + + X11(width=12, height=6) + par(mfrow=c(2,5)) + par(mar=c(1,1,1,1)) + + library(ape) + for (i in 3:7) { + my.res<-mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists)) + plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", + ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17)) + text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, + 2), ""\n p.value ="", round(my.res$p, 3))) + my.res<-Moran.I(my.data[ ,i], point.dists.inv) + text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, + 3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", + round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3))) + } + + par(mar=c(5,4,4,2)+0.1) + + for (i in 3:7) { + plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", + ylab=""behavioural distance"") + } + +P.S. if you google each tests, in these examples are both used on the exact same data +and exact the same hypothesis...**not very helpful** + +Mantel test: http://www.ats.ucla.edu/stat/r/faq/mantel_test.htm + +Moran's I: http://www.ats.ucla.edu/stat/r/faq/morans_i.htm",deleted 6 characters in body,2013-10-15 10:06:31.947 +186498,57515,22677.0,2,,CC BY-SA 3.0,a1d78672-68e5-4d0e-856e-20f5a7ea7b35,"Nevermind, apparently the $ARL0=\frac1\alpha$ where alpha is ***false alarm probability*** + +a further reading wold be +>Nonparametric monitoring of data streams for changes in location and scale +GJ Ross, DK Tasoulis, NM Adams - Technometrics, 2011 - Taylor & Francis",,2013-10-15 10:13:54.107 +186499,57426,20470.0,5,,CC BY-SA 3.0,d998e12c-3396-4e85-86d8-20c15af34373,"**Question**: *Is the set-up below a sensible implementation of a Hidden Markov model?* + +I have a set of `108,000` observations (taken over the course of 100 days) and approximately `2000` event occurrences throughout the whole observation time-span. The data looks like the figure below where there are 3 observation levels and the red columns highlight event times, i.e. $t_E$'s: + +![enter image description here][1] + +As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"". + +**HMM Training:** I plan to [train][2] a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested on Pg. 273 of Rabiner's [paper][3]. Hopefully, this will allow me to train an HMM that captures the sequence patterns which lead to an event. + +**HMM Prediction:** Then I plan to use this HMM to [predict][4] $P(Observations|HMM)$ on a new day, where $Observations$ will be a sliding window vector, which I will update to contain the observations between the current time $t$ and $t-5$ as the day goes on. + +I expect to see $log[P(Observations|HMM)]$ increase for $Observations$ that resemble the ""pre-event windows"". This should in effect allow me to predict the events before they happen. + + + + + + + [1]: https://i.stack.imgur.com/QkIn0.png + [2]: http://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm + [3]: http://www.cs.cornell.edu/Courses/cs4758/2012sp/materials/hmm_paper_rabiner.pdf + [4]: http://en.wikipedia.org/wiki/Forward_algorithm",added 6 characters in body,2013-10-15 10:32:57.617 +186500,57479,0.0,36,,,a9ce8885-4a10-45ea-b0b9-8c8275db45c5,,from http://programmers.stackexchange.com/questions/214426/how-would-you-use-pair-wise-plots-to-test-the-effectiveness-of-k-means-clusterin,2013-10-15 10:39:06.057 +186503,57516,21884.0,2,,CC BY-SA 3.0,7ece545a-e6b3-4db3-9afe-5d8e1cd67990,"Is it true that (and if so, how does one prove) the following. + $$E\left|\hat{Var}_{n}(X)-Var(X)\right|^{2}=O(n^{-1})$$ + where: + +• $X$ is a random variable with mean $\mu$ and variance $\sigma^{2}$ + + +• $\hat{Var}_{n}(X)$ is the sample variance of $X$ from $n$ + i.i.d. random variables $X_{1},\cdots,X_{n}$ with mean $\mu$ + and variance $\sigma^{2}$. + +Many thanks in advance. (Feel free to change my notation).",,2013-10-15 10:40:17.880 diff --git a/examples/csv_examples/post_links.csv b/examples/csv_examples/post_links.csv new file mode 100644 index 00000000..9c42f49e --- /dev/null +++ b/examples/csv_examples/post_links.csv @@ -0,0 +1,57 @@ +id,related_post_id,post_id,link_type_id,creation_date +190,414,1248,1,2010-10-23 01:35:01.237 +1219,3646,16313,1,2011-11-09 02:39:58.990 +1957,414,4187,1,2012-04-05 13:12:17.893 +6235,541,50739,1,2013-07-14 15:38:00.373 +7217,57195,57228,1,2013-10-10 16:08:47.950 +7219,8529,57244,1,2013-10-10 19:26:46.907 +7226,5015,57220,1,2013-10-11 10:23:28.123 +7232,57304,57307,1,2013-10-11 15:18:07.647 +7233,57317,57223,1,2013-10-11 18:06:51.590 +7234,57223,57317,1,2013-10-11 18:09:17.457 +7241,10911,57317,1,2013-10-12 01:03:28.597 +7247,57317,10911,1,2013-10-12 10:04:51.897 +7299,57390,57631,1,2013-10-16 18:22:09.447 +7302,57359,57578,1,2013-10-17 06:08:23.457 +7304,57192,57237,1,2013-10-17 08:09:13.697 +7320,57192,37981,1,2013-10-18 19:41:23.763 +7328,57538,57778,1,2013-10-18 21:56:33.543 +7348,57778,57538,1,2013-10-20 23:41:03.453 +7354,28,57869,1,2013-10-21 13:50:07.057 +7358,57976,57970,3,2013-10-22 00:01:40.250 +7370,57830,55209,1,2013-10-22 20:34:57.207 +7377,54637,58081,1,2013-10-23 10:49:22.693 +7379,58081,54637,1,2013-10-23 12:04:48.980 +7387,58149,57317,1,2013-10-24 04:24:12.760 +7388,10911,58149,1,2013-10-24 04:33:09.597 +7389,57317,58149,1,2013-10-24 04:33:09.597 +7390,57895,58154,1,2013-10-24 12:15:12.887 +7394,58090,58180,1,2013-10-24 20:28:21.067 +7395,58180,58090,1,2013-10-24 20:28:52.160 +7396,58183,58187,1,2013-10-24 21:34:20.433 +7413,58268,58265,1,2013-10-26 13:24:21.497 +7426,58269,58341,1,2013-10-28 08:12:34.160 +7428,57680,58357,1,2013-10-28 12:59:30.020 +7431,58308,58307,1,2013-10-28 15:31:19.840 +7433,58125,58369,1,2013-10-28 17:51:19.100 +7437,58027,58408,1,2013-10-29 05:11:18.327 +7445,2509,58453,1,2013-10-29 19:38:43.767 +7450,57990,58467,1,2013-10-30 12:15:40.743 +7454,58529,58290,1,2013-10-30 18:26:31.227 +7455,58534,58467,1,2013-10-30 19:14:14.273 +7468,58485,58373,1,2013-10-31 18:03:47.743 +7472,58515,58620,1,2013-11-01 00:05:59.307 +7478,14729,58650,1,2013-11-01 19:23:21.433 +7482,58179,58678,1,2013-11-02 07:37:47.103 +7489,58521,58720,1,2013-11-02 19:17:43.203 +7520,58678,58951,1,2013-11-06 03:24:17.310 +7521,58609,58951,1,2013-11-06 03:24:17.310 +7522,58951,58179,1,2013-11-06 03:27:09.100 +7544,59040,59119,1,2013-11-08 03:38:56.173 +7551,58818,59172,1,2013-11-08 21:00:03.390 +7558,59092,59124,1,2013-11-09 12:20:01.910 +7611,58678,59092,1,2013-11-13 19:20:55.760 +7649,57317,58445,1,2013-11-18 07:24:08.027 +14852,14729,58650,3,2015-01-18 17:48:06.900 +16962,47981,414,1,2015-04-14 13:49:32.550 +20431,16313,3646,1,2015-09-08 08:13:25.983 diff --git a/examples/csv_examples/posts.csv b/examples/csv_examples/posts.csv new file mode 100644 index 00000000..154eb115 --- /dev/null +++ b/examples/csv_examples/posts.csv @@ -0,0 +1,34976 @@ +id,owner_user_id,post_type_id,accepted_answer_id,parent_id,owner_display_name,title,tags,content_license,body,creation_date +57184,1741.0,2,,57086.0,,,,CC BY-SA 3.0,"

Here there are some thoughts:

+ +
    +
  1. All black-box models might be inspected in some way. You can compute the variable importance for each feature for example or you can also plot the predicted response and the actual one for each feature (link);
  2. +
  3. You might think about some pruning of the ensemble. Not all the trees in the forest are necessary and you might use just a few. Paper: [Search for the Smallest Random Forest, Zhang]. Otherwise just Google ""ensemble pruning"", and have a look at ""Ensemble Methods: Foundations and Algorithms +"" Chapter 6;
  4. +
  5. You can build a single model by feature selection as you said. Otherwise you can also try to use Domingos' method in [Knowledge acquisition from examples via multiple models] that consists in building a new dataset with black-box predictions and build a decision tree on top of it.
  6. +
  7. As mentioned in this Stack Exchange's answer, a tree model might seem interpretable but it is prone to high changes just because of small perturbations of the training data. Thus, it is better to use a black-box model. The final aim of an end user is to understand why a new record is classified as a particular class. You might think about some feature importances just for that particular record.
  8. +
+ +

I would go for 1. or 2.

+",2013-10-10 00:04:43.820 +57185,,1,,,user10619,What is the difference between the concept and treatment of measurement error in psychometry and in statistics?,,CC BY-SA 3.0,"

There is some confusion with respect to the measurement error. What is the definition in statistics and definition in psychometry ? The statistics does not seem to recognize the measurement error popularly called construct bias in psychometry.

+",2013-10-10 00:46:42.743 +57186,22542.0,1,,,,Sample sizes for differences between three groups,,CC BY-SA 3.0,"

I am doing a study of two schools, and at each school I am sampling three groups. I am trying to determine why the person chose to go to that school and not another. I will be asking each group various questions about the school and options they may have had for other schools, etc. I want to be able to detect:

+ +
    +
  1. a difference of 20% in responses between the schools (e.g., proportion of students for whom current school was first school of preference, or proportion for whom there was no other choice), and
  2. +
  3. within each school, 20% difference in responses between the three different groups (similar questions, but for students who enrolled in different eras).
  4. +
+ +

In both instances, I would like power of 80% for 0.05 significance.

+ +

So, if I do what I think is the appropriate calculation, I effectively come up with: +\begin{align} +n &= \frac{0.5(0.84 + 1.96)^2}{0.2^2} \\ + &= 98 +\end{align} +Can I just assume then that I need three groups of 99 within each school? I guess I'm confused because nobody ever seems to talk about calculating sample sizes when comparing more than two groups.

+ +

Furthermore, is there anything wrong with sampling the three groups of 98 at each school and assuming that the total sample of 294 at each school will be sufficient to detect the 20% difference between the two schools?

+",2013-10-10 01:51:45.710 +57187,9792.0,1,,,,Centering when using splines in R,,CC BY-SA 3.0,"

I am having trouble understanding why centering seems to only work with simple linear models and not with splines for example. I am using centering to report the estimated group differences at different $x$, but also statistical values (ignoring multiple comparisons for the moment).

+ +
set.seed(1)
+
+# simulate data
+N <- 10
+x <- rep(seq(0.2,1,0.2),N)
+group <- factor(rep(c('I','II'),each=length(x)/N))
+y <- -x^2 + 2*x*as.numeric(group) + rnorm(length(x),mean=0,sd=0.1)
+d <- data.frame(group,x,y)
+
+# fit a linear model with x-group interaction
+l <- lm(y~x*group,data=d)
+d$lmfit <- fitted(l)
+coef(l)['groupII'] # group difference at x==0
+#     groupII 
+#  -0.1097071 
+
+library(ggplot2)
+ggplot(d,aes(x,y,colour=group)) + geom_point() + geom_line(aes(x,lmfit,colour=group))
+
+ +

The plot confirms the reported small group difference groupII of 0.05 at $x=0$ if we were to extrapolate back to 0.

+ +

Now let us centre the data at $x=1$ and estimate the group difference there.

+ +
# center data at x==1 and refit
+l <- lm(y~I(x-1)*group,data=d)
+coef(l)['groupII'] # group difference at x==1
+#   groupII 
+#  2.08525 
+
+ +

In agreement with the plot the difference is about 2.

+ +

Now let us fit a spline model.

+ +
# fit data with splines
+library(splines)
+l <- lm(y~ns(x,2)*group,data=d)
+d$lmsplinefit <- fitted(l)
+    coef(l)['groupII'] # group difference at x==0.2
+    #     groupII 
+    #  0.2987893 
+    # compare to: d$lmsplinefit[6] - d$lmsplinefit[1]
+
+ggplot(d,aes(x,y,colour=group)) + geom_point() + geom_line(aes(x,lmsplinefit,colour=group))
+
+ +

Interestingly, the spline fit reports the group difference at the first $x$, i.e. $x=0.2$.

+ +

If we try to centre at $x=1$ we get the same result, i.e. the difference at $x=0.2$.

+ +
l <- lm(y~ns(I(x-1),2)*group,data=d)
+coef(l)['groupII']
+# same result as un-centered data, i.e. 0.2987893
+
+ +

Why is that? And is there a way to show the group difference at a different $x$? Btw, centering $x$ manually before the model fit does not make a difference.

+",2013-10-10 02:44:04.330 +57188,594.0,2,,56684.0,,,,CC BY-SA 3.0,"

Here's the probability function and distribution of the proportion of True (plus the normal approximation at which the chi-square will be exact) in a sample of size 10,000 and a proportion of True of only 1% (right below the low end of your suggested total sample size, and with expected number of True only one fifth of your suggested minimum):

+ +

+ +

Don't forget that you'll have about five times the expected successes shown here; your approximation will be much better than this.

+ +

A straight two sample proportions test or a chi-square test should do just fine. Indeed, one tenth of your proportion of True's would be just fine. One hundredth, you'd just go to exact methods.

+",2013-10-10 02:45:38.470 +57189,22545.0,1,,,,Theoretical expected value and variance,,CC BY-SA 3.0,"

Let $X$ be a random variable having expected value $\mu$ and variance $\sigma^2$. Find the Expected Value and Variance of $Y = \frac{X−\mu}{\sigma}$.

+ +

I would like to show some progress I've made so far, but honestly I've been thinking about this problem for the past few days but just have no idea where to start. Any hint or insight on a starting point would be much appreciated.

+ +

Thanks!

+",2013-10-10 04:00:27.487 +57190,449.0,2,,57183.0,,,,CC BY-SA 3.0,"

You can use multi-level logistic regression. You've only got one dependent variable, correctness. You have multiple independent variables nested within student. In R you can use lmer to construct the model. It would look something like.

+ +
m <- lmer( answer ~ treatment * Q + (treatment * Q | student), family = 'binomial', data = mydata)
+
+ +

That would allow for there to be random effects of question and treatment within student as well as overall correctness variability within student but you would also be able to assess fixed effects of treatment and question. What you seem to really want to know is all of the treatment by question interactions and that model provides them.

+ +

In order to analyze all of the questions with any kind of reliability you really should have a lot of students taking the test (hundreds). The general effect of treatment could be assessed with fewer. Also, if you know the categories, the kinds of questions you think differ, then you could replace the individual question variable with that. It would be much more sensible and make this look much less like a fishing expedition.

+",2013-10-10 04:04:00.123 +57191,594.0,2,,57189.0,,,,CC BY-SA 3.0,"

Have you seen the following basic properties of expectation and variance?

+ +

(I'd be very surprised if some version of these hadn't been discussed)

+ +

$\text{E}(aX+b) = a\text{E}(X)+b$

+ +

$\text{Var}(aX+b) = a^2\text{Var}(X)$

+ +

http://en.wikipedia.org/wiki/Expected_value#Linearity

+ +

http://en.wikipedia.org/wiki/Variance#Basic_properties

+ +

If you apply these properties, or better, the versions you'll already have been given, the problem is trivial.

+ +

If you still can't see it, try finding $\text{E}(X-\mu)$ first and work from there.

+",2013-10-10 04:06:18.590 +57192,22031.0,1,,,,How to check for normal distribution using Excel for performing a t-test?,,CC BY-SA 3.0,"

I want to know how to check a data set for normality in Excel, just to verify that the requirements for using a t-test are being met.

+ +

For the right tail, is it appropriate to just calculate a mean and standard deviation, add 1, 2 & 3 standard deviations from the mean to create a range then compare that to the normal 68/95/99.7 for the standard normal distribution after using the norm.dist function in excel to test each standard deviation value.

+ +

Or is there a better way to test for normality?

+",2013-10-10 04:41:49.360 +57193,1717.0,2,,57175.0,,,,CC BY-SA 3.0,"

When you use EM to obtain maximum likelihood estimates, you need a variable that describes your observations $x_{n}$, latent variables $z_{n}$ that are in some way related to your observations (e.g. in coin tossing experiments, $\{H, T\}$ are the latent variables and in gaussian mixtures, the mixing coefficients $\pi_{i}$ take the role of latent variables) and the parameters $\theta$ that you are trying to estimate.

+ +

At the risk of not answering your question at all, I think you want a maximum likelihood estimate of $\theta$ using EM based on known observations $x_{n}$ that are given by the following equation:

+ +

$$x_{t} = s_{t}(\theta_{0}) + n_{t}$$

+ +

If that is correct, a general idea is the following. Since $n_{t}$ is white noise $N(0, \sigma)$, $x_{t}$ can be described by a Gaussian $p(x_{t}|s_{t},\theta) = N(s_{t}(\theta), \sigma)$. In the EM formulation, $x_{t}$'s are known variables, $s_{t}$'s are latent variables and $\theta$ is the parameter. It is customary to group the variables $x_{n}$ in a variable $X$ and likewise, latent variables $s_{n}$ are grouped in a variable $S$.

+ +

As you should know, the EM algorithm consists of 2 steps: expectation and maximization. In the expectation step, we use an expression $Q$ as a proxy for the likelihood $L(\theta|X) = p(X|\theta)$, that is, the probability of +getting the known data $X$ given a parameter $\theta$. This is the same likelihood used to obtain maximum likelihood estimates. However, in EM we use this $Q$ instead:

+ +

$$Q(\theta|\theta^{\text{old}}) = E_{S|X, \theta^{\text{old}}} \log p(X,S|\theta)$$

+ +

This odd-looking expression is actually a lower bound of the likelihood $L(\theta|X)$. Bishop's book contains a good derivation of $Q$.

+ +

In order to start the EM magic, you have to choose a random $\theta^{\text{old}}$ and calculate this expectation. Notice that you need $p(X,S|\theta)$ and $p(S|X,\theta^{\text{old}})$. $p(X,S|\theta)$ is equal to $p(X|S,\theta)p(S|\theta)$ and using Bayes' theorem, $p(S|X,\theta^{\text{old}})$ is +proportional to $p(X|S,\theta^{\text{old}})p(S|\theta^{\text{old}})$.

+ +

At this point, I hope it is clear that $p(X|S,\theta)=\prod_{t} p(x_{t}|s_{t},\theta)$, so that part is not hard to calculate. However, $p(S|\theta)$, that is, $\prod_{t}p(s_{t}|\theta)$ is required. I don't know what distribution could be appropriate since this depends on the specifics of your problem so I will assume you know.

+ +

By now, you can calculate $Q(\theta|\theta^{\text{old}})$.

+ +

The maximization step is simply:

+ +

$$\theta = \text{arg max}_{\theta} Q(\theta|\theta^{\text{old}})$$

+ +

This is the new $\theta$ to be used in the expectation step again until convergence.

+ +

That is a general idea of how EM could work in this case. However, maybe you don't know a distribution for $s_{t}$ or it is difficult to calculate the expectation or the maximization step.

+ +

For the big picture, take a look at this nice explanation.

+ +

UPDATE

+ +

I think you changed the question quite a bit. Are you asking how to calculate maximum likelihood estimates? Basically, you apply a derivative to the likelihood on the parameter you want to estimate:

+ +

$$\frac{\partial}{\partial \theta}L(\theta|X) = 0$$

+ +

solve it and that's pretty much it. See more examples here.

+",2013-10-10 05:41:08.343 +57194,155.0,2,,57192.0,,,,CC BY-SA 3.0,"

You could plot a histogram using the data analysis toolpack in Excel. Graphical approaches are more likely to communicate the degree of non-normality, which is typically more relevant for assumption testing (see this discussion of normality).

+ +

The data analysis toolpack in Excel will also give you skewness and kurtosis if you ask for descriptive statistics and choose the ""summary statistics"" option. You might for example consider values of skewness above plus or minus one be a form of substantive non-normality.

+ +

That said, the assumption with t-tests is that the residuals are normally distributed and not the variable. Furthermore, they also quite robust such that even with fairly large amounts of non-normality, p-values are still fairly valid.

+",2013-10-10 05:50:58.150 +57195,22547.0,1,57381.0,,,Showing spatial and temporal correlation on maps,,CC BY-SA 3.0,"

I have data for a network of weather stations across the United States. This gives me a data frame that contains date, latitude, longitude, and some measured value. Assume that data are collected once per day and driven by regional-scale weather (no, we are not going to get into that discussion).

+ +

I'd like to show graphically how simultaneously-measured values are correlated across time and space. My goal is to show the regional homogeneity (or lack thereof) of the value that is being investigated.

+ +

Data set

+ +

To start with, I took a group of stations in the region of Massachusetts and Maine. I selected sites by latitude and longitude from an index file that is available on NOAA's FTP site.

+ +

+ +

Straight away you see one problem: there are lots of sites that have similar identifiers or are very close. FWIW, I identify them using both the USAF and WBAN codes. Looking deeper in to the metadata I saw that they have different coordinates and elevations, and data stop at one site then start at another. So, because I don't know any better, I have to treat them as separate stations. This means the data contains pairs of stations that are very close to each other.

+ +

Preliminary Analysis

+ +

I tried grouping the data by calendar month and then calculating the ordinary least squares regression between different pairs of data. I then plot the correlation between all pairs as a line connecting the stations (below). The line color shows the value of R2 from the OLS fit. The figure then shows how the 30+ data points from January, February, etc. are correlated between different stations in the area of interest.

+ +

+ +

I've written the underlying codes so that the daily mean is only calculated if there are data points every 6-hour period, so data should be comparable across sites.

+ +

Problems

+ +

Unfortunately, there is simply too much data to make sense of on one plot. That can't be fixed by reducing the size of the lines.

+ +

I've tried plotting the correlations between the nearest neighbors in the region, but that turns into a mess very quickly. The facets below show the network without correlation values, using $k$ nearest neighbors from a subset of the stations. This figure was just to test the concept. +

+ +

The network appears to be too complex, so I think I need to figure out a way to reduce the complexity, or apply some kind of spatial kernel.

+ +

I am also not sure what is the most appropriate metric to show correlation, but for the intended (non-technical) audience, the correlation coefficient from OLS might just be the simplest to explain. I may need to present some other information like the gradient or standard error as well.

+ +

Questions

+ +

I'm learning my way into this field and R at the same time, and would appreciate suggestions on:

+ +
    +
  1. What's the more formal name for what I'm trying to do? Are there some helpful terms that would let me find more literature? My searches are drawing blanks for what must be a common application.
  2. +
  3. Are there more appropriate methods to show the correlation between multiple data sets separated in space?
  4. +
  5. ... in particular, methods that are easy to show results from visually?
  6. +
  7. Are any of these implemented in R?
  8. +
  9. Do any of these approaches lend themselves to automation?
  10. +
+",2013-10-10 05:52:03.253 +57196,668.0,2,,57192.0,,,,CC BY-SA 4.0,"

You have the right idea. This can be done systematically, comprehensively, and with relatively simple calculations. A graph of the results is called a normal probability plot (or sometimes a P-P plot). From it you can see much more detail than appears in other graphical representations, especially histograms, and with a little practice you can even learn to determine ways to re-express your data to make them closer to Normal in situations where that is warranted.

+ +

Here is an example:

+ +

+ +

Data are in column A (and named Data). The rest is all calculation, although you can control the ""hinge rank"" value used to fit a reference line to the plot.

+ +

This plot is a scatterplot comparing the data to values that would be attained by numbers drawn independently from a standard Normal distribution. When the points line up along the diagonal, they are close to Normal; horizontal departures (along the data axis) indicate departures from normality. In this example the points are remarkably close to the reference line; the largest departure occurs at the highest value, which is about $1.5$ units to the left of the line. Thus we see at a glance that these data are very close to Normally distributed but perhaps have a slightly ""light"" right tail. This is perfectly fine for applying a t-test.

+ +

The comparison values on the vertical axis are computed in two steps. First each data value is ranked from $1$ through $n$, the amount of data (shown in the Count field in cell F2). These are proportionally converted to values in the range $0$ to $1$. A good formula to use is $\left(\text{rank}-1/6\right)/\left(n+2/3\right).$ (See http://www.quantdec.com/envstats/notes/class_02/characterizing_distributions.htm for where that comes from.) Then these are converted to standard Normal values via the NormSInv function. These values appear in the Normal score column. The plot at the right is an XY scatterplot of Normal Score against the data. (In some references you will see the transpose of this plot, which perhaps is more natural, but Excel prefers to place the leftmost column on the horizontal axis and the rightmost column on the vertical axis, so I have let it do what it prefers.)

+ +

+ +

(As you can see, I simulated these data with independent random draws from a Normal distribution with mean $5$ and standard deviation $2$. It is therefore no surprise that the probability plot looks so nice.) There really are only two formulas to type in, which you propagate downward to match the data: they appear in cells B2:C2 and rely on the Count value computed in cell F2. That's really all there is to it, apart from the plotting.

+ +

The rest of this sheet is not necessary but it's helpful for judging the plot: it provides a robust estimate of a reference line. This is done by picking two points equally far in from the left and right of the plot and connecting them with a line. In the example these points are the third lowest and third highest, as determined by the $3$ in the Hinge Rank cell, F3. As a bonus, its slope and intercept are robust estimates of the standard deviation and mean of the data, respectively.

+ +

To plot the reference line, two extreme points are computed and added to the plot: their calculation occurs in columns I:J, labeled X and Y.

+ +

+",2013-10-10 06:11:44.377 +57197,22507.0,2,,57161.0,,,,CC BY-SA 3.0,"

If you select the equal number of cases and non-cases, it will bias the model. For example, suppose that the features have zero correlation with the outcome, and the dataset is very large. The model will predict the same probability of lung cancer for all patients. If you select equal number of positive and negative example, the predicted probability will be 0.5, while in reality it is 0.1 .

+",2013-10-10 06:21:56.940 +57198,22548.0,1,57215.0,,,What is random error in OLS regression? And how is it related to Gaussian noise?,,CC BY-SA 4.0,"

In OLS regression:

+ +

$$Y=\beta_0+\beta_1 X_1+ \beta_2 X_2+\beta_3 X_3 + \beta_4 X_4+\beta_5 X_5+\beta_6 X_6 + \varepsilon,$$

+ +

what is $\varepsilon$? Is it Gaussian noise or random error? What is a difference? Why we add it to multiple regression model? In most of papers authors refer it to random error but without clarification.

+ +

I need a simple and good reason why authors add it to their model.

+",2013-10-10 06:34:08.830 +57199,22551.0,2,,47846.0,,,,CC BY-SA 3.0,"

Poisson/Negative binomial can also be used with a binary outcome with offset equal to one. Of course it necessitates that the data be from a prospective design (cohort, rct, etc). Poisson or NB regression gives the more appropriate effect measure (IRR) versus odds ratio from logistic regression.

+ +

NB regression is ""safer"" to run than Poisson regression because even if the overdispersion parameter (alpha in Stata) is not statistically significant, the results will be exactly the same as its Poisson regression form.

+",2013-10-10 07:32:59.240 +57200,15563.0,1,,,,SVM prediction accuracy drops when using Test data,,CC BY-SA 3.0,"

I am using the Kaggle Scikit data to learn R.

+ +

I am using the R e1071 SVM function to predict classes.

+ +

When I use:

+ +
svm(train, trainLabels, scale = TRUE, type = NULL, kernel = ""polynomial"")
+
+ +

I obtain this level of accuracy on a sample of the Train data:

+ +
> table(pred, trainLabels)
+    trainLabels
+pred   0   1
+   0 478   8
+1   12 502
+
+ +

which I interpret as being 98% accurate (8+12) / (478+8+12+502).

+ +

Though when I use the same prediction model on the Test data, Kaggle returns a 0.82 score, based on classification accuracy.

+ +

Can you explain why I can get such a different accuracy level?

+",2013-10-10 08:09:22.140 +57201,21762.0,2,,57015.0,,,,CC BY-SA 3.0,"

A simple approach would be the following:

+ +

1) Take all observations sampled at random under condition A and obtain the relevant one sided p-value from Wilcoxon's rank sum test.

+ +

2) Do the same for the observations sampled under condition B.

+ +

3) If the smaller of the two p-values is below the level $\alpha/2$ and the other p-value is below $\alpha$, then your claim holds at the $\alpha$ level. (This would be the Bonferroni-Holm correction for multiple testing.)

+ +

Since the sample sizes are extremely low, you will get a ""significant"" result only if the signal is very strong.

+",2013-10-10 08:18:09.707 +57208,13889.0,1,57214.0,,,Test to rank methods by AUCs on various benchmarks,,CC BY-SA 3.0,"

Suppose I have N methods and M benchmarks. I have an AUC statistic (and some other similar statistics) for each combination of method with benchmark. What test should I use to test if one method is better than the rest? I have seen some authors do pairwise comparisons using a one-sided Wilcoxon signed-rank test but I would prefer to test all methods at once. In any case I'm not sure the assumptions for the one-sided Wilcoxon signed-rank test hold. If the average AUC for each benchmark varies widely can you say the samples are from the same population? Also I'm not sure the distribution of the AUCs is symmetric around the median. Any advice would be welcome.

+",2013-10-10 10:34:56.943 +57311,22612.0,1,,,,Regressing a difference of ordinal variables?,,CC BY-SA 4.0,"

Dependent variables (ordinal): credit rating 1970 (cr70) and credit rating 1980 (cr80).

+

Here is what I want to do:

+

Regress cr80-cr70 = independent vars.

+

How could this be done and how could you interpret it!?

+

If the dependent variable is continuous it would be simple. But can you make a new var from the difference of two ordinal vars, and have that be the dependent var?

+",2013-10-11 16:46:12.820 +57202,14799.0,2,,5015.0,,,,CC BY-SA 3.0,"

This may be a problem of interpretation, a misunderstanding of what a so-called ""direct effect"" coefficient really is.

+ +

In regression models with continuous predictor variables and no interaction terms -- that is, with no terms that are constructed as the product of other terms -- each variable's coefficient is the slope of the regression surface in the direction of that variable. It is constant, regardless of the values of the variables, and is obviously a measure of the effect of that variable.

+ +

In models with interactions -- that is, with terms that are constructed as the products of other terms -- that interpretation can be made without further qualification only for variables that are not involved in any interactions. The coefficient of a variable that is involved in interactions is the slope of the regression surface in the direction of that variable when the values of all the variables that interact with the variable in question are zero, and the significance test of the coefficient refers to the slope of the regression surface only in that region of the predictor space. Since there is no requirement that there actually be data in that region of the space, the apparent direct effect coefficient may bear little resemblance to the slope of the regression surface in the region of the predictor space where data were actually observed. There is no true ""direct effect"" in such cases; the best substitute is probably the ""average effect"": the slope of the regression surface in the direction of the variable in question, taken at each data point and averaged over all data points. For more on this, see Why could centering independent variables change the main effects with moderation?

+",2013-10-10 08:31:00.453 +57203,633.0,2,,57164.0,,,,CC BY-SA 3.0,"

Yes, the gamma distribution is the maximum entropy distribution for which the mean $E(X)$ and mean-log $E(\log X)$ are fixed. As with all exponential family distributions, it is the unique maximum entropy distribution for a fixed expected sufficient statistic.

+ +

To answer your question about physical processes that generate these distributions: The lognormal distribution arises when the logarithm of X is normally distributed, for example, if X is the product of very many small factors. If X is gamma distributed, it is the sum of many exponentially-distributed variates. For example, the waiting time for many events of a Poisson process.

+",2013-10-10 09:03:03.037 +57204,4831.0,2,,57160.0,,,,CC BY-SA 3.0,"

You are correct that there's currently no good way to do seasonal ARIMA in statsmodels. Currently, I only have a half-baked solution for doing non-consecutive lags, but it's not public anywhere. It's a bit heavy, computations-wise. Unfortunately, I doubt I'll be able to work on this anytime soon (unless someone would be willing to fund the enhancement...). Contributions in this area would be very welcome.

+ +

https://github.com/statsmodels/statsmodels/issues/247 +https://github.com/statsmodels/statsmodels/issues/232

+",2013-10-10 09:22:00.363 +57205,22558.0,2,,20561.0,,,,CC BY-SA 3.0,"

I would not touch the data at all. Use this for autocorrelation with NaNs:

+ +

http://www.mathworks.com/matlabcentral/fileexchange/43840-autocorrelation-and-partial-autocorrelation-with-nans/content/nanautocorr.m

+ +

""not touch the data"" means not to remove any data or time-step or replace with 0 or the mean, it would compromise the information about the specific-time-lag linear dependence. I would also avoid simulating the values in the gaps, if you are interested in the ""SAMPLE"" autocorrelation, anyway even the best simulation technique will not add any more information about the autocorrelation, being based on the data themselves. +I partially recoded the matlab (link above) autocorrelation and partial autocorrelation functions to deal with NaNs: any data couples including NaNs is excluded from the computation. This is done for each lag. It worked for me. Any suggestion is well accepted.

+",2013-10-10 09:58:27.403 +57206,22555.0,2,,57192.0,,,,CC BY-SA 3.0,"

This question borders on statistics theory too - testing for normality with limited data may be questionable (although we all have done this from time to time).

+ +

As an alternative, you can look at kurtosis and skewness coefficients. From Hahn and Shapiro: Statistical Models in Engineering some background is provided on the properties Beta1 and Beta2 (pages 42 to 49) and the Fig 6-1 of Page 197. Additional theory behind this can be found on Wikipedia (see Pearson Distribution).

+ +

Basically you need to calculate the so-called properties Beta1 and Beta2. A Beta1 = 0 and Beta2 = 3 suggests that the data set approaches normality. This is a rough test but with limited data it could be argued that any test could be considered a rough one.

+ +

Beta1 is related to the moments 2 and 3, or variance and skewness, respectively. In Excel, these are VAR and SKEW. Where ... is your data array, the formula is:

+ +
Beta1 = SKEW(...)^2/VAR(...)^3
+
+ +

Beta2 is related to the moments 2 and 4, or the variance and kurtosis, respectively. In Excel, these are VAR and KURT. Where ... is your data array, the formula is:

+ +
Beta2 = KURT(...)/VAR(...)^2
+
+ +

Then you can check these against the values of 0 and 3, respectively. This has the advantage of potentially identifying other distributions (including Pearson Distributions I, I(U), I(J), II, II(U), III, IV, V, VI, VII). For example, many of the commonly used distributions such as Uniform, Normal, Student's t, Beta, Gamma, Exponential, and Log-Normal can be indicated from these properties:

+ +
Where:   0 <= Beta1 <= 4
+         1 <= Beta2 <= 10 
+
+Uniform:        [0,1.8]                                 [point]
+Exponential:    [4,9]                                   [point] 
+Normal:         [0,3]                                   [point]
+Students-t:     (0,3) to [0,10]                         [line]
+Lognormal:      (0,3) to [3.6,10]                       [line]
+Gamma:          (0,3) to (4,9)                          [line]
+Beta:           (0,3) to (4,9), (0,1.8) to (4,9)        [area]
+Beta J:         (0,1.8) to (4,9), (0,1.8) to [4,6*]     [area]
+Beta U:         (0,1.8) to (4,6), [0,1] to [4.5)        [area]
+Impossible:     (0,1) to (4.5), (0,1) to (4,1]          [area]
+Undefined:      (0,3) to (3.6,10), (0,10) to (3.6,10)   [area]
+
+Values of Beta1, Beta2 where brackets mean:
+
+[ ] : includes (closed)
+( ) : approaches but does not include (open)
+ *  : approximate 
+
+ +

These are illustrated in Hahn and Shapiro Fig 6-1.

+ +

Granted this is a very rough test (with some issues) but you may want to consider it as a preliminary check before going to a more rigorous method.

+ +

There are also adjustment mechanisms to the calculation of Beta1 and Beta2 where data is limited - but that is beyond this post.

+",2013-10-10 09:59:06.940 +57207,503.0,2,,57185.0,,,,CC BY-SA 3.0,"

For measurement error there really isn't a difference in the definitions. Psychometry defines ""true score"" as ""measured score"" + ""error"" and this is the same thing as the statistical definition. The confusion may come from different terminology; that developed because psychometry deals with tests while statistics can deal with almost anything.

+ +

""Bias"" is a bit more complex. @NickCox gave the definition in statistics. In psychometry, it is used (at least some of the time) in a slightly different way, again due to the specialized nature of the subject. A test is biased for/against a group if its predictions work differently in another setting. So, e.g. if we are using SAT scores to predict college GPA, bias would be that one group gets lower/higher GPA with the same SAT score.

+ +

In statistics, a scale could be biased against everyone - e.g. if my scale estimates everyone's weight as 5 pounds less than the actual value, that's bias. In the psychometrics definition, that can't be bias.

+ +

BUT psychometricians often use ""bias"" in the statistical sense as well.

+",2013-10-10 10:19:23.717 +57528,22705.0,2,,57110.0,,,,CC BY-SA 3.0,"

i'm having a fabulous run with ucm. You could model this as daily seasonality & with an annual cycle. You also have a very evident trend.

+ +

Post back here if you succeeded with ucm (proc ucm)

+",2013-10-15 13:43:10.627 +57209,21896.0,2,,57137.0,,,,CC BY-SA 3.0,"

Following the suggestion of @Momo I will answer the question myself. What I had forgotten yesterday when I posted this question, is that I can just see what glm.nb does by typing ""glm.nb"" into the console. From the code it returns it can be inferred that indeed the variance equals $\mu + \mu^2/\theta$ so that $\theta = 1/\kappa$.

+ +

Also I'd like to use the opportunity to advertise this article I found since then, also addressing these matters.

+",2013-10-10 10:37:03.140 +57210,22560.0,1,,,,Why eigenvalues are greater than 1 in factor analysis?,,CC BY-SA 3.0,"

Why we take eigenvalue greater than 1 in factor analysis to retain factors?
+ And how can we decide which variables are to be chosen as factors?

+",2013-10-10 10:46:06.893 +57211,503.0,2,,57210.0,,,,CC BY-SA 3.0,"

Using eigenvalues > 1 is only one indication of how many factors to retain. Other reasons include the scree test, getting a reasonable proportion of variance explained and (most importantly) substantive sense.

+ +

That said, the rule came about because the average eigenvalue will be 1, so > 1 is ""higher than average"".

+ +

On your second question: Are you asking how to know how many factors (latent variables) to retain? Or are you asking about which observed variables to retain?

+ +

If the former, see above and see any book on factor analysis. If the latter, each factor is a linear combination of all the observed variables (although some contribute very little).

+",2013-10-10 10:52:34.780 +57212,22190.0,1,,,,Concordance and Discordance role in modelling,,CC BY-SA 3.0,"

I am new to statistics and was asked to develop a statistical model, which I had started, they ask me to carry out concordance and discordance now, however I don't know anything about these terms except that the concordance is the probability that a pair of individuals will both have a certain characteristic, given that one of the pair has the characteristic and the opposite for discordance.
+Still I don't know why I have to find them and what would be the appropriate value of both for a decent model.

+",2013-10-10 10:53:56.520 +57213,503.0,2,,57212.0,,,,CC BY-SA 3.0,"

In this paper I cover concordance and discordance. The paper is about PROC LOGISTIC in SAS but the section on concordance is more general. Briefly: Look at all possible pairs of observations. A pair is concordant if the observation with the higher observed value also has the higher predicted value.

+",2013-10-10 11:17:56.730 +57214,1428.0,2,,57208.0,,,,CC BY-SA 3.0,"

If the M benchmarks are supposed to yield score identically distributed score estimates (e.g. cross-validation folds) then maybe you can estimate confidence intervals for the mean AUC score for each method by bootstrapping on the M benchmarks of that method and then compare methods by considering non-overlapping confidence intervals. As bootstrapped confidence interval is a non-parametric method, you do not make any assumption on the symmetry of AUCs around the median.

+",2013-10-10 11:28:17.020 +57215,449.0,2,,57198.0,,,,CC BY-SA 3.0,"

The wikipedia definition is a fine definition that you can use for your paper if you need one but I think you're missing something.

+ +

The $\epsilon$ is random error, which is synonymous with noise. In practice, the random error can be Gaussian distributed, in which case it is Gaussian noise, but it could take on other distributions. If the distribution of $\epsilon$ happens to be Gaussian then you've met one of the theoretical assumptions of the model and things like interval estimation are better justified. If it's not Gaussian then, like Glen_b said, you still have that it's best linear unbiased.

+ +

Theoretically, the random error (noise) is supposed to be Gaussian distributed but the outcome could be anything. So, in order to answer your question you'd need to state whether you want to know the distribution of your particular noise or what the distribution of the noise should be. For the former you'd need data.

+",2013-10-10 11:41:48.027 +57216,22561.0,2,,57128.0,,,,CC BY-SA 3.0,"

The total variance for combined regression results can be estimated using the same approach as in multiple imputations. In the attached file, the formulas for combining the regression results and total variance are presented.

+ +

+",2013-10-10 12:00:04.637 +57217,15563.0,1,99688.0,,,R e1071 tune plot does not give me the best gamma?,,CC BY-SA 3.0,"

I am using the R e1071 library for the SVM (Support Vector Machine) algorithm. +And I used tune to find out the best Cost and gamma parameters.

+ +

Though the plot doesn't seem to provide the actual best prediction.

+ +

Here is some details:

+ +
gammalist <- c(0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,0.045,0.05)
+obj <- tune(svm, Class~., data = trainData, ranges = list(gamma = gammalist, cost = 2^(2:4)), tunecontrol = tune.control(sampling = ""fix"") )
+plot(obj)
+
+ +

Here is the plot obtained: +

+ +

The plot leads me to believe 0.02 is roundabout the best gamma. +But I actually tested manually several others and find better results for 0.042.

+ +

On a 200 sample, I get 23 errors with gamma=0.042, and 26 errors with gamma=0.02.

+ +

How do you explain this?

+",2013-10-10 12:05:58.447 +57218,21398.0,1,57225.0,,,Multiple imputation - original model,,CC BY-SA 3.0,"

I have a question. I have a dataset with some missing values that were not MCAR. I imputed them with fully conditional specification method iterations. I then executed my analysis on the basis of the imputed dataset. The results of the original model (with missing values: listwise deletion) did not change much in the eventual pooled model. My idea would be to go back the missing values dataset. What do you think?

+",2013-10-10 12:44:27.567 +57229,22567.0,1,57233.0,,,Dealing with missing data in the prediction set only,,CC BY-SA 3.0,"

I have a regression problem where the independent variables are all factors (categorical). I've been looking at the literature on missing data, and so far it all seems concerned with missing training data. I was wondering if there is a standard way of dealing with missing data in the prediction set. That is, you have all the data you need to train, but then you need to be able to make a prediction with only partial data. This must have been a studied problem.

+ +

My initial thought is to use an average of the dummy encoded variables, according to how common they are. As a quick example, say we have a three level factor dummy encoded as

+ +
level 1: [1 0]
+level 2: [0 1]
+level 3: [0 0]
+
+ +

Say level $i$ occurs fraction $f_i$ of the time in the training data (so $\sum_i{f_i}=1$), and the regression has the two coefficients $\beta_1$ and $\beta_2$.

+ +

Then a missing value in this factor might be estimated as: +$$ +\beta_1*f_1 + \beta_2*f_2 + 0*f_3 +$$ +But given that the ""default"" level encoding are shared across factors, I'm not sure I'm handling level 3 correctly in this case.

+",2013-10-10 15:23:53.377 +57219,10547.0,2,,57167.0,,,,CC BY-SA 3.0,"

If all the $Z$s are also determined by the $Y$ the system of equations, which you have proposed, cannot be identified. What you need to do is to reduce the equations such that the coefficients can be identified.

+ +

I recommend reading:

+ +
+

Wooldridge, Introductory Econometrics, 3d ed. Chapter 16: Simultaneous + equations

+
+ +

Here, your kind of problems gets explained, and there are some pretty nice examples.

+ +

I also recommend reading:

+ +
Rummery,Vella,Verbeek (1998) - Estimating the Returns to Education for Australian Youth via Rank-Order Instrumental Variables
+
+ +

and

+ +
Vella,Verbeek (1997) - Using Rank Order As An Instrumental Variable - An Applicaton To The Return To Schooling
+
+ +

Vella and Verbeek (also Rummery) estimate smth. like:

+ +

$y_i = x_i\beta + z_i\delta + e_i, \ \ \ \ i = 1,...,N$

+ +

Here $x_i$ is a $K$ vector of exogenous variables whereas $z_i$ is assumed to be endogenous. Hence the reduced form equation of $z_i$ is given by:

+ +

$z_i = x_i\alpha + v_i$

+ +

The advantage of this approach is, that you dont need any exclusion restrictions for the $x_i$, which are necessary to make 2SLS/3SLS work.

+ +

I've used this approach to solve a three equation system, i.e., i got three equations and in each of them there are two endogenous regressors which are also the dependend variable in some other equation.

+ +

I also applied a plug-in style of approach to deal with potential heteroscedasticity.

+ +

There are some issues which are not presented within this papers but I would be happy to talk to you about that.

+",2013-10-10 12:59:56.353 +57220,22562.0,1,,,,Interaction wipes out my direct effects in regression (non zero variable),,CC BY-SA 3.0,"

I have the following regression

+ +

$children = \beta_0 + \beta_1 \log(earnings) + \beta_2 grandparents + \epsilon$

+ +

and $\beta_1>0$ with $p$=0.01 and $\beta_2>0$ with $p$=0.01, and N is large (N>10.000) and grandparents takes values 0,1,2,3,4.

+ +

Then I add the interaction term ($\log(earnings)*grandparents$) to equation 1, such that:

+ +

$children = \beta_0 + \beta_1 \log( earnings) + \beta_2 grandparents+ \beta_3 \log( earnings)*grandparents + \epsilon$

+ +

and $\beta_1>0$ with $p$=0.01, $\beta_2$ is no longer statistically significant and also $\beta_3$ is not statistically significant.

+ +

I do not understand how to interpret the results and if the interaction term wipes out the direct effect of grandparents since $\log(earnings)$ is always different from 0.

+ +
+
+
+
+

There is a way to test the stat. sign. of the effect of Grandparents in the interacted model? (Thanks Maarten for your previous answer)

+
+
+
+
+",2013-10-10 13:11:58.220 +57221,22563.0,1,,,,What's the approximate distribution? Replace the true mean with sample mean,,CC BY-SA 3.0,"

If say for a random variable $X$, I have observation of $x_1,x_2,x_3,\ldots,x_n$. Let $m$ be the sample mean, and $s$ be the sample standard deviation. Does the new random variable $(X-m)/s$ follow some distribution? It's not $t$-distribution I guess, since for it to be $t$ distributed, $m$ needs to be replaced by true mean.

+ +

Can statistics expert shed some light on this?

+",2013-10-10 13:28:20.653 +57222,16474.0,2,,57220.0,,,,CC BY-SA 3.0,"

$\beta_2$ in equation 2 is the effect of $grandparents$ when $\log(earnings) = 0$, i.e. $earnings = 1$. This is apperently outside the range of your data, so it is an extrapolation. The easiest way around that is to center $earnings$ before taking the logarithm or creating the interaction term at some meaningfull value withing the range of the data, for example, the median. That way the main effect of $grandparents$ will be the effect of grandparents when one has a median income instead of a fictional income of 1.

+",2013-10-10 13:29:02.837 +57223,22564.0,1,57245.0,,,How to compare two groups with multiple measurements for each individual with R?,,CC BY-SA 3.0,"

I have a problem like the following:

+ +

1) There are six measurements for each individual with large within-subject variance

+ +

2) There are two groups (Treatment and Control)

+ +

3) Each group consists of 5 individuals

+ +

4) I want to perform a significance test comparing the two groups to know if the group means are different from one another.

+ +

The data looks like this: +

+ +

And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. This ignores within-subject variability:

+ +
 n.simulations<-10000
+    pvals=matrix(nrow=n.simulations,ncol=1)
+    for(k in 1:n.simulations){
+      subject=NULL
+      for(i in 1:10){
+        subject<-rbind(subject,as.matrix(rep(i,6)))
+      }
+      #set.seed(42)
+
+      #Sample Subject Means
+      subject.means<-rnorm(10,100,2)
+
+      #Sample Individual Measurements
+      values=NULL
+      for(sm in subject.means){
+        values<-rbind(values,as.matrix(rnorm(6,sm,20)))
+      }
+
+      out<-cbind(subject,values)
+
+      #Split into GroupA and GroupB
+      GroupA<-out[1:30,]
+      GroupB<-out[31:60,]
+
+      #Add effect size to GroupA
+      GroupA[,2]<-GroupA[,2]+0
+
+      colnames(GroupA)<-c(""Subject"", ""Value"")
+      colnames(GroupB)<-c(""Subject"", ""Value"")
+
+      #Calculate Individual Means and SDS
+      GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2)
+      for(i in 1:length(unique(GroupA[,1]))){
+        GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])
+        GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])
+      }
+      colnames(GroupA.summary)<-c(""Mean"",""SD"")
+
+
+      GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2)
+      for(i in 1:length(unique(GroupB[,1]))){
+        GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])
+        GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])
+      }
+      colnames(GroupB.summary)<-c(""Mean"",""SD"")
+
+      Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary))
+      colnames(Summary)[1]<-""Group""
+
+      pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value
+    }
+
+ +

And here is code for plots:

+ +
#Plots
+par(mfrow=c(2,2))
+boxplot(GroupA[,2]~GroupA[,1], col=""Red"", main=""Group A"", 
+        ylim=c(.9*min(out[,2]),1.1*max(out[,2])),
+        xlab=""Subject"", ylab=""Value"")
+stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T)
+#abline(h=mean(GroupA[,2]), lty=2, lwd=3)
+
+for(i in 1:length(unique(GroupA[,1]))){
+  m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])
+  ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2]
+
+  points(i-.2,m, pch=15,cex=1.5, col=""Grey"")
+  segments(i-.2,
+           ci[1],i-.2,
+           ci[2], lwd=4, col=""Grey""
+  )
+}
+legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"")
+
+
+boxplot(GroupB[,2]~GroupB[,1], col=""Light Blue"", main=""Group B"", 
+        ylim=c(.9*min(out[,2]),1.1*max(out[,2])),
+        xlab=""Subject"", ylab=""Value"")
+stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T)
+#abline(h=mean(GroupB[,2]), lty=2, lwd=3)
+
+for(i in 1:length(unique(GroupB[,1]))){
+  m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])
+  ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2]
+
+  points(i-.2,m, pch=15,cex=1.5, col=""Grey"")
+  segments(i-.2,
+           ci[1],i-.2,
+           ci[2], lwd=4, col=""Grey""
+  )
+}
+legend(""topleft"", legend=c(""Individual Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"")
+
+
+boxplot(Summary[,2]~Summary[,1], col=c(""Red"",""Light Blue""), xlab=""Group"", ylab=""Average Value"",
+        ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])),
+        main=""Individual Averages"")
+stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T)
+
+points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col=""Grey"")
+segments(.9,
+         t.test(GroupA.summary[,1])$conf.int[1],.9,
+         t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col=""Grey""
+)
+
+points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col=""Grey"")
+segments(1.9,
+         t.test(GroupB.summary[,1])$conf.int[1],1.9,
+         t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col=""Grey""
+)
+legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"")
+
+
+hist(pvals, breaks=seq(0,1,by=.05), col=""Grey"",
+     main=c(paste(""# sims="", n.simulations),
+            paste(""% Sig p-values="",100*length(which(pvals<0.05))/length(pvals)))
+)
+
+ +

Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data.

+ +

So what is the correct way to analyze this data?

+ +

Bonus:

+ +

The example above is a simplification. For the actual data:

+ +

1) The within-subject variance is positively correlated with the mean.

+ +

2) Values can only be multiples of two.

+ +

3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end.

+ +

4) Number of Subjects in each group are not necessarily equal.

+ +

Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the ""correct"" p-values.

+ +

EDIT:

+ +

Ok, here is what actual data looks like. There is also three groups rather than two:

+ +

+ +

dput() of data:

+ +
structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
+3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 
+3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 
+6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 
+10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 
+12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 
+15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 
+18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, 
+22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, 
+6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, 
+2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, 
+12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, 
+10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, 
+20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list(
+    NULL, c(""Group"", ""Subject"", ""Value"")))
+
+ +

EDIT 2:

+ +

In response to Henrik's answer: +So if I instead perform anova followed by TukeyHSD procedure on the individual averages as shown below, I could interpret this as underestimating my p-value by about 3-4x?

+ +

My goal with this part of the question is to understand how I, as a reader of a journal article, can better interpret previous results given their choice of analysis method. For example they have those ""stars of authority"" showing me 0.01>p>.001. So if i accept 0.05 as a reasonable cutoff I should accept their interpretation? The only additional information is mean and SEM.

+ +
#Get Invidual Means
+summary=NULL
+for(i in unique(dat[,2])){
+sub<-which(dat[,2]==i)
+summary<-rbind(summary,cbind(
+dat[sub,1][3],
+dat[sub,2][4],
+mean(dat[sub,3]),
+sd(dat[sub,3])
+)
+)
+}
+colnames(summary)<-c(""Group"",""Subject"",""Mean"",""SD"")
+
+TukeyHSD(aov(summary[,3]~as.factor(summary[,1])+ (1|summary[,2])))
+
+#      Tukey multiple comparisons of means
+#        95% family-wise confidence level
+#    
+#    Fit: aov(formula = summary[, 3] ~ as.factor(summary[, 1]) + (1 | summary[, 2]))
+#    
+#    $`as.factor(summary[, 1])`
+#             diff       lwr       upr     p adj
+#    2-1 -0.672619 -4.943205  3.597967 0.9124024
+#    3-1  7.507937  1.813822 13.202051 0.0098935
+#    3-2  8.180556  2.594226 13.766885 0.0046312
+
+ +

EDIT 3: +I think we are getting close to my understanding. Here is the simulation described in the comments to @Stephane:

+ +
#Get Subject Means
+means<-aggregate(Value~Group+Subject, data=dat, FUN=mean)
+
+#Initialize ""dat2"" dataframe
+dat2<-dat
+
+#Initialize within-Subject sd
+s<-.001
+pvals=matrix(nrow=10000,ncol=2)
+
+for(j in 1:10000){
+#Sample individual measurements for each subject
+temp=NULL
+for(i in 1:nrow(means)){
+temp<-c(temp,rnorm(6,means[i,3], s))
+}
+
+#Set new values
+dat2[,3]<-temp
+
+#Take means of sampled values and fit to model
+dd2 <- aggregate(Value~Group+Subject, data=dat2, FUN=mean)
+fit2 <- lm(Value~Group, data=dd2)
+
+#Save sd and pvalue
+pvals[j,]<-cbind(s,anova(fit2)[[5]][5])
+
+#Update sd
+s<-s+.001
+}
+
+plot(pvals[,1],pvals[,2], xlab=""Within-Subject SD"", ylab=""P-value"")
+
+ +

+",2013-10-10 13:47:37.470 +57224,3731.0,2,,52910.0,,,,CC BY-SA 3.0,"

To close the loop for those who don't want to follow the comment thread above:

+ +

No. People do not normally compile these things because they are specific to both the particular model chosen and the particular data onto which the model is fit. To talk of a ""typical"" number is not well posed.

+ +

If someone would like to post a more comprehensive answer, I will be happy to ""unaccept"" this answer and accept theirs instead.

+",2013-10-10 14:09:23.710 +57225,503.0,2,,57218.0,,,,CC BY-SA 3.0,"

I think the choice depends on the audience that will read whatever you write.

+ +

If they are mostly statistically unsophisticated, I'd say you could use the original data set and put a footnote about how multiple imputation did not change things much. If they are more sophisticated, I'd go with the MI analysis. Even if things don't change ""much"" they change some and the MI is a better approach.

+ +

Also, be careful that you looked at all the output for what changed (or didn't). Not just parameter estimates but their standard errors (or whatever your analysis involves - you didn't say what analysis you did, so it's hard to say what might be affected).

+",2013-10-10 14:16:53.693 +57226,22566.0,2,,57128.0,,,,CC BY-SA 3.0,"

The above presented formulas are available in the SPSS help: Help > Algorithms > Multiple Imputation: Pooling Algorithms > Rubin's Rules (multiple imputation algorithms) > Combining Results after Multiple Imputation

+",2013-10-10 14:44:17.203 +57227,9792.0,2,,57187.0,,,,CC BY-SA 3.0,"

I think the question why splines cannot be centered arose out of a misunderstanding of how splines function. It seems that splines don't model an intercept and thus centering is impossible. It would, however, be great if someone had another solution to estimating the group differences at different time points when modelling more complex dynamics.

+",2013-10-10 15:02:05.507 +57228,22568.0,1,,,,How to create a GIS basemap in R?,,CC BY-SA 3.0,"

I am an expert GIS user moving towards R more and more. I have been using R for some basic regressions and such, but I would like to begin to use and manipulate GIS data in R.

+ +

How can I create a basemap graphic similar to the one in this post: +Showing spatial and temporal correlation on maps

+ +

Again, I am a beginner in R and haven't found any other related thread here.

+",2013-10-10 15:23:08.293 +57230,22569.0,1,57409.0,,,Represent data across multiple categories and sub categories,,CC BY-SA 3.0,"

The data contain category and sub-category distributions.

+ +

The categories are topics in a quiz such as: Music, Sports, Business.

+ +

Each category has three levels to choose from: Basic, Standard and Advanced.

+ +

For example: A user might take a quiz on Music across different levels. Say the number of questions attempted is 100. The user would have answered them across levels. 40 for basic, 40 for standard and 20 for advanced. The data consist of counts of the questions attempted within each category for each user.

+ +

What is the best way to represent these data on a graph? Each graph would contain up to 5 main categories.

+",2013-10-10 15:25:25.017 +57231,22381.0,1,,,,Raw return vs. percentage return to calculate volatility,,CC BY-SA 3.0,"

I am using squared return as a proxy to calculate volatility, however i'm not sure whether to use raw return or percentage return. Under raw return all return estimates are below 1, however under percentage return there is a mix of return greater than 1 and less than 1. Percentage return below 1 would end up as a volatility figure less than the percentage return itself, on the other hand percentage return above 1 would end up as a volatility figure greater than percentage return.

+ +

My question is: Doesn't this pose a problem when calculating volatility in that there is an over estimation when the return is above 1?

+ +

I am going to use the data to fit an ARMA-GARCH model, would there be any difference if I used percentage or absolute values?

+",2013-10-10 15:42:26.310 +57232,2873.0,2,,57187.0,,,,CC BY-SA 3.0,"

The ns function (and other spline functions) does its own ""centering"" of the data. Consider this example:

+ +
> library(splines)
+> 
+> s1 <- ns( 1:10, 3 )
+> s2 <- ns( (1:10)-5, 3 )
+> 
+> all.equal(s1,s2)
+[1] ""Attributes: < Component 1: Mean relative difference: 0.9090909 >""
+[2] ""Attributes: < Component 7: Mean relative difference: 0.9090909 >""
+> all.equal(as.vector(s1),as.vector(s2))
+[1] TRUE
+
+ +

So the centering of the data leads to the same splines as the uncentered data (other than the knot information in the attributes). So centering your variable before computing a spline has no effect. If you want to compare the values at a point other than 0 then just use the predict function to get the actual predictions at the point of interest and compare (subtract).

+",2013-10-10 15:46:38.177 +57233,5237.0,2,,57229.0,,,,CC BY-SA 3.0,"

(I'll let someone else address the estimation of the missing data. You may want to directly model the probability that the observation is each level of the unknown factor using knowledge of other covariate values, and possibly outside information, e.g., priors etc. There are strategies such as propensity scores that you might be able to use for this type of thing. However, at first glance your approach looks reasonable to me.)

+ +

One note is that I can't tell from your description if you are weighting by raw frequencies. If so, you want to divide these by $N$ to get the marginal probabilities instead.

+ +

You are right that you are not handling level 3 correctly. The coding scheme that you use in your question set up is known as reference level coding. To use this approach correctly, you need to have an intercept (i.e., $\beta_0$), which estimates the mean of level 3. I suspect you do have such, even though you didn't list it. In this case, you would just add the intercept to your final equation. That is: +$$ +\beta_0\!*\!f_3 + \beta_1\!*\!f_1 + \beta_2\!*\!f_2 +$$ +Note that you are multiplying the intercept (which encodes the reference level) by the marginal probability that the observation is actually the reference level.

+",2013-10-10 15:56:30.450 +57234,2873.0,2,,57228.0,,,,CC BY-SA 3.0,"

You should read through the Spatial and possibly SpatialTemporal Taskviews on CRAN. Those will give you an idea of what packages are available and gives brief descriptions of what they do and how they compare.

+",2013-10-10 16:05:27.410 +57235,10060.0,2,,57228.0,,,,CC BY-SA 3.0,"

R by itself does not handle GIS type of work but with different add-ons it can be a quite potent GIS device. You'd need to understand the idea of ""package"" (user-contributed scripts) and how to use install.packages(""whateverPackage"") command to install them.

+ +

I don't use R in GIS enough to show you the whole topography (pun totally intended), but the most commonly used packages I have seen are map, ggmap, ggplot2, RgoogleMaps, and plotGoogleMap.

+ +

Also, check out some sites and tutorials about this topic: 1, 2, 3, and 4. These got me started and within a day I could make some silly maps.

+ +

Lastly, this pdf probably contains some codes pertinent to the map you wish to create. Its $\LaTeX$ format is a bit off, but you can still get some general functionality and key commands.

+ +

Good luck!

+",2013-10-10 16:08:02.663 +57236,22572.0,1,,,,"Kaplan Meier - Can I use to assess recovery of function, not just loss?",,CC BY-SA 3.0,"

In my experience and readings, Kaplan-Meier has always been used to calculate differential survival between a certain number of groups. However, I'm looking to assess how time to recovery from a certain event as measured by activity levels. At time zero, everyone is essentially ""dead"" (non-mobile), and with time they regain mobility. +seems like a ""negative"" Kaplan-Meier, is that possible? Or should I be looking at a different modeling strategy?

+",2013-10-10 16:10:12.377 +57237,22570.0,1,,,,Generating causally dependent random variables,,CC BY-SA 3.0,"

I'm trying to generate sets of causally connected random variables and started off doing this with a monte carlo approach.

+ +

The baseline is a 2-dimensional measured histogram from which I draw random values.

+ +

In my concrete examples these variables are acceleration $\bf{a}$ and velocity $\bf{v}$ - so obviously +$v_{i+1} = v_{i} + a_i * dt$ +has to hold.

+ +

My current naive approach is:

+ +

I start with a some $v_0$. +Then I generate a random $a_0$ according to the measured probability of $\bf{a}$ for the value of $v_0$. Using this $a_0$ I can calculate $v_1$ and the whole procedure starts over again.

+ +

So when I check the generated accelerations $\bf{a}$ in bins of $\bf{v}$ everything's fine. +But I obviously this does not at all respect the marginal distribution of $\bf{v}$.

+ +

I'm kind of familiar with basic monte carlo methods, though lacking some theoretical background as you might guess. +I'd be fine if the two variables where just connected by some correlation matrix, but the causal connection between the two gives me headaches.

+ +

I didn't manage to find an example for this kind of problem somewhere - I might be googl'ing the wrong terms. +I'd be satisfied if somebody could point me to some literature/example or promising method to get a hold on this.

+ +

(Or tell me that's is not really possible given my inputs - that's what I'm guessing occasionally...)

+ +

EDIT:

+ +

The actual aim of this whole procedure: +I have a set of measurements $\bf{a}$ and $\bf{v}$, represented in a two-dimensional histogram $N(a,v)$. Given this input I'd like to generate sets of random $\bf{a_r}$ and $\bf{v_r}$ that reproduce the measured distribution.

+",2013-10-10 16:26:50.467 +57724,9175.0,2,,57668.0,,,,CC BY-SA 3.0,"

I had misunderstood the question originally. I have edited the question now and here is a brief answer since I have already put up the question.

+ +

I used the following transformations

+ +

$U =\frac{Y}{X}$

+ +

$V = X$

+ +

Then use the standard bivariate transformation short cut for one-to-one functions

+",2013-10-17 18:40:14.853 +57238,8414.0,1,,,,Simulating data to fit a mediation model,,CC BY-SA 3.0,"

I am interested in finding a procedure to simulate data that are consistent with a specified mediation model. According to the general linear structural equation model framework for testing mediation models first outlined by Barron and Kenny (1986) and described elsewhere such as Judd, Yzerbyt, & Muller (2013), mediation models for outcome $Y$, mediator $\newcommand{\med}{\rm med} \med$, and predictor $X$ and are governed by the following three regression equations: +\begin{align} +Y &= b_{11} + b_{12}X + e_1 \tag{1} \\ +\med &= b_{21} + b_{22}X + e_2 \tag{2} \\ +Y &= b_{31} + b_{32}X + b_{32} \med + e_3 \tag{3} +\end{align} +The indirect effect or mediation effect of $X$ on $Y$ through $\med$ can either be defined as $b_{22}b_{32}$ or, equivalently, as $b_{12}-b_{32}$. Under the old framework of testing for mediation, mediation was established by testing $b_{12}$ in equation 1, $b_{22}$ in equation 2, and $b_{32}$ in equation 3.

+ +

So far, I have attempted to simulate values of $\med$ and $Y$ that are consistent with values of the various regression coefficients using rnorm in R, such as the code below:

+ +
x   <- rep(c(-.5, .5), 50)
+med <- 4 + .7 * x + rnorm(100, sd = 1) 
+
+# Check the relationship between x and med
+mod <- lm(med ~ x)
+summary(mod)
+
+y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1)
+
+# Check the relationships between x, med, and y
+mod <- lm(y ~ x + med)
+summary(mod)
+
+# Check the relationship between x and y -- not present
+mod <- lm(y ~ x)
+summary(mod)
+
+ +

However, it seems that sequentially generating $\med$ and $Y$ using equations 2 and 3 is not enough, since I am left with no relationship between $X$ and $Y$ in regression equation 1 (which models a simple bivariate relationship between $X$ and $Y$) using this approach. This is important because one definition of the indirect (i.e., mediation) effect is $b_{12}-b_{32}$, as I describe above.

+ +

Can anyone help me find a procedure in R to generate variables $X$, $\med$, and $Y$ that satisfy constraints that I set using equations 1, 2, and 3?

+",2013-10-10 16:41:50.710 +57239,22571.0,1,,,,Quantifying the relationship between two disparate time series,,CC BY-SA 3.0,"

I have two time series that have a roughly similar trend, though both variables are noisy. This graph shows means and standard errors throughout a season of measurements.

+ +

+ +

I'd like to be able to make a quantitative statement about the relationship between these two data sets.

+ +

While the two data sets were collected from the same experimental plots, the individual samples from which the means and standard errors were calculated are not meaningfully paired with one another, and you can see that the carbohydrate data set was measured more frequently.

+ +

By taking a subset of the carbohydrate measurements that are closest to the microbial biomass measurement dates, I can make a scatterplot showing the means and standard errors that I think gives a fair visual representation of the relationship (TRS.ml is the carbohydrates):

+ +

+ +

This is where I am stuck. I'm not sure how to estimate regression coefficients or calculate an r2 value for a regression of this sort where I have estimates of uncertainty for both variables. Here are some approaches I have been considering:

+ +
    +
  1. Deming regression. I'm not sure that this would be the right approach. It seems to be more for data sets in which the same technique was used for both variables. If it is, my question is how would I calculate the variance ratio based on the information I have?

  2. +
  3. Regression of all underlying data points. This doesn't really work because the data are not meaningfully paired, so of the 80 or so microbial biomass measurements that underlie the data shown in the graphs here, I can't directly match them to individual measurements of carbohydrates. Matching them arbitrarily seems bad.

  4. +
  5. Regression of carbohydrate means by date against microbial biomass means by date. Basically regress the points in my scatterplot above but throw out the information about the uncertainty. This gives a high r2 driven by the coinciding peaks on July 1st, but to me, seems to overestimate the strength of the relationship.

  6. +
  7. Regression of all microbial biomass values against carbohydrate means by date or vice versa. This allows more of the underlying uncertainty to be incorporated while not forcing the pairing of unrelated data points in an arbitrary way. Again though, it does not incorporate the uncertainty in both variables.

  8. +
+ +

My question is which of these approaches, or any other unlisted approaches, would you recommend for quantifying the relationship between these two time series?

+",2013-10-10 16:58:21.140 +57240,22527.0,2,,40870.0,,,,CC BY-SA 3.0,"

I believe you multiply by 2 because you need to control for your database being twice as large. There are other ways to do this calculation such as

+ +

decoy spectra identified / target spectra identified

+ +

The use of the term FDR for this calculation is totally confusing and is why people have started calling it target/decoy rate in the last year or so. It's also doubly confusing as people often fail to specify if they are using a spectra target/decoy or a peptide target/decoy...and there is no way to consistently calculate a protein target/decoy as different programs will weigh peptide --> protein evidence differently...It's a mess...Having said that I will always make this calculation just to double check I or the software has not done something stupid. For that it is very useful.

+",2013-10-10 16:59:34.360 +57241,5237.0,2,,57238.0,,,,CC BY-SA 4.0,"

This is quite straightforward. The reason you have no relationship between $x$ and $y$ using your approach is because of the code:

+ +
y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1)
+
+ +

If you want some relationship between $x$ and $y$ even when ${\rm med}$ is included (that is, you want partial mediation), you would simply use a non-zero value for $b_{32}$ instead. For example, you could substitute the following code for the above:

+ +
y <- 2.5 + 3 * x + .4 * med + rnorm(100, sd = 1)
+
+ +

Thus, $b_{32}$ has been changed from $0$ to $3$. (Of course some other, specific value would probably be more relevant, depending on your situation, I just picked $3$ off the top of my head.)

+ +
+ +

Edit:
+With respect to the marginal $x\rightarrow y$ relationship being non-significant, that is just a function of statistical power. Since the causal force of $x$ is passed entirely through ${\rm med}$ in your original setup, you have lower power than you might otherwise. Nonetheless, the effect is still real in some sense. When I ran your original code (after having set the seed using 90 as a value that I again just picked off the top of my head), I did get a significant effect:

+ +
set.seed(90)
+x <- rep(c(-.5, .5), 50)
+med <- 4 + .7 * x + rnorm(100, sd = 1) 
+
+# Check the relationship between x and med
+mod <- lm(med ~ x)
+summary(mod)
+
+y <- 2.5 + 0 * x + .4 * med + rnorm(100, sd = 1)
+
+# Check the relationships between x, med, and y
+mod <- lm(y ~ x + med)
+summary(mod)
+
+# Check the relationship between x and y -- not present
+mod <- lm(y ~ x)
+summary(mod)
+
+...
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)    
+(Intercept)   3.8491     0.1151  33.431   <2e-16 ***
+x             0.5315     0.2303   2.308   0.0231 *  
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
+
+...
+
+ +

To get more power, you can increase the $N$ you are using, or use smaller error values (i.e., use sd= values less than the default 1 in the rnorm() calls).

+",2013-10-10 17:09:45.623 +57242,22573.0,1,,,,Estimating multivariate normal distribution by observing variance in different directions,,CC BY-SA 3.0,"

Assume I am looking for a normal distribution $\mathcal{N}(\mu,\Sigma)$. For simplicity let's say we only have 2 random variables $x$ and $y$ and a known $\mu=0$.

+ +

Is it possible to estimate $\Sigma$ by observing the variance along multiple directions?

+ +

For example, I measure the variance $\sigma_1$ along the vector $\mathbb{v}_1 = (x_1,y_1)^T$. In another step I obtain a different variance $\sigma_2$ from a different direction $\mathbb{v}_2 = (x_2,y_2)^T$. Ideally one would continue to observe these single variations in different directions and combine them in one multivariate normal distribution.

+ +

Does this make sense?

+ +

EDIT: +Some additional background information might be useful: I have a sensor device with known position and orientation in 2D space (in a future step both may have an uncertainty). The sensor is able to measure only the distance of a point along its orientation. I'm also given the sensor model. So for each distance measure $d_i$, I obtain the standard error $\sigma(d_i)$ which depends on the distance.

+ +

Since I'm not able to manipulate the sensor position to my advantage or perform a large number of measurements, I'd like to combine these variances into one covariance matrix in order to make a more reliable prediction of the position of the measured point.

+ +

This is just a thought that is still under development with no guaranty to work out correctly. Hence my question of ""making sense""...

+",2013-10-10 18:29:33.973 +58449,2164.0,2,,54234.0,,,,CC BY-SA 3.0,"

You should be using ARIMA, but you need to consider deterministic impacts like outliers, level shifts, changes in trend, changes in seasonality, changes in parameters and changes in variance. Look to the work of Ruey Tsay, Nathan Balke, Gregory Chow, Fox, Box, I. Chang.

+",2013-10-29 18:32:41.107 +57243,20473.0,2,,57221.0,,,,CC BY-SA 3.0,"

It appears you are confusing results that hold for a collection of random variables with the case of one random variable.

+ +

When you have a series of observations, $x_1,...,x_n$, then if they are contemporaneous, ($x_{1t},...,x_{nt}$) they are considered as realizations of n distinct random variables (that may be identically and independently distributed, or not). You cannot, in this case, assume that all are realizations of the same random variable, because a random variable is a real-valued function: this means that at a specific point in time, it can have only one realization (take one value), otherwise it wouldn't be a function but a correspondence: this is why when we have a cross-sectional sample of size $n$, we say that ""it is comprised of the realization of $n$ random variables"", and not ""$n$ realizations of the same random variable"". Note carefully that ""same"" does not just mean ""identically distributed"", but ontologically equal.

+ +

Assume now that you have a time-series, and the index $1,...,n$ represents different points in time. Can you say that they are all realizations of the same random variable? Well in principle you can, but here too, we tend to view a time series as a stochastic process of distinct random variables (one for each point in time), that, again, may be identically distributed.

+ +

So in general, when looking at a sample, be it cross-sectional or time series, it is advisable to think of it as a collection of realizations of many random variables.

+ +

Now, when we subtract the mean from a random variable, and divide by the standard deviation, we create the ""standardized"" version of the variable, that has mean zero and variance (and standard deviation) unity. This is irrespective of the distribution that this variable follows, because, by standard universal properties of these distribution moments +$$Z = \frac {X-\mu}{\sigma} \Rightarrow E(Z) = \frac {1}{\sigma}E(X) - \frac {\mu}{\sigma} = 0$$

+ +

and

+ +

$$ \text {Var}(Z) = \text {Var}\left(\frac {X-\mu}{\sigma}\right) = \frac {1}{\sigma^2}\text {Var}(X) = \frac {\sigma^2}{\sigma^2} =1$$

+ +

The standardized version $Z$ of one random variable $X$ follows a distribution that belongs to the same family as the distribution of $X$, with different said parameters - the distribution family does not change. So if you don't know the distribution by other means, the distribution of the standardized version will remain unknown.

+ +

Now consider the random variable $S_n = \frac 1n\sum_{i=1}^nX_i$. It is for this and like quantities that the various Central Limit Theorems talk about and tell that they approach a normal distribution asymptotically.

+",2013-10-10 18:52:09.813 +57244,20179.0,1,,,,SNP genotype coding in regression,,CC BY-SA 3.0,"

I would like to conduct some analysis on some biological traits with regression model. +The response variable is continuous. One important independent variable is the SNP information (wildtype, heterozygous, or homozygous). There are different ways to code it. +It can be treated as a nominal or a ordinal variable (like 1, 2, 3). Any one familar with the difference and any classical references about it? +Thank you for any suggestion.

+",2013-10-10 18:54:07.593 +57245,346.0,2,,57223.0,,,,CC BY-SA 3.0,"

I take the freedom to answer the question in the title, how would I analyze this data.

+ +

Given that we have replicates within the samples, mixed models immediately come to mind, which should estimate the variability within each individual and control for it.

+ +

Hence I fit the model using lmer from lme4. However, as we are interested in p-values, I use mixed from afex which obtains those via pbkrtest (i.e., Kenward-Rogers approximation for degrees-of-freedom). (afex also already sets the contrast to contr.sum which I would use in such a case anyway)

+ +

To control for the zero floor effect (i.e., positive skew), I fit two alternative versions transforming the dependent variable either with sqrt for mild skew and log for stronger skew.

+ +
require(afex)
+
+# read the dput() in as dat <- ...    
+dat <- as.data.frame(dat)
+dat$Group <- factor(dat$Group)
+dat$Subject <- factor(dat$Subject)
+
+(model <- mixed(Value ~ Group + (1|Subject), dat))
+##        Effect    stat ndf ddf F.scaling p.value
+## 1 (Intercept) 237.730   1  15         1  0.0000
+## 2       Group   7.749   2  15         1  0.0049
+
+(model.s <- mixed(sqrt(Value) ~ Group + (1|Subject), dat))
+##        Effect    stat ndf ddf F.scaling p.value
+## 1 (Intercept) 418.293   1  15         1  0.0000
+## 2       Group   4.121   2  15         1  0.0375
+
+(model.l <- mixed(log1p(Value) ~ Group + (1|Subject), dat))
+##        Effect    stat ndf ddf F.scaling p.value
+## 1 (Intercept) 458.650   1  15         1  0.0000
+## 2       Group   2.721   2  15         1  0.0981
+
+ +

The effect is significant for the untransformed and sqrt dv. But are these model sensible? Let's plot the residuals.

+ +
png(""qq.png"", 800, 300, units = ""px"", pointsize = 12)
+par(mfrow = c(1, 3))
+par(cex = 1.1)
+par(mar = c(2, 2, 2, 1)+0.1)
+qqnorm(resid(model[[2]]), main = ""original"")
+qqline(resid(model[[2]]))
+qqnorm(resid(model.s[[2]]), main = ""sqrt"")
+qqline(resid(model.s[[2]]))
+qqnorm(resid(model.l[[2]]), main = ""log"")
+qqline(resid(model.l[[2]]))
+dev.off()
+
+ +

+ +

It seems that the model with sqrt trasnformation provides a reasonable fit (there still seems to be one outlier, but I will ignore it). So, let's further inspect this model using multcomp to get the comparisons among groups:

+ +
require(multcomp)
+
+# using bonferroni-holm correction of multiple comparison
+summary(glht(model.s[[2]], linfct = mcp(Group = ""Tukey"")), test = adjusted(""holm""))
+##          Simultaneous Tests for General Linear Hypotheses
+## 
+## Multiple Comparisons of Means: Tukey Contrasts
+## 
+## 
+## Fit: lmer(formula = sqrt(Value) ~ Group + (1 | Subject), data = data)
+## 
+## Linear Hypotheses:
+##            Estimate Std. Error z value Pr(>|z|)  
+## 2 - 1 == 0  -0.0754     0.3314   -0.23    0.820  
+## 3 - 1 == 0   1.1189     0.4419    2.53    0.023 *
+## 3 - 2 == 0   1.1943     0.4335    2.75    0.018 *
+## ---
+## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+## (Adjusted p values reported -- holm method)
+
+# using default multiple comparison correction (which I don't understand)
+summary(glht(model.s[[2]], linfct = mcp(Group = ""Tukey"")))
+##          Simultaneous Tests for General Linear Hypotheses
+## 
+## Multiple Comparisons of Means: Tukey Contrasts
+## 
+## 
+## Fit: lmer(formula = sqrt(Value) ~ Group + (1 | Subject), data = data)
+## 
+## Linear Hypotheses:
+##            Estimate Std. Error z value Pr(>|z|)  
+## 2 - 1 == 0  -0.0754     0.3314   -0.23    0.972  
+## 3 - 1 == 0   1.1189     0.4419    2.53    0.030 *
+## 3 - 2 == 0   1.1943     0.4335    2.75    0.016 *
+## ---
+## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+## (Adjusted p values reported -- single-step method)
+
+ +

Punchline: group 3 differs from the other two groups which do not differ among each other.

+",2013-10-10 18:54:30.263 +57246,22059.0,2,,57242.0,,,,CC BY-SA 3.0,"

You could, but it doesn't make too much sense to do it this way. Since there are ${n+1}\choose 2$ independent dimensions in a covariance matrix, you'd need to measure the variance in ${n+1}\choose 2$ different directions, and then do some linear algebra to reconstruct the covariance matrix. It's more straightforward just to measure the covariance matrix all at once.

+ +

For example, in $2$ dimensions, you could measure the variance along $(0,1)$, $(0,1)$ and $(1,1)$, then since $v_{(1,1)}=\Sigma_{11}+\Sigma_{22}+2\Sigma_{12}$, you would end up with +$$\Sigma=\left(\array{v_{(1,0)} & \frac{v_{(1,1)}-v_{(1,0)}-v_{(0,1)}}{2}\\ \frac{v_{(1,1)}-v_{(1,0)}-v_{(0,1)}}{2} & v_{(0,1)}}\right)$$

+ +

I suppose if somehow you were constrained in such a way that you couldn't sample from the entire distribution, but you were only able to sample from projections onto a single dimension at a time, then this approach could be useful.

+",2013-10-10 19:06:15.007 +57247,20286.0,2,,57012.0,,,,CC BY-SA 3.0,"

Unless you have reason to believe that LabA and LabB would systematically provide different results if they ever measured the same set of samples, your data from the 2 Labs ""are on the same scale"" as far as you can tell from these data. The problem is that the less-sensitive LabA will be unable to report a value for some samples that would have had values reported if LabB had instead done the analysis.

+ +

Perhaps the best way to proceed would be to define a class of results called ""<0.2"", and include in that class all 0 readings from LabA and all readings <0.2 from LabB. How you proceed from there depends on ""What is the question of interest?"" as @Glen_b put it in a comment.

+ +

All will be much more useful and reliable if it is possible to cross-compare a set of samples analyzed by both Labs, because there may be systematic differences between the 2 Labs' results that you don't suspect.

+",2013-10-10 19:08:35.603 +57248,10278.0,2,,57244.0,,,,CC BY-SA 3.0,"

If you treat the variable as ordinal you are assuming a gene-dosage effect. +This is essentially a one degree of freedom test since you are testing whether the slope of the regression line is significantly different from $0$. If you treat the variable as nominal you are not assuming any gene-dosage effect and instead you are doing a one way ANOVA with 3 groups so that's a two degrees of freedom test. +The gene-dosage model (treating genotypes as ordinal) is more powerful because you are using information about the genotype groups (whether the group has 0, 1 or 2 copies of the wild type allele) whereas in the categorical approach your model knows nothing about the genotype groups (they could just be called A, B and C). Treating the genotype as ordinal is the preferred approach. Also I should mention that if you believe that for example the wild-type allele is dominant then you can merge the heterozygous individuals into the wild-type homozygous group and treat them as one group.

+",2013-10-10 19:25:29.377 +57249,19264.0,1,57256.0,,,Generic sum of Gamma random variables,,CC BY-SA 3.0,"

I have read that the sum of Gamma random variables with the same scale parameter is another Gamma random variable. I've also seen the paper by Moschopoulos describing a method for the summation of a general set of Gamma random variables. I have tried implementing Moschopoulos' method but have yet to have success.

+ +

What does the summation of a general set of Gamma random variables look like? To make this question concrete, what does it look like for:

+ +

$\text{Gamma}(3,1) + \text{Gamma}(4,2) + \text{Gamma}(5,1)$

+ +

If the parameters above are not particularly revealing, please suggest others.

+",2013-10-10 19:49:21.903 +57255,19545.0,1,,,,Why are ERR (Expected Reciprocal Ranking) scores not normalized?,,CC BY-SA 3.0,"

It seems to me that normalized ERR (Expected Reciprocal Ranking) scores (ERR scores of your ranking algorithm divided by ERR score calculated for the ground truth ranking) are more useful than the unscaled ERR scores, but I have not seen normalized scores being reported in the literature. Is there a good reason that the ERR scores are reported in raw rather than normalized format?

+",2013-10-10 20:53:13.273 +57283,9074.0,2,,57282.0,,,,CC BY-SA 4.0,"

You're asking two questions:

+
    +
  1. Is there a generic test for unimodality?
  2. +
  3. Are there tests to test whether a sample is derived from a given distribution, say, a normal distribution?
  4. +
+

Ad 1): Yes, the Hartigan-Hartigan dip test, Ann. Statist. 13(1):70-84.

+

Ad 2): There exists a number of special tests, but the Kolmogorov-Smirnov test is a general-purpose nonparametric test, although with low statistical power.

+",2013-10-11 08:43:06.977 +57250,22577.0,1,,,,How to determine degree of freedom for a certain test of interaction?,,CC BY-SA 3.0,"

The scenario is like this:

+ +

I have a cohort with 2000 people, half of them taking DRUG, the other half not taking it. I would like to check interactions between DRUG and the other variables in the model:

+ +
    +
  • Method 1:

    + +

    Firstly I got a original model: y1=a1*AGE+b1*BMI+c1*DRUG,[DRUG is binary: yes-1, no-0]; I got a likelihood 1;

    + +

    If I want to test the interaction of AGE, BMI and DRUG, I need another model: y2=a2*AGE+b2*BMI+c2*DRUG+d*(DRUG*AGE)+e*(DRUG*BMI); I got a likelihood 2;

    + +

    Then I compare the likelihood of these two models using chi-square test (df=2), and see whether the difference (likelihood 2 minus likelihood) is significant.

  • +
  • Method 2:

    + +

    Stratify people into two groups according to DRUG status:

    + +

    Group 1: for people taking DRUG (n=1000), model 1: y1=a1*AGE+b1*BMI, I got a likelihood 1 (L1);

    + +

    Group 2: for people not taking DRUG (n=1000), model 2: y2=a2*AGE+b2*BMI, likelihood 2 (L2);

    + +

    Then I use all the people (n-2000), model 3:y3=a3*AGE+b3*BMI+d*(DRUG*AGE)+e*(DRUG*BMI), likelihood 3 (L3);

  • +
+ +

So in order to test the interactions, chi-square=L3/(L1*L2). But the question is: What is the degree of freedom (df)?

+ +

Can anyone help? I cannot get the answer.

+",2013-10-10 19:57:14.333 +57251,22578.0,1,,,,Convergence theorem for Gibbs sampling,,CC BY-SA 4.0,"

The convergence theorem for Gibbs sampling states:

+ +

Given a random vector $X$ with components $X_1,X_2,...X_K$ and the knowledge about the conditional distribution of $X_k$ we can find the actual distribution using Gibbs Sampling infinitly often.

+ +

The exact theorem as stated by book (Neural Networks and Learning Machines):

+ +
+

The random variable $X_k^{(n)}$ + converges in distribution to the true probability distributions of + $X_k$ for k=1,2,...,K as n approaches infinity

+ +

$\lim_{n \rightarrow \infty}P(X^{(n)}_k \leq x | X(0)) = P_{X_k}(x) $ for $k +> = 1,2,...,K$

+ +

where $P_{X_k}(x)$ is the marginal cumulative distribution function + of $X_k$

+
+ +

While doing research on this, for a deeper understanding, I ran across this answer. Which explains quite well how to pick a single sample using the Method, but I am not able to extend/modify it to fit the convergence theorem, as the result of the given example is one sample (spell) and not a final/actual probability distribution.

+ +

Therefore, how do I have to modify that example to fit the convergence theorem?

+",2013-10-10 19:59:40.663 +57252,22580.0,1,,,,What are the pros and cons of standardizing variable in presence of an interaction?,,CC BY-SA 3.0,"

I put this question because while reading the benefits of standardizing explanatory variables or not, I read good but contrasting opinions about standardizing when there are interaction in the model.

+ +

Some talk about how problems of collinearity are removed when standardizing (e.g. Collinearity diagnostics problematic only when the interaction term is included), which is basically the case of my GLMM. However, others claim that standard errors and p-values of interactions of standardized models are not reliable... (e.g.Variables are often adjusted (e.g. standardised) before making a model - when is this a good idea, and when is it a bad one? or http://quantpsy.org/interact/interactions.htm)

+ +

So, any ideas on what is the right thing to do?

+",2013-10-10 19:59:51.410 +57253,22582.0,1,,,,Box Cox Transformation with swift,,CC BY-SA 3.0,"

I am trying to do a box-cox transformation with swift. I have a dependent variable, annual foreign sales of companies (in US\$ thousands) which contains zeros, for a set of panel data. I have been advised to add a small amount, for example, 0.00001 to the annual foreign sales figures so that I can take the log, but I think box-cox transformation will produce a more appropriate constant than 0.00001. I have done a box-cox transformation on R with the codes below, but it has given me a very large lambda2 of 31162.8.

+ +
library(geoR)
+boxcoxfit(bornp$ForeignSales, lambda2 = TRUE)
+#R output - Fitted parameters:
+# lambda lambda2 beta sigmasq 
+# -1.023463e+00 3.116280e+04 9.770577e-01 7.140328e-11
+
+ +

My hunch is that the above value of lambda2 is very large, so I am not sure if I need to run the boxcoxfit with my independent variables like below:

+ +
boxcoxfit(bornp$ForeignSales, bornp$family bornp$roa bornp$solvencyratio,lambda2=TRUE)
+
+ +

I am still trying to identify the best set of independent variables, so I am not sure if using the boxcoxfit with independent variables at this stage will work or is best.

+ +

Here's the description of the two lambda parameters from the help:

+ +

lambda       numerical value(s) for the transformation parameter $\lambda$. Used as the initial value
+             in the function for parameter estimation. If not provided default values are as-
+             sumed. If multiple values are passed the one with highest likelihood is used as
+             initial value.
+lambda2      logical or numerical value(s) of the additional transformation (see DETAILS
+             below). Defaults to NULL. If TRUE this parameter is also estimated and the initial
+             value is set to the absolute value of the minimum data. A numerical value is
+             provided it is used as the initial value. Multiple values are allowed as for
+             lambda.

+",2013-10-10 20:15:15.787 +57254,22583.0,1,,,,variance of summation/compound variable?,,CC BY-SA 3.0,"

here is my situation. I am weighting a packet of material that has 10 individual units in it. In the end of the day I would like to know the average weight and variance of the individual units but the problem is that I cannot weight each unit individually since I would have to destroy the packet to get to the individual units. So in lieu of this, I am trying to make an inference of the individual units from what I know about the packets. I weighed 10 packets (hence I have 100 individual units). I was able to figure out the average weight of the units but am having trouble with the variance. Here is what I have done so far:

+ +

$$ +\begin{split} +\bar{y}&=\frac{1}{10}\sum^{10}_{i=1}y_i\\ + &=\frac{1}{10}\sum^{10}_{i=1} (x_{i,1}+x_{i,2}+...+x_{i,10})~~[since~y_i=x_{i,1}+x_{i,2}+...+x_{i,10}]\\ + &=\frac{1}{10}\sum^{100}_{j=1}x_j\\ + &=\frac{1}{10}(100~\bar{x})=10~\bar{x} +\end{split} +$$

+ +

thus we have the average of $x$, $\bar{x}=\frac{\bar{y}}{10}.$ But now my challenge is how to do I find variance of $x$ given the variance of $y$? Any suggestions? Thanks!

+ +

::::UPDATE::::

+ +

After some thought I came up with this reasoning: +$$ +\begin{split} +\frac{1}{10}var(y)&=var(\bar{y})\\ + &=var(10~\bar{x})\\ + &=100~var(\bar{x})\\ + &=100~\frac{1}{100}var(x)~~[assuming~that~all~x~are~i.i.d.]\\ + &=var(x) +\end{split} +$$

+ +

thus we have $var(x)=\frac{1}{10}var(y).$ I am correct in that if we assume that all the individual units share the same common variance and are independent of each other, this result holds?

+",2013-10-10 20:48:01.520 +57256,668.0,2,,57249.0,,,,CC BY-SA 4.0,"

First, combine any sums having the same scale factor: a $\Gamma(n, \beta)$ plus a $\Gamma(m,\beta)$ variate form a $\Gamma(n+m,\beta)$ variate.

+

Next, observe that the characteristic function (cf) of $\Gamma(n, \beta)$ is $(1-i \beta t)^{-n}$, whence the cf of a sum of these distributions is the product

+

$$\prod_{j} \frac{1}{(1-i \beta_j t)^{n_j}}.$$

+

When the $n_j$ are all integral, this product expands as a partial fraction into a linear combination of $(1-i \beta_j t)^{-\nu}$ where the $\nu$ are integers between $1$ and $n_j$. In the example with $\beta_1 = 1, n_1=8$ (from the sum of $\Gamma(3,1)$ and $\Gamma(5,1)$) and $\beta_2 = 2, n_2=4$ we find

+

$$\begin{aligned}&\frac{1}{(1-i t)^{8}}\frac{1}{(1- 2i t)^{4}} = \\ +&\frac{1}{(t+i)^8}-\frac{8 i}{(t+i)^7}-\frac{40}{(t+i)^6}+\frac{160 i}{(t+i)^5}+\frac{560}{(t+i)^4}-\frac{1792 i}{(t+i)^3}\\ +&-\frac{5376}{(t+i)^2}+\frac{15360 i}{t+i}+\frac{256}{(2t+i)^4}+\frac{2048 i}{(2 t+i)^3}-\frac{9216}{(2t+i)^2}-\frac{30720 i}{2t+i}. +\end{aligned}$$

+

The inverse of taking the cf is the inverse Fourier Transform, which is linear: that means we may apply it term by term. Each term is recognizable as a multiple of the cf of a Gamma distribution and so is readily inverted to yield the PDF. In the example we obtain

+

$$\begin{aligned} +&\frac{e^{-t} t^7}{5040}+\frac{1}{90} e^{-t} t^6+\frac{1}{3} e^{-t} t^5+\frac{20}{3} e^{-t} t^4+\frac{8}{3} e^{-\frac{t}{2}} t^3+\frac{280}{3} e^{-t} t^3\\ +&-128 e^{-\frac{t}{2}} t^2+896 e^{-t} t^2+2304 e^{-\frac{t}{2}} t+5376 e^{-t} t-15360 e^{-\frac{t}{2}}+15360 e^{-t} +\end{aligned}$$

+

for the PDF of the sum.

+

This is a finite mixture of Gamma distributions having scale factors equal to those within the sum and shape factors less than or equal to those within the sum. Except in special cases (where some cancellation might occur), the number of terms is given by the total shape parameter $n_1 + n_2 + \cdots$ (assuming all the $n_j$ are different).

+
+

As a test, here is a histogram of $10^4$ results obtained by adding independent draws from the $\Gamma(8,1)$ and $\Gamma(4,2)$ distributions. On it is superimposed the graph of $10^4$ times the preceding function. The fit is very good.

+

+
+

Moschopoulos carries this idea one step further by expanding the cf of the sum into an infinite series of Gamma characteristic functions whenever one or more of the $n_i$ is non-integral, and then terminates the infinite series at a point where it is reasonably well approximated.

+",2013-10-10 20:58:56.470 +57257,10570.0,2,,55043.0,,,,CC BY-SA 4.0,"

Your question is not really possible to answer unless you have additional information about the situation you are applying this to.

+

Indistinguishable situations

+

For the purposes of this, we'll assume that $X$, $Y$, and $Z$ are 0-mean multivariate normal distributions in $\mathbb{R}^d$, and we're interested in one or more spectrum $\sigma_i$ (a vector of size $d$ with decreasing values, yada yada). I refer to the components of the spectrum as eigenvalues, without specifying that they're the eigenvalues of the covariance matrix.

+
    +
  1. The true distribution is $X$ which has spectrum $\sigma_X$ with all non-zero values. There is no error, and we draw a large number of samples, estimating everything very accurately. Clearly all of the "small" eigenvalues still have "information" and aren't noise.

    +
  2. +
  3. The true distribution is $Y$ which has a spectrum $\sigma_Y$ with only 3 non-zero eigenvalues. There's noise, though, so we measure $Y+Z$, where $\sigma_Z$ does have all non-zero eigenvalues. Let's suppose $Y$ and $Z$ are such that $\sigma_{Y+Z} = \sigma_X$. Here, it's obvious that all but the top 3 eigenvalues are "merely noise".

    +
  4. +
+

My point is just that which parts of the spectrum can be attributed to "noise" is not a property of the sample.

+

External criteria

+

There potentially are external criteria that can help you distinguish the above situations, but they're sort of problem specific. For instance, in the Netflix Challenge, a very successful technique for predicting movie ratings was based on SVD (which is also the basis of PCA). When using SVD-based algorithms for a prediction task, one is confronted with the same challenge you have: "How many non-zero components do I consider? How far do I reduce the dimensionality?" The answer is basically cross validation. The more components you consider, the lower your training error is, but the more risk of overfitting. The validation error is a proxy for generalization error. So, you generally get a chart like:

+

+

If you're not doing a predictive problem, I don't really have useful advice, but I do imagine there might be something you want to measure that can help you define what it means for something to be "signal" vs "noise" in your application.

+",2013-10-10 21:11:46.330 +57258,6162.0,2,,57223.0,,,,CC BY-SA 3.0,"

For information, the random-effect model given by @Henrik:

+ +
> f <- function(x) sqrt(x)
+> library(lme4)
+> ( fit1 <- lmer(f(Value) ~ Group + (1|Subject), data=dat) )
+Linear mixed model fit by REML ['lmerMod']
+Formula: f(Value) ~ Group + (1 | Subject) 
+   Data: dat 
+REML criterion at convergence: 296.3579 
+Random effects:
+ Groups   Name        Std.Dev.
+ Subject  (Intercept) 0.5336  
+ Residual             0.8673  
+Number of obs: 108, groups: Subject, 18
+Fixed Effects:
+(Intercept)       Group2       Group3  
+    3.03718     -0.07541      1.11886  
+
+ +

is equivalent to a generalized least-squares model with an exchangeable correlation structure for subjects:

+ +
> library(nlme)
+> fit2 <-  gls(f(Value) ~ Group, data=dat, na.action=na.omit, correlation=corCompSymm(form= ~  1 | Subject)) 
+
+ +

The fitted variance matrix is then:

+ +
> getVarCov(fit2)
+Marginal variance covariance matrix
+        [,1]    [,2]    [,3]    [,4]    [,5]    [,6]
+[1,] 1.03690 0.28471 0.28471 0.28471 0.28471 0.28471
+[2,] 0.28471 1.03690 0.28471 0.28471 0.28471 0.28471
+[3,] 0.28471 0.28471 1.03690 0.28471 0.28471 0.28471
+[4,] 0.28471 0.28471 0.28471 1.03690 0.28471 0.28471
+[5,] 0.28471 0.28471 0.28471 0.28471 1.03690 0.28471
+[6,] 0.28471 0.28471 0.28471 0.28471 0.28471 1.03690
+  Standard Deviations: 1.0183 1.0183 1.0183 1.0183 1.0183 1.0183 
+
+ +

As you can see, the diagonal entry corresponds to the total variance in the first model:

+ +
> VarCorr(fit1)
+ Groups   Name        Std.Dev.
+ Subject  (Intercept) 0.53358 
+ Residual             0.86731 
+> 0.53358^2+0.86731^2
+[1] 1.036934
+
+ +

and the covariance corresponds to the between-subject variance:

+ +
> 0.53358^2
+[1] 0.2847076
+
+ +

Actually the gls model is more general because it allows a negative covariance. The advantage of nlme is that you can more generally use other repeated correlation structures and also you can specify different variances per group with the weights argument.

+ +

I think that residuals are different because they are constructed with the random-effects in the first model. In order to get multiple comparisons you can use the lsmeans and the multcomp packages, but the $p$-values of the hypotheses tests are anticonservative with defaults (too high) degrees of freedom. Unfortunately, the pbkrtest package does not apply to gls/lme models.

+",2013-10-10 21:31:42.880 +57259,6162.0,2,,57223.0,,,,CC BY-SA 3.0,"

Now, try to you write down the model: $y_{ijk} = ...$ where $y_{ijk}$ is the $k$-th value for individual $j$ of group $i$. Then look at what happens for the means $\bar y_{ij\bullet}$: you get a classical Gaussian linear model, with variance homogeneity because there are $6$ repeated measures for each subject:

+ +
> xtabs(~Group+Subject, data=dat)
+     Subject
+Group 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
+    1 6 6 6 6 6 6 6 0 0  0  0  0  0  0  0  0  0  0
+    2 0 0 0 0 0 0 0 6 6  6  6  6  6  6  6  0  0  0
+    3 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  6  6  6
+
+ +

Thus, since you are interested in mean comparisons only, you don't need to resort to a random-effect or generalised least-squares model - just use a classical (fixed effects) model using the means $\bar y_{ij\bullet}$ as the observations:

+ +
tdat <- transform(dat, tvalue=f(Value))
+dd <- aggregate(tvalue~Group+Subject, data=tdat, FUN=mean)
+fit3 <- lm(tvalue~Group, data=dd)
+
+ +

I think this approach always correctly work when we average the data over the levels of a random effect (I show on my blog how this fails for an example with a fixed effect).

+ +

The ANOVA provides the same answer as @Henrik's approach (and that shows that Kenward-Rogers approximation is correct):

+ +
> anova(fit3)
+Analysis of Variance Table
+
+Response: tvalue
+          Df Sum Sq Mean Sq F value  Pr(>F)  
+Group      2 3.3799 1.68994   4.121 0.03747 *
+
+ +

Then you can use TukeyHSD() or the lsmeans package for multiple comparisons:

+ +
> TukeyHSD(aov(fit3), ""Group"")
+  Tukey multiple comparisons of means
+    95% family-wise confidence level
+
+Fit: aov(formula = fit3)
+
+$Group
+           diff         lwr       upr     p adj
+2-1 -0.07541248 -0.93627828 0.7854533 0.9719148
+3-1  1.11885667 -0.02896441 2.2666777 0.0565628
+3-2  1.19426915  0.06817536 2.3203629 0.0370434
+
+> library(lsmeans)
+> lsmeans(fit3, pairwise~Group)
+
+$`Group pairwise differences`
+         estimate        SE df  t.ratio p.value
+1 - 2  0.07541248 0.3314247 15  0.22754 0.97191
+1 - 3 -1.11885667 0.4418996 15 -2.53193 0.05656
+2 - 3 -1.19426915 0.4335348 15 -2.75472 0.03704
+    p values are adjusted using the tukey method for 3 means 
+
+",2013-10-10 21:54:03.493 +57260,4779.0,2,,57065.0,,,,CC BY-SA 3.0,"

Looking at this as an outlier problem seems wrong to me. If ""< 10% of users spend at all"", you need to model that aspect. Tobit or Heckman regression would be two possibilities.

+",2013-10-10 21:55:57.283 +57267,22591.0,1,,,,Extract important features,,CC BY-SA 3.0,"

Here is my situation: +- A huge amount of data +- 600 features +- Only one class is provided +Now, my question is how can I reduce the number of features to important ones? In another word, all of these features (with data) are intending to predict only one class. but some of features have large impact on the prediction (means their variation come to higher probability).

+",2013-10-10 23:28:22.503 +57268,18040.0,1,,,,Difference between two mixed effects models,,CC BY-SA 3.0,"

I have a question about how to tell two different mixed effects models apart. In the simple case both involve fitting a model with a random group effect and a covariate. I fit the model with lme4 in R. Here is a visualization of the two different scenarios.
+

+ +
library(ggplot2)
+library(lme4)
+gen_dat2 <- function(group.m,group.v,int, sl,n){
+      x <- vector()
+      y <- vector()
+      g <- vector()
+         for(i in 1:length(group.m)){
+         x.t <- rnorm(n,group.m[i],group.v[i])
+         y.t <- rnorm(n,group.m[i],group.v[i])+int + sl*x.t 
+         x <- c(x,x.t)
+         y <- c(y,y.t)
+         g <- c(g,rep(i,n))
+        }
+     return(cbind(x,y,g))
+}
+
+group.m <- runif(5,1,20)
+group.v <- runif(5,1,1.5)
+
+dat2 <- data.frame(gen_dat2(group.m,group.v,1,4,14))
+ggplot(dat2,aes(x=x,y=y,colour=as.factor(g),group=g))+geom_point()+stat_smooth(method=""lm"",se=F)
+m2 <- lmer(y~x + (x|g),data=dat2)
+
+ +

Then I can generate and fit the other scenario with similar code:

+ +

+ +
 gen_dat <- function(group.m,group.v,int, sl,n){
+      x <- vector()
+      y <- vector()
+      g <- vector()
+         for(i in 1:length(group.m)){
+         x.t <- rnorm(n,0,1)
+         y.t <- rnorm(n,group.m[i],group.v[i])+int + sl*x.t 
+         x <- c(x,x.t)
+         y <- c(y,y.t)
+         g <- c(g,rep(i,n))
+        }
+     return(cbind(x,y,g))
+}
+
+group.m <- runif(5,1,20)
+group.v <- runif(5,1,1.5)
+
+dat1 <- data.frame(gen_dat(group.m,group.v,1,4,14))
+ggplot(dat1,aes(x=x,y=y,colour=as.factor(g),group=g))+geom_point()+stat_smooth(method=""lm"",se=F)
+m1 <- lmer(y~x + (x|g),data=dat1)
+
+ +

My central question is how do I tell these two models apart? Am I incorrectly fitting the first one, and I need an extra term in there to model the relationships between groups and the x variable as well as y? Both detect substantial between group variation in the intercept and not much in the slope as I would predict. But I need a way to tell these two apart. Any thoughts would be helpful.

+ +
+ +

Edits:

+ +

This has been helpful in me restating the question. So I want to re-ask the question with an example which I hope will make it clear why I want to be able to tell these two models apart. Let's imagine that Y is the average student test score at a school, and X is spending per student in that school. Our grouping variables are 5 different school districts.

+ +

Data in the top figure shows that an increase in spending within a district means that test scores increase. It also shows that between districts there are differences is scores, but that's clearly because some districts spend more student than others.

+ +

Data in the second figure show similarly that within a district student scores increase as spending increases. It also shows that between districts there are differences in test scores. However we don't know what is driving those differences, unlike in the first set of data. This is a pretty common situation I've encountered in building models. The former is not.

+ +

So what I'm asking is what is the appropriate model that captures the following features from the first dataset:

+ +
    +
  1. Test scores increase as spending per student does
  2. +
  3. There is also variance between districts in student test scores
  4. +
  5. Part of that difference between districts is because of the underlying relationship between spending and test scores, which also varies with district.
  6. +
+ +

More generally stated, how do you handle a scenario where you're building a hierarchical model where the grouping variable is correlated with one of your continuous independent variables (e.g. the first scenario). I feel like the model I've presented get's at points 1. and 2., but not point 3. So I'm really seeking a way to tease these two scenarios apart.

+ +

Normally I might add an extra level of hierarchy if there was another group level explanatory variable. Continuing our example, maybe in the 2nd dataset there are differences between district because in some districts parents have more time to spend on homework with students. So we would add that as a group level predictor in a hierarchical model. But that wouldn't work in the first scenario.

+",2013-10-10 23:42:09.300 +57261,16703.0,1,,,,Dealing with 0 values when calculating the mle for a Dirichlet distribution,,CC BY-SA 3.0,"

I have $N$ pmfs, and for each each $L$ samples. Each sample has a variable amount of $x$ values, but the $x$ values that they have can be matched. So for example:

+ +

$$sample_1 \rightarrow\ x_1 = 0, x_2 = 0, x_3 = 0.2, x_4 = 0.4, x_5 = 0.4$$ +$$sample_2 \rightarrow\ x_1 = 0.3,x_2=0, x_3 = 0.4, x_4 = 0.3,x_5=0$$

+ +

I'm using a python program to calculate the mle from the samples which is a port of Thomas P. Minka's Matlab Fastfit code (Estimating a Dirichlet Distribution).

+ +

The problem is that, for fitting, it sums over $logp+psi(\sum^k a_k) - logp$. Since some of the $x$ values are 0, some logp values are -inf. Therefore, summing over this makes everything -inf.

+ +

How can I deal with 0 values when calculating the mle for a Dirichlet distribution?

+",2013-10-10 22:12:17.237 +57262,22587.0,1,,,,Poisson distribution vs multiplying probabilities,,CC BY-SA 3.0,"

I am a TA for a stats course for engineers, and I had a really good question from a student today, which I don't know the answer to.

+ +

We were going through the following word problem:

+ +

""4 computers run continuously for the Toronto Stock Exchange. The probability of a computer failure in a day is estimated at 5%. Assuming differing computers fail independently, what is the probability that all 4 computers fail in a day?""

+ +

Since the sampling takes place over an interval, the way I would approach this is using the Poisson distribution, with the average number of computers failing on a day $\equiv\lambda = 0.05$. If four computers fail, then $k = 4$. Thus, +\begin{align*} + P(k; \lambda) &= \frac{\lambda^{k} e^{-\lambda}}{k!} \\ + P(k=4; \lambda = 0.05) &= \frac{0.05^{4} e^{-0.05}}{4!} \\ + & = 2.477\times 10^{-7} +\end{align*}

+ +

However, a student asked why it would not be appropriate to just multiply the probability of each computer failing. Since the probability of each computer failing each day $\equiv p = 0.05$, and since each computer failure is independent, he argued that,

+ +

\begin{align*} + P(k=4) &= p^4 \\ + &= 0.05^4 = 6.25\times 10^{-6} +\end{align*}

+ +

Which one of these approaches is wrong given the question? And why? What underlying assumption of the wrong approach is violated by the question?

+ +

Thank you for your help.

+ +

UPDATE: I left out some information in the problem the first time this was posted, and I apologize.

+",2013-10-10 22:54:50.603 +57263,22585.0,1,,,,How to separate out the regression effect vs treatment effect without a control group?,,CC BY-SA 4.0,"

I'm looking at a dataset that has pre-post test measurements on users' stress, depression and anxiety levels collected from a website's online health assessment. On average, the healthier participants at baseline got worse over time, and the sicker participants at baseline got much better, and the middle group gets a little better. There's definitely a regression effect going on here, but also a treatment effect too.

+

As this data was collected based on website usage, there isn't really a control group (all of the "post" measurements come from people that have used the online program). There are probably ways that I could synthesize a control group using the people who I can guess didn't make much use out of the treatment (based on number of logins or length of time between logins), but is there a way to separate out the treatment effect from the regression effect when you can't use difference-in-difference techniques using a control group or anything like that?

+",2013-10-10 23:01:51.663 +57264,633.0,2,,57262.0,,,,CC BY-SA 3.0,"

The Poisson process that you're using assumes that 0.05 is the expected number of computers failing in one day in an unknown number of total computers (your answer also assumes that this rate is fixed after a computer fails, which implies that computers can fail multiple times, or are replaced immediately, or there are so many of them that this is negligible).

+ +

The independent probability that the student is using assumes that there are exactly four computers each of which has a 5% chance of failing.

+ +

The wording makes it sound to me like 5% is the chance of any individual computer failing (so the second interpretation). In that case, we want to know the total number of computers and apply a binomial distribution. Since the question doesn't give the total number of computers, it can't be answered.

+ +

Another possibility is that 5% is the probability that exactly one computer fails, and yet another possibility is that 5% is the probability that at least one computer fails. In either case you can deduce the Poisson process intensity that gives this value. For the first of these, I get 4.4997552907483822; for the second, I get an intensity of 0.051293294149203306. From there you could calculate similarly to how you did.

+ +
+ +

Per your update: You can eliminate the Poisson process since you don't have a fixed rate. You still have to decide whether 5% is the probability of a given computer failing, in which case the student is right. If it's the probability of at least one computer failing, or the probability of exactly one computer failing, you'll have to reason back from that number to the probability of any individual computer failing before reasoning forwards.

+",2013-10-10 23:04:27.343 +57265,2490.0,1,,,,Simple experimental design - should I counterbalance?,,CC BY-SA 3.0,"

I'm designing a pretty simple experiment that goes like this. Participants will be shown a series of stimuli and after viewing each one they will answer a few questions where they will make judgments about the stimulus - all Likert items. There are two kinds of stimuli. Probably obvious, but the hypothesis is that there will be a difference between answers for A vs B stimuli. There will be 30 or so stimuli, with an equal number of A and B stimuli. All participants will see all the stimuli (within-subjects).

+ +

I'm wondering if there would be a benefit to counterbalancing the order in which they receive the items, vs just showing everyone the same randomized sequence of stimuli (which is easier to setup).

+ +

If there's a better method I need to consider, I'd be interested in hearing about it. I also looked into blocking designs, but this is so simple that I don't think those apply here. I'm planning to analyze with t-tests or Mann-Whitney-Wilcoxon.

+",2013-10-10 23:12:54.873 +57266,18040.0,2,,52871.0,,,,CC BY-SA 3.0,"

I've run into this problem before. Often times it's due to a problem with passing negative values to a distribution that doesn't allow them. So perhaps your log normal or wishart distributions are getting negative values somehow.

+ +

Another issue is that you are giving priors on your variance parameters that is quite big. This can cause unexpected values to be passed to your log-normald and wishart variables. I'd start by severely constraining the variance priors and slowly expand the parameter space. In my experience fitting BUGS models is both and art and a science.

+",2013-10-10 23:22:27.973 +57282,22601.0,1,57283.0,,,Generic test for unimodality given sample,,CC BY-SA 3.0,"

Are there any generic tests to validate if a given sample follows a unimodal distribution, like a Gaussian, Cauchy, Student's t or a chi-square?

+",2013-10-11 08:38:55.517 +57269,1741.0,2,,57156.0,,,,CC BY-SA 3.0,"

Given that your model exhibits good accuracy you can just use it to predict the class labels of records in the unlabeled dataset. However, you cannot evaluate the performances on unlabeled data.

+ +

Be careful that you should assess the quality of your model on the labeled data by cross-validation. It is not enough to check the training error rate.

+ +

If your model is not accurate enough you might think about semi-supervised learning. The unlabeled data is used in order to improve the quality of your model via inductive learning. The accuracy should always be computed by cross-validation on your labeled data.

+ +

Have a look at [ Crimisini et al. Decision Forests: A Unified Framework +for Classification, Regression, Density Estimation, Manifold Learning and +Semi-Supervised Learning ] Chapter 7 about semi-supervised learning and 7.4 about induction with semi-supervised learning.

+",2013-10-11 00:17:06.150 +57270,22593.0,1,,,,Simulating groups different with respect to the orthogonal complement in R,,CC BY-SA 3.0,"

In a paper of Journal of Chemometrics (Naes & Mevik 2001 : Understanding the collinearity problem in regression and discriminant analysis), the authors propose to make simulations by creating two groups which are different with respect to the smallest eigenvector direction.

+ +
+

Here the groups are different with respect to the orthogonal complement to the five ‘NIR loadings’. This is achieved in the following way. The constant 0 ⋅ 18 is multiplied by a sixth loading vector (orthogonal to the other five) and added to group 2. Both groups had initially the same means as group 1

+
+ +

How can I compute such a simulation in R? The goal is to obtain group differences which are tied to the ""small eigenvectors"" space. Then to check whether the LDA results are best when using the whole dataset or better by using only the last components as variables.

+ +
+ +

Edit
+I am trying to get the same results as these authors by simulation, but unfortunately...

+ +

I have tried to make two groups in a set of correlated data by adding differences between groups in the last eigen-vector (as it was suggested to me):

+ +

Here is a try:

+ +
require(MASS)
+R=matrix(0.9,10,10);
+diag(R)=1;
+random.normal=mvrnorm(n=20, rep(1,10), Sigma=R) ;
+V <- var(random.normal) ## vcv matrix
+U <- eigen(V)$vectors ## eigen vectors
+Y <- random.normal %*% U
+group=rep(1:2,each=10)
+Y10 <- Y[,10] ## temporary data
+Y10[group==1] <- Y10[group==1]+1 ## add 1 to group 1
+Y10 <- Y10*sqrt(var(Y[,10])/var(Y10)) ## scaling to the variance of Y[,10]
+Y[,10] <- Y10-mean(Y10)+mean(Y[,10]) ## center
+
+X2 <- Y %*% t(U) #data back transformed
+
+## Then I try LDA with MASS
+
+test=lda(X2,group,CV=T)
+tab=table(test$class,group)
+sum(diag(tab))/sum(tab)
+
+ +

But unfortunately I never found the bad classification results that Naes & Mevik 2001 found. when I use the complete dataset instead of using the last PC in the LDA as these authors found. (it's the same with more variables) +I found that results are even better with the firsts components (e.g. 1:5 for ten PC)

+ +
test=lda(prcomp(X2)$x[,1:5],group,CV=T)
+    tab=table(test$class,group)
+sum(diag(tab))/sum(tab)
+
+ +

Any suggestions to found the effects these authors explain? +Thank a lot

+",2013-10-11 00:34:33.700 +57271,20473.0,2,,56768.0,,,,CC BY-SA 3.0,"

Write your system explicitly for time $t$ as (""$L$"" for ""loss"", as a positive quantity, and ""$G$"" for ""gain"") +$$ A_t - A_{t-1} = - L^A_{t} + G_{t}^{B\rightarrow A}+G_{t}^{C\rightarrow A}$$

+ +

$$ B_t - B_{t-1} = - L^B_{t} + G_{t-1}^{A\rightarrow B}+G_{t}^{C\rightarrow B}$$

+ +

$$ C_t - C_{t-1} = - L^C_{t} + G_{t}^{A\rightarrow C}+G_{t}^{B\rightarrow C}$$

+ +

The following three relations hold exactly: +$$ L^A_{t} = G_{t}^{A\rightarrow B} + G_{t}^{A\rightarrow C} $$ +$$ L^B_{t} = G_{t}^{B\rightarrow A} + G_{t}^{B\rightarrow C} $$ +$$ L^C_{t} = G_{t}^{C\rightarrow A} + G_{t}^{C\rightarrow B} $$

+ +

If you substitute in the first three you obtain

+ +

$$ A_t - A_{t-1} = - G_{t}^{A\rightarrow B} - G_{t}^{A\rightarrow C} + G_{t}^{B\rightarrow A}+G_{t}^{C\rightarrow A}$$

+ +

$$ B_t - B_{t-1} = - G_{t}^{B\rightarrow A} - G_{t}^{B\rightarrow C} + G_{t}^{A\rightarrow B}+G_{t}^{C\rightarrow B}$$

+ +

$$ C_t - C_{t-1} = - G_{t}^{C\rightarrow A} - G_{t}^{C\rightarrow B} + G_{t}^{A\rightarrow C}+G_{t}^{B\rightarrow C}$$

+ +

You have $6$ unknown quantities to estimate per time period. There is just not enough information to do that. So you need assumptions that will impose structure (=restrictions) on the situation, and will permit you to estimate something. What? Let's say you assume that there is a relatively stable ""churn"" from one company to another, as a linear function of their market share in the previous period. This assumption brings in a set of unknown coefficients to be estimated (which will then give you an estimate of ""hidden transfers of market share""). Write $G_{t}^{A\rightarrow B} = a_bA_{t-1}$ (market share lost from $A$ to $B$ as a linear function of $A$'s market share in period $t-1$). +Your equations will become

+ +

$$ A_t - A_{t-1} = - a_bA_{t-1} - a_cA_{t-1} + b_aB_{t-1}+c_aC_{t-1} $$

+ +

$$ B_t - B_{t-1} = - b_aB_{t-1} - b_cB_{t-1} + a_bA_{t-1}+c_bC_{t-1}$$

+ +

$$ C_t - C_{t-1} = - c_aC_{t-1} - c_bC_{t-1} + a_cA_{t-1}+ b_cB_{t-1}$$

+ +

We have turned a set of mathematical identities into a model. It is doubtful that this model will hold exactly for each $t$, so you should add a stochastic error term. Rearranging we obtain a first-order Vector Autoregression (VAR):

+ +

$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +1-a_b-a_c & b_a & c_a \\ +a_b & 1-b_a-b_c & c_b \\ +a_c & b_c & 1-c_a-c_b \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$

+ +

or, to homogenize notation,

+ +

$$ \left[ \begin{matrix} +A_t \\ +B_t \\ +C_t \\ +\end{matrix} \right] = \left [\begin{matrix} +\gamma_{11} & \gamma_{12} & \gamma_{13} \\ +\gamma_{21} & \gamma_{22} & \gamma_{23} \\ +\gamma_{31} & \gamma_{32} & \gamma_{33} \\ +\end{matrix} \right] \left[ \begin{matrix} +A_{t-1} \\ +B_{t-1} \\ +C_{t-1} \\ +\end{matrix} \right]+ \left[ \begin{matrix} +u^A_{t} \\ +u^B_{t} \\ +u^C_{t} \\ +\end{matrix} \right] $$

+ +

subject to the equality restrictions +$$ \begin{matrix} +\gamma_{11} + \gamma_{21} + \gamma_{31} =1 \\ +\gamma_{12} + \gamma_{22} + \gamma_{32} =1 \\ +\gamma_{13} + \gamma_{23} + \gamma_{33} =1 \\ +\end{matrix} $$

+ +

So you have essentially $6$ unknown coefficients and a sample of $T-1$ observations (for each company).
+Note that these restrictions imply the ""add up to unity"" restriction $A_t+B_t+C_t =1$ for each $t$, so this last one does not impose any additional structure on the unknown coefficients -but it does imply a relation between the error terms, namely that $u^A_{t} + u^B_{t} +u^C_{t} =0$. Any additional assumptions on the three error terms should either come from knowledge of the specific real world phenomenon under study, and/or through a statistical specification search.

+ +

Then, an estimation for a hidden transfer of market share will be, for example

+ +

$$\hat G_{t}^{A\rightarrow B} = \hat \gamma_{21}A_{t-1}$$

+ +

etc.

+ +

Of course you may find that such a model does not fit your data sample well - for example you expect that all estimated coefficients should be positive and smaller than or equal to unity, but the estimation procedure may not give you that. But this is what we do: we come up with specification hypotheses and we test them against the data - ""success"" is never guaranteed. Then you should try to come up with a different model.

+",2013-10-11 01:09:15.643 +57272,22595.0,2,,412.0,,,,CC BY-SA 3.0,"

The following are text books I used for my MSEE coursework and research and I found them to be pretty good.

+ +
    +
  1. Probability, Statistics and Random Processes for Engineers by Henry Stark and John W. Woods +(Detailed explanation of concepts, good for Communications and Signal Processing people).
  2. +
  3. Schaum's Outline of Probability, Random Variables and Random Processes by Hwei Hsu +(Concise explanation of concepts, has a good amount of solved examples).
  4. +
+",2013-10-11 01:37:35.967 +57273,22596.0,1,,,,Difficulty with MCMC implementation,,CC BY-SA 3.0,"

I could really use some guided help! I'm having difficulty understanding an MCMC implementation in terms of modeling a data set. I'm working on generating parameters from stellar light curves, and was asked to look into implementing an MCMC algorithm. A large chuck on the code is written in Python, so I've been trying to use emcee hammer to generate parameter fits. But going through the code, it's just not ""clicking"" how the method works.

+ +

I have a set of data (time vs flux) of two stars orbiting each other such that from our point of view, they eclipse. There are dips in the light curve to signify this. All I'm attempting to do is get the parameters of the system dependent on the characteristics of these dips.

+ +

In the emcee implementation, there are a few functions that I understand: the posterior function which, I believe, simply generates a data set given the set of parameters. Then there's a prior function which, I assume, is the function given a previous set of parameters. Somehow the algorithm chooses whether or not the jump to the posterior parameter set is to be done? I'm guessing that's what the use of the likelihood function is? To describe whether or not to take the jump?

+ +

I apologize, I'm quite confused on how this is to be implemented in terms a defined set of data.

+",2013-10-11 02:05:57.810 +57274,22594.0,1,,,,Is mixed measures ANOVA the correct test for my data?,,CC BY-SA 3.0,"

I'm fairly new to statistics and I'm still trying to figure out the best way to analyse the data I have. The experiment has 2 groups of participants who perform 2 repetitions of a task that consists of 5 stages. All participants completed both repetitions for all stages, but one group had 8 participants while the other group only had 6. I have a about 100 dependent variables that I wish to examine, so my data looks a bit like this:

+ +
ID   Group    Repetition    Stage   DV1    DV2     ...
+1    A        1             1       212.9  179.9   ...
+1    A        2             1       144.8  134.7   ...
+2    B        1             1       146.3  156.8   ...
+2    B        2             1       128.6  178.2   ...
+
+ +

Group is a between-subjects factor while Repetition and Stage are within-subjects factors. I would like to determine whether Group and Repetition have a significant effect on each dependent variable within each stage (I am not interested in the effect of stage itself). I'm doing the analysis in R so I have the following code:

+ +
options(contrasts=c(""contr.sum"",""contr.poly""))
+mydata            = read.csv(""data.csv"",header=TRUE)
+mydata$Group      = factor(mydata$Group)
+mydata$Repetition = factor(mydata$Repetition)
+mydata$Stage      = factor(mydata$Stage)
+# for each stage
+mydata = mydata[mydata$Stage==1,]
+for (i in 5:(ncol(mydata))) 
+{
+   fit = aov(formula=as.formula(paste(names(mydata)[i], 
+                                ""~ Group * Repetition + Error(ID/Repetition)"")), 
+             data=mydata)
+}
+
+ +

My questions are:

+ +
    +
  1. Is mixed measures ANOVA a valid test for this data? What's the correct way to test whether my data fits the assumptions of ANOVA in R? If this is not a reliable test, what's a possible alternative?
  2. +
  3. Have I defined the mixed measures ANOVA in R correctly? The various tutorials I've read define it in different ways so I'm a bit confused.
  4. +
+",2013-10-11 02:25:06.173 +57275,22598.0,1,,,,Training one class SVM using LibSVM,,CC BY-SA 3.0,"

I hope to use one-class SVM of LIBSVM to train a training samples so as to get a model. Then, I will use the model to predict whether the new test data and the training data is same type or not. In the training process, I have some questions as follows:

+ +
    +
  • Should the training samples all be positive examples or not?
  • +
  • Which kernel function can get better result, linear kernel or RBF kernel?
  • +
  • What is the effect of nu's values to the model?
  • +
+",2013-10-11 02:49:49.953 +57284,,1,,,user14650,How to set confidence level for wilcoxsign_test (package coin)?,,CC BY-SA 3.0,"

In R, the function wilcox.test takes the argument conf.level = 0.095 (for example). Giving the same argument to the function wilcoxsign_test from the coin package returns a warning:

+ +
additional arguments conf.level will be ignored
+
+ +

What default confidence level does wilcoxsign_test use, and how can I change it?

+ +

Or : Why do I not need a confidence level for this function?

+",2013-10-11 09:06:48.623 +57276,22310.0,1,,,,"Path analysis, sample sizes, and alternative analysis",,CC BY-SA 3.0,"

I am examining how English ivy affects the occurrence of a salamander species under cover objects (e.g., logs). Soil moisture is assumed to be the major factor that affect their occurrence.

+

My hypothesized pathway: The presence/absence of salamanders under cover objects is either a direct consequence of changes in ivy-induced abioitc environment (i.e., drier soil) or an indirect result of changes in prey community that resulted from altered abiotic factors. But, there are multiple factors, other than English ivy, that affect soil moisture.

+

+

My questions are:

+
    +
  1. I think that a path analysis is most suitable for testing my causal mechanisms. But, given a small sample size (n = 71), is a path analysis appropriate?

    +
  2. +
  3. Another potential problem for a path analysis is that the effects of English ivy on soil moisture seem to depend on the other factors (e.g., the number of overstory trees), as shown below. Are there any way to account for such patterns in a path analysis?

    +

    +
  4. +
  5. Are there any other analyses suitable for testing my hypothesized relationships? I am considering multiple (linear and logistic) regressions, but again my sample size is small AND regressions do not reflect my hypothesized causal relationships accurately.

    +
  6. +
+

I am using R, so any recommended code would be greatly helpful (I am a relatively new R user, though).

+",2013-10-11 02:51:27.387 +57277,20603.0,2,,57275.0,,,,CC BY-SA 3.0,"
+

Should the training samples all be positive examples or not?

+
+ +

Yes, in one class SVM (and any other outlier detection algorithm) you need just one class. If it is positive or negative depends on your naming convention, but it it more probable, that you will seek for positive examples which are underrepresented.

+ +
+

Which kernel function can get better result, linear kernel or RBF kernel?

+
+ +

""There is no free lunch"". There is no general answer, the reason behind having many kernels (not just linear and rbf) is that they work well in different applications. It is data dependant decision, so you will have to test at least those two.

+ +
+

What is the effect of nu's values to the model?

+
+ +

It corresponds to the bounds on fraction of points becoming support vectors, so it limits the model's complexity (smaller the number of SVs, simplier the model and less prone to overfitting, yet prone to underfitting). As in the http://www.cms.livjm.ac.uk/library/archive/Grid%20Computing/NoveltyDetection/sch00support.pdf paper, it directly corresponds to:

+ +
    +
  • ""an upper bound on the fraction of outliers""
  • +
  • ""a lower bound on the fraction of SVs"".
  • +
+",2013-10-11 05:28:45.147 +57278,20144.0,1,,,,"Exponential family parameter estimation and fitting, references",,CC BY-SA 3.0,"

First of all, I want to express my apologies if the question is too broad or wrong, but I am in need of references and I have no idea whom I can ask.

+ +

If you are interested, the question comes from a model I built, you can see some details here and here. In this model I have: +$$f(\mathbb{x}|T,\mu)=\frac{h(\mathbb{x})e^{-\frac{E(\mathbb{x})}{kT}+\mu N(x)}}{\mathcal{Z}(T,\mu)}$$

+ +

There, my parameters are $\mu$ and $T$, and $\mathbb{x}=(x_1,\dots,x_M)$ where $x_i\in\{0,1\}$ and I have the restriction $\forall i\in\{1,\dots,M-D+1\}$ +$$\sum_{j=0}^{D-1} x_{i+j} \leq 1$$ +This is, $h(\mathbb{x})=0$ if that condition is not held.

+ +

I have the ""small"" inconvenience of not knowing $\mathcal{Z}(T,\mu)$, so I used a MCMC (Metropolis-Hastings) method to approximate this function. However I face two problems.

+ +
    +
  • The first of them regards the simulation and the model and I am on solving it (it depends too much on the initial condition).

  • +
  • The second is that these parameters are not fully known and I have no idea how can I estimate them. I have been reading about Bayesian inference and I know a bit of estimation theory but I am no expert (furthermore I don't know if not knowing the partition function can affect the result). If any of you were able to give me some clue in the form of a book that I can read, I would be eternally grateful.

  • +
+ +

Thank you very much for your help.

+ +

Thanks to cardinal's comment, I have realized that I didn't explain one thing. It probably makes all more complex but there it goes: +The idea is that $E$ is known in each experiment, actually $E(\mathbf{x}) = \mathbf{E}\cdot\mathbf{x}$. However, $\mathbf{E}$ is not always the same, it represents an external potential for some particles. The ""good"" thing is that $T$, which accounts for the temperature, never changes whatever $\mathbf{E}$ is, so I thought that I could find a way of estimating it, given the fact that I have an empirical distribution of $x_{i}$ (so, a probability that a particle is in the position $i$) given a certain $\mathbf{E}$. So, in a way, what I have is +$$f(\mathbf{x}|T,\mu , \mathbf{E})$$, but I always know $\mathbf{E}$ and I know (can I say this?) that $T,\mu$ are independent of $\mathbf{E}$. I am sorry for not being clear enough before. I am starting to think that nothing of this makes sense...

+",2013-10-11 06:43:41.090 +57279,22262.0,1,,,,Variable selection with groups of predictors that are highly correlated,,CC BY-SA 3.0,"

What variable selection approach should I consider if I have thousands of predictors with clusters that are extremely correlated?

+ +

For example I might have a predictor set $X:= \{A_1,A_2,A_3,A_4,...,A_{39},B_1,B_2,...,B_{44},C_1,C_2,...\}$ with cardinality $|X| > 2000$. Consider the case where all $\rho(A_i,A_j)$ are very high, and similarly for $B$, $C$, ....

+ +

Correlated predictors aren't correlated ""naturally""; it's a result of the feature engineering process. This is because all $A_i$ are hand engineered from the same underlying data with small variations in hand-engineering methodology, e.g. I use a thinner pass band on $A_2$ than I did for $A_1$ in my denoising approach but everything else is the same.

+ +

My goal is to improve out of sample accuracy in my classification model.

+ +

One approach would just be to try everything: non-negative garotte, ridge, lasso, elastic nets, random subspace learning, PCA/manifold learning, least angle regression and pick the one that's best in my out of sample dataset. But specific methods that are good at dealing with the above would be appreciated.

+ +

Note that my out of sample data is extensive in terms of sample size.

+",2013-10-11 06:57:56.613 +57280,22600.0,1,57281.0,,,Binomial Conditional Probability of a an event,,CC BY-SA 3.0,"

Determining Binomial Condition Probability of a Random Sample

+ +

I have a question about binomial probability involving a conditional event. This problem keeps tripping me up, because while I know how to calculate the binomial probability that a random variable is a failure, i don't know how to calculate the conditional probability of that variable.

+ +
+ +

My question is as follows:

+ +

70% of the total shipments come from factory A, of which 10% are defective.

+ +

30% of the total shipments come from factory B, of which 5% are defective.

+ +

A random shipment comes in, and a sample of 20 pints is taken, and 1 of the pints is defective.

+ +

What is the probability that this shipment came from the Factory A?

+",2013-10-11 07:24:30.003 +57281,9074.0,2,,57280.0,,,,CC BY-SA 3.0,"

Edit 1: didn't pay attention to the question. Will edit answer later today.

+ +

Edit 2: I've attempted to provide an answer below, however I might be mistaken. Feel free to correct me if I am in error.

+ +

$P(1 defective|A) \approx 0.270 \wedge P(1 defective|B) \approx 0.377 \\ +P(A) = 0.7 \wedge P(B) = 0.3 \\ +P(1D) = 0.7*0.270+0.3*0.377 = 0.189+0.113 = 0.302 \\ +P(A|1D) = P(1D|A)*P(A)/P(1D)=0.270*0.7/0.302 \approx 0.626$

+",2013-10-11 07:39:11.260 +57285,221.0,2,,56372.0,,,,CC BY-SA 3.0,"

The topic is called Association Rule Learning, which is one of the most basic (and rather old-fashioned) ways to build a recommender system. The most widely known algorithms are called A Priori and FP Growth. Every good book about Data Mining should contain a chapter about it.

+ +

However, the formula seems to be wrong.

+ +

$P(A|B)$ means Probability of A given B, so

+ +
P(A|B)=count_users(bought(A,B)) / count_users(bought_B)
+
+ +

is correct.

+ +

Furthermore, the mentioned algorithms do not take into account something like $P(\neg A|B)$, because the fact that a user has not bought A could have multiple meanings

+ +
    +
  • user does not like A
  • +
  • user does not know that A exists or is sold here
  • +
  • user does not bought A although he likes it for one of the thousand seemingly arbitrary motivatiors of human behavior.
  • +
+ +

because not buying something is an implicit preference. If the user would have stated explicitly that he does not like A (may be in a survey), it is called an explicit preference. In case of implicit negative preferences, the negative preferences are often excluded from the model.

+ +

If explicit preferences are given, the overall formula $\frac{P(A|B)}{P(\neg A|B)}$ would make sense and represent the Odds Ratio.

+",2013-10-11 09:40:49.687 +57286,503.0,2,,57284.0,,,,CC BY-SA 3.0,"

You don't need (and in fact can't) set a confidence limit in wilcoxsign_test because the about of the function includes a p value. e.g.the first example in the help file for the function:

+ +
RoundingTimes <- data.frame(
+  times = c(5.40, 5.50, 5.55,
+            5.85, 5.70, 5.75,
+            5.20, 5.60, 5.50,
+            5.55, 5.50, 5.40,
+            5.90, 5.85, 5.70,
+            5.45, 5.55, 5.60,
+            5.40, 5.40, 5.35,
+            5.45, 5.50, 5.35,
+            5.25, 5.15, 5.00,
+            5.85, 5.80, 5.70,
+            5.25, 5.20, 5.10,
+            5.65, 5.55, 5.45,
+            5.60, 5.35, 5.45,
+            5.05, 5.00, 4.95,
+            5.50, 5.50, 5.40,
+            5.45, 5.55, 5.50,
+            5.55, 5.55, 5.35,
+            5.45, 5.50, 5.55,
+            5.50, 5.45, 5.25,
+            5.65, 5.60, 5.40,
+            5.70, 5.65, 5.55,
+            6.30, 6.30, 6.25),
+  methods = factor(rep(c(""Round Out"", ""Narrow Angle"", ""Wide Angle""), 22)),
+  block = factor(rep(1:22, rep(3, 22))))
+
+### classical global test
+friedman_test(times ~ methods | block, data = RoundingTimes)
+
+ +

gives as output

+ +
Asymptotic Friedman Test
+
+data:  times by
+     methods (Narrow Angle, Round Out, Wide Angle) 
+     stratified by block
+chi-squared = 11.1429, df = 2, p-value =  0.003805
+
+ +

so, since p = 0.0038, you know it is significant at p = 0.05 (and, indeed, much below that).

+",2013-10-11 10:26:57.610 +57287,21624.0,1,,,,How to decide bootstrap number of runs?,,CC BY-SA 3.0,"

I am using bootstrap for my simulation.

+ +

The number of the population is flexible for each case, and the sample size is decided by a certain percentage. For example, I have a 10,000 population, and I decide to use 10% for each iteration of bootstrap, so the sample size is 1,000.

+ +

In practice, I found it is hard to decide how many times to run the bootstrap is enough. With less simulation, the results appear insufficiant, while with a large number of simulation they are purely redundant.

+ +

May I know if there is a method that can help me to decide the number of iterations to run?

+",2013-10-11 10:37:53.160 +57288,19436.0,1,,,,Reversing Chebyshev inequality argument,,CC BY-SA 3.0,"

One way one could state Chebyshev's inequality is

+ +
+

The probability that a realization deviates from the mean more + than $k$ standard deviations is at most $\frac{1}{k^2}$.

+
+ +

My question is: Can one rigorously reverse this logic and make a statement about the probability that the actual mean is close to the observation. One immediate technical problem is that one needs to define a probability space on possible probability distributions/means.

+ +

I'm asking because I think this type of argument (although slightly more convoluted) underlies Vapnik-Chervonenkis theory. In their textbooks this issue is not discussed at all. They prove a large deviation principle and then simply invert all their inequalities. How does this work? Does it?

+",2013-10-11 10:55:26.637 +57289,20410.0,5,,,,,,CC BY-SA 3.0,"

Canonical correlation analysis (CCA) is a multivariate statistical technique that analyzes two sets of variables and looks for correlations between them. CCA finds linear combinations of variables in each set such that their correlation is maximal. These two linear combinations form one pair of ""canonical variates"". CCA then proceeds to find subsequent pairs, constrained to be uncorrelated with the previous ones.

+",2013-10-11 11:26:16.130 +57290,2081.0,4,,,,,,CC BY-SA 3.0,Canonical correlation analysis (CCA) is a method to analyze correlations between two sets of variables. It finds linear combinations of variables in each set such that their correlation is maximal.,2013-10-11 11:26:16.130 +57291,22262.0,1,57294.0,,,Function to find the quantile in a vector corresponding to constant $x$,,CC BY-SA 3.0,"

Suppose I have constant x=0.1 in the language R and I have a vector vec = rnorm(200). Is there a pre-packaged function to find the quantile of vec that corresponds the closest to x?

+ +

A solution is as follows:

+ +
x = 0.1
+vec = rnorm(100)
+percentiles = quantile(vec,seq(0,1,by=0.01))
+which(abs(x-percentiles)==min(abs(x-percentiles))) 
+#returns closest match
+
+ +

... but I would like to know if there's a pre-packaged function.

+",2013-10-11 11:39:19.043 +57292,19125.0,2,,48597.0,,,,CC BY-SA 3.0,"

Your supervisor may very well be right and the small sample size is the problem. You might want to do a bit of reading on Power Analysis. An introductory paper is that by Cohen (1992).

+ +

In short, there is a relation between sample size, effect size and power (which is the probability that the test detects a significant effect assuming that there is one). For example, if you have an estimate of the effect size you're looking for (in your example the difference between the means of the two groups) and you want to obtain a statistically significant result regardings this effect with a certain error probability (the $\alpha$-Level), then you can compute the size of the sample that is neccessary. Generally, when you have two of the numbers, you can compute the third one.

+ +

The difficult part is probably to get an idea of the effect size before doing the analysis. After all, ususally that is what one wants to find out about. An interesting discussion on this can be found on the Cognitive Sciences SE site.

+ +

One piece of free software to do power analysis is G Power. There is also the pwr-package for R.

+ +

References:

+ +

+Cohen, J. (1992). A power primer. Psychological Bulletin, 112(1), 155. +

+",2013-10-11 11:48:49.933 +57293,18198.0,1,,,,Selecting optimal set of eigenvectors for Principal Components Regression,,CC BY-SA 3.0,"

I am testing various techniques for dealing with strong multi-collinearity (MC) in a regression problem.

+ +

There have been various comparison papers written between competing techniques such as Ridge Regression (RR) and Principal Components Regression (PCR). There seems to be no clear winner though with the best technique seemingly problem specific. However one thing that bothers me about the PCR approach is the somewhat arbitrary way in which one simply excludes the smallest eigenvectors as has been proven in Hadi and Ling even the smallest eigenvector may have strong predictive power while the largest eigenvectors may have none.

+ +
+

""Some Cautionary notes on the use of Principal Components + Regression"" by Hadi and Ling. (PDF)

+
+ +

They also show that the the SSE can be vastly improved by adding seemingly insignificant eigenvectors.

+ +

In their discussion they highlght two papers that try to address this 2nd deficiency--Lott(1973) and Gunst and Mason(1973)--but it has been shown that the Lott technique fails to pick the ""correct"" eigenvectors in the presence of strong MC, and my problem has strong MC.

+ +

Do you know of a paper that can select the optimum set of eigenvalues even in the presence of strong MC? Or more recent papers that compare PCR and RR?

+",2013-10-11 11:56:09.380 +57294,11489.0,2,,57291.0,,,,CC BY-SA 3.0,"

Yep, if had you bothered to read the manual of quantile, you would have found the function ecdf in the ""See Also"" section.

+ +
x <- 0.1
+vec <- rnorm( 100 )
+ecdf( vec )( x )
+# or
+my.ecdf <- ecdf( vec )
+my.ecdf( x )
+
+ +

ecdf is a function returning another function -- that in turn is the experimental distribution function of your distribution.

+",2013-10-11 12:03:59.843 +57295,21762.0,2,,35249.0,,,,CC BY-SA 3.0,"

A data driven (and thus probably not so very good) approach

+ +

Calculate four correlation matrices: One for each layer and one for the pooled data (three lines per sample). If they all look quite similar, run a PCA based on the correlation matrix of the pooled sample and go on with the first few PCs.

+ +

Instead of comparing the four correlation matrices, you could also consider the four loading matrices of the corresponding PCAs and compare the loadings of the first few PCs. This is much easier if you have lots of variables.

+",2013-10-11 12:05:40.297 +57296,21398.0,1,,,,Selection probability weight,,CC BY-SA 3.0,"

I have a question on my selection probability weight. Is it a correct weight?

+ +

The research design: research areas were divided into strata according to size. Interviews were taken: 50 batches of 10 interviews in each area according to the relative size of strata. +Clusters were made for each stratum. In each cluster: batches of 10 interviews were sampled in fixed intervals. A random walk selected households and within these households, respondents were randomly chosen.

+ +

The selection probability weight: I had no population data on number of households. A selection probability weight was calculated for the within-household selection for each stratum. In each stratum, a weight was calculated and normalized so that the sum of the weights is 500 for each research area. The size of the eventual stratum was divided by the number of people in the stratum eligible for the survey. The result of this calculation was then multiplied by the number of eligible respondents in the household.

+",2013-10-11 12:11:24.820 +57297,18198.0,1,,,,Testing whether two Eigen decompositions are equal,,CC BY-SA 3.0,"

I have an eigen decomposition of a 30 variable covariance matrix calculated using 5y of daily data and would like to compare it to a different 5y period to see if the eigenvalues are the same. Obviously they will not be exactly the same due to noise in the signal but can I test statistically that they are the the same?

+ +

""An asymptotic chi-square test for the equality of two correlation matrices"" by R. Jennrich

+ +

The closest match I have found is a paper to test the equivalence on two correlation matrices, but as I am working in the Eigenvector space I would prefer a test that is performed on the eigenvectors (plus the paper is quite old).

+ +

Also on a similar topic what is the minimum length of time I can run a PCA analysis over for 30 variables on daily data. Clearly if i can generate more eigenvector decompositions to compare I can be more confident in my results.

+",2013-10-11 12:13:44.087 +57298,,1,57301.0,,user30602,Different answers for probability density function and cumulative density function,,CC BY-SA 3.0,"

I have a function $f(x)=2ae^{-ax}(1-e^{-ax})$, for $x>0, a>0$. This is a pdf. I need to find $P(X>1)$. I have done all my work in such a way that I should get the same answer whether I use the pdf or the cdf to find this probability. However, I'm getting different answers. Can someone please help me?

+ +

My attempt:

+ +

(using pdf) $P(X>1)=\int_1^{\infty}2ae^{-ax}(1-e^{-ax})dx = 2e^{-a}-e^{-2a}$

+ +

(using cdf) $P(X>1)= 1-P(X\leq 1) = 1 - (F_X(1)) = 1-(e^{-ax}(e^{-ax}-2))|_{x=1}=1-2e^{-a}-e^{-2a}$

+ +

Why are my answers different? Thanks!

+",2013-10-11 12:36:44.077 +57299,19395.0,1,57300.0,,,Before and after data: Which test for average comparison of Likert scale data?,,CC BY-SA 3.0,"

I have one group of respondents which answer on a scale of 1-5 once before and once after an experiment. I want to see if the experiment made a difference to their responses.

+ +

I was told not to use a t-test because of the Likert scale (ordinal data does not seem to fit a t-test) and because my data are not nearly normally distributed (answers to the questions lean heavily to the 1 of the scale (which is not a mistake in the design)).

+ +

I am not sure if the Wilcoxon signed-rank test works, because it seems to be designed for differences in groups (as in ""Do men respond differently from women?"").

+ +

Any suggestions on what could actually be used here?

+ +

(The answer here refers to a ""special paired t-test"", but does not explain which one)

+",2013-10-11 13:10:18.780 +57300,21762.0,2,,57299.0,,,,CC BY-SA 3.0,"

Wilcoxon's signed-rank test is usually a good choice in such a situation. It is the paired version of Wilcoxon's rank-sum test (aka Mann-Whitney-U-test). I think you are mixing these two procedures.

+ +

Make sure you use an exact/pseudo-exact implementation of the test to account for the highly discrete distribution.

+ +

EDIT: How you do it in R for x (pre) and y (post)

+ +
library(coin)
+set.seed(2)
+x <- sample(1:2, 20, T)
+y <- sample(2:3, 20, T)
+
+#Basic R gives p value of 0.0007167
+wilcox.test(x-y)                    
+
+#Coin gives p value of 0.0001221
+wilcoxsign_test(x~y, distribution = exact())
+
+",2013-10-11 13:17:25.363 +57301,22059.0,2,,57298.0,,,,CC BY-SA 3.0,"

It looks like a simple calculation error:

+ +

\begin{align} +F_X(y)&=\int_{x=0}^y 2ae^{-ax}(1-e^{-ax})\\ +&=e^{-ax}(e^{-ax}-2)|_{x=0}^y\\ +&=e^{-2ay}-2e^{-ay}+1. +\end{align}

+",2013-10-11 13:22:52.120 +57302,750.0,2,,57268.0,,,,CC BY-SA 3.0,"

You don't need extra terms in the models, less actually. It is plain to see by your plots, but if you look at summary(m2) you will see that the variance for random effect for x is really small, and the variance for the intercept is quite small as well.

+ +

Similarly for the m1 model, you can see from the plot that the slopes are all the same, but the intercept varies. You can use an F-test to check the model with only random intercepts versus the model with random slopes and intercepts you specified.

+ +
m1 <- lmer(y~x + (x|g),data=dat1)
+m1RInt <- lmer(y~x + (1|g),data=dat1)
+anova(m1,m1RInt)
+
+ +

Also just looking at the variance estimates of the random intercepts and effects for summary(m1) you would have come to the same conclusion that using random slopes adds nothing to the model.

+",2013-10-11 13:41:25.093 +57303,11506.0,1,,,,Quantitative results of cluster analysis,,CC BY-SA 3.0,"

Currently, I am doing a clustering for two sets of data. One smaller dataset (about 100 data) got ground truth labels, and one larger dataset (about 2000 data) has no ground truth labels.

+ +

For the smaller dataset, obviously, I can obtain quantitative results like accuracy, sensitivity and specificity.

+ +

However, for the larger dataset, I have no ground truth and couldn't get any useful quantitative results.

+ +
    +
  1. The only thing I found useful is the 'mean silhouette value', which can measure the cluster performance. However, it based on some distance measure that can only tell people how separate are the clusters. I am wondering if there are other 'better' or 'more appropriate' quantitative analysis for data without labels.

  2. +
  3. Because the data are without labels, I am also wondering if we can somehow have a 'uncertainty' measure about the clustering results like how confident about the cluster results?

  4. +
  5. For the smaller dataset with labels, except accuracy, sensitivity and specificity, any other quantitative results I can get? For the classification algorithm, we can do a cross-validation, is there any method we can use to do such a cross-validation for clustering? Also, can we get ROC analysis for clustering task?

  6. +
+",2013-10-11 13:50:46.157 +57304,9522.0,1,57306.0,,,Which statistical test should be used to test for enrichment of gene lists?,,CC BY-SA 3.0,"

I have performed an experiment to test the cellular sensitivity to a certain DNA damage agent. We have found 270 genes that were specifically sensitive to the drug and the total number of genes analyzed was 3668. 38 out of the 270 sensitive genes are classified as ""DNA repair genes"". If the number of ""DNA repair genes"" contained in the genome is 112 and the total number of genes in the genome is 3668, are the sensitive genes enrichment in DNA repair genes? +Which statistical test should be used? I would appreciate if you could also tell me some tool to calculate the p-value online.

+",2013-10-11 14:05:31.013 +57305,22607.0,1,,,,Interpretation of a PDF squared,,CC BY-SA 3.0,"

I have a problem where the crucial variable is the integral of the squared PDF of a random variable, i.e.

+ +

$\int f(x)^2dx$

+ +

How should I interpret this property of a distribution? If $f(x)$ is gaussian, then this is inversely proportional to the variance, $\sigma^2$, but I don't think this is generally true.

+ +

(Note that this is also equal to $\int F(x)f'(x)dx$ ).

+",2013-10-11 14:20:05.910 +57306,21638.0,2,,57304.0,,,,CC BY-SA 3.0,"

Standard practice to test for enrichment of gene lists is to do a hypergeometric test or, equivalently, a one-sided Fisher's exact test. You have the following $2\times2$ contingency table:

+ +

$$ +\array{& \text{DNA Repair} & \text{Other} \\\text{Sensitive} & 38 & 232 & 270\\\text{Not Sensitive} & 74 & 3324 & 3398 \\ & 112 & 3556} +$$

+ +

You can carry out the test in R as follows:

+ +
fisher.test(matrix(c(38,74,232,3324),nrow=2,ncol=2),alternative=""greater"")
+
+ +

Which gives a highly significant result:

+ +
Fisher's Exact Test for Count Data
+
+data:  matrix(c(38, 74, 232, 3324), nrow = 2, ncol = 2) 
+p-value < 2.2e-16
+alternative hypothesis: true odds ratio is greater than 1 
+95 percent confidence interval:
+5.062107      Inf 
+sample estimates:
+odds ratio 
+7.34918
+
+ +

Note that as we are testing for over-representation (rather than under-representation) the alternative parameter is set to ""greater"".

+",2013-10-11 14:48:42.243 +57307,9522.0,1,57310.0,,,Any online software to calculate pvalue of Fisher exact test?,,CC BY-SA 3.0,"

I would like to do a one-sided Fisher´s exact test for an analysis. I have not any statistic software to obtain the pvalues (no SAS, no SPSS). The 2x2 tables are of this type:

+ +

Do you know any online statistical software to calculate the pvalues? I have tried with some of them but the results indicate pvalue<0.0001 but I need to know the exact number. +The 2x2 tables are of this type:

+ +

+ +

Thanks a lot in advanced!

+",2013-10-11 15:15:59.620 +57308,22507.0,2,,57279.0,,,,CC BY-SA 3.0,"

I would do the forward stepwise selection, adding predictors as long as the correlation with residuals is significant, and then do some regularization (ridge, lasso, elastic nets). There are 2-3 metaparameters: forward stepwise termination constraint, and 1 or 2 regularization parameters. These metaparameters are determined via cross-validation.

+ +

If you want to take into account non-linearity, you could try random forest, which produces good results when there are many predictors. But it is slow.

+",2013-10-11 15:33:10.407 +57309,22611.0,1,,,,Covary two dummy variables in SEM?,,CC BY-SA 3.0,"

I am running a structural equation model (SEM) in Amos 18, and I want to test the impact of marital status on several latent variables. Marital status is nominal, so I created three dummy variables:

+ +
    +
  1. Mar_Single: 1 = yes, 0 = no
  2. +
  3. Mar_Married: 1 = yes, 0 = no
  4. +
  5. Mar_Other: 1 = yes, 0 = no
  6. +
+ +

I included Mar_Single and Mar_Married in the SEM, so their coefficients will be interpreted against the omitted (reference) group, Mar_Other. The modification indices suggested fit could be improved significantly if I covary Mar_Single and Mar_Married. Should I do this? In a way, this makes sense because they are perfectly correlated: If Mar_Single = 1 then Mar_Married will always = 0.

+ +

Larry

+",2013-10-11 16:00:23.980 +57310,20972.0,2,,57307.0,,,,CC BY-SA 3.0,"

Microsoft research have an online tool here. You can also download an Excel add-in from here.

+ +

Your result according to the Microsoft tool is 6.511E-017.

+",2013-10-11 16:12:37.340 +57415,22507.0,2,,55260.0,,,,CC BY-SA 3.0,"

If you want to do logistic regression, a simple approach is:

+ +
    +
  • for each continuous feature with missing data, replace all missing values by the average or median value for this feature, and create one more boolean feature which indicates whether the data is missing or not
  • +
  • for each unordered categorical feature with missing data, put all missing values into a new category
  • +
+",2013-10-14 02:23:17.417 +57312,18198.0,1,,,,Degrees of Freedom for Ridge regression without knowing the Ridge Parameter?,,CC BY-SA 3.0,"

There is a very nice post here that gives a neat solution to the problem of finding the ridge parameter when the degrees of freedom are known:

+ +

How to calculate regularization parameter in ridge regression given degrees of freedom and input matrix?

+ +

My question is how can you know what the degrees of freedom are before knowing the ridge parameter value is? ( I have added a comment to the other thread but its quite old so thought it best to create a new topic).

+",2013-10-11 16:59:04.847 +57313,12544.0,2,,57309.0,,,,CC BY-SA 3.0,"

Yes, you should. You should always correlate exogenous variables - their correlations are not part of the model, and if you don't, you're either worsening fit, or getting degrees of freedom (which appear to improve fit) when you shouldn't.

+ +

When you do this in regression analysis, they are correlated (and there's no way of stopping them).

+ +

Also, you say ""If Mar_Single = 1 then Mar_Married will always = 0."" Yes, but that doesn't make them perfectly correlated - because if mar_single = 0 mar_married might be 0 or might be 1.

+ +

If you use Mplus, it will correlate x-variables by default and will not include these correlations in the null model which it uses to calculate CFI and NFI.

+",2013-10-11 17:21:17.923 +57314,3446.0,1,,,,Coverage rates of confidence intervals in reality,,CC BY-SA 3.0,"

One proves mathematically that if assumptions of a model are satisfied, then the coverage rate of a $100p\%$ confidence interval is $100p\%$. But then statistics gets applied to the world, where model assumptions may not be satisfied. Are there any studies comparing the coverage rates of confidence intervals applied to the real world with theoretical coverage rates?

+",2013-10-11 17:31:46.623 +57315,5984.0,1,,,,What does it mean that random effects are highly correlated?,,CC BY-SA 3.0,"

What does it mean when two random effects are highly or perfectly correlated?
+That is, in R when you call summary on a mixed model object, under ""Random effects"" ""corr"" is 1 or -1.

+ +
summary(model.lmer) 
+Random effects:
+Groups   Name                    Variance   Std.Dev.  Corr                 
+popu     (Intercept)             2.5714e-01 0.5070912                      
+          amdclipped              4.2505e-04 0.0206167  1.000               
+          nutrientHigh            7.5078e-02 0.2740042  1.000  1.000        
+          amdclipped:nutrientHigh 6.5322e-06 0.0025558 -1.000 -1.000 -1.000
+
+ +

I know this is bad and indicates that the random effects part of the model is too complex, but I'm trying to understand

+ +
    +
  • 1)what is doing on statistically
  • +
  • 2)what is going on practically with +the structure of the response variables.
  • +
+ +

Example

+ +

Here is an example based on ""GLMMs in action: gene-by-environment interaction in total fruit production of wild populations of Arabidopsis thaliana"" +by Bolker et al

+ +

Download data

+ +
download.file(url = ""http://glmm.wdfiles.com/local--files/trondheim/Banta_TotalFruits.csv"", destfile = ""Banta_TotalFruits.csv"")
+dat.tf <- read.csv(""Banta_TotalFruits.csv"", header = TRUE)
+
+ +

Set up factors

+ +
dat.tf <- transform(dat.tf,X=factor(X),gen=factor(gen),rack=factor(rack),amd=factor(amd,levels=c(""unclipped"",""clipped"")),nutrient=factor(nutrient,label=c(""Low"",""High"")))
+
+ +

Modeling log(total.fruits+1) with ""population"" (popu) as random effect

+ +
model.lmer <- lmer(log(total.fruits+1) ~ nutrient*amd + (amd*nutrient|popu), data= dat.tf)
+
+ +

Accessing the Correlation matrix of the random effects show that everything is perfectly correlated

+ +
attr(VarCorr(model.lmer)$popu,""correlation"")
+
+                         (Intercept) amdclipped nutrientHigh amdclipped:nutrientHigh
+(Intercept)                       1          1            1                      -1
+amdclipped                        1          1            1                      -1
+nutrientHigh                      1          1            1                      -1
+amdclipped:nutrientHigh          -1         -1           -1                       1
+
+ +

I understand that these are the correlation coefficients of two vectors of random effects coefficients, such as

+ +
cor(ranef(model.lmer)$popu$amdclipped, ranef(model.lmer)$popu$nutrientHigh)
+
+ +

Does a high correlation mean that the two random effects contain redundant information? Is this analogous to multicollinearity in multiple regression when a model with highly correlated predictors should be simplified?

+",2013-10-11 17:38:34.780 +57316,22615.0,1,,,,Mean difference for count data,,CC BY-SA 4.0,"

I have two samples $s_1$ and $s_2$ of count data. The sample size is > 1000 each. The distributions look similar to a Poisson distribution but the variance is much larger than the mean.

+ +

How do I test whether the mean of $s_1$ is larger than the mean of $s_2$?

+",2013-10-11 18:00:38.057 +57317,22564.0,1,57321.0,,,"When making inferences about group means, are credible Intervals sensitive to within-subject variance while confidence intervals are not?",,CC BY-SA 3.0,"

This is a spin off of this question: +How to compare two groups with multiple measurements for each individual with R?

+ +

In the answers there (if I understood correctly) I learned that within-subject variance does not effect inferences made about group means and it is ok to simply take the averages of averages to calculate group mean, then calculate within-group variance and use that to perform significance tests. I would like to use a method where the larger the within subject variance the less sure I am about the group means or understand why it does not make sense to desire that.

+ +

Here is a plot of the original data along with some simulated data that used the same subject means, but sampled the individual measurements for each subject from a normal distribution using those means and a small within-subject variance (sd=.1). As can be seen the group level confidence intervals (bottom row) are unaffected by this (at least the way I calculated them).

+ +

+ +

I also used rjags to estimate the group means in three ways. +1) Use the raw original data +2) Use only the Subject means +3) Use the simulated data with small within-subject sd

+ +

The results are below. Using this method we see that the 95% credible intervals are narrower in cases #2 and #3. This meets my intuition of what I would like to occur when making inferences about group means, but I am not sure if this is just some artifact of my model or a property of credible intervals.

+ +

Note. To use rjags you need to first install JAGS from here: +http://sourceforge.net/projects/mcmc-jags/files/

+ +

+ +

The various code is below.

+ +

The original data:

+ +
structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
+3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 
+3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 
+6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 
+10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 
+12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 
+15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 
+18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, 
+22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, 
+6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, 
+2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, 
+12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, 
+10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, 
+20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list(
+    NULL, c(""Group"", ""Subject"", ""Value"")))
+
+ +

Get subject Means and simulate the data with small within-subject variance:

+ +
#Get Subject Means
+means<-aggregate(Value~Group+Subject, data=dat, FUN=mean)
+
+#Initialize ""dat2"" dataframe
+dat2<-dat
+
+#Sample individual measurements for each subject
+temp=NULL
+for(i in 1:nrow(means)){
+  temp<-c(temp,rnorm(6,means[i,3], .1))
+}
+
+#Set Simulated values
+dat2[,3]<-temp
+
+ +

The function to fit the JAGS model:

+ +
 require(rjags) 
+
+#Jags fit function
+jags.fit<-function(dat2){
+
+  #Create JAGS model
+  modelstring = ""
+
+  model{
+  for(n in 1:Ndata){
+  y[n]~dnorm(mu[subj[n]],tau[subj[n]]) T(0, )
+  }
+
+  for(s in 1:Nsubj){
+  mu[s]~dnorm(muG,tauG) T(0, )
+  tau[s] ~ dgamma(5,5)
+  }
+
+
+  muG~dnorm(10,.01) T(0, )
+  tauG~dgamma(1,1)
+
+  }
+  ""
+  writeLines(modelstring,con=""model.txt"")
+
+#############  
+
+  #Format Data
+  Ndata = nrow(dat2)
+  subj = as.integer( factor( dat2$Subject ,
+                             levels=unique(dat2$Subject ) ) )
+  Nsubj = length(unique(subj))
+  y = as.numeric(dat2$Value)
+
+  dataList = list(
+    Ndata = Ndata ,
+    Nsubj = Nsubj ,
+    subj = subj ,
+    y = y
+  )
+
+  #Nodes to monitor
+  parameters=c(""muG"",""tauG"",""mu"",""tau"")
+
+
+  #MCMC Settings
+  adaptSteps = 1000             
+  burnInSteps = 1000            
+  nChains = 1                   
+  numSavedSteps= nChains*10000          
+  thinSteps=20                      
+  nPerChain = ceiling( ( numSavedSteps * thinSteps ) / nChains )            
+
+
+  #Create Model
+  jagsModel = jags.model( ""model.txt"" , data=dataList, 
+                          n.chains=nChains , n.adapt=adaptSteps , quiet=FALSE )
+  # Burn-in:
+  cat( ""Burning in the MCMC chain...\n"" )
+  update( jagsModel , n.iter=burnInSteps )
+
+  # Getting DIC data:
+  load.module(""dic"")
+
+
+  # The saved MCMC chain:
+  cat( ""Sampling final MCMC chain...\n"" )
+  codaSamples = coda.samples( jagsModel , variable.names=parameters , 
+                              n.iter=nPerChain , thin=thinSteps )  
+
+  mcmcChain = as.matrix( codaSamples )
+
+  result = list(codaSamples=codaSamples, mcmcChain=mcmcChain)
+
+}
+
+ +

Fit the model to each group of each dataset:

+ +
#Fit to raw data
+groupA<-jags.fit(dat[which(dat[,1]==1),])
+groupB<-jags.fit(dat[which(dat[,1]==2),])
+groupC<-jags.fit(dat[which(dat[,1]==3),])
+
+#Fit to subject mean data
+groupA2<-jags.fit(means[which(means[,1]==1),])
+groupB2<-jags.fit(means[which(means[,1]==2),])
+groupC2<-jags.fit(means[which(means[,1]==3),])
+
+#Fit to simulated raw data (within-subject sd=.1)
+groupA3<-jags.fit(dat2[which(dat2[,1]==1),])
+groupB3<-jags.fit(dat2[which(dat2[,1]==2),])
+groupC3<-jags.fit(dat2[which(dat2[,1]==3),])
+
+ +

Credible interval/highest density interval function:

+ +
#HDI Function
+get.HDI<-function(sampleVec,credMass){ 
+  sortedPts = sort( sampleVec )
+  ciIdxInc = floor( credMass * length( sortedPts ) )
+  nCIs = length( sortedPts ) - ciIdxInc
+  ciWidth = rep( 0 , nCIs )
+  for ( i in 1:nCIs ) {
+    ciWidth[ i ] = sortedPts[ i + ciIdxInc ] - sortedPts[ i ]
+  }
+  HDImin = sortedPts[ which.min( ciWidth ) ]
+  HDImax = sortedPts[ which.min( ciWidth ) + ciIdxInc ]
+  HDIlim = c( HDImin , HDImax, credMass )
+  return( HDIlim )
+}
+
+ +

First Plot:

+ +
layout(matrix(c(1,1,2,2,3,4),nrow=3,ncol=2, byrow=T))
+
+boxplot(dat[,3]~dat[,2], 
+xlab=""Subject"", ylab=""Value"", ylim=c(0, 1.2*max(dat[,3])),
+col=c(rep(""Red"",length(which(dat[,1]==unique(dat[,1])[1]))/6),
+rep(""Green"",length(which(dat[,1]==unique(dat[,1])[2]))/6),
+rep(""Blue"",length(which(dat[,1]==unique(dat[,1])[3]))/6)
+),
+main=""Original Data""
+)
+stripchart(dat[,3]~dat[,2], vert=T, add=T, pch=16)
+legend(""topleft"", legend=c(""Group A"", ""Group B"", ""Group C"", ""Individual Means +/- 95% CI""),
+col=c(""Red"",""Green"",""Blue"", ""Grey""), lwd=3, bty=""n"", pch=c(15),
+pt.cex=c(rep(0.1,3),1),
+ncol=3)
+
+for(i in 1:length(unique(dat[,2]))){
+  m<-mean(examp[which(dat[,2]==unique(dat[,2])[i]),3])
+  ci<-t.test(dat[which(dat[,2]==unique(dat[,2])[i]),3])$conf.int[1:2]
+
+  points(i-.3,m, pch=15,cex=1.5, col=""Grey"")
+  segments(i-.3,
+           ci[1],i-.3,
+           ci[2], lwd=4, col=""Grey""
+  )
+}
+
+
+
+boxplot(dat2[,3]~dat2[,2], 
+xlab=""Subject"", ylab=""Value"", ylim=c(0, 1.2*max(dat2[,3])),
+col=c(rep(""Red"",length(which(dat2[,1]==unique(dat2[,1])[1]))/6),
+rep(""Green"",length(which(dat2[,1]==unique(dat2[,1])[2]))/6),
+rep(""Blue"",length(which(dat2[,1]==unique(dat2[,1])[3]))/6)
+),
+main=c(""Simulated Data"", ""Same Subject Means but Within-Subject SD=.1"")
+)
+stripchart(dat2[,3]~dat2[,2], vert=T, add=T, pch=16)
+legend(""topleft"", legend=c(""Group A"", ""Group B"", ""Group C"", ""Individual Means +/- 95% CI""),
+col=c(""Red"",""Green"",""Blue"", ""Grey""), lwd=3, bty=""n"", pch=c(15),
+pt.cex=c(rep(0.1,3),1),
+ncol=3)
+
+for(i in 1:length(unique(dat2[,2]))){
+  m<-mean(examp[which(dat2[,2]==unique(dat2[,2])[i]),3])
+  ci<-t.test(dat2[which(dat2[,2]==unique(dat2[,2])[i]),3])$conf.int[1:2]
+
+  points(i-.3,m, pch=15,cex=1.5, col=""Grey"")
+  segments(i-.3,
+           ci[1],i-.3,
+           ci[2], lwd=4, col=""Grey""
+  )
+}
+
+
+means<-aggregate(Value~Group+Subject, data=dat, FUN=mean)
+
+boxplot(means[,3]~means[,1], col=c(""Red"",""Green"",""Blue""),
+ylim=c(0,1.2*max(means[,3])), ylab=""Value"", xlab=""Group"",
+main=""Original Data""
+)
+stripchart(means[,3]~means[,1], pch=16, vert=T, add=T)
+
+for(i in 1:length(unique(means[,1]))){
+  m<-mean(means[which(means[,1]==unique(means[,1])[i]),3])
+  ci<-t.test(means[which(means[,1]==unique(means[,1])[i]),3])$conf.int[1:2]
+
+  points(i-.3,m, pch=15,cex=1.5, col=""Grey"")
+  segments(i-.3,
+           ci[1],i-.3,
+           ci[2], lwd=4, col=""Grey""
+  )
+}
+legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3, col=""Grey"")
+
+
+means2<-aggregate(Value~Group+Subject, data=dat2, FUN=mean)
+
+boxplot(means2[,3]~means2[,1], col=c(""Red"",""Green"",""Blue""),
+ylim=c(0,1.2*max(means2[,3])), ylab=""Value"", xlab=""Group"",
+main=""Simulated Data Group Averages""
+)
+stripchart(means2[,3]~means2[,1], pch=16, vert=T, add=T)
+
+for(i in 1:length(unique(means2[,1]))){
+  m<-mean(means[which(means2[,1]==unique(means2[,1])[i]),3])
+  ci<-t.test(means[which(means2[,1]==unique(means2[,1])[i]),3])$conf.int[1:2]
+
+  points(i-.3,m, pch=15,cex=1.5, col=""Grey"")
+  segments(i-.3,
+           ci[1],i-.3,
+           ci[2], lwd=4, col=""Grey""
+  )
+}
+legend(""topleft"", legend=c(""Group Means +/- 95% CI""), bty=""n"", pch=15, lwd=3,   col=""Grey"")
+
+ +

Second Plot:

+ +
layout(matrix(c(1,2,3,4,4,4,5,5,5,6,6,6),nrow=4,ncol=3, byrow=T))
+
+#Plot priors
+plot(seq(0,10,by=.01),dgamma(seq(0,10,by=.01),5,5), type=""l"", lwd=4,
+     xlab=""Value"", ylab=""Density"",
+     main=""Prior on Within-Subject Precision""
+)
+plot(seq(0,10,by=.01),dgamma(seq(0,10,by=.01),1,1), type=""l"", lwd=4,
+     xlab=""Value"", ylab=""Density"",
+     main=""Prior on Within-Group Precision""
+)
+plot(seq(0,300,by=.01),dnorm(seq(0,300,by=.01),10,100), type=""l"", lwd=4,
+     xlab=""Value"", ylab=""Density"",
+     main=""Prior on Group Means""
+)
+
+
+#Set overall xmax value
+x.max<-1.1*max(groupA$mcmcChain[,""muG""],groupB$mcmcChain[,""muG""],groupC$mcmcChain[,""muG""],
+               groupA2$mcmcChain[,""muG""],groupB2$mcmcChain[,""muG""],groupC2$mcmcChain[,""muG""],
+               groupA3$mcmcChain[,""muG""],groupB3$mcmcChain[,""muG""],groupC3$mcmcChain[,""muG""]
+)
+
+
+#Plot result for raw data
+#Set ymax
+y.max<-1.1*max(density(groupA$mcmcChain[,""muG""])$y,density(groupB$mcmcChain[,""muG""])$y,density(groupC$mcmcChain[,""muG""])$y)
+
+plot(density(groupA$mcmcChain[,""muG""]),xlim=c(0,x.max), 
+     ylim=c(-.1*y.max,y.max), lwd=3, col=""Red"",
+     main=""Group Mean Estimates: Fit to Raw Data"", xlab=""Value""
+)
+lines(density(groupB$mcmcChain[,""muG""]), lwd=3, col=""Green"")
+lines(density(groupC$mcmcChain[,""muG""]), lwd=3, col=""Blue"")
+
+hdi<-get.HDI(groupA$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col=""Red"")
+
+hdi<-get.HDI(groupB$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col=""Green"")
+
+hdi<-get.HDI(groupC$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col=""Blue"")
+
+####
+
+#Plot result for mean data
+
+#x.max<-1.1*max(groupA2$mcmcChain[,""muG""],groupB2$mcmcChain[,""muG""],groupC2$mcmcChain[,""muG""])
+y.max<-1.1*max(density(groupA2$mcmcChain[,""muG""])$y,density(groupB2$mcmcChain[,""muG""])$y,density(groupC2$mcmcChain[,""muG""])$y)
+
+plot(density(groupA2$mcmcChain[,""muG""]),xlim=c(0,x.max), 
+     ylim=c(-.1*y.max,y.max), lwd=3, col=""Red"",
+     main=""Group Mean Estimates: Fit to Subject Means"", xlab=""Value""
+)
+lines(density(groupB2$mcmcChain[,""muG""]), lwd=3, col=""Green"")
+lines(density(groupC2$mcmcChain[,""muG""]), lwd=3, col=""Blue"")
+
+hdi<-get.HDI(groupA2$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col=""Red"")
+
+hdi<-get.HDI(groupB2$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col=""Green"")
+
+hdi<-get.HDI(groupC2$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col=""Blue"")
+
+
+
+
+####
+#Plot result for simulated data
+#Set ymax
+#x.max<-1.1*max(groupA3$mcmcChain[,""muG""],groupB3$mcmcChain[,""muG""],groupC3$mcmcChain[,""muG""])
+y.max<-1.1*max(density(groupA3$mcmcChain[,""muG""])$y,density(groupB3$mcmcChain[,""muG""])$y,density(groupC3$mcmcChain[,""muG""])$y)
+
+plot(density(groupA3$mcmcChain[,""muG""]),xlim=c(0,x.max), 
+     ylim=c(-.1*y.max,y.max), lwd=3, col=""Red"",
+     main=c(""Group Mean Estimates: Fit to Simulated data"", ""(Within-Subject SD=0.1)""), xlab=""Value""
+)
+lines(density(groupB3$mcmcChain[,""muG""]), lwd=3, col=""Green"")
+lines(density(groupC3$mcmcChain[,""muG""]), lwd=3, col=""Blue"")
+
+hdi<-get.HDI(groupA3$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.033*y.max,hdi[2],-.033*y.max, lwd=3, col=""Red"")
+
+hdi<-get.HDI(groupB3$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.066*y.max,hdi[2],-.066*y.max, lwd=3, col=""Green"")
+
+hdi<-get.HDI(groupC3$mcmcChain[,""muG""], .95)
+segments(hdi[1],-.099*y.max,hdi[2],-.099*y.max, lwd=3, col=""Blue"")
+
+ +

EDIT with my personal version of the answer from @StéphaneLaurent

+ +

I used the model he described to sample from a normal distribution with mean=0, between subject variance =1 and within subject error/variance= 0.1,1,10,100. A subset of the confidence intervals are shown in the left panels while the distribution of their widths is shown by the corresponding right panels. This has convinced me that he is 100% correct. However, I am still confused by my example above but will follow this up with a new more focused question.

+ +

+ +

The code for the above simulation and charts:

+ +
dev.new()
+par(mfrow=c(4,2))
+
+
+num.sims<-10000
+sigmaWvals<-c(.1,1,10,100)
+muG<-0  #Grand Mean
+sigma.between<-1  #Between Experiment sd
+
+for(sigma.w in sigmaWvals){
+
+  sigma.within<-sigma.w #Within Experiment sd
+
+  out=matrix(nrow=num.sims,ncol=2)
+  for(i in 1:num.sims){
+
+    #Sample the three experiment means (mui, i=1:3)
+    mui<-rnorm(3,muG,sigma.between)
+
+    #Sample the three obersvations for each experiment (muij, i=1:3, j=1:3)
+    y1j<-rnorm(3,mui[1],sigma.within)
+    y2j<-rnorm(3,mui[2],sigma.within)
+    y3j<-rnorm(3,mui[3],sigma.within)
+
+
+    #Put results in data frame
+    d<-as.data.frame(cbind(
+      c(rep(1,3),rep(2,3),rep(3,3)),
+      c(y1j, y2j, y3j )
+    ))
+    d[,1]<-as.factor(d[,1])
+
+    #Calculate means for each experiment
+    dmean<-aggregate(d[,2]~d[,1], data=d, FUN=mean)
+
+    #Add new confidence interval data to output
+    out[i,]<-t.test(dmean[,2])$conf.int[1:2]
+
+  }
+
+  #Calculate % of intervals that contained muG
+  cover<-matrix(nrow=nrow(out),ncol=1)
+  for(i in 1:nrow(out)){
+    cover[i]<-out[i,1]<muG & out[i,2]>muG
+  }
+
+
+
+  sub<-floor(seq(1,nrow(out),length=100))
+  plot(out[sub,1], ylim=c(min(out[sub,1]),max(out[sub,2])),
+       xlab=""Simulation #"", ylab=""Value"", xaxt=""n"",
+       main=c(paste(""# of Sims="",num.sims),
+              paste(""% CIs Including muG="",100*round(length(which(cover==T))/nrow(cover),3)))
+  )
+  axis(side=1, at=1:100, labels=sub)
+  points(out[sub,2])
+
+  cnt<-1
+  for(i in sub){
+    segments(cnt, out[i,1],cnt,out[i,2])
+    cnt<-cnt+1
+  }
+  abline(h=0, col=""Red"", lwd=3)
+
+  hist(out[,2]-out[,1], freq=F, xlab=""Width of 95% CI"",
+       main=c(paste(""muG="", muG), 
+              paste(""Sigma Between="",sigma.between), 
+              paste(""Sigma Within="",sigma.within))
+  )
+
+}
+
+",2013-10-11 18:05:20.930 +57318,18198.0,1,,,,Iterative method to find Ridge Regression Parameter,,CC BY-SA 3.0,"

I have seen a method whereby instead of trying to estimate the ridge parameter (k) directly from the data (using one of the many many ridge parameter estimators in the literature) you solve for it iteratively.

+ +

The method is simple enough: You simply increase k (in suitably small steps) until the condition number is reduced blow 10.

+ +

At first blush this seems like quite a nice solution to me but I've never seen a Ridge Regression paper/book that uses it.

+ +

Update OK this is basically the method suggested by Marquardt ""Generalized inverses, Ridge Regression, Biased Linear Estimation and Non-linear Estimation"" the only difference being he used VIF's to measure the MC while this method uses the condition number. McDonald and Galrneau ""A Monte-Carlo Evaluation of some Ridge-Type Estimators"" note that this method is may not be appropriate for all data sets as it does not include the y values (observations). I still have not found a paper where the Marquardt method is tested against other estimators for the ridge parameter does anybody know of such a paper?

+ +

Is this method theoretically sound though? Even if (as I suspect) it isn't does it really matter for the average practitioner who just want to produce more stable estimates of their Beta's (the weights in the regression) rather than having them ""blow up"" to grossly unrealistic values when they experience severe MC?

+ +

Truly I would like to find a better method than this ideally with a solid theoretical underpinning but its hard to see from a practical view point it can be improved upon?

+",2013-10-11 18:05:44.790 +57339,10987.0,2,,57316.0,,,,CC BY-SA 3.0,"

Give your large sample sizes, you could probably use a t-test on the means. If your sample sizes are equal, you are in pretty good shape whether you want to use a pooled estimate of the variance or unpooled (Welch's test). Do a one sided test, if you are sure that the population of s1 has a mean at least as large as the mean of the population of s2.

+ +

Note: If the variances are much larger than the means, your counts are not Poisson. But what matters here is the distribution of the sample averages, and that should be nearly normal, unless the data are super-skewed. In that case, you could do a non-parametric test like the Kruskal-Wallis.

+",2013-10-11 23:59:32.067 +57340,10987.0,2,,57323.0,,,,CC BY-SA 3.0,"

You could try CART (tree) classification regression. That would select a decision tree algorithm for the outcomes based on the answers to the questions. As a bi-product, it would indicate which questions are most important in predicting outcome.

+",2013-10-12 00:09:09.067 +57416,22507.0,2,,57414.0,,,,CC BY-SA 3.0,"

Multinomial test, if I understand you correctly.

+",2013-10-14 02:36:19.860 +57319,1693.0,1,,,,How high must logistic covariates' predictive accuracy be for a reversal effect to show up?,,CC BY-SA 4.0,"

I am modeling an outcome for hospital patients, 'RA' (whether readmitted; 0=No, 1=Yes). My predictor of interest is 'HHS' (whether referred to Home Health Services such as from a visiting nurse; 0=No, 1=Yes). Those referred readmit at a 15.2% rate; others, 9.2%, but the former are needier, sicker patients. Conventional thinking is that if we controlled for severity of illness this difference would not only be washed out but would reverse itself. In other words, holding constant the severity of illness, having HHS should mean a lower RA rate.

+ +

With HHS as the sole predictor, its coefficient (B) in a logistic regression = 0.6 (N ~ 25k). B is reduced to 0.2 with a group of covariates controlled, each accounting for some aspect of severity of illness, but B doesn't fall below zero.

+ +

HHS alone explains only about 1% of the variance in RA; with the other predictors, this becomes 4%.* Perhaps this is the problem--that these covariates are not explaining enough variance to ""succeed"" in reversing the sign of the coefficient of interest. If this is true, is there a way to estimate how high their explained variance needs to be for such a reversal to show up?

+ +

EDIT: Alecos Papadopoulos has come up with an impressive solution that answers this question, soon to be published in The American Statistician. See https://www.tandfonline.com/doi/full/10.1080/00031305.2019.1704873

+ +
+ +

*Using either of 2 pseudo-R-squared formulas; Cox & Snell's or Menard's [-2LL0 - (-2LL1)] / [-2LL0.]

+",2013-10-11 18:20:07.563 +57320,19331.0,2,,37182.0,,,,CC BY-SA 3.0,"

The spatial power covariance structure is a generalization of the first-order autoregressive covariance structure. Where the first-order autoregressive structure assumes the time points are equally spaced, the spatial power structure can account for a continuous time point. In reality, we could just forget the first-order autoregressive structure entirely, because if we fit the spatial power structure when the data are equally spaced we'll get the same answer as when using the first-order autoregressive structure.

+ +

All that aside, the correlation function you're looking for is corCAR1(), which is the continuous first-order autoregressive structure. If you're looking to duplicate what you fit in SAS, then the code you're looking for is:

+ +
gls(CD4t~T, data=df, na.action = (na.omit), method = ""REML"",
+    corr=corCAR1(form=~T|NUM_PAT))
+
+ +

Of course, you don't need to specify method = ""REML"", since, as in SAS, the default method in gls() is already restricted maximum likelihood.

+",2013-10-11 19:13:11.093 +57321,6162.0,2,,57317.0,,,,CC BY-SA 3.0,"
+

In the answers there (if I understood correctly) I learned that + within-subject variance does not effect inferences made about group + means and it is ok to simply take the averages of averages to + calculate group mean, then calculate within-group variance and use + that to perform significance tests.

+
+ +

Let me develop this idea here. The model for the individual observations is +$$y_{ijk}= \mu_i + \alpha_{ij} + \epsilon_{ijk}$$, where :

+ +
    +
  • $y_{ijk}$ is the $k$-th measurement of individual $j$ of group $i$

  • +
  • $\alpha_{ij} \sim_{\text{iid}} {\cal N}(0, \sigma^2_b)$ is the random effect for individual $j$ of group $i$

  • +
  • $\epsilon_{ijk} \sim_{\text{iid}} {\cal N}(0, \sigma^2_w)$ is the within-error

  • +
+ +

In my answer to your first question, I have suggested you to note that one obtains a classical (fixed effects) Gaussian linear model for the subjects means $\bar y_{ij\bullet}$. Indeed you can easily check that $$\bar y_{ij\bullet} = \mu_i + \delta_{ij}$$ with $$\delta_{ij} = \alpha_{ij} + \frac{1}{K}\sum_k \epsilon_{ijk} +\sim_{\text{iid}} {\cal N}(0, \sigma^2) \quad \text{where } \quad \boxed{\sigma^2=\sigma^2_b+\frac{\sigma^2_w}{K}},$$ +assuming $K$ repeated measurements for each individual. This is nothing but the one-way ANOVA model with a fixed factor.

+ +

And then I claimed that in order to draw inference about the $\mu_i$ you can simply consider the simple classical linear model whose observations are the subjects means $\bar y_{ij\bullet}$. Update 12/04/2014: Some examples of this idea are now written on my blog: Reducing a model to get confidence intervals. I'm under the impression that this always work when we average the data over the levels of a random effect.

+ +
+

In the answers there (if I understood correctly) I learned that + within-subject variance does not effect inferences made about group + means and it is ok to simply take the averages of averages to + calculate group mean, then calculate within-group variance and use + that to perform significance tests. I would like to use a method where + the larger the within subject variance the less sure I am about the + group means or understand why it does not make sense to desire that.

+
+ +

As you see from the boxed formula, the within-variance $\sigma^2_w$ plays a role in the model for the observed group means.

+",2013-10-11 19:18:23.050 +57322,10964.0,1,,,,Explaining p-value to a sophisticated layman,,CC BY-SA 3.0,"

I think I understand the concept of p-value but unfortunately I still have to exert a lot of brain cycles to get my arms around it.

+ +

I would like to get an explanation of the p-value that is rigorous enough for a sophisticated layman - something that would be intuitive.

+",2013-10-11 19:42:47.813 +57323,22618.0,1,57344.0,,,Simple recommender system - where to start?,,CC BY-SA 3.0,"

Without going into specifics, I'm currently working on a system that involves 20-25 questions being answered as either Green, Yellow, Orange or Red. After completing a subset of these questions (many questions can be left as defaulting to Green), the system allows our users to choose one outcome out of four, roughly corresponding to the answers they entered (OutcomeGreen, OutcomeYellow, OutcomeOrange or OutcomeRed). The answer that was selected most tends to be a good indicator as to what outcome they will select, but that's not always the case.

+ +

After having this system in place for the last 2 years, now I've received a request to have the system itself make a recommendation as to which outcome the user should select. Using data already accumulated over this period, I'd like to get some insight as to which questions/answers tend to be most influential for specific outcomes, and possibly give them more weight when determining what to recommend.

+ +

My main dilemma is that my last class on statistics was more than 20 years ago, and just looking through the tags here made me feel that I'm out of my depth. With the description I've provided, and the vast knowledge contained within this SE:

+ +
    +
  • Is there anything I should be looking into (tools, subset of +CrossValidated tags) that would help gain better insight, and where I +should look for more information?
  • +
  • Is there a quick way to get up-to-speed on what I'm missing?
  • +
+ +

Background: I'm a developer in many programming languages, and an amateur mathematician (mostly playing around in number theory and linear programming). I'm also a quick learner; I've been learning how to use R in my spare time. I just need some indication as to where I would find info quickly that would help me move forward with this.

+",2013-10-11 19:57:04.153 +57350,11772.0,1,57351.0,,,Subscript notation in expectations,,CC BY-SA 3.0,"

What is the exact meaning of the subscript notation $\mathbb{E}_X[f(X)]$ in conditional expectations in the framework of measure theory ? These subscripts do not appear in the definition of conditional expectation, but we may see for example in this page of wikipedia. (Note that it wasn't always the case, the same page few months ago).

+ +

What should be for example the meaning of $\mathbb{E}_X[X+Y]$ with $X\sim\mathcal{N}(0,1)$ and $Y=X+1$ ?

+",2013-10-12 11:04:38.997 +57324,5045.0,2,,57322.0,,,,CC BY-SA 3.0,"

Take a look at the tooth brushing example at the very start of Chapter 14 of Andrew Vickers' book What is a p-value anyway? 34 Stories to Help You Actually Understand Statistics. It starts on page 57 or you can use the table of contents button in the bottom left corner to find it.

+ +

Here's an excerpt:

+ +
+

[I]f you do nothing else, please try to remember the following + sentence: “the $p$-value is the probability that the data would be at + least as extreme as those observed, if the null hypothesis were true.” + Though I’d prefer that you also understood it—about which, teeth + brushing.

+ +

I have three young children. In the evening, before we get to bedtime + stories (bedtime stories being a nice way to end the day), we have to + persuade them all to bathe, use the toilet, clean their teeth, change + into pajamas, get their clothes ready for the next day and then + actually get into bed (the persuading part being a nice way to go + crazy). My five-year-old can often be found sitting on his bed, fully + dressed, claiming to have clean teeth. The give-away is the bone dry + toothbrush: he says that he has brushed his teeth, I tell him that he + couldn’t have.

+ +

My reasoning here goes like this: the toothbrush is dry; it is + unlikely that the toothbrush would be dry if my son had cleaned his + teeth; therefore he hasn’t cleaned his teeth. Or using + statistician-speak: here are the data (a dry toothbrush); here is a + hypothesis (my son has cleaned his teeth); the data would be unusual + if the hypothesis were true, therefore we should reject the + hypothesis.

+ +

[...]

+ +

So here is what to parrot when we run into each other at a bar and I + still haven’t managed to work out any new party tricks: “The $p$-value + is the probability that the data would be at least as extreme as those + observed, if the null hypothesis were true.” When I recover from + shock, you can explain it to me in terms of a toothbrush (“The + probability of the toothbrush being dry if you’ve just cleaned your + teeth”).

+
+ +

The other thing I really like about this example is that it also explains that failing to reject the null does not mean the null is necessarily true. Vickers writes that his son has now worked out the trick and has taken to running his toothbrush under the tap for a second or two before heading to bed. Just because the toothbrush is wet (and the data is consistent with the null hypothesis), it does not mean that his son has cleaned his teeth.

+",2013-10-11 20:00:54.043 +57325,22622.0,1,,,,Marginal effect in model with interactions,,CC BY-SA 3.0,"

I am running cross-sectional regressions of the type

+ +

$$Y_c = \alpha + \beta X_1 + \gamma X_2 + \delta_1 X_3 + \delta_2 X_1 X_3 + \delta_3 X_2 X_3 + e_c.$$

+ +

My theoretical model implies that

+ +
    +
  • $\delta_2$ should be negative,
  • +
  • $\delta_3$ should be positive, and
  • +
  • the marginal effect of $X_3$ should be negative.
  • +
+ +

My estimates imply that

+ +
    +
  • $\widehat\delta_2$ is negative and significant,
  • +
  • $\widehat\delta_3$ is positive and insignificant,
  • +
  • $\widehat\beta$ is significant, and
  • +
  • $\widehat\gamma$ is insignificant.
  • +
+ +

Building on this evidence, can I calculate the marginal effect of $X_3$ as $\delta_1 + \delta_2 E(X_1)$ where $E(X_1)$ is the mean of $X_1$, justifying this procedure with the fact that all the terms incorporating $X_2$ are insignificant?

+",2013-10-11 20:30:29.607 +57326,20742.0,1,,,,Sweeping across multiple classifiers and choosing the best?,,CC BY-SA 3.0,"

I'm using Weka to perform classification, clustering, and some regression on a few large data sets. I'm currently trying out all the classifiers (decision tree, SVM, naive bayes, etc.).

+ +

Is there an automated way (in Weka or other machine learning toolkit) to sweep through all the available classifier algorithms to find the one that produces the best cross-validated accuracy or other metric? I'm not talking about boosting; rather, I'm looking to just choose the best classifier using a given data set.

+ +

I'd like to find the best clustering algorithm, too, for my other clustering problem; perhaps finding the lowest sum-of-squared-error?

+",2013-10-11 20:36:25.720 +57327,22623.0,1,,,,Binary features for prediction,,CC BY-SA 3.0,"

I have a set of relatively long ($\sim 1000$) binary features with scalar values $[0-10]$ attached to them. My aim is to write a predictor that learns to map the features to the $[0-10]$ interval to predict new features when given a new binary vector. I used SVM and Lasso with leave-one-out performance analysis, but both always end up predicting the mean value of the distribution (correlates to the histogram of all the feature - scalar distribution). The histograms are also rather norm / Rayleigh distributions. Suggestions for algorithms / feature space mapping? My main problem is that I am dealing with binary features for the first time.

+",2013-10-11 20:40:31.053 +57328,13396.0,1,,,,Improving the quality of pseudo-randomly generated uncorrelated unit normals,,CC BY-SA 3.0,"

Let's say I want to generate $N$ sequences $p_j$, where $j = 1,\ldots,N$. Each sequence has a length of $M$. I want $\mathbb{E}[ p_j ] \to 0$ and $\text{corr}(p_j, p_k) \to \delta_{j, k}$ as $M \to +\infty$.

+ +

In practice, I can generate an $M \times N$ matrix of i.i.d. unit normals. For example, in MATLAB, Z = randn(M, N). Then I get $p_j$ from the $j$-th column of $Z$.

+ +

For a finite value of $M$, the sample mean $\mathbb{E}[ p_j ] \neq 0$, but I can ""fix"" the problem if I remove the sample mean by working with $q_j = p_j - \mathbb{E}[ p_j ]$.

+ +

My question is -- how do I continue to improve my sequences, so that I get the 2nd-order moments I want, i.e., $\text{corr}(q_j, q_k) = \delta_{j,k}$ even when $M$ is finite?

+",2013-10-11 20:40:44.083 +57329,10135.0,1,,,,BIC vs. Out of sample performance,,CC BY-SA 3.0,"

I have two statistical models. Model 1 uses a GLM approach while model 2 uses a time series approach for fitting. I want to compare these two models.

+ +

Model 1 (i.e. GLM) has a better out of sample performance. Model 2 has a better BIC criteria. So based on out of sample performance, I should pick up model 1 and based on BIC I should pick up model 2 as the preferred model.

+ +

I should add that in this context and for the question I am trying to answer, Both the BIC and out of sample performance are important. The question is how to choose the best model in this case? Should I consider other criteria? Please let me know if you know any good reference with similar cases.

+",2013-10-11 20:54:18.747 +57349,22630.0,1,57352.0,,,"What does ""Mean of each pixel over all images"" mean?",,CC BY-SA 3.0,"

I was reading a paper related to Auto encoders for my project work. It is required to input images as vectors to the neural network. I couldn't understand a certain sentence due to lack of knowledge of statistics (I guess). I Googled, but the problem is I don't know what it is exactly and searching the same phrase returns the same kind of documents but not their explanation.

+ +

Source: http://www.cs.toronto.edu/~hinton/absps/esann-deep-final.pdf

+ +
+

We train on 1.6 million 32*32 color images that have been preprocessed + by subtracting from each pixel its mean value over all images and then + dividing by the standard deviation of all pixels over all images.

+
+ +

What does it mean by ""subtracting from each pixel its mean value over all images and then +dividing by the standard deviation of all pixels over all images"".

+ +

My interpretation is: ""Subtracting from each pixel its mean value over all images"" + It means, for a pixel position in an image, subtract the average of values of that pixel position over all images and subtract from the current pixel value.

+ +

Am I correct?

+ +

It is somewhat ambiguous to me.

+ +

Please explain in some math terms.

+",2013-10-12 10:43:14.610 +57330,22359.0,2,,57303.0,,,,CC BY-SA 3.0,"

How are the data sets related? IF both data sets are drawn from the same distribution (they describe the same problem) than you can use the labeled set as a ""test set"" for the clustering. Basically you treat the clustering algorithm as a classifier. The only problem is that you must find a match between the output of the clustering algorithm and the actual labels.

+ +

You might use some simple matching (ex: instances labeled GREEN are more often clustered in cluster 2 and BLUE in cluster 1 so cluster 1== BLUE and cluster 2 == GREEN).

+ +

More elegantly you can compute the Mutual Information between the clustering output and actual labels. Mutual Information has a nice property, that one doesn't need to know the exact matching. MI will give high scores if most of the matching are consistent. Think of it as a correlation coefficient between (cluster <-> actual label) relation.

+ +

Also check http://en.wikipedia.org/wiki/Cluster_analysis for some measures. The key phrase there is:

+ +
+

[...] clustering results are evaluated based on data that was not used for clustering, such as known class labels and external benchmarks. Such benchmarks consist of a set of pre-classified items, and these sets are often created by human (experts). Thus, the benchmark sets can be thought of as a gold standard for evaluation.

+
+ +

For ROC usually one needs some ""a posteriori"" probability, outputted by the classifier, but in your case, the distance between the instance and the cluster center will work. Keep in mind that ROC is computed for a specific label at a time (i.e. one vs all). So for 5 labels you will get 4 independent AUROC values.

+ +

IMHO I strongly advise yo to do the CV for clustering if you have labeled data! Iterate it several times and use the mean of your measure as the performance.

+ +

I would also try this: Use some percent (66% usually) of unlabeled data to perform clustering, measure performance using labeled data, repeat the experiment with different randomization (usually 5-10 times) and report mean performance. Unfortunately I don't know if this method will give a good estimate of your real performance. Is it possible that will overfit the labeled data set. This is not a textbook approach, so, use it with caution.

+",2013-10-11 21:08:48.940 +57331,22763.0,1,,,mikepk,CDF (cumulative frequency) of multiple samples in summed normals?,,CC BY-SA 3.0,"

Say I have some normally distributed data. I have an application where I compute the percentile (or cumulative frequency less than sample) for a particular sample using a CDF function along with the mean $\mu$ and standard deviation $\sigma$ of the samples.

+ +

so $$F_x(x) = \frac 12\left[1 + \text{erf} \left (\frac {x - \mu}{\sqrt{2 \sigma^2}}\right)\right]$$

+ +

Now I find myself in a situation where I want to determine the cumulative frequency of multiple samples across multiple data sets (finding something akin to an overall percentile of, say, three samples). Now assuming the variables are independent, I can sum the normals using

+ +

$$(\mu_\text{sum}, \sigma_\text{sum}) = (\mu_x + \mu_y + \mu_z), (\sqrt{σ^2_x + σ^2_y + σ^2_z})$$

+ +

Can I then sum the individual samples I care about and compare them to the new summed normal to compute a percentile of the three samples compared to the sum of the normals? Something tells me this doesn't work but I'd like to be sure. So I'm thinking something like computing the CDF using the sum of the samples I'm interested in:

+ +

$$F_x(x_x + x_y + x_z)$$

+ +

and using the $\mu$sum and $\sigma$sum in the CDF function above.

+",2013-10-11 21:12:06.117 +57332,22359.0,2,,57279.0,,,,CC BY-SA 4.0,"

Features extracted from image/signal processing tend to get correlated a lot! This is not a very bad thing if you have enough samples.

+

From my experience, a classifier with small variance tend to work well (ex. logistic regression). They have less chances of overfitting the train data.

+

Another idea that I employed is the Additive logistic regression here and here some references. They are already implemented in Weka. They are slower than the logistic models. In the same time they have the great advantage that they perform a feature selection while learning. Moreover, the model is human friendly so you can see what features are more relevant.

+

Hope it helps

+",2013-10-11 21:23:38.743 +57333,22359.0,2,,57026.0,,,,CC BY-SA 3.0,"

Try this paper. Your answer might be at chapter 3.2, figures 2 and 3.

+ +

Long story short: The same performance can be obtained for different pairs of C and kernel parameters. You shouldn't try to manually tune a SVM.

+ +

Edit: Some details:

+ +

I usually tune C (the cost parameter) when I have largely imbalanced classes. That is, one class have 10% and the other 90%. Some SVM libraries (esp. libSVM which I use) lets you specify a cost for each class. According to libsvm paper, $\frac{c_1}{c_2} = \frac{n_2}{n_1}$ where $n_2>n_1$ , $n_i$ is the volume of the i'th class. If you let $c_2 = 1$ then $c_1 = n_2/n_1$ . There is also a ""global"" C, that is multiplied with the specific $c_i$ values.

+ +

When the learning algorithm computes the error for the current SVM parameters, it multiplies each wrongly classified instance with this cost. If the cost is the same for both classes, the lesser class errors will get diluted and your final model will tend not to predict very well (or not at all) the weakly represented class.

+ +

Gamma acts as the $\sigma$ for a Gaussian kernel $G(x) = exp(-x^2/2\sigma^2)$. Note from the equation of RBF : $K(x,y)=exp(-\gamma||x-y||^2)$ that $\gamma$ is more or less proportional to $1/\sigma^2$. Basically $\gamma$ controls the width of the kernel.

+ +

The intuition behind this is that a large kernel will tend to produce a smoother border between classes and a narrower kernel a more intricate border. In extreme, the former will tend to give higher bias (it learns only the general aspect of the data) and the latter will tend to overfit (it learns all the details, including the outliers and errors in the data). None of these extremes are welcome in applications. A midpoint is desired, but this midpoint cannot be computed analytically and depends on the actual data.

+ +

This is why, the metaparameters are usually searched through cross validation. Please keep in mind that you must optimize for BOTH parameters in the same time.

+ +

The cost parameter C.

+ +

The theory says that SVM is a large margin classifier. In layman terms this means that it tries to find a border that is somehow as far away as possible from both classes. See the figure below (wikipedia).

+ +

+ +

Both H2 and H3 are ok, but H3 is better because if new samples arrive, is more likely to be classified wrong by H2 than H3, because H2 crosses near the black group.

+ +

The math behind SVM ensures that if a border is found, this is the one gives the largest gap between the data.

+ +

And now the tricky part: In the data you have outliers, errors, etc. basically data that is labeled as white but resides near the black group and vice versa. You have two choices: Move the border so you minimize the number of samples that will be learned wrong OR ignore few samples here and there but ensure that the border gives you large separation between classes.

+ +

The cost parameter C ""tunes"" the algorithm between better fitting the available data or giving a larger margin. If I'm not mistaken, small C means that you prefer larger margin.

+ +

Hope it helps!

+ +

p.s. I am not an expert in SVM so I can't give you the intuition on how exactly the values for global cost parameter C actually influences the results or the convergence speed of SVM.

+",2013-10-11 21:33:36.120 +57334,594.0,2,,47981.0,,,,CC BY-SA 4.0,"

There are two issues here:

+ +

1) If you're doing a formal hypothesis test (and if you're going as far as quoting a p-value in my book you already are), what is the formal rejection rule?

+ +

When comparing test statistics to critical values, the critical value is in the rejection region. While this formality doesn't matter much when everything is continuous, it does matter when the distribution of the test statistic is discrete.

+ +

Correspondingly, when comparing p-values and significance levels, the rule is:

+ +
+

          Reject if $p\leq\alpha$

+
+ +

Please note that, even if you rounded your p-value up to 0.05, indeed even if the $p$ value was exactly 0.05, formally, you should still reject.

+ +

2) In terms of 'what is our p-value telling us', then assuming you can even interpret a p-value as 'evidence against the null' (let's say that opinion on that is somewhat divided), 0.0499 and 0.0501 are not really saying different things about the data (effect sizes would tend to be almost identical).

+ +

My suggestion would be to (1) formally reject the null, and perhaps point out that even if it were exactly 0.05 it should still be rejected; (2) note that there's nothing particularly special about $\alpha = 0.05$ and it's very close to that borderline -- even a slightly smaller significance threshold would not lead to rejection.

+",2013-10-11 21:33:38.517 +57335,13396.0,2,,57328.0,,,,CC BY-SA 3.0,"

I think I got it. If $Z \sim \mathcal{N}(0, 1)$ but we want to generate $X$ such that its mean is $\mu$ and covariance matrix is $C$, we decompose $C = L L^T$, and let $X = L Z + \mu$.

+ +

Now we just need to carry out the reverse operations.

+",2013-10-11 21:39:45.703 +57336,22624.0,1,57337.0,,,How to test if a result is statistically significant?,,CC BY-SA 3.0,"

I am trying to determine if a certain conversion on my site is statistically significant. I remembered doing this type of stuff in school but I can't seem to remember how to do it now.

+ +

For 1st set: n = 7297 and conversion was 2.618% +For 2nd set: n = 6107 and conversion was 2.669%

+ +

Any tips on how to do this?

+",2013-10-11 22:55:22.160 +57337,2069.0,2,,57336.0,,,,CC BY-SA 3.0,"

Percents are a proportion. The traditional way to test differences between proportions is the chi-square test. Based on the information you have given me (7106 and 191 [2.62%] in one half and 5944 and 163 [2.67%] in the other), the chi-square test results in a non-significant value of .88 (p value). Your proportions are 2.67 and 2.62, so it is no surprise that these are not statistically significant, despite your large sample.

+",2013-10-11 23:20:23.397 +57338,21746.0,1,,,,Rescaling input features for neural networks regression,,CC BY-SA 3.0,"

In Neural Nets for the regression problem, we rescale the continuous labels consistently with the output activation function, i.e. normalize them if the logistic sigmoid is used, or adjusted normalize them if tanh is used. At the end we can restore original range but renormalizing the output neurons back.

+ +

Should we also normalize input features? And how? For example, if hidden activation differs from the output activation? E.g. if hidden activation is TANH and output activation is LOGISTIC, should the input features be normalized to lie in [0,1] or [-1,1] interval?

+",2013-10-11 23:29:41.150 +57341,13549.0,1,63675.0,,,Temporal autocorrelation in perMANOVA?,,CC BY-SA 4.0,"

I have a data set where samples are collected once per year for 15 years at a number of sites. I am worried that these data are temporally autocorrelated and was trying to figure out if I need to address that. However, the only time I will be using degrees of freedom with these data is in a perMANOVA. This test calculates a pseudo F-statistic by permuting the rows. I can't figure out if the exchangebility assumption means that I don't need to worry about autocorrelation at all (i.e., permuting rows will simply destroy the temporal structure, which I am not interested in anyway) or if it means that I can't use a perMANOVA even if I accounted for autocorrelation?

+ +

Edit: I am editing this in the hopes that clarification will help get it answered. The perMANOVA user's guide says:

+ +

""Recall that for traditional one-way ANOVA, the assumptions are that the errors are independent, that they are normally distributed with a mean of zero and a common variance, and that the treatment effects are additive. In the case of a one-way analysis, the PERMANOVA test using permutations assumes only that the observation units are exchangeable under a true null hypothesis. There are no explicit assumptions regarding the distributions of the original variables; they are certainly not assumed to be normally distributed. However, implicit in the notion of exchangeability is the notion of independence, for if observations are correlated with one another (e.g., temporally or +spatially), then randomly shuffling them will destroy this kind of inherent structure, if it is there. Thus, in general, we would assume that the observation units are independent of one another.""

+ +

The meaning of this is ambiguous to me for the reasons stated in the first paragraph. I can't find any techniques for testing/correcting autocorrelation with perMANOVA, which maybe means that it isn't a problem to worry about?

+ +

User's guide: https://web.archive.org/web/20180806183841/https://pdfs.semanticscholar.org/4d0c/430f6129b427e48fb407e59ac79ee29b4cae.pdf

+ +

Original 2001 paper describing technique: https://web.archive.org/web/20180806184058/https://pdfs.semanticscholar.org/038e/8869b676aa365f2afdea935edf3f2003324d.pdf

+",2013-10-12 02:33:38.043 +57342,19681.0,2,,57325.0,,,,CC BY-SA 3.0,"

You seem to be aware that the marginal effect of $X_3$ is $\delta_1 + \delta_2 X_1 + \delta_3 X_2$, which is just the derivative of the response with respect to $X_3$.

+ +

Replacing $X_1$ with $E(X_1)$ is a reasonable way to summarize the marginal effect.

+ +

However, discarding the final term due to statistical insignificance is nonsense. There are at least two relatively sensible alternatives:

+ +
    +
  1. If your $n$ is so big that you believe the statistical result that $\delta_3$ is insignificant more than you believe your prior belief that $\delta_3$ should be positive, than you could get rid of the $\delta_3 X_1 X_2$ term in your model and refit the coefficients BEFORE using $\delta_1 + \delta_2 X_1$ as your marginal effect.

  2. +
  3. If you believe that the terms involving $X_2$ need to be in the model, regardless of statistical significance, than you need to keep the $X_2$ term in your marginal effect as well.

  4. +
+",2013-10-12 03:15:41.107 +57343,22629.0,1,57345.0,,,K-Means clustering after first iteration,,CC BY-SA 3.0,"

In k-means clustering we initially pick $k$ random centroids and assign the given data to one of these $k$ centroids (which ever is nearest). After this we create new centroids by taking the mean of the assigned points.

+ +

However there might be case that the initially selected random centroids may not be nearest to any point in the dataset and hence no points would be assigned to these centroids. So in such case what should be done in the step of creating new centroids?

+",2013-10-12 05:40:58.913 +57344,21243.0,2,,57323.0,,,,CC BY-SA 3.0,"

Actually, this isn't by most definitions a recommender system, and anything you read in the literature about recommender systems might be geared toward solving a similar but slightly different problem (namely, where the input and output space are the same set).

+ +

This is, by most conventional definitions, a classification problem, so looking for tags related to Classification might help.

+ +

In terms of actual approaches: as Placidia mentioned, CART and random forest methods are quite popular right now. Additionally, a classic method is Logistic Regression, which might be worth checking at (I'm not an R user, but I believe R has an implementation of it provided).

+ +

More than that is tough to provide without a knowledge of what your system is doing and what approaches might be valid. I'd recommend refreshing your basic probability and statistics and thinking about the way that your variables might be related, then taking a look at the classification methods included with R (and their respective Wikipedia pages).

+",2013-10-12 06:34:05.823 +57345,436.0,2,,57343.0,,,,CC BY-SA 3.0,"

I am not sure if there is a ""standard"" thing to do in the case one of the initial centroids is completely off.

+ +

You can easily test this by specifying the initial centroids and see how things evolve!

+ +

For instance, R will just give you an error.

+ +

Say you do:

+ +
# Set the RNG seed to ensure reproducibility
+set.seed(12345)
+
+# Let's create 3 visually distinct clusters
+n <- c(1000, 500, 850)
+classifier.1 <- c(rnorm(n[1], 10, 0.9), 
+                  rnorm(n[2], 25, 2),
+                  rnorm(n[3], 35, 2))
+classifier.2 <- c(rnorm(n[1], 5, 1),
+                  rnorm(n[2], 10, 0.4),
+                  rnorm(n[3], 2, .9))
+
+col = c(""blue"", ""darkgreen"", ""darkred"")
+# Run k-means with 3 clusters and random initial centroids 
+# to check the clusters are correctly recognized
+km <- kmeans(cbind(classifier.1, classifier.2), 3)
+# Plot the data, colored by cluster
+plot(classifier.1, classifier.2, pch=20, col=col[km$cluster])
+
+# Mark the final centroids
+points(km$centers, pch=20, cex=2, col=""orange"")
+
+# Now impose some obviously ""wrong"" starting centroids
+start.x <- c(10, 25, 3000)
+start.y <- c(10, 10, -10000)
+km.2 <- kmeans(cbind(classifier.1, classifier.2), 
+               centers=cbind(start.x, start.y))
+
+ +

Now, R has obviously no issue in discriminating the 3 clusters when you let it choose the initial centroids, but when you run it the second time it will just say:

+ +
Error: empty cluster: try a better set of initial centers
+
+ +

I guess that if you are implementing your own algorithm you may choose to use this behaviour or rather give the user a warning and let the algorithm choose the centroids by itself.

+ +

Obviously, as others pointed out, there are algorithms such as k-means++ that help in choosing a good set of starting centroids.

+ +

Also, in R you can use the nstart parameter of the kmeans function to run several iterations with different centroids: this will improve clustering in certain situations.

+ +

EDIT: also, note from the R kmeans help page

+ +
+

The algorithm of Hartigan and Wong (1979) is used by default. Note + that some authors use k-means to refer to a specific algorithm rather + than the general method: most commonly the algorithm given by MacQueen + (1967) but sometimes that given by Lloyd (1957) and Forgy (1965). The + Hartigan–Wong algorithm generally does a better job than either of + those, but trying several random starts (nstart> 1) is often + recommended. For ease of programmatic exploration, k=1 is allowed, + notably returning the center and withinss.

+ +

Except for the Lloyd–Forgy method, k clusters will always be returned + if a number is specified. If an initial matrix of centres is supplied, + it is possible that no point will be closest to one or more centres, + which is currently an error for the Hartigan–Wong method.

+
+",2013-10-12 07:23:28.157 +57346,21586.0,2,,47981.0,,,,CC BY-SA 3.0,"

It lies in the eye of the beholder.

+ +

Formally, if there is a strict decision rule for your problem, follow it. This means $\alpha$ is given. However, I am not aware of any problem where this is the case (though setting $\alpha=0.05$ is what many practitioners do after Statistics101).

+ +

So it really boils down to what AlefSin commented before. There cannot be a ""correct answer"" to your question. Report what you got, rounded or not.

+ +

There is a huge literature on the ""significance of significance""; see for example the recent paper of one of the leading German statisticians Walter Krämer on ""The cult of statistical significance - What economists should and should not do to make their data talk"", Schmollers Jahrbuch 131, 455-468, 2011.

+",2013-10-12 07:43:00.677 +57347,6162.0,2,,10911.0,,,,CC BY-SA 3.0,"

There is a natural exact confidence interval for the grandmean in the balanced random one-way ANOVA model $$(y_{ij} \mid \mu_i) \sim_{\text{iid}} {\cal N}(\mu_i, \sigma^2_w), \quad j=1,\ldots,J, +\qquad +\mu_i \sim_{\text{iid}} {\cal N}(\mu, \sigma^2_b), \quad i=1,\ldots,I.$$ +Indeed, it is easy to check that the distribution of the observed means $\bar{y}_{i\bullet}$ is $\bar{y}_{i\bullet} \sim_{\text{iid}} {\cal N}(\mu, \tau^2)$ with $\tau^2=\sigma^2_b+\frac{\sigma^2_w}{J}$, +and it is well known that the between sum of squares $SS_b$ has distribution $$SS_b \sim J\tau^2\chi^2_{I-1}$$ and is independent of the overall observed mean $$\bar y_{\bullet\bullet} \sim {\cal N}(\mu, \frac{\tau^2}{I})$$. +Thus $$\frac{\bar y_{\bullet\bullet} - \mu}{\frac{1}{\sqrt{I}}\sqrt{\frac{SS_b}{J(I-1)}}}$$ has a Student $t$ distribution with $I-1$ degrees of freedom, wherefrom it is easy to get an exact confidence interval about $\mu$.

+ +

Note that this confidence interval is nothing but the classical interval for a Gaussian mean by considering only the group means $\bar{y}_{i\bullet}$ as the observations. +Thus the simple approach you mention:

+ +
+

The simple approach is to first compute the mean of each experiment: + 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the + grand mean is 39.7 with the 95% confidence interval ranging from 17.4 + to 61.9.

+
+ +

is right. And your intuition about the ignored variation:

+ +
+

The problem with that approach is that it totally ignores the + variation among triplicates. I wonder if there isn't a good way to + account for that variation.

+
+ +

is wrong. I also mention the correctness of such a simplification in https://stats.stackexchange.com/a/72578/8402

+ +

Update 12/04/2014

+ +

Some details are now written on my blog: Reducing a model to get confidence intervals.

+",2013-10-12 10:02:03.747 +57348,6136.0,1,57353.0,,,KL-divergence between two categorical/multinomial distributions gives negative values?,,CC BY-SA 3.0,"

If

+ +

$$P = [0,0.9,0,0.1]$$

+ +

$$Q = [0,1,0,0]$$

+ +

Then $$KL(P||Q) = 0 + \ln(0.9/1)\cdot0.9 + 0 + 0 = -0.094$$

+ +

This shouldn't be possible from the Gibbs inequality. What am I misunderstanding?

+",2013-10-12 10:23:30.670 +57417,22425.0,1,,,,Independent but not identically distributed,,CC BY-SA 3.0,"

Let $X_1, X_2,\ldots ,X_n$ be discrete random variables.

+ +

I'm looking for a way to prove the random variables are independent but not identically distributed.

+ +

Can anyone suggest some ideas ?

+",2013-10-14 03:02:05.353 +57351,20473.0,2,,57350.0,,,,CC BY-SA 4.0,"

In an expression where more than one random variables are involved, the symbol $E$ alone does not clarify with respect to which random variable is the expected value ""taken"". For example

+ +

$$E[h(X,Y)] =\text{?} \int_{-\infty}^{\infty} h(x,y) f_X(x)\,dx$$ +or +$$E[h(X,Y)] = \text{?} \int_{-\infty}^\infty h(x,y) f_Y(y)\,dy$$

+ +

Neither. When many random variables are involved, and there is no subscript in the $E$ symbol, the expected value is taken with respect to their joint distribution:

+ +

$$E[h(X,Y)] = \int_{-\infty}^\infty \int_{-\infty}^\infty h(x,y) f_{XY}(x,y) \, dx \, dy$$

+ +

When a subscript is present... in some cases it tells us on which variable we should condition. So

+ +

$$E_X[h(X,Y)] = E[h(X,Y)\mid X] = \int_{-\infty}^\infty h(x,y) f_{h(X,Y)\mid X}(h(x,y)\mid x)\,dy $$

+ +

Here, we ""integrate out"" the $Y$ variable, and we are left with a function of $X$.

+ +

...But in other cases, it tells us which marginal density to use for the ""averaging""

+ +

$$E_X[h(X,Y)] = \int_{-\infty}^\infty h(x,y) f_{X}(x) \, dx $$

+ +

Here, we ""average over"" the $X$ variable, and we are left with a function of $Y$.

+ +

Rather confusing I would say, but who said that scientific notation is totally free of ambiguity or multiple use? You should look how each author defines the use of such symbols.

+",2013-10-12 11:56:27.427 +57352,15827.0,2,,57349.0,,,,CC BY-SA 3.0,"

Each image is composed of 32 $\times$ 32 pixels, so for a given pixel (say row 13, column 31) something measured is averaged over all the images, and the standard deviation (SD for short) for the same something is also calculated.

+ +

(value − mean) / SD is often called a z-score and is a way of standardizing values to take account of mean and SD. Presumably that's done for every pixel, meaning every pixel position.

+ +

It is spelled out that they are ""dividing by the standard deviation of all pixels over all images"" [my emphasis] and that SD would usually be calculated with reference to the corresponding overall mean. However, division by that SD would be dividing by a constant, so it won't have any effect on the images beyond a question of units.

+",2013-10-12 12:16:10.913 +57353,5875.0,2,,57348.0,,,,CC BY-SA 3.0,"

Let’s remove the two categories with probability $0$ in both distributions. Your example is $P = (0.9, 0.1)$ and $Q = (1,0)$.

+ +

The KL divergence is $KL(P||Q) = \sum_i p_i \log\left( {p_i \over q_i }\right)$. It is not +$$ 0.9 \times \log\, 0.9 + 0 $$ +but +$$ 0.9 \times \log\, 0.9 + 0.1 \times ( +\infty ) = + \infty.$$

+",2013-10-12 12:19:43.223 +57354,8671.0,1,57356.0,,,Computing mutual information,,CC BY-SA 3.0,"

I have a problem when computing the mutual information between two variables. +Let's consider the following table:

+ +
             k=1       k=2       k=3    
+c = 1         10        20         5    
+c = 2          5         3        20
+
+ +

I want to calculate the mutual information between the class ($c$) and the cluster ($k$).

+ +

E.g., for $\text{MI}(c=1, k=1)$, I will calculate the $P(c,k)$ as follows:

+ +
             k=1            k~=1    
+c=1        10/63           25/63    
+c~=1        5/63           23/63
+
+ +

and I can calculate the MI based on the following:

+ +

$\text{MI}(c,k) = \sum_{c\in\{0,1\}} \sum_{k\in\{0,1\}} p(c,k)log_2 \frac{p(c,k)}{p(c)p(k)}$

+ +

In this case: I am getting the same MI for $(c=1, k=1)$ and $(c=2, k=1)$. What is the reason for this? How can I calculate the MI for each $(c,k)$ based on the first table?

+",2013-10-12 13:58:25.187 +57355,20927.0,1,,,,SPSS-independent-sample-t-test,,CC BY-SA 3.0,"

I want to test mean differences between 4(0,1,2,3) groups for 6 different cell-types. In other words, I would have 6 comparisons to do for each cell-type (0-1,0-2,0-3,1-2,1-3,2-3). I have run a repeated-measures ANOVA before but it only gives me the main effects and interactions. To correct my t-tests for multiple comparisons, I would use Bonferroni, does SPSS let me do that, I just cannot seem to find it except as part of the means-testing using ANOVA? Or do I have to do it manually and how would i do that? alpha-level here is 0.05.Thanks

+",2013-10-12 14:02:41.167 +57356,6630.0,2,,57354.0,,,,CC BY-SA 3.0,"

MI is between two random variables. In this case, $c$ and $k$ are random variables.

+ +

If you take $c=1$, it isn't really random anymore, and MI should be zero. Likewise for $k=1$.

+ +

(Also, one more thing to keep in mind is that your 'plug-in' estimation of MI is biased.)

+ +

EDIT: MI between $I(c=1)$ vs $I(k=1)$ where $I$ is the indicator function, is a different matter. Your double usage of the same variable confused me.

+ +

Now your $c$ only take 2 values, therefore, $I(c=1) = 1 - I(c=2)$. MI is invariant invertible transformation of variables, that's why $MI(I(c=1),I(k=1)) = MI(I(c=2),I(k=1))$.

+",2013-10-12 14:05:53.523 +57357,14850.0,2,,56911.0,,,,CC BY-SA 3.0,"

A dependent mixture model (hidden Markov model) may be of use, depending on the type of deviations expected.

+ +

Assume that your observations come from two distributions (or states), both of which are normally distributed, but have different mean and variance.

+ +

A number of parameters can be estimated: The initial state probabilities (2 parameters), the state transition probabilities between neighbouring data points (4 parameters) and finally the mean and variance of the two distributions (4 parameters).

+ +

In R, this model can be estimated using the depmixS4 package:

+ +
library(depmixS4)
+
+set.seed(3)
+y = rnorm(100)
+y[30:35] <- rnorm(6,mean=4,sd=2)
+plot(1:100,y,""l"")
+
+m <- depmix(y~1,nstates=2,ntimes=100)
+fm <- fit(m)
+
+means <- getpars(fm)[c(7,9)]
+lines(1:100,means[fm@posterior$state],lwd=2,col=2)
+
+ +

+ +

See http://cran.r-project.org/web/packages/depmixS4/vignettes/depmixS4.pdf for references

+",2013-10-12 15:57:43.367 +57358,22637.0,1,57364.0,,,Transformations(CDF technique),,CC BY-SA 3.0,"

Consider the following short example of transformations.

+ +

Let the joint density of X and Y be given by the unit square, i.e.

+ +

$$f_{X,Y}(x,y) = \begin{cases} 1\ \quad 0<x<1\ \text{ and }\ 0<y<1 \\ 0 \quad \text{elsewhere} \end{cases}$$

+ +

Then the Cumulative Distribution Function of $Z=X+Y$ is given by:

+ +

$$ +F_Z = \begin{cases}\begin{array}{ll} 0\ & \text{ for }\ z<0 \\ \int_0^{z} \int_0^{z-x} dy\,dx\ & \text{ for }\ 0\leq z <1 \\1-\int_{z-1}^1 \int_{z-x}^1 dy\,dx\ & \text{ for }\ 1\leq z<2 \\1\ & \text{ for }\ 2\leq{z} \end{array}\end{cases} +$$

+ +

I understand why we have to partition our CDF, what I am having trouble figuring out is why for the interval $[1,2)$ that specific form. What is the intuition here? Thanks.

+",2013-10-12 16:48:18.213 +57359,346.0,1,57687.0,,,How to include a linear and quadratic term when also including interaction with those variables?,,CC BY-SA 3.0,"

When adding a numeric predictor with categorical predictors and their interactions, it is usually considered necessary to center the variables at 0 beforehand. The reasoning is that the main effects are otherwise hard to interpret as they are evaluated with the numeric predictor at 0.

+ +

My question now is how to center if one not only includes the original numeric variable (as a linear term) but also the quadratic term of this variable? Here, two different approaches are necessary:

+ +
    +
  1. Centering both variables at their individual mean. This has the unfortunate downside that the 0 now is at a different position for both variables considering the original variable.
  2. +
  3. Centering both variables at the mean of the original variable (i.e., subtracting the mean from the original variable for the linear term and subtracting the square of the mean of the original variable from the quadratic term). With this approach the 0 would represent the same value of the original variable, but the quadratic variable would not be centered at 0 (i.e., the mean of the variable wouldn't be 0).
  4. +
+ +

I think that approach 2 seems reasonable given the reason for centering after all. However, I cannot find anything about it (also not in the related questions: a and b).

+ +

Or is it generally a bad idea to include linear and quadratic terms and their interactions with other variables in a model?

+",2013-10-12 17:07:43.177 +57360,22381.0,1,,,,Does applying ARMA-GARCH require stationarity?,,CC BY-SA 3.0,"

I am going to use the ARMA-GARCH model for financial time series and was wondering whether the series should be stationary before applying the said model. +I know to apply ARMA model the series should be stationary, however I'm not sure for ARMA-GARCH since I'm including GARCH errors which imply volatility clustering and non-constant variance and hence non-stationary series no matter what transformation I do.

+ +

Are financial time series usually stationary or non-stationary? +I tried applying ADF test to a few volatile series and got p-value<0.01 which seems to indicate stationarity but the principle of volatile series itself tells us that the series isn't stationary.

+ +

Can somebody clear that up for me?I'm getting really confused

+",2013-10-12 17:14:19.543 +57361,20473.0,2,,57360.0,,,,CC BY-SA 3.0,"

Copying from the abstract of Engle's original paper:
+""These are mean zero, serially uncorrelated processes with nonconstant variances conditional on the past, but constant unconditional variances. For such processes, the recent past gives information about the one-period forecast variance"".

+ +

Continuing with the references, as the author who introduced GARCH shows (Bollerslev, Tim (1986). ""Generalized Autoregressive Conditional Heteroskedasticity"", Journal of Econometrics, 31:307-327) +for the GARCH(1,1) process, it suffices that $\alpha_1 + \beta_1 <1$ for 2nd-order stationarity.

+ +

Stationarity (the one needed for estimation procedures), is defined relative to the unconditional distribution and moments.

+ +

ADDENDUM
+To summarize here discussion in the comments, the GARCH modeling approach is an ingenious way to model suspected heteroskedasticity over time, i.e. of some form of heterogeneity of the process (which would render the process non-stationary) as an observed feature that comes from the existence of memory of the process, in essence inducing stationarity at the unconditional level.

+ +

In other words, we took our two ""great opponents"" in stochastic process analysis (heterogeneity and memory), and used the one to neutralize the other -and this is indeed an inspired strategy.

+",2013-10-12 17:42:59.303 +57362,1406.0,2,,57360.0,,,,CC BY-SA 3.0,"

Yes the the series should be stationary. GARCH models are actually white noise processes with not trivial dependence structure. Classical GARCH(1,1) model is defined as

+ +

$$r_t=\sigma_t\varepsilon_t,$$

+ +

with

+ +

$$\sigma_t^2=\alpha_0+\alpha_1\varepsilon_{t-1}^2+\beta_1\sigma_{t-1}^2,$$

+ +

where $\varepsilon_t$ are independent standard normal variables with unit variance.

+ +

Then

+ +

$$Er_t=EE(r_t|\varepsilon_{t-1},\varepsilon_{t-2},...)=E\sigma_tE(\varepsilon_t|\varepsilon_{t-1},\varepsilon_{t-2},...)=0$$

+ +

and

+ +

$$Er_tr_{t-h}=EE(r_tr_{t-h}|\varepsilon_{t-1},\varepsilon_{t-2},...)=Er_{t-h}\sigma_{t}E(\varepsilon_t|\varepsilon_{t-1},\varepsilon_{t-2},...)=0$$

+ +

for $h>0$. Hence $r_t$ is a white noise process. However it is possible to show that $r_t^2$ is actually a $ARMA(1,1)$ process. So GARCH(1,1) is stationary process, yet has non-constant conditional variance.

+",2013-10-12 17:56:44.900 +57363,22639.0,1,57506.0,,,Machine learning algorithms/approaches for class recommendations?,,CC BY-SA 3.0,"

I am asking a theoretical question about machine learning in terms of clustering. Is it possible, given a set of data of classes that students have taken in a semester to recommend additional classes that students should take if they selected some classes?

+ +

I am thinking along the line of forming clusters of classes and figuring out if a particular set of picked classes match with a pre-existing set of classes. Then, recommend the class that are in the set. But I am new to machine learning, and so welcome any other suggestions of algorithms.

+ +

In addition, this is not particularly theoretical, so feel free to ignore: but does anyone know any particular software that can accomplish this? I know LensKit is a software to handle recommendations but it seems to need ratings (which I do not have).

+ +

I welcome any mathematical manipulations that can turn clusters into ""ratings."" Thanks.

+",2013-10-12 18:03:28.603 +57364,10135.0,2,,57358.0,,,,CC BY-SA 3.0,"

OK, check out the following plot.

+ +

You need to find the area of shaded region. So you need to take double integration over the shaded region. First you fix your $X$ and take your integration with respect to $Y$. Look at the double bar in the middle of that triangle (upper right corner). Its lower part goes from $Y=-X+z$ to its upper part $Y=1$. These are the bounds for the first integration. Now you need to move that little bar in the middle to left and right to cover all the shaded region. In other words, it means that this time, you need to take your integration with respect to $X$. So as you can see, the line $Y=-X+z$ for $1\leq z<2$ intercepts the line $Y=1$ at $X=z-1$. This is the left boundary limit for your integration. Now move that little bar to the right, it should go up to $X=1$, that gives you the upper bound. Hope that helps.

+",2013-10-12 18:10:09.217 +57469,22507.0,2,,57467.0,,,,CC BY-SA 3.0,"

You cannot. ""Accept that $b_1=0$"" is the same as ""reject that $b_1\ne 0$"". But on what basis you could do this? No matter how many observations you have, you cannot distinguish between 0 and sufficiently small value of $b_1$. You can only accept that $|b_1|<\epsilon$ (the smaller $\epsilon$ the more observations you need).

+",2013-10-14 22:07:54.383 +57365,22640.0,1,,,,"Drawing data from ""population"" for regression analysis",,CC BY-SA 3.0,"

We have a response variable $Y$ and predictor $X$, and we draw $n$ samples $(Y_1,X_1), \ldots, (Y_n, X_n)$ from the population of interest to do a regression analysis. Under the assumptions of a simple linear regression model, my question is a conceptual one: how do we really think about the response on the $i$th unit, $Y_i$? Do we say it's drawn from the level or subpopulation of individuals with $ X = x_i $, or from the aggregate population over all the values of $X$? Moreover, while we assume that the response $Y$ in every subpopulation defined by $X$ is normal with equal variances, how do we think about the aggregate population from which $Y_i$ is drawn?

+",2013-10-12 18:36:45.027 +57366,306.0,2,,57360.0,,,,CC BY-SA 3.0,"

Stationarity is a theoretical concept which is then modified to other forms like Weak Sense Stationarity which can be tested easily. Most of the tests like adf test as you have mentioned test for linear conditions only. the ARCH effects are made for series which do not have autocorrelation in the first order but there is dependence in the squared series.

+ +

The ARMA-GARCH process you talk about, here the second order dependence is removed using the GARCH part and then any dependence in the linear terms is captured by the ARMA process.

+ +

The way to go about is to check for the autocorrelation of the squared series, if there is dependence, then apply the GARCH models and check the residuals for any linear time series properties which can then be modelled using ARMA processes.

+",2013-10-12 18:41:16.300 +57367,22641.0,1,,,,"Test for differences between (among) related, but not matched, samples",,CC BY-SA 3.0,"

When two samples are related, or dependent, but the observations are not matched, are there any tests that will determine if the samples (means or otherwise) are different? I've searched extensively and have only found tests for matched samples, which is not what I need.

+",2013-10-12 18:47:43.087 +57368,21840.0,1,,,,Compute probability,,CC BY-SA 3.0,"

Suppose that $X$, $Y$ and $Z$ are $\text{i.i.d.} \sim \text{Uniform}(0,1)$. Let $t > 0$ be a fixed constant.

+ +

(i) Compute $P(X/Y \leq t)$
+(ii) Compute $ P(XY \leq t)$
+(iii) Compute $ P(XY/Z \leq t)$

+ +

I found the solution for (i) part undermining the different values of $t$.
+$ P(X/Y \leq t) = \int_0^1\int_0^{yt}dx dy = t/2 $ ; when $t\leq 1$

+ +

I am finding hard time to lake limits for different values of $t$ i.e when $t\leq 1 $ and $t>1$

+",2013-10-12 19:10:33.277 +57369,22507.0,2,,57363.0,,,,CC BY-SA 3.0,"

Clustering is seldom, if ever, used for recommendations, since it is too crude. The most common techniques used are:

+ +
    +
  • matrix factorization; read, for example, ""Matrix Factorization Techniques for Recommender Systems"" by Koren, Bell, and Volinsky. If you use R, there is are packages NMFN and gnmf for non-negative matrix factorization. In your case, this will be the matrix of 0's and 1's. There are many modifications and versions of this technique.
  • +
  • KNN. For each class, find classes highly correlative with it. Then predict the probability for this class as a linear regression (or, in your case, logistic regression) of the correlative classes, with relaxation.
  • +
  • Restricted Boltzmann Machines. This is relatively hard to understand or implement. Read, for example, ""Restricted Boltzmann Machines for Collaborative Filtering"" by Salakhutdinov, Mnih, and Hinton. There are no Restricted Boltzmann Machine packages on R.
  • +
  • Often, a combination of different approaches (blending) is used, providing better results than each one separately. For example, Netflix uses a blending of Matrix Factorization and Restricted Boltzmann Machines.
  • +
+",2013-10-12 19:31:39.603 +57370,22507.0,2,,15542.0,,,,CC BY-SA 3.0,"

I recommend ""The Elements of Statistical Learning"", by Hastie, Tibshirani, and Friedman. Don't just read it, play with some algorithms described by them (most of them are implemented in R, or you could even implement some yourself), and learn their weak and strong points.

+",2013-10-12 19:39:43.643 +57371,22507.0,2,,45804.0,,,,CC BY-SA 3.0,"

Calculate a correlation of two functions over a set of random examples. The two-sided Kolmogorov-Smirnov test compares one-dimensional distributions, not multidimensional functions.

+",2013-10-12 20:15:29.000 +57372,21947.0,2,,47981.0,,,,CC BY-SA 3.0,"

The answer is absolutely not. There is no ""in the eye of the beholder"", there is no argument, the answer is no, your data is not significant at the $p=0.05$ level. (Ok, there is one way out, but its a very narrow path.)

+ +

The key problem is this phrase: ""We came across some data..."".

+ +

This suggests that you looked at several other statistical hypothesis, and rejected them because they did not reach your significance level. You found one hypothesis that (barely) met your standard, and you are wondering whether it is significant. Unless your $p$ value accounts for such multiple hypothesis testing, it is overly optimistic. Given that you are just three decimal points away from your threshold, considering even one additional hypothesis would surely push $p$ over the line.

+ +

There is a name for this sort of statistical malfeasance: data dredging. I'm ambivalent about reporting it in the paper as an interesting hypothesis; does it have some physical reason you expect it to hold?

+ +

There is, however, one way out. Perhaps you decided a priori to perform just this one test on just this one data set. You wrote that down in your lab notebook, in front of someone so that you could prove it later. Then you did your test.

+ +

If you did this, then your result is valid at the $p=0.05$ level, and you can back it up to skeptics like me. Otherwise, sorry, it is not a statistically significant result.

+",2013-10-12 20:45:05.873 +57373,17459.0,1,,,,what's the pdf and covariance for this distribution?,,CC BY-SA 3.0,"

I am stuck on a problem and wonder if anyone can give me some suggestions.

+ +

$X_1, X_2, X_3$ all follow a $\text{Uniform}[0,1]$ distribution and are subject to the constraint $X_1+X_2+X_3\leq 1$.

+ +

What's the joint distribution for $(X_1, X_2, X_3)$, that is, what's $p(X_1, X_2, X_3)$, and what's the variance-covariance matrix for it?

+ +

I get the joint distribution by geometrical way, that the pdf should be $1/6$.

+ +

However, I can't calculate the variance-covariance matrix for it. I wonder how to get it?

+",2013-10-13 03:46:30.167 +57374,17123.0,1,,,,How to estimate the confidence interval using sample average and sample size ONLY?,,CC BY-SA 3.0,"

Suppose there is a population, with goods and bads. The bad rate of the population(=bads/(bads+goods)) is of course unknown.

+ +

Now, I have a sample of $N$ from the population and I know the bad rate of this sample as $b$. The question is can I calculate the confidence interval based on $N$ and $b$ ONLY? In other words, can I calculate the confidence interval $x$ such that with, say, 95% confidence the population bad rate falls in $[\text{range}_1,\text{range}_2]$, where $\text{range}_1$ will be $b-x$ and $\text{range}_2$ will be $b+x$.

+",2013-10-13 04:36:36.017 +57375,,2,,57374.0,anon,,,CC BY-SA 3.0,"

Exactly as Glen_b said. Under random sampling the confidence interval for a binomial proportion can be easily calculated with, e.g., using the normal approximation. The formula can be found from Wikipedia, among other sources (http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval).

+ +

As an example, the 95% confidence interval for a sample of 1000 with the proportion of bad of 0.5, the confidence interval would be from 0.5-sqrt((1/1000)*0.5*0.5)*1.96 to 0.5+sqrt((1/1000)*0.5*0.5)*1.96. In other words, in this case the 95% confidence interval would be 0.469-0.530.

+",2013-10-13 06:29:18.963 +57376,19455.0,2,,57326.0,,,,CC BY-SA 3.0,"

I would suggest a different approach. Instead of sweeping across all possible classifiers, +stop and think about your problem. How does your feature space look like? For the case of binary classification, are there two large clusters with some boundary, or is your feature space ""segmented"" and contains many clusters?

+ +

In the former case, an SVM would be a good choice to separate the two clusters (with the right choice of kernel), in the latter a decision tree which splits the feature space into areas would probably be a better choice. Another issue is interpretability, do you need some sort of report or methodology for classification, or simply a prediction result? Decision tree can provide you with a methodology you can follow, enabling you to debug and check if you are overfitting. From my personal experience, understanding your dataset is at least as important as the choice of algorithm.

+",2013-10-13 07:01:11.173 +57377,12756.0,1,57387.0,,,Usage of Linear optimization model,,CC BY-SA 3.0,"

Young Energy operates a power plant. The power plant is a coal-fired boiler that produces steam which in turn drives a generator. The company can buy different types of coal, and then mix them to meet the demands placed on it which is fired in the boiler. The table shows the characteristics of the different types of coal are:

+ +

+ +

The requirement to be burned in the pan is:

+ +
    +
  • BTU/lb: 11900,
  • +
  • content of the ashes max 12,2% and
  • +
  • max moisture 9,4%.
  • +
+ +
+

How should I implement a linear optimization model in this context?

+
+",2013-10-13 07:04:53.763 +57378,5001.0,1,57389.0,,,Interpretation of the p-value of the y-intercept coefficient in a linear regression,,CC BY-SA 3.0,"

I am trying to interpret one of the p-values in a one variable linear regression. Some of the answers I've seen for similar questions were not worded as thoroughly as I would have liked. My interpretation is deliberately verbose because it will aid my understanding if faults are found within it.

+ +

From Microsoft Excel the linear regression formula from 90 samples of (x,y) pairs is

+ +

y = 0.514x + 0.00087

+ +

and the p-value of the first coefficient is 4e-16 (scientific notation) and for the second it is 0.0027.

+ +

Would it be correct to say that the interpretation of the p-value of the 0.00087 term is:

+ +
+

Under the assumption that the true value of the y-intercept is zero + and the first coefficient is 0.514, random sampling of the same number + of (x,y) pairs, specifically 90, would result in a least squares best + fit line with a y-intercept at least as extreme as 0.00087, with a + probability of 0.0027.

+
+ +

If not, then what would be the correct interpretation?

+ +

Not so importantly, but just to be complete, I am also inquiring if it would be more accurate and complete to put the relevant phrase as

+ +
+

""at least as extreme as 0.00087 in the same direction, that is, + positive"".

+
+ +

Edit: The Excel funcion is Tools > Data Analysis > Regression in Office 2003 with service pack 2. Excel regression p-values on coefficients are 2 sided.

+ +

Edit: Regarding differentiation from this question here: The most up voted answer there discusses the p-value of a hypothesis, which seems ill defined or at least not specific. I am not interested in that. I am interested in the p-value of a coefficient that is not the coefficient of an independent variable. I am being very specific.

+",2013-10-13 08:09:51.043 +57379,22646.0,1,,,,Standard error of the residuals for a non-linear model,,CC BY-SA 3.0,"

Hi I am new to R and statistics and used to linear models. Can you please explain the output? I used it to make a growth curve.

+ +
Formula: length ~ a * (1 - exp(-c * est_age))
+
+Parameters:
+   Estimate Std. Error t value Pr(>|t|)    
+a 1.097e+03  1.026e+01 106.966  < 2e-16 ***
+c 1.539e-01  1.982e-02   7.765 2.33e-09 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
+
+Residual standard error: 41.74 on 38 degrees of freedom
+Number of iterations to convergence: 6 
+Achieved convergence tolerance: 7.32e-07 
+
+",2013-10-13 08:15:40.013 +57380,22648.0,1,,,,Negative values in predictions for an always-positive response variable in linear regression,,CC BY-SA 3.0,"

I'm trying to predict a response variable in linear regression that should be always positive (cost per click). It's a monetary amount. In adwords, you pay google for clicks on your ads, and a negative number would mean that google pays you when people clicked :P

+ +

The predictors are all continuous values. The Rsquared and RMSE are decent when compared to other models, even out-of-sample:

+ +
  RMSE        Rsquared 
+1.4141477     0.8207303
+
+ +

I cannot rescale the predictions, because it's money, so even a small rescaling factor could change costs significantly.

+ +

As far as I understand, for the regression model there's nothing special about zero and negative numbers, so it finds the best regression hyperplane no matter whether the output is partly negative.

+ +

This is a very first attempt, using all variables I have. So there's room for refinement.

+ +

Is there any way to tell the model that the output cannot be negative?

+",2013-10-13 09:41:29.343 +57381,8074.0,2,,57195.0,,,,CC BY-SA 3.0,"

I think there are a few options for showing this type of data:

+ +

The first option would be to conduct an ""Empirical Orthogonal Functions Analysis"" (EOF) (also referred to as ""Principal Component Analysis"" (PCA) in non-climate circles). For your case, this should be conducted on a correlation matrix of your data locations. For example, your data matrix dat would be your spatial locations in the column dimension, and the measured parameter in the rows; So, your data matrix will consist of time series for each location. The prcomp() function will allow you to obtain the principal components, or dominant modes of correlation, relating to this field:

+ +
res <- prcomp(dat, retx = TRUE, center = TRUE, scale = TRUE) # center and scale should be ""TRUE"" for an analysis of dominant correlation modes)
+#res$x and res$rotation will contain the PC modes in the temporal and spatial dimension, respectively.
+
+ +

The second option would be to create maps that show correlation relative to an individual location of interest:

+ +
C <- cor(dat)
+#C[,n] would be the correlation values between the nth location (e.g. dat[,n]) and all other locations. 
+
+ +

EDIT: additional example

+ +

While the following example doesn't use gappy data, you could apply the same analysis to a data field following interpolation with DINEOF (http://menugget.blogspot.de/2012/10/dineof-data-interpolating-empirical.html). The example below uses a subset of monthly anomaly sea level pressure data from the following data set (http://www.esrl.noaa.gov/psd/gcos_wgsp/Gridded/data.hadslp2.html):

+ +
library(sinkr) # https://github.com/marchtaylor/sinkr
+
+# load data
+data(slp)
+
+grd <- slp$grid
+time <- slp$date
+field <- slp$field
+
+# make anomaly dataset
+slp.anom <- fieldAnomaly(field, time)
+
+# EOF/PCA of SLP anom
+P <- prcomp(slp.anom, center = TRUE, scale. = TRUE)
+
+expl.var <- P$sdev^2 / sum(P$sdev^2) # explained variance
+cum.expl.var <- cumsum(expl.var) # cumulative explained variance
+plot(cum.expl.var)
+
+ +

Map the leading EOF mode

+ +
# make interpolation
+require(akima)
+require(maps)
+
+eof.num <- 1
+F1 <- interp(x=grd$lon, y=grd$lat, z=P$rotation[,eof.num]) # interpolated spatial EOF mode
+
+
+png(paste0(""EOF_mode"", eof.num, "".png""), width=7, height=6, units=""in"", res=400)
+op <- par(ps=10) #settings before layout
+layout(matrix(c(1,2), nrow=2, ncol=1, byrow=TRUE), heights=c(4,2), widths=7)
+#layout.show(2) # run to see layout; comment out to prevent plotting during .pdf
+par(cex=1) # layout has the tendency change par()$cex, so this step is important for control
+
+par(mar=c(4,4,1,1)) # I usually set my margins before each plot
+pal <- jetPal
+image(F1, col=pal(100))
+map(""world"", add=TRUE, lwd=2)
+contour(F1, add=TRUE, col=""white"")
+box()
+
+par(mar=c(4,4,1,1)) # I usually set my margins before each plot
+plot(time, P$x[,eof.num], t=""l"", lwd=1, ylab="""", xlab="""")
+plotRegionCol()
+abline(h=0, lwd=2, col=8)
+abline(h=seq(par()$yaxp[1], par()$yaxp[2], len=par()$yaxp[3]+1), col=""white"", lty=3)
+abline(v=seq.Date(as.Date(""1800-01-01""), as.Date(""2100-01-01""), by=""10 years""), col=""white"", lty=3)
+box()
+lines(time, P$x[,eof.num])
+mtext(paste0(""EOF "", eof.num, "" [expl.var = "", round(expl.var[eof.num]*100), ""%]""), side=3, line=1) 
+
+par(op)
+dev.off() # closes device
+
+ +

+ +

Create correlation map

+ +
loc <- c(-90, 0)
+target <- which(grd$lon==loc[1] & grd$lat==loc[2])
+COR <- cor(slp.anom)
+F1 <- interp(x=grd$lon, y=grd$lat, z=COR[,target]) # interpolated spatial EOF mode
+
+
+png(paste0(""Correlation_map"", ""_lon"", loc[1], ""_lat"", loc[2], "".png""), width=7, height=5, units=""in"", res=400)
+
+op <- par(ps=10) #settings before layout
+layout(matrix(c(1,2), nrow=2, ncol=1, byrow=TRUE), heights=c(4,1), widths=7)
+#layout.show(2) # run to see layout; comment out to prevent plotting during .pdf
+par(cex=1) # layout has the tendency change par()$cex, so this step is important for control
+
+par(mar=c(4,4,1,1)) # I usually set my margins before each plot
+pal <- colorRampPalette(c(""blue"", ""cyan"", ""yellow"", ""red"", ""yellow"", ""cyan"", ""blue""))
+ncolors <- 100
+breaks <- seq(-1,1,,ncolors+1)
+image(F1, col=pal(ncolors), breaks=breaks)
+map(""world"", add=TRUE, lwd=2)
+contour(F1, add=TRUE, col=""white"")
+box()
+
+par(mar=c(4,4,0,1)) # I usually set my margins before each plot
+imageScale(F1, col=pal(ncolors), breaks=breaks, axis.pos = 1)
+mtext(""Correlation [R]"", side=1, line=2.5)
+box()
+
+par(op)
+
+dev.off() # closes device
+
+ +

+",2013-10-13 10:19:34.250 +57382,,1,57392.0,,user10619,Does sampling error include measurement error?,,CC BY-SA 3.0,"

Gross sampling error (MSE) appears to be a composite of two errors sampling and measurement error. How do we assess measurement error ? can we find out net sampling error ?

+",2013-10-13 11:56:43.490 +57390,21985.0,1,57427.0,,,"Auto regressive process, maximum likelihood estimator",,CC BY-SA 3.0,"

A first-order autoregressive process, $X_0,\dots,X_n$, is given through the following conditional distributions: +$X_i | X_{i-1},\dots,X_0 \sim \mathcal{N}(\alpha X_{i-1},1)$, +for $i = 1,2,\dots,n$ and $X_0 \sim \mathcal{N}(0,1)$.

+ +

I know that the log-likelihood function $\ell{(\alpha)}$ is of the form: +$\ell(\alpha) = - \frac{1}{2} \sum (x_i - \alpha x_{i-1})^2 + c$. But I don't know how to show that.

+ +

I found for $\hat{\alpha}_{ML}$ the following solution: $\hat{\alpha}_{ML} = \frac{s}{t}, \mathrm{where} \; s = \sum x_1 x_{i-1} \mathrm{and} \; t = \sum (x_{i-1})^2$. Is this right?

+ +

Then I have to show that this is the global maximum. If I take the second derivative I get a constant. Is this the sign that I got the global maximum, because the first derivative is linear wrt to $\alpha$? Right?

+",2013-10-13 14:45:19.743 +57540,22715.0,1,,,,Is there a statistical application that requires strong consistency?,,CC BY-SA 3.0,"

I was wondering if someone knows or if there exists an application in statistics in which strong consistency of an estimator is required instead of weak consistency. That is, strong consistency is essential for the application and the application would not work with weak consistency.

+",2013-10-15 15:00:30.140 +57383,,2,,44370.0,anon,,,CC BY-SA 3.0,"

This depends on what you mean by a genomic location. For each cytoband this would be rather straight forward to do. Roughly:

+ +

1) Get the cytoband locations for all genes. These are stored in the organism specific packages, e.g., org.Dm.eg.db, and are named as 'MAP' . You might need the chiptype specific annotation package to map between the probe identifiers and the genes first.

+ +

2) Once you have the cytoband annotations for the genes, you can then test each cytoband separately with the functionality offered by, e.g., the topGO package. There is a section with the heading 'Predefined list of interesting genes' in the vignette of the topGO package that shortly shows how to do this is a similar case.

+ +

For the smoothing approach you have thought of, it might be worth correcting the counts with the actual number of genes in any predefined window, taking into account that not all genes might be present on the chip. The exact gene locations are available in the organism specific annotation package (the same as above). Some difficulties might arise, since certain locations probably have a gene in both strands, so you just need to decide how to count them.

+ +

The cytoband based approach is available in, e.g., Chipster software (see the manual entry at http://chipster.csc.fi/manual/stat-hyperG-cytoband.html), and the source code for the analysis is available at https://github.com/chipster/chipster/blob/master/src/main/modules/microarray/R-2.12/stat-hyperG-cytoband.R, which might in some details, if you decide to use the cytobands.

+",2013-10-13 12:11:47.693 +57384,306.0,2,,57378.0,,,,CC BY-SA 3.0,"

Assuming the coefficients to be normally distributed with the mean of 0 and an estimated standard error which you have not mentioned, the p value tells the quantile of how far the calculated value is from the mean. In the given case if you think that the value is significant at 99.73% level, even then the coefficient is different from 0. If the confidence level that you want is higher than this, then you fail to reject the hypothesis that the coefficient is different from 0.

+",2013-10-13 12:29:25.737 +57385,22651.0,1,,,,Testing linearity,,CC BY-SA 3.0,"

I have two variables and I need to test if they exhibit a linear relationship so that I will be able to predict a response. Kindly assist in how to handle this problem. This is the data:task is to show that there is linear relationship between bricks used and wastes generated.

+ +
Trials            1       2       3       4       5       6       7        8
+No. Bricks (x)  1400    1800    2100    2400    2700    3000    3500    3800
+Wastage, % (y)  10.31   12.26   13.32   15.65   15.12   18.93   20.72   19.04
+
+",2013-10-13 13:09:09.757 +57386,503.0,2,,57385.0,,,,CC BY-SA 3.0,"

My first point would be that you do not need to have a linear relationship in order to predict a response.

+ +

Second, if you are trying to predict a response outside the range of the data (i.e. to less than 1400 or more than 3800 bricks) be very cautious.

+ +

To your question: The first thing I would do is make a graph. In R this could be done as follows:

+ +
x <- c(1400, 1800, 2100, 2400, 2700, 3000, 3500, 3800)
+y <- c(10.31, 12.26, 13.32, 15.65, 15.12, 18.93, 20.72, 19.04)
+plot(x,y)
+lines(lowess(x,y))
+
+ +

The last line adds a loess curve to the data. The relationship appears to be linear at the lower levels, but then flatten at higher levels of bricks.

+ +

I would not rely on any statistical test of linearity. With only 8 points, the deviation from linearity would have to be extreme for it to be significant and a much smaller deviation might be important.

+",2013-10-13 13:23:42.913 +57387,12522.0,2,,57377.0,,,,CC BY-SA 3.0,"

A possible formulation of this model is as follows:

+ +

The purpose of the optimization problem is to obtain the % of each coal type to mix in order to minimize the cost of the mix without violating any operational constraint.

+ +

$i = $ index for coal type (1 = A, 2 = B, 3 = B, 4 = D)

+ +

$x_{i} =$ % of coal type $i$ to be included in the mix

+ +

$c_{i} =$ cost per pound of coal of type $i$

+ +

$b_{i} =$ BTUs per pound of coal of type $i$

+ +

$a_{i} =$ % of ashes of coal of type $i$

+ +

$m_{i} =$ % of moisture of coal of type $i$

+ +

Objective Function: Minimize the cost of a pound of the mix

+ +

Min $Z = \sum_{i=1}^{4} c_{i} \cdot x_{i}$

+ +

Subject to the following contraints:

+ +
    +
  • $BTU/lb$ of the mix must be equal to 11,900:

    + +

    $\sum_{i=1}^4 b_{i} \cdot x_{i} = 11900$

  • +
  • Content of ashes of the mix must be less than 12.2%:

    + +

    $\sum_{i=1}^4 a_{i} \cdot x{i} \leqslant 12.2\%$

  • +
  • The percent of moisture of the mix must be less than 9.4%:

    + +

    $\sum_{i=1}^4 m_{i} \cdot x{i} \leqslant 9.4\%$

  • +
  • The percent of each coal in the mix must add up to 100%:

    + +

    $\sum_{i=1}^4 x{i} = 100\%$

  • +
  • Non-negativity constraint:

    + +

    $x_{i} \geqslant 0, \forall i$

  • +
+ +

You can implement the model in R using the Rglpk package or using the Excel Solver Add-in in MS Excel.

+",2013-10-13 13:26:01.097 +57388,22653.0,1,,,,Meta Analysis: Pooling samples or determine an average effect size,,CC BY-SA 3.0,"

I am new to meta analysis and how I understood the terminology is that there are actually two ways of performing a meta analysis. Let's consider 5 clinical studies with fixed effects. Fixed effects in terms of the same medical treatment as well as demographic details of the participants. One way of analysing these data would be to pool all 5 studies together to obtain a very large study to increase the power to detect the effect of the medical treatment. The other would be to try to detect the effect in each analysis separately and then determine the average effect across the studies. As I understood meta analysis, both seem to be reasonable techniques. However, can anyone tell me pro's and con's for both techniques? When should I use which method? I would assume the results to be pretty similar anyhow or is that wrong to assume?

+",2013-10-13 13:49:31.680 +57389,166.0,2,,57378.0,,,,CC BY-SA 4.0,"

Your interpretation is almost right.

+ +

A right interpretation should contain the following information:

+ +
    +
  1. There are two approaches to interpretating of p-values:

    + +
      +
    • The Frequentist interpretation, which your answer correctly used: The p-value is the probability of observing a value (in your case, the association between y-intercept and response) as extreme or more ('extreme' implies a two-tailed test), if the null hypothesis is true (in your case that is, the association between y-intercept and response is truly absent in the population, i.e. y-intercept = 0. In some tests it can mean the difference is 0);

    • +
    • or, the probability of obtaining that estimate of the parameter (e.g. intercept; using this statistical approach), or a more extreme value, if the population value for that parameter is 0. Your definition correctly uses the frequentist form.

    • +
  2. +
  3. As you can see from point 1, you do not need to assume the other coefficients are correct when interpreting p-values in a regression model... just that the same approach was used. However, it does assume that those parameters are estimated. So, your definition lacks in saying that 'first coefficient is 0.514'. All you need to assert was that the first coefficient is being estimated, i.e. '...the true value of the y-intercept is zero, in the presence of x.'. The values of other coefficients are immaterial to the definition of the p-value of any coefficients.
  4. +
  5. The y-intecept is referring to the value of y when all xs are zero. You correctly implied this point.
  6. +
+ +

You should also note that your example, in using the frequentist approach, is not free from your wants and subjective beliefs. Specifically, the p-value is tied to the design of the experiment you ran. You acknowledged this when you mention using the same number of sampling pairs.

+ +

With regards to your second question, the typical p-value reported for a regression equation is implicitly two-tailed. So, it refers to the absolute value of the parameters obtained. You didn't provide the Excel function you used to calaculate the p-value, but I'd check there to see if Excel is calculating one-tailed (in the same direction) or two-tailed (extreme or more extreme) p-values.

+",2013-10-13 13:57:06.653 +57391,2420.0,1,,,,How to find the input layer and the architecture for a Neural Network,,CC BY-SA 3.0,"

I'm a software developer and I'll like to learn about neural networks. At this point I've find a problem which I'll like to solve at some point. It is about electrical load forecasting. I'm looking for similar problems and it will be great if I can find some similar examples with solutions. At this point I'm having troubles in finding the right model for the RNN, and more exactly I'm struggling with the input layer. As the output I need the forecast values for each hour.

+ +

Any reference to books, links resources or advices are welcome and very appreciated.

+ +

This is the problem that I'll like to solve:

+ +

A very small factory, use a number of equipments to produce bread. Some of them are electrical equipments which means that they consume electrical power. Knowing which equipments will run on the next day, an electricity consumptions forecast can be computed.

+ +

The equipment named E.V. is a special case of equipment. The human operator completes it's values in an empirically manner in order to have a good forecast for the next day. Those values can be positive or negative. The 'on' thing from the bellow table represents the status of the machines.

+ +
+----------------------------------------------------+  
+|equipment name|power| 1h| 2h| 3h| 4h| 5h| 6h| 7h| 8h|  
++----------------------------------------------------+  
+|Equipment 1   |  2MW| - | - | on| on| on| - | - | - |
+|Equipment 2   |  5MW| - | - | - | on| on| on| - | - |
+|Equipment 3   |  1MW| on| on| on| on| on| on| on| on|
+|E.V.          |     | .1|-.1|-.1| .1|-.2| .1| .1|-.1|
++--------------+-------------------------------------+
+|total/(forecast)    |1.1|0.9|2.9|8.1|7.8|6.1|1.1|0.9|
++--------------+-------------------------------------+
+|real consumption    |0.9|0.9|2.7|8.2|7.9|3.1|0.8|0.7|  
++--------------+-------------------------------------+
+
+ +

The problem is that the machines are not running at their maximal power, so it will be great if a more exactly forecast can be build.

+ +

I have data from 2 years back for every day. Also, do you think that date or week day is a good candidate for the input layer?

+ +

I'm not very efficient in understanding an answer with a math only approach. Any example with something more appropiated with my problem is verry appreciated

+",2013-10-13 15:03:39.740 +57392,5203.0,2,,57382.0,,,,CC BY-SA 3.0,"

Yes, assuming by ""gross sampling error"" you mean mean-squared error or the $\epsilon$ term in a model like $Y=AX + \epsilon$

+ +

The error component of a model includes all sources of variability that are not explicitly included in the model. This includes sampling errors (uncertainty due to measuring only a subset of the population), measurement errors (uncertainty due to imprecisions in each measurement), and other things, like error attributable to a misspecified model (e.g., missing predictors/interactions).

+ +

Keep in mind that these are actually types of errors. For example, there may be measurement error associated with each variable in the model, and that error might be a combination of systematic error (essentially, a bias; e.g., someone forgot that the scale reports the weight of the container + its contents) and random error. Given that, there isn't an automatic, all-purpose way of identifying the various error contributions.

+ +

One way to examine measurement errors is through calibration. For example, you could put a weight on the scale and compare the scale's reading to the known mass of the weight. In many cases, the phenomena causing measurement error are reasonably well understood and have a specific structure (e.g., shot noise), which allows them to be incorporated into the model. Some large-scale physics experiments take this to incredible extremes to compare an apparatus's expected performance to the real data. Surveys are sometimes benchmarked by comparing data collected during the survey to larger data sets. For example, you might ask participants for demographic information (e.g., age, gender, income). These values are then compared to known population values (e.g., from a census or tax records), which might tell you how representative your respondents are of the general population.

+ +

Sampling error is much harder to measure directly. You might expect sampling error to shrink as the number of samples approaches the size of the population, whereas a systematic measurement error would remain approximately the same, regardless of sample size.

+",2013-10-13 15:15:38.567 +57393,14850.0,2,,57379.0,,,,CC BY-SA 3.0,"

It appears that you're using nls.

+ +

By typing

+ +
?summary.nls
+
+ +

you can read about the output.

+ +

Estimates and standard errors are estimated by the Gauss-Newton algorithm (if the nls defaults are used)

+ +

The P-values are the results of a two sided test of whether the parameters are zero or not.

+ +

You can check the exact calculations used to create the output shown by typing:

+ +
stats:::summary.nls
+
+",2013-10-13 16:17:54.617 +57394,16046.0,1,,,,Causality in Time Series,,CC BY-SA 3.0,"

I am reading an article which is trying to justify the need for causal inference in their inferential framework. The thought experiment is as follows:

+ +
+

Suppose a statistician is asked to design a model for a simple time + series $X_1,X_2,X_3,...$ and she decides to use a Bayesian method. + Assume she collects a first observation $X_1 = x_1$. She computes the + posterior probability density function (pdf) over the parameters + $\theta$ of the model given the data using Bayes’ rule: $$p(\theta|X_1 += x_1) = \int\frac{p(X_1 = x_1|\theta)p(\theta)}{p(X_1 = x_1|\theta')p(\theta')}, $$

+ +

where $p(X_1 = x_1|θ)$ is the likelihood of $x_1$ given $\theta$ and + p($\theta$) is the prior pdf of $\theta$. She can use the model to + predict the next observation by drawing a sample $x_2$ from the + predictive pdf: $$p(X_2 = x_2|X_1 = x_1) = \int p(X_2 = x_2|X_1 = + x_1,\theta)p(\theta|X_1 = x_1)d\theta,$$

+ +

where $p(X_2 = x_2|X_1 = x_1,\theta)$ is the likelihood of $x_2$ given + $x_1$ and $\theta$. Note that $x_2$ is not drawn from $p(X_2 = x_2|X_1 +> = x_1, \theta)$. She understands that the nature of $x_2$ is very different from $x_1$: while $x_1$ is informative and does change the + belief state of the Bayesian model, $x_2$ is non-informative and thus + is a reflection of the model’s belief state. Hence, she would never + use $x_2$ to further condition the Bayesian model. Mathematically, she + seems to imply that: + $$p(\theta|X_1 =x_1,X_2 =x_2)=p(\theta|X_1 =x_1)$$

+
+ +

However I hardly believe that what this poor statistician should imply is: +$$p(\theta|X_1 =x_1,\text{do}(X_2 =x_2))=p(\theta|X_1 =x_1)$$ +Where ""do(or set)"" here comes from Pearl's framework of causality which can be found here and here. +Now am I right about this?

+",2013-10-13 16:53:32.577 +57395,668.0,2,,57373.0,,,,CC BY-SA 3.0,"

Such a distribution does not exist.

+ +

To see why not, let $0 \lt t \lt 1/2$ and notice that $X_2\gt 1-t$ entails $X_1\le t$ and $X_3\gt 1-t$ also implies $X_1\le t$, for otherwise in either situation the sum of all the $X_i$ would exceed $1.$ The latter two events are disjoint, because we cannot simultaneously have $X_2\gt 1-t \gt 1/2$ and $X_3\gt 1-t\gt 1/2.$ Consequently the chance that $X_1\le t$ is no less than the sum of the chances that $X_2\ge 1-t$ and $X_3\ge 1-t$, each of which equals $t$ by the uniform distribution assumptions. This shows that $t \ge t+t,$ which for $t\gt 0$ obviously is false.

+ +

This contradiction forces us to give up at least one of the assumptions: if indeed $X_1+X_2+X_3\le 1$, then the only other assumptions used in this argument are that each $X_i$ has a Uniform$[0,1]$ distribution. Therefore at least one of the $X_i$ cannot have a Uniform$[0,1]$ distribution, QED.

+",2013-10-13 17:06:36.577 +57396,22656.0,1,57448.0,,,Minimum Sample Size Required to Estimate the Probability $P(X \le c)$ for a Constant $c$ (Given a Confidence Level & Confidence Interval),,CC BY-SA 3.0,"

I have a large population of size $n$ from an unknown continuous random variable $X$, and I do not know the underlying distribution of $X$. Given a constant number $c$, I want to determine the minimum sample size I need to estimate the probability $P(X \le c)$ given a confidence level, $p_c$, and confidence interval, $I_c$ (I am not sure if we need them! ). How can I find the minimum sample size to estimate this probability?

+ +

I have found the following discussion in Wikipedia which is independent of the number of population. I am not sure if it is a good way to determine sample size! +

+ +

I have also found some methods to determine sample size for data to be analyzed by nonparametric tests.you don't have to make any assumption about the distribution of the values. That is why it is called nonparametric. Now I am confused if these nonparametric methods can be used to solve my problem or the method I found in Wikipedia is the correct way to solve my problem, or there exists a better solution.

+ +

Thanks for your help.

+",2013-10-13 17:09:46.403 +57397,20473.0,2,,57380.0,,,,CC BY-SA 3.0,"

I assume that you are using the OLS estimator on this linear regression model. You can use the inequality constrained least-squares estimator, which will be the solution to a minimization problem under inequality constraints. Using standard matrix notation (vectors are column vectors) the minimization problem is stated as

+ +

$$\min_{\beta} (\mathbf y-\mathbf X\beta)'(\mathbf y-\mathbf X\beta) \\s.t.-\mathbf Z\beta \le \mathbf 0 $$

+ +

...where $\mathbf y$ is $n \times 1$ , $\mathbf X$ is $n\times k$, $\beta$ is $k\times 1$ and $\mathbf Z$ is the $m \times k$ matrix containing the out-of-sample regressor series of length $m$ that are used for prediction. We have $m$ linear inequality constraints (and the objective function is convex, so the first order conditions are sufficient for a minimum).

+ +

The Lagrangean of this problem is

+ +

$$L = (\mathbf y-\mathbf X\beta)'(\mathbf y-\mathbf X\beta) -\lambda'\mathbf Z\beta = \mathbf y'\mathbf y-\mathbf y'\mathbf X\beta - \beta'\mathbf X'\mathbf y+ \beta'\mathbf X'\mathbf X\beta-\lambda'\mathbf Z\beta$$

+ +

$$= \mathbf y'\mathbf y - 2\beta'\mathbf X'\mathbf y+ \beta'\mathbf X'\mathbf X\beta-\lambda'\mathbf Z\beta $$

+ +

where $\lambda$ is a $m \times 1$ column vector of non-negative Karush -Kuhn -Tucker multipliers. The first order conditions are (you may want to review rules for matrix and vector differentiation)

+ +

$$\frac {\partial L}{\partial \beta}= \mathbb 0\Rightarrow - 2\mathbf X'\mathbf y +2\mathbf X'\mathbf X\beta - \mathbf Z'\lambda $$

+ +

$$\Rightarrow \hat \beta_R = \left(\mathbf X'\mathbf X\right)^{-1}\mathbf X'\mathbf y + \frac 12\left(\mathbf X'\mathbf X\right)^{-1}\mathbf Z'\lambda = \hat \beta_{OLS}+ \left(\mathbf X'\mathbf X\right)^{-1}\mathbf Z'\xi \qquad [1]$$

+ +

...where $\xi = \frac 12 \lambda$, for convenience, and $\hat \beta_{OLS}$ is the estimator we would obtain from ordinary least squares estimation.

+ +

The method is fully elaborated in Liew (1976).

+",2013-10-13 17:17:58.327 +57398,22658.0,2,,49906.0,,,,CC BY-SA 3.0,"

There are 2 competing statistical models. Model #1 (null hypothesis, McNemar): probability correct to incorrect = probability of incorrect to correct = 0.5 or equivalent b=c. Model #2: probability correct to incorrect < probability of incorrect to correct or equivalent b > c. For model #2 we use maximum likelihood method and logistic regression to determine model parameters representing model 2. Statistical methods look different because each method reflects a different model.

+",2013-10-13 18:26:48.740 +57399,6813.0,1,,,,Modelling probabilties within friend sets,,CC BY-SA 3.0,"

Recently, I was wondering about calculating the probability of a given individual in a given population ""knowing"" (let's say, present in individual's friend set) at least one person with a given trait A and at least one person with another given trait, B; where it is possible that any number of people in the given individual's friend set can possess both traits.

+ +

For example, using genetic traits, in a given population, how could one calculate the probability that a given individual in a given population ""knows"" at least one person with grey eye colour and at least one person who is greater than 200cm tall; where, naturally, it is possible that any number of people in the friend set can possess grey eye colour and be greater than 200cm tall.

+ +

I have developed a sort of model, but it may not be correctly specified; it is as follows:

+ +

Assumptions and Qualifications:

+ +
    +
  • First of all, for simplicity, let's assume that we define ""knowing"" as mutually connected friends on an online social network.
  • +
  • Secondly, the frequencies of genetic traits are (a) going to be determined by ethnicity of a given population as well as environmental factors (nutrition, healthcare) and (b) unlikely to be independently distributed across a given individual's friend set (e.g. family members will have greater genetic similarity); however, for this problem, let's adopt a simple model where both of the above conditions are violated.
  • +
  • Thirdly, assume that the individual's friend set provides a microcosmic representation of society; this facilitates a frequentist generation of probabilities from the instance rate in the population.
  • +
  • Finally, instances of genetic traits in the population are fabricated, but are used to generate probabilities for the examples.
  • +
+ +

Model Formulation:

+ +

I have reasoned that a binomial random distribution can be applied to the probability of each genetic trait, where a ""success"" is defined as an individual in the friend set possessing that genetic trait.

+ +

Thus for trait A, we have:

+ +

$$P(A=k) = \binom{N}{k}p_{a}^{k}(1-p_{a})^{N-k}$$

+ +

and, for trait B:

+ +

$$P(B=k) = \binom{N}{k}p_{b}^{k}(1-p_{b})^{N-k}$$

+ +

where $N$ is number of friends in the friend set, $k$ is number of people containing the genetic trait and $p_{x}$ is the probability associated with possessing genetic trait $X$.

+ +

Because the scenario is concerned with the probability of at least one person possessing trait A and at least one person possessing trait B, it is easier to find the complement of no people in a friend set containing the trait; for both traits:

+ +

$$P(A >= 1) = 1 - P(A = 0)$$

+ +

and

+ +

$$P(B >= 1) = 1 - P(B = 0)$$

+ +

Furthermore, we know that probability of an intersection of events is given by:

+ +

$$\mathbb{P}(A \cap B) = \mathbb{P}(A|B)\mathbb{P}(B) = \mathbb{P}(B|A)\mathbb{P}(A)$$

+ +

However, because we are assuming independence between genetic traits, $\mathbb{P}(A)$ and $\mathbb{P}(B)$ are independent, thus:

+ +

$$\mathbb{P}(A \cap B) = \mathbb{P}(A)\mathbb{P}(B)$$

+ +

Combining the information above, we get the following model for the above scenario:

+ +

$$\mathbb{P}(A >=1, B >=1) = \left(1-\binom{N}{k}p_{a}^{k}(1-p_{a})^{N-k}\right)\left(1-\binom{N}{k}p_{b}^{k}(1-p_{b})^{N-k}\right)$$

+ +

So, for the original example above, assuming a friend set of size $N = 300$, the instance of trait A in the population is $\frac{1}{800}$ and the instance of trait B in the population is $\frac{1}{5000}$; according to the model, we get the final probability:

+ +

$$\mathbb{P}(A >=1, B >=1)$$

+ +

$$ = \left(1-\binom{300}{0}\left(\frac{1}{800}\right)^{0}\left(\frac{799}{800}\right)^{300}\right)\left(1-\binom{300}{0}\left(\frac{1}{5000}\right)^{0}\left(\frac{4999}{5000}\right)^{300}\right)$$

+ +

$$\approx 0.018 = 1.8\%$$

+ +

Does this model seem reasonable given the assumptions?

+ +

Assuming this model is not correctly specified, maybe somebody could provide a more accurate representation.

+",2013-10-13 18:37:21.500 +57400,7155.0,2,,57391.0,,,,CC BY-SA 3.0,"

Define neural network to be $f$, time-series to be $x$, lag order to be $n$ and forecast horizon to be $h$.

+ +

$ f(x_{t-1}, x_{t-2},..,x_{t-n}) = [x_t, x_{t+1},..,x_{t+h}]$

+ +

Assume you have the following time series,

+ +
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+ +

You define $n=2$, $h=1$.

+ +

Your inputs for that time-series are circulant matrix like.

+ +

x =

+ +
[[ 1, 0],
+ [ 2, 1],
+ [ 3, 2],
+ [ 4, 3],
+ [ 5, 4],
+ [ 6, 5],
+ [ 7, 6],
+ [ 8, 7]]
+
+ +

Your outputs are

+ +

y =

+ +
[2, 3, 4, 5, 6, 7, 8, 9]
+
+ +

So the length of your input layer is given by $n$, the length of your output layer is given by $h$, where your first input neuron is $x_{t-1}$ and your last input in $x_{t-n}$. Same goes for the forecast horizon.

+ +

Instead of having multiple outputs for the forecast horizon, you can use a forecast horizon of 1 then recurse on the predictions to obtain any forecast horizon you want.

+ +

For classic parametric stationary time series models the limit of the recursive behaviour of the system is well-studied.

+ +

Your problem is a little more involved though. You have inputs and outputs of the system and you want the predict outputs to follow some reference trajectory.

+ +

One solution is to use Narma-L2, which approximates the system by linear feedback using two neural networks. Define control inputs to be $c$ and production outputs to be $p$. Define reference production outputs to be $r$

+ +

You train two neural networks of the forms $g(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n}) = c_{t}$ and $k(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n}) = p_{t}$.

+ +

The prediction for control inputs is then $c_t = \frac{r - k(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n})}{g(c_{t-1}, .., c_{t-n}, p_{t-1},..,p_{t-n})}$

+ +

Also, neural networks are a PITA. There's plenty of good nonparametric regression models that are easier to train, like Gaussian Process Regression for instance.

+ +

See: Neural Network NARMA Control of a Gyroscopic +Inverted Pendulum

+",2013-10-13 18:42:03.260 +57401,15377.0,1,57407.0,,,A question on continuous random variable,,CC BY-SA 3.0,"

Let say, I have 2 continuous random variables X1 & X2. Both have same location parameters. Other parameters may be same or may not.

+ +

Now say, the q1-th quantile of X1 is less than the q1-th quantile of x2. But the q2-th quantile of x1 is more than the q2th quantile of x2.

+ +

My question is, is that possible? Is there any example of x1 & x2 which have that property?

+ +

I will be really grateful if someone can give me some pointer.

+ +
+ +

Edit: At this point, I realize the question I asked for was not correctly specified.

+ +

I'm particularly interested in the case where the two quantiles being considered are on the same side of the location parameter.

+",2013-10-13 19:40:09.530 +57402,7229.0,2,,57195.0,,,,CC BY-SA 3.0,"

I don't see clearly behind the lines but it seems to me that there are too much data points.

+ +

Since you want to show the regional homogeneity and not exactly stations, I'd suggest you firstly to group them spatially. For example, overlay by a ""fishnet"" and compute average measured value in every cell (at every time moment). If you place these average values in the cell centers this way you rasterize the data (or you can compute also mean latitude and longitude in every cell if you don't want overlaying lines). Or to average inside administrative units, whatever. Then for these new averaged ""stations"" you can calculate correlations and plot a map with smaller number of lines.

+ +

+ +

This can also remove those random single high-correlation lines going through all area.

+",2013-10-13 20:29:43.190 +57403,20062.0,1,58506.0,,,Why is Mantel's test preferred over Moran's I?,,CC BY-SA 3.0,"

Mantel's test is widely used in biological studies to +examine the correlation between the spatial distribution of animals (position in space) with, for example, their genetic relatedness, rate of aggression or some other attribute. Plenty of good journals are using it (PNAS, Animal Behaviour, Molecular Ecology...).

+ +

I fabricated some patterns which may occur in nature, but Mantel's test seems to be quite useless to detect them. On the other hand, Moran's I had better results (see p-values under each plot).

+ +
+

Why don't scientists use Moran's I instead? Is there some hidden reason I do not see? And if there is some reason, how can I know (how the hypotheses must be constructed differently) to appropriately use Mantel's or Moran's I test? A real-life example will be helpful.

+
+ +

Imagine this situation: There is an orchard (17 x 17 trees) with a crow is sitting on each tree. Levels of ""noise"" for each crow are available and you are want to know if the spatial distribution of crows is determined by noise they make.

+ +

There are (at least) 5 possibilities:

+ +
    +
  1. ""Birds of a feather flock together."" The more similar crows are, the smaller the geographical distance between them (single cluster).

  2. +
  3. ""Birds of a feather flock together."" Again, the more similar crows are, the smaller the geographical distance between them, (multiple clusters) but one cluster of noisy crows has no knowledge about the existence of second cluster (otherwise they would fuse into one big cluster).

  4. +
  5. ""Monotonic trend.""

  6. +
  7. ""Opposites attract."" Similar crows cannot stand each other.

  8. +
  9. ""Random pattern."" The level of noise has no significant effect on spatial distribution.

  10. +
+ +

For each case, I created a plot of points and used the Mantel test to compute a correlation (it is no surprise that its results are non-significant, I would never try to find linear association among such patterns of points).

+ +

+ +
+ +

Example data: (compressed as possible)

+ +
r.gen   <- seq(-100,100,5)
+r.val   <- sample(r.gen, 289, replace=TRUE)
+z10     <- rep(0, times=10)
+z11     <- rep(0, times=11)
+r5      <- c(5,15,25,15,5)
+r71     <- c(5,20,40,50,40,20,5)
+r72     <- c(15,40,60,75,60,40,15)
+r73     <- c(25,50,75,100,75,50,25)
+rbPal   <- colorRampPalette(c(""blue"",""red""))
+my.data <- data.frame(x = rep(1:17, times=17),y = rep(1:17, each=17),
+             c1=c(rep(0,times=155),r5,z11,r71,z10,r72,z10,r73,z10,r72,z10,r71,
+             z11,r5,rep(0, times=27)),c2 = c(rep(0,times=19),r5,z11,r71,z10,r72,
+             z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=29),r5,z11,r71,z10,r72,
+             z10,r73,z10,r72,z10,r71,z11,r5,rep(0, times=27)),c3 = c(seq(20,100,5),
+             seq(15,95,5),seq(10,90,5),seq(5,85,5),seq(0,80,5),seq(-5,75,5),
+             seq(-10,70,5),seq(-15,65,5),seq(-20,60,5),seq(-25,55,5),seq(-30,50,5),
+             seq(-35,45,5),seq(-40,40,5),seq(-45,35,5),seq(-50,30,5),seq(-55,25,5),
+             seq(-60,20,5)),c4 = rep(c(0,100), length=289),c5 = sample(r.gen, 289, 
+             replace=TRUE))
+
+# adding colors
+my.data$Col1 <- rbPal(10)[as.numeric(cut(my.data$c1,breaks = 10))]
+my.data$Col2 <- rbPal(10)[as.numeric(cut(my.data$c2,breaks = 10))]
+my.data$Col3 <- rbPal(10)[as.numeric(cut(my.data$c3,breaks = 10))]
+my.data$Col4 <- rbPal(10)[as.numeric(cut(my.data$c4,breaks = 10))]
+my.data$Col5 <- rbPal(10)[as.numeric(cut(my.data$c5,breaks = 10))]
+
+ +

Creating matrix of geographical distances (for Moran's I is inversed):

+ +
point.dists           <- dist(cbind(my.data$x, my.data$y))
+point.dists.inv       <- 1/point.dists
+point.dists.inv       <- as.matrix(point.dists.inv)
+diag(point.dists.inv) <- 0
+
+ +

Plot creation:

+ +
X11(width=12, height=6)
+par(mfrow=c(2,5))
+par(mar=c(1,1,1,1))
+
+library(ape)
+for (i in 3:7) {
+  my.res <- mantel.test(as.matrix(dist(my.data[ ,i])), as.matrix(point.dists))
+  plot(my.data$x,my.data$y,pch=20,col=my.data[ ,c(i+5)], cex=2.5, xlab="""", 
+       ylab="""", xaxt=""n"", yaxt=""n"", ylim=c(-4.5,17))
+  text(4.5, -2.25, paste(""Mantel's test"", ""\n z.stat ="", round(my.res$z.stat, 
+   2), ""\n p.value ="", round(my.res$p, 3)))
+
+  my.res <- Moran.I(my.data[ ,i], point.dists.inv)
+  text(12.5, -2.25, paste(""Moran's I"", ""\n observed ="", round(my.res$observed, 
+   3), ""\n expected ="",round(my.res$expected,3), ""\n std.dev ="", 
+       round(my.res$sd,3), ""\n p.value ="", round(my.res$p.value, 3)))
+}
+
+par(mar=c(5,4,4,2)+0.1)
+
+for (i in 3:7) {
+  plot(dist(my.data[ ,i]), point.dists,pch = 20, xlab=""geographical distance"", 
+       ylab=""behavioural distance"")
+}
+
+ +

P.S. in the examples on UCLA's statistics help website, both tests are used on the exact same data and the exact same hypothesis, which is not very helpful (cf., Mantel test, Moran's I).

+ +

Response to I.M. +You have write:

+ +
+

...it [Mantel]tests whether quiet crows are located near other quiet + crows, while noisy crows have noisy neighbors.

+
+ +

I think that such hypothesis could NOT be tested by Mantel test. On both plots the hypothesis valid. But if you suppose that one cluster of not noisy crows may not have knowledge about the existence of second cluster of not noisy crows - Mantels test is again useless. Such separation should be very probable in nature (mainly when you are doing data collection on larger scale).

+ +

+",2013-10-13 20:35:21.350 +57404,19547.0,1,,,,Hypothesis Testing applied to real life,,CC BY-SA 3.0,"

I was wondering if it is possible to apply the method of hypothesis testing to real life. For example if someone can use it for decision making. I have always used this method for homework problems but maybe we can use this method as an aid in decision making. Therefore we could somehow know for example the probability of rejecting wrongly an alternative decision. What do you think on that? And it would be nice if someone can give an example if he thinks that this can be done.

+",2013-10-13 20:50:28.997 +57405,19750.0,1,57408.0,,,Bayesian variable selection,,CC BY-SA 3.0,"

Chapter 13 of Kevin Murphy's book Machine Learning: A Probabilistic Perspective discusses Sparse Linear Models. After a short introduction on the benefits of sparse models, he introduces the following problem:

+ +

+ +

How does he derive equation 13.1 above? i.e. why does it take that form, and what is $f$ supposed to represent here?

+",2013-10-13 21:00:08.203 +57413,22666.0,1,,,,Time persistence in panel data,,CC BY-SA 3.0,"

I am using dynamic model with panel quarter data using Stata. And my sample contain 16 nations from 2000 to 2010. +Is there an approximated number of observations in the panel data to be considered as a time persistent process?

+",2013-10-14 01:03:58.377 +57414,22667.0,1,,,,"I have a discrete distribution and want to know to what extent other samples differ from it, what is the right test?",,CC BY-SA 3.0,"

This is kind of a basic stats question, but I want to make sure I am doing this right.

+ +

I have a distribution of objects. Specifically: + array([ 6072., 112673., 126874., 44366., 5384., 14697., 20323., 68197., 98024.,39483., 103990., 18556., 32930., 23551., 6897.])

+ +

I then have a lot of samples like [1,4,0,0,0,0...] (same length) and I'd like to know how far the samples are from the distribution above. Correlation doesn't really do it. +[32,0,0,0,..] should be further away than [4,0,0,0...].

+",2013-10-14 01:17:13.023 +57406,22507.0,2,,20234.0,,,,CC BY-SA 3.0,"

Machine learning often deals with optimization of a function which has many local minimas. Feedforward neural networks with hidden units is a good example. Whether these functions are discrete or continuous, there is no method which achieves a global minimum and stops. It is easy to prove that there is no general algorithm to find a global minimum of a continuous function even if it is one-dimensional and smooth (has infinitely many derivatives). In practice, all algorithms for learning neural networks stuck into a local minimum. It is easy to check this: create a random neural network, make a big set of its responses to random inputs, then try to learn another neural network with the same architecture to copy the responses. While the perfect solution exists, neither backpropagation not any other learning algorithm will be able to discover it, starting from a random set of weights.

+ +

Some learning methods, like simulated annealing or genetic algorithms, explore many local minimas. For continuous functions there are methods like gradient descent, which find the closest local minimum. They are much faster, thats why they are widely used in practice. But given enough time, the former group of methods outperforms the later in terms of training set error. But with reasonable time constraints, for real world problems, the latter group is usually better.

+ +

For some models, like logistic regression, there is one local minimum, the function is convex, the minimization converges to the minimum, but the models themselves are simplistic.

+ +

Thats the bitter truth.

+ +

Note also that proof of convergence and proof of convergence to the best solution are two different things. K-means algorithm is an example of this.

+ +

Finally, for some models we don't know how to learn at all. For example, if the output is an arbitrary computable function of inputs, we don't know good algorithms which, in reasonable time, find a Turing or equivalent machine implementing this function. For instance, if f(1)=2, f(2)=3, f(3)=5, f(4)=7, ..., f(10)=29 (ten first primes), we don't know any learning algorithm which would be able to predict, in reasonable time, that f(11)=31, unless it already knows the concept of prime numbers.

+",2013-10-13 21:24:53.867 +57407,594.0,2,,57401.0,,,,CC BY-SA 3.0,"

Since you appear to doubt the example offered, I have included a diagram. As Michael Mayer said, two normal distributions centered around 0, one with larger variance, is sufficient.

+ +

In the diagram, we compare the 0.1 and the 0.9 quantiles for $\sigma=1$ (blue) and $\sigma=0.8$ (dark orange)

+ +

+ +

Michael Mayer's example fulfills the requirements of your question with $q_1=0.1$, $q_2=0.9$ and $X_1$ being the one with larger variance.

+ +
+ +

Edit:

+ +

For the case where $q_1$ and $q_2$ must both be on the same side of whatever the measure of location is, let's take two symmetric distributions, which share the same mean and median.

+ +

Let $X_1$ be $\sim \text{N}(0,1^2)$ and let $X_2$ be an equal mixture of a $\text{N}(-0.8,0.1^2)$ and a $\text{N}(0.8,0.1^2)$, and let $q_1 = 0.6$ and $q_2 = 0.9$:

+ +

+ +

This example fulfills the new requirements of your question with $q_1=0.6$, $q_2=0.9$ and $X_1$ being the one with only a single normal component (shown in blue above).

+ +

Further, you should note that 'location parameter' isn't sufficiently specified. I could parameterize normal distributions by their 5th percentile and their standard deviation, and call the parameter based on the 5th percentile the location parameter (it's just a shift of the mean by $1.645\sigma$. and can work as a perfectly valid location parameter). Then Michael's original example suffices even under the new conditions. If that contradicts your intention, your intention needs to be stated specifically enough to exclude it.

+",2013-10-13 22:09:36.687 +57408,7155.0,2,,57405.0,,,,CC BY-SA 3.0,"

The divisor is just a normalizing constant, so we can ignore it for the moment. If we plug in $f(\gamma)$ it simplifies to $p(D|\gamma)p(\gamma)$, by Bayes' rule is equal to $p(\gamma|D)p(D)$. Now since $p(D)$ isn't a function of $\gamma$ it falls into the normalizing constant. Thus it simplifies to $p(\gamma|D)$.

+ +

This expansion seems pointless until you realize that the $\gamma$ that $p(\gamma|D)$ is at its maximum is the same $\gamma$ that $f(\gamma)$ is at its minimum. So we can study $f(\gamma)$ by itself.

+",2013-10-13 22:17:17.043 +57409,10060.0,2,,57230.0,,,,CC BY-SA 3.0,"

If the domain all have a base of 100, a stacked bar chart may be suitable.

+ +

+",2013-10-13 22:36:21.843 +57410,22662.0,1,,,,Joint entropy of two random variables,,CC BY-SA 3.0,"

The joint entropy is the amount of information we get when we observe X and Y at the same time, but what would happen if we don't observe them at the same time.

+ +

For example, when i toss a coin, if i got tails i will only observe the variable X, but if i got heads i will only observe the variable Y. How could i find the entropy?

+",2013-10-13 22:51:56.027 +57411,20473.0,2,,57410.0,,,,CC BY-SA 3.0,"

Entropy (joint entropy included), is a property of the distribution that a random variable follows. The available sample (and hence the timing of observation) plays no role in it.

+ +

Copying for Cover & Thomas, the joint entropy $H(X,Y)$ of two discrete random variables $X, Y,$ with joint distribution $p(x,y)$, is defined as

+ +

$$H(X,Y) = - \sum_{S_X}\sum_{S_Y}p(x,y)\log p(x,y) $$

+ +

Examine the expression: the sums are taken over all possible values of $X$ and $Y$, i.e. over all the values that belong to the support of each r.v. ($S_X$ and $S_Y$ respectively), irrespective of whether some of these values may not materialize or be observed in a sample. What we actually observe, or when, plays no role, in calculating entropy, and joint entropy in particular.

+ +

Turning to your specific example: The side of a coin itself can not be modeled as a random variable. A random variable maps events into real numbers. The side of a coin is not an event. Observing one of the two sides is an event. Not observing a side, is an event. So let's define a random variable $X$ by ""$X$ takes the value $1$ if heads is observed, $0$ otherwise"". And define $Y$ by ""$Y$ takes the value $1$ if tails is observed, $0$ otherwise"". Assume the coin is fair. The joint distribution of these two random variables is then described by +$$\begin{align} +P(X=1,Y=1) &= 0 \\ +P(X=1,Y=0) &= 0.5 \\ +P(X=0,Y=1) &= 0.5 \\ +P(X=0,Y=0) &= 0 +\end{align}$$

+ +

Note that the numerical mapping we chose (the zero/one values) does not play, as numbers go, any decisive part in the probabilities assigned -we could have chosen a 5/6 mapping for $X$ and a 56/89 mapping for $Y$ (or whatever) -the allocation of probabilities in the joint distribution would have been the same (it is the underlying structure of events that is the critical factor).
+Next, as always, we consider the distribution at non-zero values, so

+ +

$$H(X,Y) = - 0.5\log(0.5) - 0.5\log(0.5) $$

+ +

and using base-2 for the logarithm we get

+ +

$$H(X,Y) = - 0.5(-1) - 0.5(-1) = 1 $$

+ +

Finally, you can easily find that the entropy of $X$ (and likewise for $Y$) is +$$H(X) = - \sum_{S_X}p(x)\log p(x) = - 0.5(-1) - 0.5(-1) = 1 $$

+ +

So in this case $H(X,Y) = H(X) = H(Y)$. But the general expression for the decomposition of joint entropy is

+ +

$$H(X,Y) = H(X) + H(Y\mid X) = H(Y) + H(X\mid Y)$$

+ +

where $H(Y\mid X)$ and $H(X\mid Y)$ are conditional entropies. Then we conclude that $H(Y\mid X) = H(X\mid Y) = 0$ in this case. The intuition is straightforward: given $X$ what has happened to $Y$ is certain (and likewise in reverse), so conditional entropy is zero.

+",2013-10-14 00:19:58.667 +57412,22665.0,1,,,,Understanding Bayesian Predictive Distributions,,CC BY-SA 3.0,"

I'm taking an Intro to Bayes course and I'm having some difficulty understanding predictive distributions. I understand why they are useful and I'm familiar with the definition, but there are some things I don't quite understand.

+ +

1) How to get the right predictive distribution for a vector of new observations

+ +

Suppose that we have built a sampling model $p(y_i | \theta)$ for the data and a prior $p(\theta)$. Assume that the observations $y_i$ are conditionally independent given $\theta$.

+ +

We have observed some data $\mathcal{D} = \{y_1, y_2, \, ... \, , y_k\}$, and we update our prior $p(\theta)$ to the posterior $p(\theta | \mathcal{D})$.

+ +

If we wanted to predict a vector of new observations $\mathcal{N} = \{\tilde{y}_1, \tilde{y}_2, \, ... \, , \tilde{y}_n\}$, I think we should try to get the posterior predictive using this formula +$$ +p(\mathcal{N} | \mathcal{D}) = \int p(\theta | \mathcal{D}) p ( \mathcal{N} | \theta) \, \mathrm{d} \theta = \int p(\theta | \mathcal{D}) \prod_{i=1}^n p(\tilde{y}_i | \theta) \, \mathrm{d} \theta, +$$ +which is not equal to +$$ +\prod_{i=1}^n \int p(\theta | \mathcal{D}) p(\tilde{y}_i | \theta) \, \mathrm{d} \theta, +$$ +so the predicted observations are not independent, right?

+ +

Say that $\theta | \mathcal{D} \sim$ Beta($a,b$) and $p(y_i | \theta) \sim$ Binomial($n, \theta$) for a fixed $n$. In this case, if I wanted to simulate 6 new $\tilde{y}$, if I understand this correctly, it would be wrong to simulate 6 draws independently from the Beta-Binomial distribution that corresponds to the posterior predictive for a single observation. Is this correct? I don't know how to interpret that the observations are not independent marginally, and I'm not sure I understand this correctly.

+ +

Simulating from posterior predictives

+ +

Many times when we simulate data from the posterior predictive we follow this scheme:

+ +

For $b$ from 1 to $B$:

+ +

1) Sample $\theta^{(b)}$ from $p(\theta | \mathcal{D})$.

+ +

2) Then simulate new data $\mathcal{N}^{(b)}$ from $p(\mathcal{N} | \theta^{(b)})$.

+ +

I don't quite know how to prove this scheme works, although it looks intuitive. Also, does this have a name? I tried to look up a justification and I tried different names, but I had no luck.

+ +

Thanks!

+",2013-10-14 00:46:15.680 +57418,19043.0,1,,,,Do only certain pairwise comparisons after significant interaction in two-way ANOVA,,CC BY-SA 3.0,"

I am comparing measurements on a test group relative to a control group in three different environmental conditions. I am interested in both differences between environmental conditions and differences between test and control groups. I ran a two-way ANOVA with an interaction term and looked at pairwise comparisons when terms were significant.

+ +

When the interaction term was significant the Tukey HSD function in R automatically outputs all comparisons. Comparisons between test and test groups on different environmental conditions, comparisons between test and control groups on different environmental conditions and so forth. Needless to say, this resulted in a large number of tests to correct for.

+ +

My adviser thinks that I should only do three tests to compare test group to control group on each environmental condition (and then only adjust for three tests). I think that because I am interested in differences between environmental conditions in this study, I should run most tests. If I wasn't interested in differences in environmental conditions it should be a nested ANOVA, right? You can see from graphs that the interaction term come from differences between test and control groups on two ecological sites, but it doesn't seem valid to just only run comparisons between groups you 'suspect' will be different. The ones I'm not sure I care about are differences test and control groups on two different environmental conditions.

+ +

Is it valid to only run comparisons between groups you are interested in to reduce the number of tests you have to adjust the Tukey HSD p-value for or should you run comparisons on all combinations of groups.

+ +

Thanks for help in advance.

+",2013-10-14 04:02:43.010 +57419,22668.0,1,,,,Weightining using TraMineR,,CC BY-SA 3.0,"

I have read some posts on weighting. However, I am still unclear on the sort of weights I need to use. I am using data from the Longitudinal Survey of Australian Youth (LSAY). This survey provides longitudinal weights for each survey wave (i.e. correction for sampling error and attrition). Because I have weight variables for each period (10) and I can only specify one of them in TraMiner, I am not sure which I should use. I have read that whether weights for the first or last wave should be used. I have not read any reasons why.

+ +

Can anyone provide me some guidance on this issue?

+",2013-10-14 04:45:22.343 +57420,306.0,2,,57417.0,,,,CC BY-SA 3.0,"

First things first. There needs to be greater information given as this does not have a universally correct answer. Different types of distributions have to be looked at with different types of procedures.

+ +

But just to show that yes this is possible, we assume that each of the variables that you have mentioned are normally distributed but the parameters of the normal distributions are different from each other for any given pair.

+ +

Now we take n samples each of these variables. Then calculate the correlation coefficients for each pair of the variables. If we cannot reject the hypothesis of these correlation coefficients being zero, we hypothesize that the variables are independent of each other. So we have a set of variables which are independent from each other, but they have different probability distributions.

+",2013-10-14 05:05:30.750 +57421,22669.0,1,57423.0,,,"ARIMA (0,1,1) or (0,1,0) - or something else?",,CC BY-SA 3.0,"

I've just started learning time series so please excuse me if it's painfully obvious; I haven't managed to find the answer elsewhere.

+ +

I have a data series showing a pretty obvious trend although it's quite noisy. I can take pretty much any division of the data and run classical tests to show a highly significant difference in means.

+ +

I decided to have a look at time series analysis to see if it could help describe the trend. An ARIMA(0,1,1) model comes out with AIC,BIC=34.3,37.3 (Stata), whilst an ARIMA(0,1,0) model comes out with AIC,BIC=55.1,58.1 - so I understand I'm supposed to prefer the (0,1,1) model.

+ +

However, the coefficient for the MA(1) is displaying as -0.9999997 (and not showing any p-values). If I try the same in SPSS I get an MA(1) coefficient of 1.000 (I assume SPSS uses opposite signs) with a p-value of 0.990 - does this mean it suggests I drop the term?

+ +

My understanding is that the effect of a MA(1) coefficient of -1 is basically to remove the old error term and convert the whole series to a linear trend. Does this mean ARIMA is totally unsuitable for my needs? On the plus side it gives me a sensible value for the trend. If I use the (0,1,0) model then I still get a reasonable value for the trend but it's not significant any more.

+ +

Thanks for your help!

+ +

EDIT: +Thanks for looking in. The trend looks like a fairly linear decrease; the data points seen to fairly noisily rattle around above and below a trend line. The ARIMA (0,1,1) model produces something that's not far off a straight line decrease which seems sensible - the (0,1,1) produces what is essentially a lagged version of the data, translated down by one month of trend. +The data aren't stationary (due to the trend) - though the first differences seem to be. I don't think the (0,1,1) is a bad model - I'm just a little confused by the p-value seeming to suggest I should drop the MA term - or wondering if it means I should bin ARIMA entirely!

+ +

EDIT2 +@vinux - thanks for the suggestion; that makes a lot of sense (and seems to be what the -1 MA term is trying to create?). +I've uploaded as many graphs as I could think of as people had requested.

+ +

+ + + + +

+ +

I've also put the monthly data up in CSV format at pastebin

+",2013-10-14 05:55:50.823 +57422,9007.0,2,,57419.0,,,,CC BY-SA 3.0,"

TraMineR, and more generally sequence analysis, treats each sequence as a whole. Ideally, weights should therefore correct for selection bias of the sample of sequences you consider, and longitudinal weights should be used. +The choice of the weights depends on which sequences you retain for your analysis:

+ +
    +
  1. If you use only complete sequences until the last wave, then the choice should be the longitudinal weights associated to the last wave, which accounts attrition until the last wave).

  2. +
  3. If you chose to retain all sequences complete until say the 6th wave and admit missing states for the last four waves, then you should chose the weights associated to wave 6.

  4. +
  5. If you retain all sequences, even for those for which you have valid data for the first wave only, then you should use weights of the first wave.

  6. +
  7. In case you select only sequences with no more than say 4 missing states, the choice may be more ambiguous. If most of the four missing states occur in the last four positions, then you could adopt the solution 2 as a good approximation.

  8. +
+ +

Hope this helps.

+",2013-10-14 05:56:44.773 +57423,5637.0,2,,57421.0,,,,CC BY-SA 3.0,"

It is difficult to give the right answer without looking at the data. Here are some points that may help you in your modelling.

+ +

The coefficient of MA(1) very close to 1 indicates the sign of overdifferencing. This means unit root in Moving averages.

+ +

My suggestion would be: Check the original series is stationary (visually) or check the presence of unit root. If you observe deterministic trend (eg: linear), add the trend part with time series model. If the original series is stationary build the time series without differencing.

+",2013-10-14 06:12:22.867 +57541,2149.0,2,,57539.0,,,,CC BY-SA 3.0,"

You could order the nob values from low to high and then use a search procedure to identify when and if local mean(s) changed significantly via Intervention Detection (trial and error). ID is esseentially a single dimension(characteristic) cluster analysis. Alternatively you could pre-specify the number of groups (classes) that you wished to have(n) and then find the n-1 breakpoints which optimally classifies the nob values. I have not ever done this but it might be worth a try.

+",2013-10-15 15:11:16.427 +57424,1406.0,2,,57413.0,,,,CC BY-SA 3.0,"

Time series issues, such as unit roots, etc in panel data can be accounted for when there is enough time series dimension for single unit regression estimation. This means at least 30 observations. If you have less, you can only use ideas from time series regressions, such as doing regression on growth rates instead of levels, etc.

+ +

In fact J. Wooldridge in his book ""Econometric Analysis of Cross Section and Panel Data"" recommends to treat all the time series issues as a question of covariance matrix of the unit error term. Translated to Stata parlance, use cluster-robust standard errors for your analysis and you should be ok, with the usual caveat that there are no magical fixes in modelling, i.e. if your model is not sound, no fancy estimation method is going to help you.

+",2013-10-14 07:37:48.140 +57425,11117.0,1,57431.0,,,Vocabulary: do we measure actual values or observations?,,CC BY-SA 3.0,"

Consider that $\theta$ is an hidden parameter and one has an observation such that $O$: +$$ +O \sim N(\theta,\sigma^2). +$$ +My question concerning vocabulary:

+ +

do we measure $\theta$ and it gives us $O$? (so we measure the true value)

+ +

or

+ +

do we measure $O$ ? (so we measure the observation)

+ +

I am looking for unquestionable sources.

+",2013-10-14 07:44:40.060 +57426,20470.0,1,57446.0,,,Hidden Markov model for event prediction,,CC BY-SA 3.0,"

Question: Is the set-up below a sensible implementation of a Hidden Markov model?

+ +

I have a data set of 108,000 observations (taken over the course of 100 days) and approximately 2000 events throughout the whole observation time-span. The data looks like the figure below where the observed variable can take 3 discrete values $[1,2,3]$ and the red columns highlight event times, i.e. $t_E$'s:

+ +

+ +

As shown with red rectangles in the figure, I have dissected {$t_E$ to $t_{E-5}$} for each event, effectively treating these as ""pre-event windows"".

+ +

HMM Training: I plan to train a Hidden Markov Model (HMM) based on all ""pre-event windows"", using the multiple observation sequences methodology as suggested on Pg. 273 of Rabiner's paper. Hopefully, this will allow me to train an HMM that captures the sequence patterns which lead to an event.

+ +

HMM Prediction: Then I plan to use this HMM to predict $log[P(Observations|HMM)]$ on a new day, where $Observations$ will be a sliding window vector, updated in real-time to contain the observations between the current time $t$ and $t-5$ as the day goes on.

+ +

I expect to see $log[P(Observations|HMM)]$ increase for $Observations$ that resemble the ""pre-event windows"". This should in effect allow me to predict the events before they happen.

+",2013-10-14 07:49:16.613 +57427,20473.0,2,,57390.0,,,,CC BY-SA 3.0,"

Using the chain rule, the joint density here can be decomposed as (denoting $\mathbf X$ the collection of the $n+1$ random variables)

+ +

$$f_{\mathbf X}(x_n,x_{n-1},...,x_0) = f(x_n\mid x_{n-1},...,x_0)\cdot f(x_{n-1}\mid x_{n-2},...,x_0)\cdot f(x_{n-2}\mid x_{n-3},...,x_0) \cdot...\cdot f(x_0)$$

+ +

$$=\left(\prod_{i=1}^{n}\frac {1}{\sqrt{2\pi}}\exp\left\{-\frac {(x_i-\alpha x_{i-1})^2}{2}\right\}\right)\frac {1}{\sqrt{2\pi}}\exp\left\{-\frac {x_0^2}{2}\right\}$$

+ +

Viewed as a likelihood function of $\alpha$, and taking its natural logarithm, we have

+ +

$$\ln L(\alpha \mid \mathbf X) = -\frac 12\sum_{i=1}^n (x_i-\alpha x_{i-1})^2 +c$$

+ +

...where in $c$ is also included the density of $x_0$ (but $x_0$ affects estimation of $\alpha$ through its presence in the conditional density related to $X_1$).

+ +

Then

+ +

$$\frac {\partial \ln L(\alpha \mid \mathbf X)}{\partial \alpha} = \frac {\partial }{\partial \alpha} \left(-\frac 12\sum_{i=1}^n (x_i-\alpha x_{i-1})^2\right)$$

+ +

$$=-\frac 12\frac {\partial }{\partial \alpha} \left(\sum_{i=1}^n (x_i^2-2\alpha x_ix_{i-1}+\alpha^2x_{i-1}^2)\right) $$

+ +

$$=-\frac 12\frac {\partial }{\partial \alpha} \left(\sum_{i=1}^n x_i^2-2\alpha \sum_{i=1}^nx_ix_{i-1}+\alpha^2\sum_{i=1}^nx_{i-1}^2)\right) $$

+ +

$$=\sum_{i=1}^n x_ix_{i-1} -\alpha\sum_{i=1}^nx_{i-1}^2$$

+ +

Setting

+ +

$$\frac {\partial \ln L(\alpha \mid \mathbf X)}{\partial \alpha} =0\Rightarrow \hat \alpha_{ML} = \frac {\sum_{i=1}^n x_ix_{i-1}}{\sum_{i=1}^nx_{i-1}^2}$$

+ +

while $$\frac {\partial^2 \ln L(\alpha \mid \mathbf X)}{\partial \alpha^2} = -\sum_{i=1}^nx_{i-1}^2 <0$$

+ +

which guarantees a global and unique maximum, since it is negative irrespective of $\alpha$.

+",2013-10-14 08:16:58.337 +57428,22629.0,1,,,,How to discretise continuous attributes while implementing the ID3 algorithm?,,CC BY-SA 3.0,"

I am trying to implement the ID3 algorithm on a data set. However, all attributes are continuous and can have values between 1-10. I found that we have to specify the bin intervals for discretization but couldn't understand how to do this exactly.

+ +

Can some one explain on how to do this? The data set I am using is Breast Cancer Data from Wisconsin hospitals.

+",2013-10-14 08:33:17.943 +57429,20740.0,1,57544.0,,,Linear regression with independent variables with varying proportions,,CC BY-SA 3.0,"

I am looking to do a linear regression on two independent variables that will be present in varying proportions.

+ +

For example trying to do a linear regression on $Y$ which is payment behavior (payback rate) of customers based on the the quality (let's say Gini coefficient) of the new and existing customer credit scores ($X_1$ and $X_2$, respectively) adjusted for the proportion of new and existing customers in the sample.

+ +

Existing customers will be present in proportion $p$ and new customers in proportion $1-p = q$.

+ +

$Y$, payback rate is the percentage of total customers who pay back. It could be expressed as the weighted average $Y = Y_1q + Y_2p$ where $Y_i$ is the payback rate of new/existing customers.

+ +

In general more new customers, $q$, has a negative effect. Better scoring ($X_1, X_2$) and more existing customers p have a positive effect.

+ +

What is a good way to model this?

+ +

Would something like the following be a good solution trying to use $p$ and $q$ as some sort of interaction effect?

+ +

$Y = X_1+X_2+\frac{X_1}{q}+X_2 p$

+ +

Would it be better to include p and q as variables themselves as well?

+",2013-10-14 08:35:58.630 +57430,16474.0,2,,57429.0,,,,CC BY-SA 3.0,"

I would stick with $p$ only, as $q$ does not add any information on top of $p$. I would add interaction terms between $X_1$ and $p$ and $X_2$ and $p$ and then include the main effects of both $X_1$, $X_2$ and $p$. So:

+ +

$Y =\beta_0 + \underbrace{\beta_1 X_1 + \beta_2 X_2 + \beta_3 p}_{\textrm{main effects}} + \underbrace{\beta_4 X_1 p + \beta_5 X_2 p}_{\textrm{interactions}} + \varepsilon$

+",2013-10-14 08:46:34.843 +57431,12683.0,2,,57425.0,,,,CC BY-SA 3.0,"

Statistics doesn't give a special meaning to 'measurement' in the way it does to 'estimate'. (As @Glen said, we 'estimate parameters'.) So it's going to depend on your area of application and on what $O$ and $\theta$ represent.

+ +

If the variance $\sigma^2$ describes the measurement error of some instrument or procedure, and $\theta$ is some property considered rather inherent to the thing being measured, it's natural to talk about 'measuring $\theta$', and about the $O$s as 'measurements of $\theta$'. E.g. the $O$s are several measurements of the length $\theta$ of a steel shaft.

+ +

If the variance $\sigma^2$ describes the variability of different individuals, and $\theta$ is some feature of the population considered rather contingent, it's not so natural to talk about 'measuring $\theta$'. E.g. the $O$s are single measurements of the lengths of each steel shaft from a batch, rather than measurements of the average length $\theta$ of a shaft in the batch .

+ +

In any case 'measuring an observation' is oddly worded; 'making an observation' is usual.

+",2013-10-14 09:18:20.040 +57432,12683.0,2,,57365.0,,,,CC BY-SA 3.0,"

In regression analysis each response $Y_i$ is modelled conditional on the observed predictor value $x_i$; as (with a normal distribution of errors) $Y_i\sim\mathcal{N}(\beta_0+\beta_1 x_i,\sigma^2)$ where $\beta_0$ and $\beta_1$ are the intercept & slope coefficients respectively, and $\sigma^2$ is the common error variance. Just as if the $x_i$s had been set by an experimenter rather than themselves sampled. The marginal distribution of the $Y_i$s is not necessarily thought about at all; but can be obtained using the conditional model, for any assumed distribution of the $X_i$s.

+ +

Marginal models are sometimes used for panel/longitudinal data instead of conditional multi-level models with random effects. See Lee & Nelder (2004), ""Conditional and Marginal Models: Another View"", Statistical Science, 19, 2 for a (rather critical) account.

+",2013-10-14 11:06:43.030 +57433,,2,,57086.0,user12555,,,CC BY-SA 3.0,"

I have experience of deploying random forests in a SQL Server environment via User Defined Function. The trick is to convert the IF-THEN ELSE rules that you get from each tree into a CASE-WHEN END or any other Conditional Processing construct (admittedly I've used JMP Pro's Bootstrap Forest implementation - 500k lines of SQL code).

+ +

There is absolutely no reason why this cannot be achived using the rattle R package. Have a look at randomForest2Rules & printRandomForests functions in that package. Both take random forest object as input and visit each tree in the forest and output a set of IF-THEN ELSE rules. Taking this as a starting point it should not be difficult converting this logic into your desired language in an automated way, since the output from the above mentioned function is structured text.

+ +

The above, also makes it important to decide the smallest no. of trees you need in the forest to make predictions at a desired level of accuracy (hint: plot(rf.object) shows you at what point the forest predictions do not improve despite adding more trees.) in order to keep the no. of lines to represent the forest down.

+",2013-10-14 13:08:34.127 +57434,1945.0,1,,,,How does RVM achieve sparsity?,,CC BY-SA 3.0,"

I have read several textbook descriptions on RVM and none of them provide an adequate (plain English) explanation of how RVM achieves sparsity.

+ +

I am left feeling like the authors left out a paragraph of text that would have connected the dots and instead decided to replace (rather than supplement) it with mathematical derivations.

+ +

Could someone please explain the basic idea as to how RVM works in relation to learning sparse regression models?

+",2013-10-14 13:13:22.673 +57435,22677.0,1,,,,"Can someone enlighten me on what is ""Neglected Nonlinearity""?",,CC BY-SA 3.0,"

I ask this question out of curiosity

+ +

earlier today when i was trying to test for heteroscedasticity in R, i accidentally mistook white.test of tseries package for white.test of bstat package.

+ +

i found out later that the former tests for Neglected Non Linearity while the latter tests for Heteroscedasticity

+ +

now this is something new, i hadn't heard about the ""neglected"" part before, can someone please enlighten me about the ""neglected""?

+",2013-10-14 13:34:50.787 +57436,7949.0,2,,57319.0,,,,CC BY-SA 3.0,"

There is no obvious relationship between $R^2$ and reversal of the sign of a regression coefficient. Assume you have data for which the true model is for example +$$ +y_i = 0+5x_i -z_z + \epsilon_i +$$ +with $\epsilon_i \sim N(0, sd_\text{error}^2)$. I show the zero to make explicit that the intercept of the true model is zero, this is just a simplification.

+ +

When x and y are highly correlated and centered about zero then the coefficient of z when regressing over just z will be positive instead of negative. Note that the true model coefficients do not change with $sd_\text{error}$ but you can make $R^2$ vary between zero and one by changing the magnitude of the residual error. Look for example at the following R-code:

+ +
require(MASS)
+sd.error <- 1
+x.and.z <- mvrnorm(1000, c(0,0) , matrix(c(1, 0.9,0.9,1),nrow=2)) # set correlation to 0.9
+x <- x.and.z[, 1]
+z <- x.and.z[, 2]
+y <- 5*x - z + rnorm(1000, 0, sd.error) # true model
+modell1 <- lm(y~x+z)
+modell2 <- lm(y~z)
+print(summary(modell1)) # coefficient of z should be negative
+print(summary(modell2)) # coefficient of z should be positive   
+
+ +

and play a bit with sd.error. Look for example at $sd_\text{error}=50$.

+ +

Note that with a very large sd.error the coefficient estimation will become more unstable and the reversal might not show up every time. But that's a limitation of the sample size.

+ +

A short summary would be that the variance of the error does not affect the expectations and thus reversal. Therefore neither does $R^2$.

+",2013-10-14 13:45:26.510 +57437,10147.0,1,,,,Ordered Response Variable,,CC BY-SA 3.0,"

For regression with ordered response variable, there are different methods, for example, discriminant analysis, probit or logit model. I am wondering what are the different focuses of the different methods and which one is more often used.

+",2013-10-14 14:23:05.487 +57470,22143.0,2,,57452.0,,,,CC BY-SA 3.0,"

Try 2:

+ +

This is a heuristic and I don't know of any statistical guarantees. The procedure is as follows:

+ +
    +
  • construct the empirical distribution function. If it looks exponential, convert the values to log scale to see a power-law tail.
  • +
  • Fit a curve on this modified histogram. That is, do a 1-D regression. Hopefully the curve mimics the tail of a well-behaved distribution.
  • +
  • Pick the point where the line intersects the x-axis in the interval $[\max_{i=1,...,N_s}x_i,\infty)$.
  • +
+ +

This is another estimator of the max value of the support of the population.

+",2013-10-14 22:17:38.120 +57438,13385.0,1,,,,Finding parameters to maximize expected utility of random variable,,CC BY-SA 3.0,"

I'm trying to analyze some data consisting of five randomized parameters and a utility function which indirectly depends on the parameters, by experimentation. That is to say, the parameters of the experiment are chosen randomly, and successes and failures are counted up. I want to find parameters for which the expected utility of successes and failures is highest.

+ +

From my days in calculus, I can see that an algorithm could consist of:

+ +
    +
  1. Regression to a (hopefully analytically tractable) surface
  2. +
  3. Finding a maximum
  4. +
  5. Finding the pre-image of my maximum (if I use any of the C libraries I've seen, which seem to focus on the maximum value, not its pre-image)
  6. +
+ +

But I'm not sure about the ""fiddly bits"" like:

+ +
    +
  • The distribution of points (I don't have any data yet)
  • +
  • Any substantive idea of the shape of the surface, though I am expecting diminishing marginal utility, so it should be non-linear and have a bump.
  • +
  • Numerical stability
  • +
+ +

This seems like it should be straight-forward, in terms of applied decision theory. So, is my plan sensible? Any pointers to literature, algorithms, C or Haskell libraries?

+ +

Addition in response to comment:

+ +

I'm trying to find the ""best"" parameters in terms of student performance. The 5-tuple represents:

+ +
    +
  1. $b$: The ""base"" waiting time before seeing a problem again.
  2. +
  3. $p_1$: A constant factor if the student says the problem was easy.
  4. +
  5. $p_2$: A constant factor if the student says it was hard.
  6. +
  7. $p_3$: A constant factor if the student says it was ""normal"".
  8. +
  9. $p_4$: A constant factor if the student got it wrong.
  10. +
+ +

The waiting time for the next viewing is computed by multiplying all of the responses the student has issued, and the base waiting time, and computing $e^{b \prod p_{i,j}}$. So, for example, a wrong answer makes the waiting time much shorter. An 'easy' report makes it quite a bit longer.

+ +

Now, if the student gets the next viewing wrong, we want to count it as a failure. If the student gets it right (regardless of the difficulty the student reports), we count it as a success.

+ +

I want to maximize the utility function $\frac{|\text{success}|}{|\text{total}|}$ by varying the 5-tuple. I guess $\frac{|\text{success}|}{|\text{failure}|}$ would serve the same purpose.

+",2013-10-14 14:28:38.417 +57439,20831.0,2,,57421.0,,,,CC BY-SA 3.0,"

As you say, the data is not stationary, we can find the stationary transformed data by differencing, and checked by the unit root test (e.g Augmented Dickey-Fuller test, Elliott-Rothenberg-Stock test, KPSS test, Phillips-Perron test, Schmidt-Phillips test, Zivot-Andrews test...) We can talk about ARMA model only after confirming the stationarity.

+ +

It is a classical way to identify the ARMA(p, q) by the ACF plot and PACF plot. ARMA(0,1) and ARMA(0,0) can be told here. Another method to identify p, q is about the EACF, but it is not widely used for univariate time series.

+ +

Empirical studies show that AIC usually tends to overfitting. The advantage of using AIC is for automatic algorithm to find the best model, but it is not usually recommended in traditional time series textbook.

+",2013-10-14 14:41:48.740 +57440,2666.0,2,,57437.0,,,,CC BY-SA 3.0,"

I don't think that discriminant analysis will be very efficient because it does not use the ordering. There are 4 commonly used families for ordinal response that are based on direct probability modeling: logistic, probit, log-log (Cox model) and complementary log-log. These are implemented in the R rms package orm function, which also handles continuous $Y$. Graphical methods can be used to choose from among the 4. Proportional odds is the easiest to interpret.

+",2013-10-14 14:56:24.637 +57441,22678.0,2,,48125.0,,,,CC BY-SA 3.0,"

I would disagree on your first point. The $L_2$ regularized model is +$$ +\parallel Y-K\beta \parallel_2^2 + \lambda \beta^T R \beta +$$ +where K is the known kernel matrix and $R$ is the regularization matrix. +$K=R$ is only a good choice, when the gaussian kernel is used. +For more information please see +A. Smola, B. Schölkopf, On a Kernel-based Method for Pattern Recognition, +Regression, Approximation, and Operator Inversion, 1997

+ +

@author, the discussion about ""good kernels"" is rather popular. +See this post for example: +What function could be a kernel?

+ +

However, there are ways to compute an optimized kernel based on your regularization idea. +You should find some approaches presented at NIPS.

+",2013-10-14 15:23:57.277 +57442,22677.0,1,,,,How to do Univariate Heteroscedasticity Test,,CC BY-SA 3.0,"

I just wanted to know how to do Heteroscedasticity Test on a Univariate Model?

+ +
    +
  • ex: an univariate autoregressive model
  • +
  • ex: an univariate ARCH/GARCH model
  • +
+ +

If it is possible, how does one do that in R?

+",2013-10-14 15:34:17.913 +57443,2149.0,2,,57442.0,,,,CC BY-SA 3.0,"

This question was answered in 1988 http://www.unc.edu/~jbhill/tsay.pdf by R.Tsay and implemented in AUTOBOX in 1990. As of this date (today) no other forecasting/time series package has implemented his elegant and creative solution. Simply adjust your series for time trend changes, level shift changes, seasonal pulses and pulses AND the correct ARIMA structure. Verify that the model parameters are constant over time and then search for change points in error variance as he recommends.

+ +

Edited to respond to Nick ..

+ +

As you may know ARCH/GARCH concerns itself with developing an ARIMA model for the squared residuals. The problem is if you have unusual (one-time) anomalies these are dealt with by incorporating pulse indicator series, yielding a zero residual for each identified point. Squaring these residuals leads to a distribution that has long tails and is not amenable to ARIMA. When I programmed and implemented ARCH/GARCH so that I could jump on the ""next new thing"" I found that it was fundamentally inconsistent with Intervention Detection schemes. Essentially ARCH/GARCH provides a possible solution for a ""change in variance"" that may well be more easily handled by Intervention Detection (violations in the expected value). Thus at this point in time my preferences (Occam's Razor) for the simplest solution/transformation/drug/remedy causes me to keep the solution as simple as possible but not too simple. The current release of AUTOBOX treats variance heterogeneity by identifying anomalies, parameter changes and deterministic variance changes and no need for power transformations via Box-Cox... If all this fails the user can square the residuals and build an arima model to construct his/her own ARCH/GARCH model. Here I stand, I can do no other!

+",2013-10-14 15:43:16.087 +57444,22262.0,1,,,,Using quantile regression to predict probability of surpassing threshold,,CC BY-SA 3.0,"

Consider a continuous response $Y$ and design matrix vector $\mathbf{X}$. These are related through some function $f(X) = Y$. Suppose that I am interested in estimating the probability that $Y \leq 0.1$ conditional on observing $\mathbf{X}$.

+ +

I want to use quantile regression to do this - can I confirm that this is a legitimate methodology?

+ +

We have quantiles $\tau \in [0,1]$ and after estimating our quantile regression for each $\tau$ we have our quantile estimates $\mathbf{q} := \{\hat{Q}(\tau) : \tau \in \{0.01,0.02,...,0.99\}\}$. I want to select the $\tau$ such that $\hat{Q}(\tau) \approx 0.1$. When I find such a $\hat{Q}(\tau)$ it seems to then follow naturally that $P(Y \leq 0.1) = \tau$. The reason is that my model has estimated the $\tau$-th quantile to be $0.1$, which is point on the x-axis in $Y$'s pdf that I need to find to be able to determine $P(Y \leq 0.1)$.

+ +

In practice this may not work since an estimated quantile can be lower for higher $\tau$ under some $\mathbf{X}$.

+ +

Not looking for logistic regression with a discretized response as a solution (since I already know about this).

+",2013-10-14 15:45:47.440 +57445,22143.0,2,,57434.0,,,,CC BY-SA 3.0,"

In Relevance vector machines (RVM) we have a prior on the weight vector $\mathbf{w}$ (which is $N+1$ dimensional, where $N$ is the number of examples) as shown in equation (5) of (1): +$$p(\mathbf{w}|\alpha) = \Pi_{i=0}^{N}\mathcal{N}(w_i|0,\alpha_i^{-1}),$$ +where $\mathbf{\alpha}$ is the $N+1$ dimensional vector of hyperparameters.

+ +

This prior is supposed to ensure that the weight vector $\mathbf{w}$ (which represents the number of ""support vectors"" which are active) is ""sparse"" if we can integrate out all the nuisance parameters ($\alpha$). See paragraph preceding Section 2.2 in (1).

+ +

Potential points of confusion:

+ +
    +
  • the notation $\mathbf{w}$ is different from the $d$-dimensional linear model representation. Here, while comparing RVM with SVM, only think of the dual SVM formulation with the $N+1$ dimensional parameter $\mathbf{w}$.
  • +
  • ""Sparse"" for (dual) SVMs means the number of support vectors is small. Do not confuse with number of non-zero coefficients in (the d-dimensional) linear models.
  • +
+",2013-10-14 15:49:20.960 +57446,4320.0,2,,57426.0,,,,CC BY-SA 3.0,"

One problem with the approach you've described is you will need to define what kind of increase in $P(O)$ is meaningful, which may be difficult as $P(O)$ will always be very small in general. It may be better to train two HMMs, say HMM1 for observation sequences where the event of interest occurs and HMM2 for observation sequences where the event doesn't occur. Then given an observation sequence $O$ you have +$$ +\begin{align*} +P(HHM1|O) &= \frac{P(O|HMM1)P(HMM1)}{P(O)} \\ +&\varpropto P(O|HMM1)P(HMM1) +\end{align*} +$$ +and likewise for HMM2. Then you can predict the event will occur if +$$ +\begin{align*} +P(HMM1|O) &> P(HMM2|O) \\ +\implies \frac{P(HMM1)P(O|HMM1)}{P(O)} &> \frac{P(HMM2)P(O|HMM2)}{P(O)} \\ +\implies P(HMM1)P(O|HMM1) &> P(HMM2)P(O|HMM2). +\end{align*} +$$

+ +

Disclaimer: What follows is based on my own personal experience, so take it for what it is. One of the nice things about HMMs is they allow you to deal with variable length sequences and variable order effects (thanks to the hidden states). Sometimes this is necessary (like in lots of NLP applications). However, it seems like you have a priori assumed that only the last 5 observations are relevant for predicting the event of interest. If this assumption is realistic then you may have significantly more luck using traditional techniques (logistic regression, naive bayes, SVM, etc) and simply using the last 5 observations as features/independent variables. Typically these types of models will be easier to train and (in my experience) produce better results.

+",2013-10-14 16:03:34.203 +57447,2666.0,2,,57444.0,,,,CC BY-SA 3.0,"

It doesn't appear that $Y$ is binary. Ordinal regression is a good choice here. With any of the ordinal models (proportional odds, proportional hazards, probit, etc.) you can compute the probability that $Y \geq y$ for all $y$. That probability will change at the unique values of $y$. The R rms package orm function implements this efficiently and has a function generator for exceedance probabilities. If you were extremely fortunate and really have Gaussian residuals you can use the maximum likelihood estimator of the exceedance probabilities, which is a simple function of $\hat{\mu}$ and $\hat{\sigma}$.

+",2013-10-14 16:42:40.467 +57448,16644.0,2,,57396.0,,,,CC BY-SA 3.0,"

The Dvoretzky-Kiefer-Wolfowitz inequality can be used here. The required sample size $b$ (I'm using $b$ to distinguish it from $n$ because you already set your population size as $n$ in the problem statement) is determined by $$b \geq \left( {1 \over 2 \epsilon^2 } \right) \mathrm{ln} \left( {2 \over \alpha} \right),$$ where $\epsilon$ is how close you want your empirical cdf to be and $1-\alpha$ is the confidence level.

+ +

So, for example, if you want to estimate $F(c)$ within $\epsilon = 0.01$ with 95% confidence, the formula gives a sample size of $$b \geq 18444.4,$$ or $b = 18445.$

+ +

This will cover any and all $c,$ so it is possible you can do much better. Perhaps one of the commenters will fill in the details on a more efficient solution for a single value of $c.$

+",2013-10-14 17:26:35.837 +57449,21884.0,1,,,,Covariance matrix equality,,CC BY-SA 3.0,"

The (unbiased) sample covariance matrix

+ +

$$\mathbf{S}=\dfrac{1}{n-1}\sum_{j=1}^{n}(\mathbf{X}_{j}-\bar{\mathbf{X}})(\mathbf{X}_{j}-\bar{\mathbf{X}})^{T}$$ +can be rewritten as

+ +

$$\mathbf{S}=\dfrac{1}{n-1}\mathbf{X}^{T}\mathbf{X}-\dfrac{1}{n(n-1)}\mathbf{X}^{T}\mathbf{1}\mathbf{1}^{T}\mathbf{X}$$

+ +

where $$\mathbf{1}=\left(\begin{array}{c} +1\\ +\vdots\\ +1 +\end{array}\right)_{(n\times1)}.$$

+ +

One (tedious) way of proving this is to expand out the left hand side and the right hand side of the equality, and showing that the entries of the matrices match. I've done this successfully.

+ +

My question: is there a neater / more concise way to prove such an equality?

+",2013-10-14 17:30:14.367 +57450,19265.0,1,,,,What is the loss function for C - Support Vector Classification?,,CC BY-SA 4.0,"

In article LIBSVM: A Library for Support Vector Machines it is written that C-SVC uses the loss function

+

$$ \frac{1}{2}w^Tw+C\sum\limits_{i=1}^l\xi_i$$

+

I know what is $w^Tw$.

+

But what is $\xi_i$? I know that it is somehow connected with misclassifications, but how it is calculated exactly?

+

P.S. I don't use any non-linear kernels.

+",2013-10-14 18:06:27.340 +57451,21840.0,1,57457.0,,,Probability of having real roots,,CC BY-SA 3.0,"

Let $U,V,W$ are independent random variables with $\mathrm{Uniform}(0,1)$ distribution. I am trying to find the probability that $Ux^{2}+Vx+W$ has real roots, that is, $P(V^{2}-4UW> 0)$ +I have solved this question using double integral but how to do this using triple integral. +My Approach: +I started with cdf: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(V>2\sqrt{UW})$ = $\int\int_{2\sqrt{uw}}^1 P(V>2\sqrt{UW}) dU dW$ +=$\int\int\int_{2\sqrt{uw}}^1 vdU dW dV$

+ +

I am finding hard time to get the limits of integral over the region in 3 dimensions.

+ +

Using double integral: +$P(V^{2}-4UW >0) =P(V^{2} > 4UW) = P(-2\ln V <-\ln 4 - \ln U - \ln W) = P(X <-\ln 4 +Y)$ +where $X=-2 \ln V, Y = - \ln U -\ln W $ +$X$ has $\exp(1)$ and $Y$ has $\mathrm{gamma}(2,1)$ distribution. +$P(X <-\ln 4 +Y) = \int_{\ln4}^\infty P(X < -\ln 4 +Y) f_Y(y) dy $ +$$=\int_{\ln 4}^\infty\int_0^{-\ln 4+y} \frac{1}{2} e^{-\frac{x}{2}}ye^{-y} dxdy $$ +Solving this I got $0.2545$.

+ +

Thanks!

+",2013-10-14 18:11:53.353 +57567,22729.0,1,100174.0,,,Heteroscedasticity-consistent F-test,,CC BY-SA 3.0,"

Why is the F-test for overall significance (OLS regression analysis) invalid when residuals are heteroscedastic? +Is there a way to calculate it in a consistent way under heteroscedasticity? +Is there any function in R to accomplish that?

+",2013-10-15 20:02:04.407 +57452,22627.0,1,57462.0,,,"Expected maximum given population size, mean, and variance",,CC BY-SA 3.0,"

How would one estimate the maximum given population size, a few moments, and perhaps some additional assumption on the distribution?

+ +

Something like ""I'm going to do $N_s≫1$ measurements out of population of size $N_p≫N_s$; will record mean $μ_s$, standard deviation $σ_s$, and maximal value in the sample $X_s$; I am willing to assume binomial (or Poisson, etc) distribution; what is the expected maximal value $X_p$ of the entire population?""

+ +

Related question: does one need to make the assumptions on the nature of the population distribution, or the sample statistics would be enough to estimate $X_p$?

+ +

Edit: the background I just added in the comments may not be clear enough. So here it is:

+ +

The end purpose it to print a set of shapes (wires, gates, etc) on a VLSI circuit that matches the designed shapes (a.k.a. targets) as well as possible. The measure of fitness of the manufactured set of shapes is the MAXIMAL difference from the target, rather than the $\sigma$ along the $~10^9$ location. The reason for evaluating the maximum difference is clear: a single short circuit is bad enough to bring down the entire chip, and then it wouldn't matter how close you were to the target in the remaining 99.999999% of the chip's location.

+ +

The problem is that it's very costly to measure the printed shape in too many locations: you literally need to look though an electron microscope at the half-manufactured chip (that's going to get trashed after the destructive measurements), adjust for metrology errors, etc. Therefore more than $10^4$ measurements is hardly ever being done. The result of those measurement is the maximal target difference $X_s$ of the SAMPLE, as well as any other sample statistics you may wish for.

+ +

And now one needs to estimate the maximal difference $X_p$ for the entire population... And now one wishes that he paid more attention in the statistics class back in college...

+",2013-10-14 18:13:32.393 +57453,22682.0,1,57454.0,,,Is there a way to remove individual trees from a forest in the randomForest package in R?,,CC BY-SA 3.0,"

I am trying to implement the ideas in this paper: http://www.sciencedirect.com/science/article/pii/S0925231212003396.

+ +

This requires me to be able to remove individual trees from the forest and reclassify my training data for each removal. I've been using the randomForest package in R and had a comb through the manual but couldn't find any way of running the forest with a subset of trees, or even with an individual tree. There is a getTree function but that only gives a matrix of the node structure of the tree.

+ +

Is there any way to do this, either in randomForest (preferably) or via another random forest implementation (e.g. scikit-learn)?

+",2013-10-14 18:27:13.640 +57454,22507.0,2,,57453.0,,,,CC BY-SA 3.0,"

One idea is, instead of creating one forest with N trees, create N ""forests"" of 1 tree each by calling randomForest() N times. Then you could manipulate them as you wish.

+",2013-10-14 18:48:24.640 +57455,14748.0,1,57465.0,,,Predictive algorithm validation,,CC BY-SA 3.0,"

In putting a binary 1/0 predictive algorithm into production, what are the consequences where only the positive (1) predictions are checked, meaning only true or false positives are detected, and then fed back into training the model? Will that bias the algorithm in any way so that it progressively gets worse and worse because it never sees true or false negatives?

+",2013-10-14 19:13:34.737 +57456,22143.0,2,,57450.0,,,,CC BY-SA 3.0,"

$\xi_i$ are the slack variables. They are typically nonzero when the 2-class data is non-separable. We are trying the minimize the slack as much as possible (by minimizing their sum, since they are non-negative) along with maximizing the margin ($w^Tw$) term.

+ +

Exact calculation: Well, if the convex program has been solved to optimality without any optimization error, then yes, they are calculated exactly.

+",2013-10-14 20:04:52.890 +57457,6162.0,2,,57451.0,,,,CC BY-SA 3.0,"

Here is a solution without multiple integrals calculation (because I don't like multiple integrals). Actually it only uses three elementary simple integrals. +$$ +P(V^{2}-4UW \leq 0) = E\bigl[P(V^{2}-4UW \leq 0 \mid U,W)\bigr] = E\bigl[f(U,W)\bigr]$$ where $f(u,w)=P(V^{2}-4uw \leq 0)= \min\bigl\{1, 2\sqrt{uw}\bigr\}$. +$$ +E\bigl[f(U,W)\bigr] = E[g(W)] +$$ +where +$$\begin{align} +g(w) & = E\bigl[\min\bigl\{1, 2\sqrt{Uw}\bigr\}\bigr] += 1 \times \Pr(2\sqrt{Uw}>1) + E\bigl[2\sqrt{Uw} \mathbf{1}_{2\sqrt{Uw}\leq 1}\bigr] \\ +& = \Pr(U>\frac{1}{4w}) + 2\sqrt{w}E\bigl[\sqrt{U} \mathbf{1}_{U \leq \frac{1}{4w}}\bigr] \\ +& = \max\bigl\{0, 1 - \frac{1}{4w}\bigr\} + 2\sqrt{w} \times \frac{2}{3} \times \min\bigl\{1, \frac{1}{{(4w)}^{\frac{3}{2}}}\bigr\} \\ +& =\begin{cases} + 0 + \frac{4}{3}\sqrt{w} & \text{if } w \leq \frac{1}{4} \\ +1 - \frac{1}{4w} + \frac{1}{6w} & \text{if } w > \frac{1}{4} +\end{cases}, \end{align}$$ +and we get +$$ E[g(W)] = \frac{1}{9} + \frac{3}{4} - \frac{1}{12} \log 4 = \frac{31}{36}-\frac{\log 2}{6},$$ +and finally +$$P(V^{2}-4UW > 0) = \frac{5}{36} + \frac{\log 2}{6} \approx 0.2544134.$$

+",2013-10-14 20:07:35.763 +57458,22684.0,1,,,,Questions about thresholding the data,,CC BY-SA 3.0,"

I came across a data mining course project online.

+ +

The data is of samples with 7000 features as genes. Each gene is associated with a value. Some of the values are negative. The data looks like in this way:

+ +
SNO ""U48730_at"" ""U58516_at"" ""U73738_at"" ""X06956_at"" ""X16699_at"" ""X83863_at""
+
+X1 "" 27"" "" 161"" "" 0"" "" 34"" "" 2"" "" 116""
+X2 "" 27"" "" 265"" "" 0"" "" 98"" "" 2"" "" 123""
+X3 "" 24"" "" 126"" "" 0"" "" 21"" "" 0"" "" 142""
+X4 "" 27"" "" 163"" "" -1"" "" 16"" "" -1"" "" 134""
+X5 "" 41"" "" 138"" "" 1"" "" 29"" "" 1"" "" 153""
+X6 "" 55"" "" 107"" "" -1"" "" 17"" "" 0"" "" 152""
+X7 "" 27"" "" 99"" "" 0"" "" 57"" "" 1"" "" 139""
+X8 "" 2"" "" 137"" "" -1"" "" 19"" "" -3"" "" 213""
+X9 "" -5"" "" 161"" "" -3"" "" 23"" "" 2"" "" 193""
+X10 "" 0"" "" 110"" "" -3"" "" 7"" "" -1"" "" 208""
+X11 "" -7"" "" 67"" "" 1"" "" 2"" "" -2"" "" 149""
+X12 "" 4"" "" 93"" "" 3"" "" 37"" "" 2"" "" 266""
+X13 "" 2"" "" 75"" "" 3"" "" 30"" "" 6"" "" 205""
+
+ +

The professor advise the students to first do 'data cleaning'. The original sentence is Threshold both train and test data to a minimum value of 20, maximum of 16,000.

+ +

I first thought that it is to search over each gene and if there is a value out of the bounds, then just discard this gene as a feature. However, it seems for every gene, there must be a sample with the value out of bound.

+ +

What should I do by ""threshold this data""? Is that like if the value is below 20, then set it 20 or if the value is above 16000, then just set it as 16000?

+ +

In fact, I did the last operation in R by

+ +
data[data<20] <- 20
+
+ +

and it turns out that the speed of the command is very slow. (79*7070 samples)

+",2013-10-14 20:08:14.273 +57459,22143.0,2,,57458.0,,,,CC BY-SA 3.0,"
+

What should I do by ""threshold this data""? Is that like if the value is below 20, then set it 20 or if the value is above 16000, then just set it as 16000?

+
+ +

Yes.

+",2013-10-14 20:17:26.320 +57479,64247.0,1,,,abe3,How would you use pair-wise plots to test the effectiveness of k-means clustering?,,CC BY-SA 3.0,"

I am looking over slides for a big data class. The slides suggest doing a pairwise plot of data (if not too many variables) to evaluate the quality of output from k-means clustering -- with each data point color-coded by its cluster. The slides say:

+ +
+

If the (colored) clusters look separated in at least some of the plots. They won’t be very separated in all of the plots.

+
+ +

How would this tell you if a pairwise plot is effective? You would want the colors to be mixed up in the plots to make sure that you have genuine multi-dimensional clusters and not just groups of data points that are very similar on one variable?

+",2013-10-15 00:37:03.313 +57460,14799.0,2,,57319.0,,,,CC BY-SA 3.0,"

This is for OLS regression. Consider a geometric representation of three variables -- two predictors, $X_1$ and $X_2$, and a dependent variable, $Y$. Each variable is represented by a vector from the origin. The length of the vector equals the standard deviation of the corresponding variable. The cosine of the angle between any two vectors equals the correlation of the corresponding two variables. I will take all the standard deviations to be 1.

+ +

+ +

The picture shows the plane determined by the $X_1$ and $X_2$ when they correlate positively with one another. $Y$ is a vector coming out of the screen; the dashed line is its projection into the predictor space and is the regression estimate of $Y$, $\hat{Y}$. The length of the dashed line equals the multiple correlation, $R$, of $Y$ with $X_1$ and $X_2$.

+ +

If the projection is in any of the colored sectors then both predictors correlate positively with $Y$. The signs of the regression coefficients $\beta_1$ and $\beta_2$ are immediately apparent visually, because $\hat{Y}$ is the vector sum of $\beta_1 X_1$ and $\beta_2 X_2$. If the projection is in the yellow sector then both $\beta_1$ and $\beta_2$ are positive, but if the projection is in either the red or the blue sector then we have what appears to be suppression; that is, the sign of one of the regression weights is opposite to the sign of the corresponding simple correlation with $Y$. In the picture, $\beta_1$ is positive and $\beta_2$ is negative.

+ +

Since the length of the projection can vary between 0 and 1 no matter where it is in the predictor space, there is no minimum $R^2$ for suppression.

+",2013-10-14 20:20:50.680 +57461,22685.0,1,,,,What is the point of measuring statistical distance?,,CC BY-SA 3.0,"

On pg. 378 of ""Cryptography with Tamperable and Leaky +Memory"", Kalai et al. claim two probability distributions are $e(k)$ close if the distance between them is at most $e(k)$.

+ +

What is significance of two distributions X and Y being ""close to"" or ""far from"" each other? Why would anybody care, especially in cryptography?

+",2013-10-14 20:32:28.583 +57462,22143.0,2,,57452.0,,,,CC BY-SA 3.0,"

Try 1:

+ +

If $X \sim U[a,b]$ (uniform, either discrete or continuous), then the MLE estimator for b (which is $\max_{x \in [a,b]} X$) is essentially $\max_{i=1,...,N_s}x_i$.

+ +

I chose uniform distribution because it is the worst case distribution in terms of entropy. This is in line with the MaxEnt (maximum entropy) principle. I also assumed a linear order in the values of the random variable.

+ +

We can make the following claim about the estimator $\max_{i=1,...,N_s}x_i$ to its mean using Hoeffdings inequality (without assuming that $X \sim U[a,b]$). Assuming $x_i$ are i.i.d from some distribution with bounded support $[a,b]$, we have +\begin{align*} +\mathbb{P}_{x_1,...,x_{N_s}}\left(|\max_{i=1,...,N_s}x_i - \mathbb{E}[\max_{i=1,...,N_s}x_i]| \geq \epsilon\right) \leq 2\exp\left(\frac{-2\epsilon^2}{N_s(b-a)}\right) +\end{align*} +Here we do not need to know $b$ exactly, any rough or crude upper bound will suffice. The above concentration is only saying that the estimator is close to the expected value of the estimator which is not the same as being close to the unknown $\max_{x \in [a,b]}X = b$.

+ +

Additional comment: I would make the measurements uniformly at random over the plane/chip so that hopefully no region with high $X$ values is missed. This observation is independent of the above.

+",2013-10-14 21:10:03.567 +57463,22687.0,1,58052.0,,,How does one generate the table mapping t-test values to p values?,,CC BY-SA 3.0,"

In the dark ages, we would map the results of a Student's t-test to a null hypothesis probability p by looking up T and degrees of freedom in a table to get an approximate result.

+ +

What is the mathematical algorithm that generates that table? ie, how can I write a function to generate a precise p given an arbitrary T and df?

+ +

The reason I ask is that I'm writing a piece of embedded software that continually monitors hundreds of populations with hundreds of samples each, and raises an alert if successive snapshots of a given population come to differ significantly. Currently it uses a crude z-score comparison, but it would be nice to use a more valid test.

+",2013-10-14 21:14:24.377 +57464,22507.0,2,,57455.0,,,,CC BY-SA 3.0,"

The algorithm which never received 0's will be grossly biased and predict amost exclusively 1's.

+",2013-10-14 21:19:18.890 +57465,22143.0,2,,57455.0,,,,CC BY-SA 3.0,"

I am thinking of the following two points:

+ +
    +
  • You are observing the true labels and their associated predictors, a.k.a the pair $y_i,x_i$ only when the algorithm is predicting a label of $1$. The algorithm is updated regardless of whether it made an error or not. This means that there is no feedback on mistakes (like in online learning). We get new data irrespective of our prediction performance.

  • +
  • The question we need to ask is then: Does the algorithm's output influence the data source? If the algorithm is not influencing the source, then this aspect where we 'conditionally observe new data' will not bias the algorithm by itself (everything else held constant).

  • +
+",2013-10-14 21:24:50.677 +57466,22690.0,1,,,,Naive Bayes with invalid independence assumption,,CC BY-SA 3.0,"

I'm trying to understand the effects of adding non-conditionally independent features to a naive Bayes classifier. Let's say I have the features vector $X = [x_1,x_2,x_3,x_4]$ and that for each value of $x_3$ I get the same value for $x_4$:

+ +

For all $i \in \{samples\}$, $x_{3}^{i} = x_{4}^{i}$

+ +

I could say that the conditionally independent assumption of $x_n$ given the class $Y = y_k$ does not hold anymore since the value of $x_{3}^{i}$ foresee $x_{4}^{i}$, and that naive Bayes classifier may not produce the expected results. I'm not really sure about that explanation and I would appreciate your point of view about it.

+",2013-10-14 21:58:00.910 +57467,16469.0,1,57475.0,,,How to test (and accept) that a coefficient in a linear regression model equals zero,,CC BY-SA 3.0,"

I understand that in a linear regression model like:

+ +

$y_i = b_0 + b_1 x_i + \epsilon_i$

+ +

I can have a null and an alternative hypothesis:

+ +

$H_0: b_1 = 0$ and $H_1: b_1 \neq 0$.

+ +

And then I can reject $H_0$ or fail to reject $H_0$. But what if I want to accept that $b_1 = 0$?

+",2013-10-14 21:58:30.230 +57468,22693.0,1,,,,Whether to log transform variable when untransformed variable has positive skew and transformed has negative skew with additional missing data?,,CC BY-SA 3.0,"

I have performed a log transformation on my skewed data, however on my DV it went from positive skew to negative skew after the (log) transformation, further data was missing from my DV after the transformation. Please help

+",2013-10-14 22:01:18.457 +57471,155.0,2,,57468.0,,,,CC BY-SA 3.0,"

Additional missing data after log transformation

+ +

If you have additional missing data after log transformation, it is likely that you have data that is less than or equal to zero. (i.e., log(0), log(-1), etc. is not defined). So if you want to use a log transformation on data with negative numbers, you need to add a constant to the raw variable so that the minimum of the resulting variable is greater than zero. So your transformation could be

+ +

$$\log(x + c)$$

+ +

where $x$ is your untransformed variable and $c = 1 - \textrm{min}(x)$.

+ +

Transformation flips the skewness

+ +

There is plenty of discussion on this site about when and whether transformations are useful. You might also like this discussion of issues surrounding transformations. In general, if a log transformation is flipping the direction of your skewness, then there is a good chance that you did not have very much skewness to begin with. To test whether the transformation makes a substantive difference with the context of multiple regression, examine your correlations, R-squares, and standardised betas before and after transformation, and see what changes you observed. In many cases you will see that it makes little difference.

+ +

Another point, is that the assumption pertains to the residuals of a multiple regression and not the dependent variable itself.

+ +

If you really care about optimising the transformation to make the variable approximate a normal distribution, then you can use the Box-Cox transformation. Or a simpler approach is just to try a range of transformations. A common set of transformations from greater to less change is:

+ +
-1/x^2
+-1/x
+log(x)
+sqrt(x)
+
+ +

So if log(x) is transforming too much, you could try sqrt(x).

+",2013-10-14 22:29:32.767 +57472,21991.0,1,57501.0,,,Numerical example to understand Expectation-Maximization,,CC BY-SA 3.0,"

I am trying to get a good grasp on the EM algorithm, to be able to implement and use it. I spent a full day reading the theory and a paper where EM is used to track an aircraft using the position information coming from a radar. Honestly, I don't think I fully understand the underlying idea. Can someone point me to a numerical example showing a few iterations (3-4) of the EM for a simpler problem (like estimating the parameters of a Gaussian distribution or a sequence of a sinusoidal series or fitting a line).

+ +

Even if someone can point me to a piece of code (with synthetic data), I can try to step through the code.

+",2013-10-14 22:37:36.997 +57473,5643.0,1,57477.0,,,Intuition for the standard error of the difference of sample means,,CC BY-SA 3.0,"

I read in Wilcox, 2003 p. 247 that the standard error of the difference between two sample means is (assuming the normality and homoskedasticity assumptions):

+ +

$\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}$

+ +

Rather than simply adding the two sample standard errors as in:

+ +

$\frac{\sigma_1}{\sqrt{n_1}} + \frac{\sigma_2}{\sqrt{n_2}}$

+ +

What is the intuition behind taking the square of the sum of the two variances divided into their respective sample size, rather than the sum of the standard errors?

+",2013-10-14 23:00:43.557 +57474,22163.0,1,,,,Inverted SPSS results: Logistic regression command vs. Genlin?,,CC BY-SA 3.0,"

I want to do a logistic regression in SPSS. However, since I analyse unemployment spells the subjects are sometimes repeated (violating the independence assumption of the regression). One way of removing the within subject variation is by applying a Genlin model with the repeated subject subcommand (in essence a GEE model). Thus, I tried out a Genlin model with binomal probability and the logit link, comparing it to a standard logistic regression. I used the exact same variables in the two procedures.

+ +

However, the results that was delivered from the Genlin procedure was inverted relative to that of the logistic regression. For instance: Exp(B) for women (of the independent variable sex/gender) was just above 2.0 in logistic regression while being at 0.49 in Genlin. The same happened with every independent variable.

+ +
    +
  • Any suggestions to why the results of the Genlin procedure is +inverted?
  • +
  • Is there any way to get the Genlin results in accordance to the logistic regression?
  • +
+",2013-10-14 23:12:42.747 +57475,503.0,2,,57467.0,,,,CC BY-SA 3.0,"

Look into equivalence testing. See this search for lots of threads. Also see Esteban & Nowacki

+",2013-10-14 23:15:43.797 +57476,503.0,2,,57473.0,,,,CC BY-SA 3.0,"

You don't square the sum of the variances, you take the square root of the sum of the variances. You do this for the same reason that the standard deviation is the square root of the variance: It make the units the same as the original ones, rather than squared units.

+ +

Although we often lose sight of it while doing statistics, the square of a measure involve squaring the measure as well as the number of units. For example, the square of 2 meters is not 4 meters, it is 4 meters squared, more commonly called 4 square meters. The same thing happens with other units that we aren't used to thinking of in this way: e.g. if you are measuring IQ, the square of an IQ is not an IQ of 10,000; it is a squared IQ of 10,000.

+ +

You divide by the sample size as a scaling technique. Variances (tend to) go up with sample size; you divide by $n$ to deal with that.

+",2013-10-14 23:21:40.057 +57477,594.0,2,,57473.0,,,,CC BY-SA 3.0,"

You seem to be thinking that $\sqrt{\text{Var}(\bar X-\bar Y)} = \sqrt{\text{Var}(\bar X)} + \sqrt{\text{Var}(\bar Y)}$.

+ +

This is not the case for independent variables.

+ +

For $X,Y$ independent, $\text{Var}(\bar X-\bar Y) = \text{Var}(\bar X) + \text{Var}(\bar Y)$

+ +

Further,

+ +

$\text{Var}(\bar X) = \text{Var}(\frac{1}{n}\sum_iX_i) = \frac{1}{n^2}\text{Var}(\sum_iX_i)= \frac{1}{n^2}\sum_i\text{Var}(X_i)= \frac{1}{n^2}\cdot n\cdot\sigma^2_1= \sigma^2_1/n$

+ +

(if the $X_i$ are independent of each other).

+ +

http://en.wikipedia.org/wiki/Variance#Basic_properties

+ +

In summary: the correct term:

+ +

+ +

$\color{red}{(1)}$ has $\sigma^2/n$ terms because we're looking at averages and that's the variance of an average of independent random variables;

+ +

$\color{red}{(2)}$ has a $+$ because the two samples are independent, so their variances (of the averages) add; and

+ +

$\color{red}{(3)}$ has a square root because we want the standard deviation of the distribution of the difference in sample means (the standard error of the difference in means). The part under the bar of the square root is the variance of the difference (the square of the standard error). Taking square roots of squared standard errors gives us standard errors.

+ +

The reason why we don't just add standard errors is standard errors don't add - the standard error of the difference in means is NOT the sum of the standard errors of the sample means for independent samples - the sum will always be too large. The variances do add, though, so we can use that to work out the standard errors.

+ +
+ +

Here's some intuition about why it's not standard deviations that add, rather than variances.

+ +

To make things a little simpler, just consider adding random variables.

+ +

If $Z = X+Y$, why is $\sigma_Z < \sigma_X+\sigma_Y$?

+ +

Imagine $Y = kX$ (for $k\neq 0$); that is, $X$ and $Y$ are perfectly linearly dependent. That is, they always 'move together' in the same direction and in proportion.

+ +

Then $Z = (k+1)X$ - which is simply a rescaling. Clearly $\sigma_Z = (k+1)\sigma_X = \sigma_X+\sigma_Y$.

+ +

That is, when $X$ and $Y$ are perfectly positively linearly dependent, always moving up or down together, standard deviations add.

+ +

When they don't always move up or down together, sometimes they move opposite directions. That means that their movements partly 'cancel out', yielding a smaller standard deviation than the direct sum.

+",2013-10-14 23:40:17.447 +57478,7860.0,1,,,,Adding errors to Gaussian kernel density estimator,,CC BY-SA 3.0,"

I'm using the scipy.stats.gaussian_kde function to generate a KDE from a set of $N$ points in a 2D space: $A = \{(x_1,y_1), (x_2,y_2), (x_3,y_2), ..., (x_N,y_N)\}$

+ +

Each one of these points has a given error attached to it. So for example, the point $(x_1,y_1)$ has errors $(e_{x_1},e_{y_1})$ and so on. I can assume the errors are normally distributed in both axis.

+ +

The python function that I use to generate the KDE has no way to integrate these errors into the calculations and I wonder how I would even do such a thing if I did it manually.

+ +

Ie: what is the statistically correct way to generate a KDE accounting for errors in the data used?

+",2013-10-15 00:04:46.003 +57480,2121.0,2,,56955.0,,,,CC BY-SA 3.0,"

I think if each dataset is already weighted to your satisfaction, then you have a couple of different options. Which one is the right one may vary based on your objectives and the particulars of your existing data collection and weighting.

+ +
    +
  • (#1) Union all of the datasets, along with their pre-calculated weights, and that's it.
  • +
+ +
+

This would be the right choice if each dataset was weighted towards a proper total count and didn't over-state the importance of any individual record relative to another dataset. If one dataset was weighted to reflect Total US Population, and another dataset was weighted in place to its own total count of respondents, then this would not be the right choice.

+
+ +
    +
  • (#2) Calculate a weight for each dataset to multiply by each record's existing weight
  • +
+ +
+

This would be the right choice if each of your datasets are of equal importance regardless of their size. Example below...

+
+ +
    +
  • (#3) Union all of the raw data and re-calculate the weights on the new, entire dataset
  • +
+ +
+

This would be the right choice if the reasons for non-response are similar across your different surveys - it results in the simplest data for you to work with, and it's the least likely to produce extreme weights.

+
+ +

Example for #2: each dataset is weighted to equal importance, with this ""dataset weight"" being multiplied by whatever weight has already been calculated within the dataset.

+ +
> Survey 1: 100 people   weight:  2
+> Survey 2: 200 people   weight:  1
+> Survey 3: 300 people   weight:  2/3
+> Survey 4: 150 people   weight:  4/3
+> Survey 5: 250 people   weight:  4/5
+
+",2013-10-15 01:07:38.337 +57481,14548.0,1,,,,Combining prediction intervals in regression,,CC BY-SA 4.0,"

Having performed a linear regression, I can find the confidence interval for the response conditioned on a particular x value. However, I am interested in a C.I for the mean response for a set of N new observations. That is, I need to combine the N prediction intervals.

+ +

The closest post I could find was Calculating the mean using regression data, but it only handles the univariate case.

+ +

I tried deriving the standard error of the mean response below, but I'm not sure if this correct.

+ +

$\begin{align} +var(\hat{\bar{y}}) &= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_1 \ldots x_n \right) \\ +&= var \left( \frac{1}{n} \sum_i \hat{y}_i|x_i \right), \quad \text{where the } \hat{y_i}|x_i \text{ are independent} \\ +&= \frac{1}{n^2} \sum_i var(\hat{y}_i|x_i) \\ +\end{align}$

+ +

where $var(\hat{y}_i|x_i) = \sqrt{\sigma^2 x_i^T (X^TX)^{-1}x_i}$ for $x_i$ in the training data and $var(\hat{y}_i|x^*_i) = \sqrt{\sigma^2 (1+ x_i^{*T} (X^TX)^{-1}x^*_i)}$ for $x^*_i$ in the test data.

+ +

Am I on the right track here? Also, is there an R implementation somewhere, or should I do it from scratch?

+ +

Edit: I am also reading up on Bayesian regression methods which specify the predictive distribution $P(y_i|x_i^*)$, and a credible interval for the response. We face a similar problem here, namely, how to compute the predictive distribution for the mean response $P(\overline{y}|x_1^* \ldots x_n^*)$?

+",2013-10-15 01:08:53.047 +57482,22695.0,1,,,,Finding the full conditonal distribution when there are multiple distributions involved,,CC BY-SA 3.0,"

6 neighboring countries have the following disease instances: $y = (y_1, y_2,...,y_n)$ with a population of $x = (x_1, x_2,...,x_n)$.

+ +

The following model and prior distributions are considered:

+ +

$y_i|\theta_i,p_i \sim \text{Poisson}(\theta_i x_i)$

+ +

$\theta_i | \alpha, \beta \sim \text{gamma}(\alpha, \beta)$

+ +

$\alpha \sim \text{gamma}(1,1)$

+ +

$\beta \sim \text{gamma}(10,1)$

+ +

a) Find the full conditional rate $p(\theta_i | \theta_{-i}, \alpha, \beta, x, y)$

+ +

b) Find the posterior distribution.

+ +

Attempt:

+ +

a) For finding the conditional rate with two variables, I would use Bayes' theory. I am not sure if this applies with multiple distributions.

+ +

$$p(\theta_i | \theta_{-i}, \alpha, \beta, x, y) = \frac{P(\theta_i \bigcap \theta_{-i} \bigcap \alpha \bigcap \beta \bigcap x \bigcap y)}{P( \theta_{-i}, \alpha, \beta, x, y)}$$

+ +

$$ = \frac{P(\theta_{-i}, \alpha, \beta, x, y | \theta_i)P(\theta_i)}{\sum_{i=1}^6 P(\theta_{-i}, \alpha, \beta, x, y | \theta_i)P(\theta_i)}$$

+ +

b) The posterior probability is the (prior)x(likelihood). So this would be $$\text{Poisson}(\theta_i x_i) \times L(\theta_i x_i)$$

+ +

I'm not sure how to do the pdf of a Poisson variable as it is variable. The likelihood function is $L(\theta_i y_i) = \frac{\theta_i^{\sum_{i=1}^n y_i} e^{-n \theta_i}}{y_1!,y_2!,..,y_n!}$

+",2013-10-15 01:09:29.977 +57483,22659.0,1,,,,Scikit-learn's Gaussian Processes: How to include multiple hyperparameters in kernel/cov function?,,CC BY-SA 3.0,"

I'm using the scikit-learn's implementation of Gaussian processes. A simple thing to do is to combine multiple kernels as a linear combination to describe your time series properly. So I'd like to include both the squared exponential kernel and the periodic kernel. Linear combinations of valid kernels produce valid kernels, and same goes for multiplying valid kernels (given by Rasmussen and Williams).

+ +

Unfortunately I haven't figured out how to give the theta parameters properly to the model. For example, if we have:

+ +

$$ +k_{Gauss}(x,x') = \exp{(\theta (x-x')^2)} +$$

+ +

then it is alright (this is how the squared-exponential kernel is defined in scikit-learn). But if I wanted:

+ +

$$ +k_{Gauss}(x,x') = \theta_0 \exp{(\theta_1 (x-x')^2)} +$$

+ +

then it is impossible, it seems. The $\mathbf{\theta}$ thing is supposed to be an array, in case you have multiple dimensions/features (even though scikit-learn doesn't support multidimensional GPs, someone developed it, and it will be merged soon). So there is one row with the columns being the parameter in such-and-such dimension. But you cannot have more rows, otherwise it screams at you.

+ +

So question: has anyone actually been able to use kernels that use more than one hyperparameter? If so, what am I doing wrong? And if it is indeed not possible with the current code in scikit, does anyone have some tips on how to extend it so that it can? This is a really important feature that I need. Thanks.

+",2013-10-15 01:19:13.100 +57484,22698.0,1,,,,Bound for the correlation of three random variables,,CC BY-SA 3.0,"

There are three random variables, $x,y,z$. The three correlations between the three variables are the same. That is,

+ +

$$\rho=\textrm{cor}(x,y)=\textrm{cor}(x,z)=\textrm{cor}(y,z)$$

+ +

What is the tightest bound you can give for $\rho$?

+",2013-10-15 01:55:03.623 +57503,5637.0,2,,57497.0,,,,CC BY-SA 3.0,"

The names of the parameters are suggestive. Location, and scale parameters are associated with central tendency, dispersion respectively. For eg: If you change location parameters, mostly it change only the central tendency measures.

+ +

Try this online tool. Distributions

+ +

See how the distribution changes for different values of parameters. You could try this with generalized extreme value distribution.

+ +

Not all standard distributions have all three parameters. Some distributions have only one or two of the parameters (eg: gamma distribution-shape and scale parameters)

+",2013-10-15 07:33:28.403 +57485,7155.0,2,,57483.0,,,,CC BY-SA 3.0,"

On scikit-learn==0.14.1.

+ +

$\theta_0$ can be a vector. The following code works for me.

+ +
import numpy as np
+from sklearn.gaussian_process import GaussianProcess
+from sklearn.datasets import make_regression
+X, y = make_regression()
+bad_theta = np.abs(np.random.normal(0,1,100))
+model = GaussianProcess(theta0=bad_theta)
+model.fit(X,y)
+
+ +

You can pass any kernel you want as the parameter corr. The following is the radial basis function that sklearn uses for Gaussian processes.

+ +
def squared_exponential(theta, d):
+    """"""
+    Squared exponential correlation model (Radial Basis Function).
+    (Infinitely differentiable stochastic process, very smooth)::
+
+                                            n
+        theta, dx --> r(theta, dx) = exp(  sum  - theta_i * (dx_i)^2 )
+                                        i = 1
+
+    Parameters
+    ----------
+    theta : array_like
+        An array with shape 1 (isotropic) or n (anisotropic) giving the
+        autocorrelation parameter(s).
+
+    dx : array_like
+        An array with shape (n_eval, n_features) giving the componentwise
+        distances between locations x and x' at which the correlation model
+        should be evaluated.
+
+    Returns
+    -------
+    r : array_like
+        An array with shape (n_eval, ) containing the values of the
+        autocorrelation model.
+    """"""
+
+    theta = np.asarray(theta, dtype=np.float)
+    d = np.asarray(d, dtype=np.float)
+
+    if d.ndim > 1:
+        n_features = d.shape[1]
+    else:
+        n_features = 1
+
+    if theta.size == 1:
+        return np.exp(-theta[0] * np.sum(d ** 2, axis=1))
+    elif theta.size != n_features:
+        raise ValueError(""Length of theta must be 1 or %s"" % n_features)
+    else:
+        return np.exp(-np.sum(theta.reshape(1, n_features) * d ** 2, axis=1))
+
+ +

It looks like you're doing something pretty interesting, btw.

+",2013-10-15 01:56:23.430 +57486,22677.0,1,,,,How does one determine what ARL0 should be used on CPM package to test for Structural Change,,CC BY-SA 3.0,"

I'm trying to find multiple break points by using processStream from CPM package on R. +Can someone enlighten me on what is ARL0 how does one determine what ARL0 should be used for?

+ +
processStream(ret.fin.chn,""Kolmogorov-Smirnov"",ARL0=500,lambda=NA)
+$changePoints
+    [1]   59   75  250  286  443  448  663 1037 1042 1261 1576 1842 1853 2013 2035 2621 2633
+    $detectionTimes
+[1]   73   89  285  334  447  503  670 1040 1145 1428 1639 1951 1874 2030 2078 2632 2644
+
+ +

while

+ +
processStream(ret.fin.chn,""Kolmogorov-Smirnov"",ARL0=2000,lambda=NA)
+$changePoints
+    [1]   59   75  663 1037 1261 1559 1842 2013 2035 2621 2633
+    $detectionTimes
+[1]   75   90  691 1041 1480 1688 2026 2032 2266 2633 2646
+
+ +

and

+ +
processStream(ret.fin.chn,""Kolmogorov-Smirnov"",ARL0=3000,lambda=NA)
+$changePoints
+    [1]   59   75  663 1037 1261 1559 1842 2013 2149
+    $detectionTimes
+[1]   75   92  692 1041 1490 1690 2026 2032 2284
+
+ +

Tt seems that different ARL0 will give fewer break point detection, is that a good thing?

+ +

Note: the time series ret.fin.chn contains 2749 rows.

+ +

Below are excerpts from R help:

+ +
+

ARL0

+ +

Determines the ARL_0 which the CPM should have, which corresponds to the average number of observations before a false positive occurs, assuming that the sequence does not undergo a chang. Because the thresholds of the CPM are computationally expensive to estimate, the package contains pre-computed values of the thresholds corresponding to several common values of the ARL_0. This means that only certain values for the ARL_0 are allowed. Specifically, the ARL_0 must have one of the following values: 370, 500, 600, 700, ..., 1000, 2000, 3000, ..., 10000, 20000, ..., 50000

+
+",2013-10-15 02:10:01.050 +57487,13037.0,1,62181.0,,,Weighted Least Squares Estimate,,CC BY-SA 3.0,"

Here is a problem from a practice test. Suppose that $$X_i = \mu + \epsilon_i,\quad i=1,\ldots,n\quad \epsilon_i\sim N(0,\sigma^2_1)$$ $$Y_i = \mu + \delta_i,\quad i=1,\ldots,m\quad \delta_i\sim N(0,\sigma^2_2)$$ All $\epsilon_i$'s and $\delta_i$'s are independent. The paramters $\mu, \sigma_1^2, $ and $\sigma_2^2$ are unknown. Let $\theta=m/n$, $\rho=\sigma_2^2/\sigma_1^2$. Suppose $\rho$ is known. Show that the least squares (weighted) estimator of $\mu$ is $$ \hat{\mu} = \dfrac{\rho\bar{X} + \theta\bar{Y}}{\rho+\theta}$$

+ +

MY ATTEMPT:

+ +

I can't figure out how to use the fact that $\rho$ is known. I tried $$\hat{\mu} = \text{argmin}\left\{\sum_{i=1}^n (X_i-\mu)^2 + \sum_{i=1}^m (Y_i-\mu)^2\right\}$$ and arrived that the weighted averaged $$ \hat{\mu} = \dfrac{n\bar{X} + m\bar{Y}}{n+m}$$ But again this does not use the fact that we know what the ratio $\sigma_2^2/\sigma_1^2$ is. Any ideas?

+",2013-10-15 02:16:00.033 +57488,17730.0,1,57495.0,,,How to express joint conditional probability with multiple conditions,,CC BY-SA 3.0,"

I want to express the joint probability of $\Phi_A$ and $\Phi_B$: $p(\Phi_A, \Phi_B)$ conditioned that $\Phi_A$ and $\Phi_B$ are both greater than some value C. How would I express this mathematically? I guess my intuition says:

+ +

$p(\Phi_A, \Phi_B | \bf{\Phi} >C)$

+ +

Is this correct? Is there a better way to express this?

+",2013-10-15 02:45:34.410 +57489,4656.0,2,,57484.0,,,,CC BY-SA 3.0,"

The common correlation $\rho$ can have value $+1$ but not $-1$. If $\rho_{X,Y}= \rho_{X,Z}=-1$, then $\rho_{Y,Z}$ cannot equal $-1$ but is in fact $+1$. +The smallest value of the common correlation of three random variables +is $-\frac{1}{2}$. More generally, +the minimum common correlation of $n$ random variables is $-\frac{1}{n-1}$ +when, regarded as vectors, they are at the vertices of a simplex (of dimension $n-1$) +in $n$-dimensional space.

+ +

Consider the variance of the sum of +$n$ unit variance random variables $X_i$. We have that +$$\begin{align*} +\operatorname{var}\left(\sum_{i=1}^n X_i\right) +&= \sum_{i=1}^n \operatorname{var}(X_i) + \sum_{i=1}^n\sum_{j\neq i}^n \operatorname{cov}(X_i,X_j)\\ +&= n + \sum_{i=1}^n\sum_{j\neq i}^n \rho_{X_i,X_j}\\ +&= n + n(n-1)\bar{\rho} \tag{1} +\end{align*}$$ +where $\bar{\rho}$ is the average value of the $\binom{n}{2}$correlation coefficients. +But since $\operatorname{var}\left(\sum_i X_i\right) \geq 0$, +we readily get from +$(1)$ that +$$\bar{\rho} \geq -\frac{1}{n-1}.$$

+ +

So, the average value of a correlation coefficient is +at least $-\frac{1}{n-1}$. If all the correlation coefficients +have the same value $\rho$, then their average also +equals $\rho$ and so we have that +$$\rho \geq -\frac{1}{n-1}.$$ +Is it possible to have random variables for which the common +correlation value $\rho$ equals +$-\frac{1}{n-1}$? Yes. Suppose that the $X_i$ are uncorrelated +unit-variance random variables and set +$Y_i = X_i - \frac{1}{n}\sum_{j=1}^n X_j = X_i -\bar{X}$. Then, $E[Y_i]=0$, while +$$\displaystyle \operatorname{var}(Y_i) += \left(\frac{n-1}{n}\right)^2 + (n-1)\left(\frac{1}{n}\right)^2 += \frac{n-1}{n}$$ +and +$$\operatorname{cov}(Y_i,Y_j) = -2\left(\frac{n-1}{n}\right)\left(\frac{1}{n}\right) + +(n-2)\left(\frac{1}{n}\right)^2 = -\frac{1}{n}$$ +giving +$$\rho_{Y_i,Y_j} += \frac{\operatorname{cov}(Y_i,Y_j)}{\sqrt{\operatorname{var}(Y_i)\operatorname{var}(Y_j)}} +=\frac{-1/n}{(n-1)/n} += -\frac{1}{n-1}.$$ +Thus the $Y_i$ are random variables achieving the minimum common +correlation value of $-\frac{1}{n-1}$. Note, incidentally, that +$\sum_i Y_i = 0$, and so, regarded as vectors, the random variables +lie in a $(n-1)$-dimensional hyperplane of $n$-dimensional space.

+",2013-10-15 02:58:22.840 +57490,22677.0,2,,15281.0,,,,CC BY-SA 3.0,"

@Dail if you're more inclined to the applied rather than the theoretical behind detection of structural break, you might want try http://cran.r-project.org/web/packages/cpm/index.html this is the link for CPM package of R, where you can use processStream to find multiple break point in your time series.

+",2013-10-15 03:22:22.863 +57491,668.0,2,,57484.0,,,,CC BY-SA 3.0,"

The tightest possible bound is $-1/2 \le \rho \le 1$. All such values can actually appear--none are impossible.

+ +

To show there is nothing especially deep or mysterious about the result, this answer first presents a completely elementary solution, requiring only the obvious fact that variances--being the expected values of squares--must be non-negative. This is followed by a general solution (which uses slightly more sophisticated algebraic facts).

+ +

Elementary solution

+ +

The variance of any linear combination of $x,y,z$ must be non-negative. Let the variances of these variables be $\sigma^2, \tau^2,$ and $\upsilon^2$, respectively. All are nonzero (for otherwise some of the correlations would not be defined). Using the basic properties of variances we may compute

+ +

$$0 \le \text{Var}(\alpha x/\sigma + \beta y/\tau + \gamma z/\upsilon) = \alpha^2 +\beta^2+\gamma^2 + 2\rho(\alpha\beta+\beta\gamma+\gamma\alpha)$$

+ +

for all real numbers $(\alpha, \beta, \gamma)$.

+ +

Assuming $\alpha+\beta+\gamma\ne 0$, a little algebraic manipulation implies this is equivalent to

+ +

$$\frac{-\rho}{1-\rho} \le \frac{1}{3} \left(\frac{\sqrt{(\alpha^2+\beta^2+\gamma^2)/3}}{(\alpha+\beta+\gamma)/3}\right)^2.$$

+ +

The squared term on the right hand side is the ratio of two power means of $(\alpha, \beta, \gamma)$. The elementary power-mean inequality (with weights $(1/3, 1/3, 1/3)$) asserts that ratio cannot exceed $1$ (and will equal $1$ when $\alpha=\beta=\gamma\ne 0$). A little more algebra then implies

+ +

$$\rho \ge -1/2.$$

+ +

The explicit example of $n=3$ below (involving trivariate Normal variables $(x,y,z)$) shows that all such values, $-1/2 \le \rho \le 1$, actually do arise as correlations. This example uses only the definition of multivariate Normals, but otherwise invokes no results of Calculus or Linear Algebra.

+ +

General solution

+ +

Overview

+ +

Any correlation matrix is the covariance matrix of the standardized random variables, whence--like all correlation matrices--it must be positive semi-definite. Equivalently, its eigenvalues are non-negative. This imposes a simple condition on $\rho$: it must not be any less than $-1/2$ (and of course cannot exceed $1$). Conversely, any such $\rho$ actually corresponds to the correlation matrix of some trivariate distribution, proving these bounds are the tightest possible.

+ +
+ +

Derivation of the conditions on $\rho$

+ +

Consider the $n$ by $n$ correlation matrix with all off-diagonal values equal to $\rho.$ (The question concerns the case $n=3,$ but this generalization is no more difficult to analyze.) Let's call it $\mathbb{C}(\rho, n).$ By definition, $\lambda$ is an eigenvalue of provided there exists a nonzero vector $\mathbf{x}_\lambda$ such that

+ +

$$\mathbb{C}(\rho,n) \mathbf{x}_\lambda = \lambda \mathbf{x}_\lambda.$$

+ +

These eigenvalues are easy to find in the present case, because

+ +
    +
  1. Letting $\mathbf{1} = (1, 1, \ldots, 1)'$, compute that

    + +

    $$\mathbb{C}(\rho,n)\mathbf{1} = (1+(n-1)\rho)\mathbf{1}.$$

  2. +
  3. Letting $\mathbf{y}_j = (-1, 0, \ldots, 0, 1, 0, \ldots, 0)$ with a $1$ only in the $j^\text{th}$ place (for $j = 2, 3, \ldots, n$), compute that

    + +

    $$\mathbb{C}(\rho,n)\mathbf{y}_j = (1-\rho)\mathbf{y}_j.$$

  4. +
+ +

Because the $n$ eigenvectors found so far span the full $n$ dimensional space (proof: an easy row reduction shows the absolute value of their determinant equals $n$, which is nonzero), they constitute a basis of all the eigenvectors. We have therefore found all the eigenvalues and determined they are either $1+(n-1)\rho$ or $1-\rho$ (the latter with multiplicity $n-1$). In addition to the well-known inequality $-1 \le \rho \le 1$ satisfied by all correlations, non-negativity of the first eigenvalue further implies

+ +

$$\rho \ge -\frac{1}{n-1}$$

+ +

while the non-negativity of the second eigenvalue imposes no new conditions.

+ +
+ +

Proof of sufficiency of the conditions

+ +

The implications work in both directions: provided $-1/(n-1)\le \rho \le 1,$ the matrix $\mathbb{C}(\rho, n)$ is nonnegative-definite and therefore is a valid correlation matrix. It is, for instance, the correlation matrix for a multinormal distribution. Specifically, write

+ +

$$\Sigma(\rho, n) = (1 + (n-1)\rho)\mathbb{I}_n - \frac{\rho}{(1-\rho)(1+(n-1)\rho)}\mathbf{1}\mathbf{1}'$$

+ +

for the inverse of $\mathbb{C}(\rho, n)$ when $-1/(n-1) \lt \rho \lt 1.$ For example, when $n=3$

+ +

$$\color{gray}{\Sigma(\rho, 3) = \frac{1}{(1-\rho)(1+2\rho)} \left( +\begin{array}{ccc} + \rho +1 & -\rho & -\rho \\ + -\rho & \rho +1 & -\rho \\ + -\rho & -\rho & \rho +1 \\ +\end{array} +\right)}.$$

+ +

Let the vector of random variables $(X_1, X_2, \ldots, X_n)$ have distribution function

+ +

$$f_{\rho, n}(\mathbf{x}) = \frac{\exp\left(-\frac{1}{2}\mathbf{x}\Sigma(\rho, n)\mathbf{x}'\right)}{(2\pi)^{n/2}\left((1-\rho)^{n-1}(1+(n-1)\rho)\right)^{1/2}}$$

+ +

where $\mathbf{x} = (x_1, x_2, \ldots, x_n)$. For example, when $n=3$ this equals

+ +

$$\color{gray}{\frac{1}{\sqrt{(2\pi)^{3}(1-\rho)^2(1+2\rho)}} +\exp\left(-\frac{(1+\rho)(x^2+y^2+z^2) - 2\rho(xy+yz+zx)}{2(1-\rho)(1+2\rho)}\right)}.$$

+ +

The correlation matrix for these $n$ random variables is $\mathbb{C}(\rho, n).$

+ +

+ +

Contours of the density functions $f_{\rho,3}.$ From left to right, $\rho=-4/10, 0, 4/10, 8/10$. Note how the density shifts from being concentrated near the plane $x+y+z=0$ to being concentrated near the line $x=y=z$.

+ +

The special cases $\rho = -1/(n-1)$ and $\rho = 1$ can also be realized by degenerate distributions; I won't go into the details except to point out that in the former case the distribution can be considered supported on the hyperplane $\mathbf{x}.\mathbf{1}=0$, where it is a sum of identically distributed mean-$0$ Normal distribution, while in the latter case (perfect positive correlation) it is supported on the line generated by $\mathbf{1}'$, where it has a mean-$0$ Normal distribution.

+ +
+ +

More about non-degeneracy

+ +

A review of this analysis makes it clear that the correlation matrix $\mathbb{C}(-1/(n-1), n)$ has a rank of $n-1$ and $\mathbb{C}(1, n)$ has a rank of $1$ (because only one eigenvector has a nonzero eigenvalue). For $n\ge 2$, this makes the correlation matrix degenerate in either case. Otherwise, the existence of its inverse $\Sigma(\rho, n)$ proves it is nondegenerate.

+",2013-10-15 04:52:22.687 +57492,20130.0,1,,,,Good econometrics handbooks?,,CC BY-SA 3.0,"

There's a question for textbooks here, but I would like to ask similar question about handbooks: what econometrics handbooks would you recommend?

+ +

Assumed audience is researchers and graduate-level students. It needs to include the material of Greene's Econometric Analysis and Wooldridge's Econometric Analysis of Cross Section and Panel Data in a denser form (without discussions and proofs), as well as mainstream techniques not mentioned in these two books.

+ +

One option is Elsevier's 6 volumes of Handbook of Econometrics series edited by Griliches and Instriligator. However, perhaps you would recommend other handbooks, more concise or otherwise?

+ +

I suggest two possible formats: one is a reference-card format with minimum explanations, and the other one is a more extended format with proofs and more detailed exposition.

+",2013-10-15 04:59:24.173 +57493,3183.0,2,,57472.0,,,,CC BY-SA 3.0,"

It sounds like your question has two parts: the underlying idea and a concrete example. I'll start with the underlying idea, then link to an example at the bottom.

+
+

EM is useful in Catch-22 situations where it seems like you need to know $A$ before you can calculate $B$ and you need to know $B$ before you can calculate $A$.

+

The most common case people deal with is probably mixture distributions. For our example, let's look at a simple Gaussian mixture model:

+
+

You have two different univariate Gaussian distributions with different means and unit variance.

+

You have a bunch of data points, but you're not sure which points came from which distribution, and you're also not sure about the means of the two distributions.

+
+

And now you're stuck:

+
    +
  • If you knew the true means, you could figure out which data points came from which Gaussian. For example, if a data point had a very high value, it probably came from the distribution with the higher mean. But you don't know what the means are, so this won't work.

    +
  • +
  • If you knew which distribution each point came from, then you could estimate the two distributions' means using the sample means of the relevant points. But you don't actually know which points to assign to which distribution, so this won't work either.

    +
  • +
+

So neither approach seems like it works: you'd need to know the answer before you can find the answer, and you're stuck.

+

What EM lets you do is alternate between these two tractable steps instead of tackling the whole process at once.

+

You'll need to start with a guess about the two means (although your guess doesn't necessarily have to be very accurate, you do need to start somewhere).

+

If your guess about the means was accurate, then you'd have enough information to carry out the step in my first bullet point above, and you could (probabilistically) assign each data point to one of the two Gaussians. Even though we know our guess is wrong, let's try this anyway. And then, given each point's assigned distributions, you could get new estimates for the means using the second bullet point. It turns out that, each time you do loop through these two steps, you're improving a lower bound on the model's likelihood.

+

That's already pretty cool: even though the two suggestions in the bullet points above didn't seem like they'd work individually, you can still use them together to improve the model. The real magic of EM is that, after enough iterations, the lower bound will be so high that there won't be any space between it and the local maximum. As a result, and you've locally optimized the likelihood.

+

So you haven't just improved the model, you've found the best possible model one can find with incremental updates.

+
+

This page from Wikipedia shows a slightly more complicated example (two-dimensional Gaussians and unknown covariance), but the basic idea is the same. It also includes well-commented R code for implementing the example.

+

In the code, the "Expectation" step (E-step) corresponds to my first bullet point: figuring out which Gaussian gets responsibility for each data point, given the current parameters for each Gaussian. The "Maximization" step (M-step) updates the means and covariances, given these assignments, as in my second bullet point.

+

As you can see in the animation, these updates quickly allow the algorithm to go from a set of terrible estimates to a set of very good ones: there really do seem to be two clouds of points centered on the two Gaussian distributions that EM finds.

+",2013-10-15 05:25:55.930 +57494,155.0,2,,57473.0,,,,CC BY-SA 3.0,"

Algebraic intuition

+ +

The standard error of the mean for $n$ independent observations is $\frac{\sigma}{\sqrt{n}}$ where $\sigma$ is the standard deviation.

+ +

So if we have two independent samples we have the standard errors for the means of group 1 and group 2.

+ +

$$\sigma_{\mu_1}=\frac{\sigma_1}{\sqrt{n_1}}$$ +$$\sigma_{\mu_2}=\frac{\sigma_2}{\sqrt{n_2}}$$

+ +

If we square these values we get the variance of the mean:

+ +

$$\sigma^2_{\mu_1}=\frac{\sigma^2_1}{n_1}$$ +$$\sigma^2_{\mu_2}=\frac{\sigma^2_2}{n_2}$$

+ +

The variance of the sum or difference of two independent random variables +is the sum of the two variances. Thus,

+ +

$$\sigma^2_{\mu_1 - \mu_2} =\sigma^2_{\mu_1} + \sigma^2_{\mu_2} = \frac{\sigma^2_1}{n_1} + \frac{\sigma^2_2}{n_2} $$

+ +

So if we want the standard error of the difference we take the square root of the variance:

+ +

$$\sigma_{\mu_1 - \mu_2} =\sqrt{\sigma^2_{\mu_1} + \sigma^2_{\mu_2}} = \sqrt{\frac{\sigma^2_1}{n_1} + \frac{\sigma^2_2}{n_2}} $$

+ +

So I imagine this is intuitive if the component steps are intuitive. In particular it helps if you find intuitive the idea that the variance of the sum of independent variables is the sum of the variances of the component variables.

+ +

Fuzzy Intuition

+ +

In terms of more general intuition, if $n_1 = n_2$ and $\sigma=\sigma_1=\sigma_2$ then the standard error of the difference between means will be $\sqrt{2}\sigma_\mu\approx 1.4\times \sigma_\mu$. It makes sense that this value of approximately 1.4 is greater than 1 (i.e., the variance of a variable after adding a constant; i.e., equivalent to one sample t-test) and less than 2 (i.e., the standard deviation of the sum of two perfectly correlated variables (with equal variance) and the standard error implied by the formula you mention: $\frac{\sigma_1}{\sqrt{n_1}} + \frac{\sigma_2}{\sqrt{n_2}}$).

+",2013-10-15 05:35:05.007 +57495,11440.0,2,,57488.0,,,,CC BY-SA 3.0,"

Well, it is your choice which notation to use, but you certainly can just use logical operators:

+ +

$p(\Phi_A, \Phi_B \; |\; \Phi_A>C \,\cap \Phi_B > C)$

+ +

Your current notation is not clear as $\Phi$ is not defined and not obvious what it means.

+",2013-10-15 05:35:17.670 +57496,12314.0,1,,,,Forecasting time-series ahead by multiple time horizons,,CC BY-SA 3.0,"

Suppose that I have daily data on the population of a small village, given by $Y(t)$, as well as daily data on various factors that are relevant to the size of the population in the future, given by vector $X(t)$. These explanatory variables include untransformed variables as well as features engineered to be informative over long horizons (e.g. one of the variables captures the number of deaths over the last 30 days). I have collected this data for 8 years.

+ +

My objective is to forecast $Y(t)$ ahead by 1,2,3,...,365 days. I expect long-run forecasts to be different to short-run forecasts. If a holiday season is coming up I might expect a downwards spike in a few months time (people visiting the city), but if someone is on their deathbed then I will expect a downwards spike in a few days.

+ +

Since the population is sufficiently small that $\Delta Y(t+k)$ is typically in $\{-2,-1,0,1,2\}$ for the forecasting horizon under question, I will use a multiple categorical response variable classification model that will assign probabilities to the various class labels being observed.

+ +

My question centers on the specific considerations I need to make when constructing forecasts of the change from $Y(t)$ to $Y(t+k)$ where $k$ is large (e.g. 100 days).

+ +

Basically there will be the most hideous autocorrelation structure in $\Delta Y(t+k)$ over these time scales. If someone dies on day $2$, they are also dead on day $3, 4, ..., k$, meaning a string of $k$ or so $\Delta Y(t+k)$ will contain this same information.

+ +

These queries result:

+ +
    +
  • What are some ways of dealing with this immense autocorrelation structure in my response. Is it even a problem?
  • +
  • Are there alternative methodologies to the ones I've proposed for forecasting these horizons (aside from typical machine learning methods such as random forests which I'm already working with).
  • +
  • Any other handy advice.
  • +
+",2013-10-15 06:05:55.207 +57497,22703.0,1,,,,Parameters of a Statistical Distribution,,CC BY-SA 3.0,"

Any statistical distribution is described in terms of shape, scale and location parameters. But what do these parameters mean, geometrically, statistically and for a layman with minimum statistical knowledge?

+ +

I have explored wikipedia and still, this doubt continues to exist.

+",2013-10-15 06:36:38.530 +57498,22703.0,1,57509.0,,,Motivation for statistical distributions,,CC BY-SA 3.0,"

As statisticians, we come across many distributions under the banners ""discrete"" or ""continuous"", and ""univariate"" or ""multivariate"". But can anyone provide a good reason behind the existence and motivation for so many distributions? How do we get them? And what can a layman understand from it?

+ +

What is the logic behind the existence of distributions?

+",2013-10-15 06:42:30.540 +57499,22703.0,2,,57472.0,,,,CC BY-SA 3.0,"

Well, I would suggest you to go through a book on R by Maria L Rizzo. One of the chapters contain the use of EM algorithm with a numerical example. I remember going through the code for better understanding.

+ +

Also, try to view it from a clustering point of view in the beginning. Work out by hand, a clustering problem where 10 observations are taken from two different normal densities. This should help.Take help from R :)

+",2013-10-15 07:03:26.383 +57500,22703.0,1,,,,Regression methods,,CC BY-SA 3.0,"

What is the fundamental difference between:

+ +
    +
  1. Linear regression
  2. +
  3. Non linear regression
  4. +
  5. Parametric regression, and
  6. +
  7. Non-parametric regression?
  8. +
+ +

+ +

When should we use each type? How do we know what to choose? What kind of data are required? What are the assumptions unique to each?

+ +

At times, if you go through papers you get to see a combination of the names above.

+ +
+ +

Well, the ideas presented above have led me to the following conclusions:

+ +

1) Linear Regression : Regression methods associated with a linear model, linear with regard to the parameters of interest

+ +

2) Non-Linear Regression : Regression methods associated with a non-linear model, non linear with regard to the parameters of interest.

+ +

3) Parametric Regression: Regression methods associated with a linear model/non-linear model (accordingly called as Linear Parametric / Non-linear Parametric), but the basic assumptions of regression including those associated with errors have to hold truth.

+ +

4) Non-Parametric Regression: Regression methods associated with a linear model/non-linear model (accordingly called as Linear Non-Parametric / Non-linear Non-Parametric), but the basic assumptions of regression including those associated with errors are not true.

+ +

Am I right ? Is there an error or misleading idea here? Please respond.

+",2013-10-15 07:09:16.460 +57501,20470.0,2,,57472.0,,,,CC BY-SA 3.0,"

This is a recipe to learn EM with a practical and (in my opinion) very intuitive 'Coin-Toss' example:

+ + + +
    +
  1. Read this short EM tutorial paper by Do and Batzoglou. This is the schema where the coin toss example is explained:

    + +

  2. +
  3. You may have question marks in your head, especially regarding where the probabilities in the Expectation step come from. Please have a look at the explanations on this maths stack exchange page.

  4. +
  5. Look at/run this code that I wrote in Python that simulates the solution to the coin-toss problem in the EM tutorial paper of item 1:

    + +
    import numpy as np
    +import math
    +import matplotlib.pyplot as plt
    +
    +## E-M Coin Toss Example as given in the EM tutorial paper by Do and Batzoglou* ##
    +
    +def get_binomial_log_likelihood(obs,probs):
    +    """""" Return the (log)likelihood of obs, given the probs""""""
    +    # Binomial Distribution Log PDF
    +    # ln (pdf)      = Binomial Coeff * product of probabilities
    +    # ln[f(x|n, p)] =   comb(N,k)    * num_heads*ln(pH) + (N-num_heads) * ln(1-pH)
    +
    +    N = sum(obs);#number of trials  
    +    k = obs[0] # number of heads
    +    binomial_coeff = math.factorial(N) / (math.factorial(N-k) * math.factorial(k))
    +    prod_probs = obs[0]*math.log(probs[0]) + obs[1]*math.log(1-probs[0])
    +    log_lik = binomial_coeff + prod_probs
    +
    +    return log_lik
    +
    +# 1st:  Coin B, {HTTTHHTHTH}, 5H,5T
    +# 2nd:  Coin A, {HHHHTHHHHH}, 9H,1T
    +# 3rd:  Coin A, {HTHHHHHTHH}, 8H,2T
    +# 4th:  Coin B, {HTHTTTHHTT}, 4H,6T
    +# 5th:  Coin A, {THHHTHHHTH}, 7H,3T
    +# so, from MLE: pA(heads) = 0.80 and pB(heads)=0.45
    +
    +# represent the experiments
    +head_counts = np.array([5,9,8,4,7])
    +tail_counts = 10-head_counts
    +experiments = zip(head_counts,tail_counts)
    +
    +# initialise the pA(heads) and pB(heads)
    +pA_heads = np.zeros(100); pA_heads[0] = 0.60
    +pB_heads = np.zeros(100); pB_heads[0] = 0.50
    +
    +# E-M begins!
    +delta = 0.001  
    +j = 0 # iteration counter
    +improvement = float('inf')
    +while (improvement>delta):
    +    expectation_A = np.zeros((len(experiments),2), dtype=float) 
    +    expectation_B = np.zeros((len(experiments),2), dtype=float)
    +    for i in range(0,len(experiments)):
    +        e = experiments[i] # i'th experiment
    +          # loglikelihood of e given coin A:
    +        ll_A = get_binomial_log_likelihood(e,np.array([pA_heads[j],1-pA_heads[j]])) 
    +          # loglikelihood of e given coin B
    +        ll_B = get_binomial_log_likelihood(e,np.array([pB_heads[j],1-pB_heads[j]])) 
    +
    +          # corresponding weight of A proportional to likelihood of A 
    +        weightA = math.exp(ll_A) / ( math.exp(ll_A) + math.exp(ll_B) ) 
    +
    +          # corresponding weight of B proportional to likelihood of B
    +        weightB = math.exp(ll_B) / ( math.exp(ll_A) + math.exp(ll_B) ) 
    +
    +        expectation_A[i] = np.dot(weightA, e) 
    +        expectation_B[i] = np.dot(weightB, e)
    +
    +    pA_heads[j+1] = sum(expectation_A)[0] / sum(sum(expectation_A)); 
    +    pB_heads[j+1] = sum(expectation_B)[0] / sum(sum(expectation_B)); 
    +
    +    improvement = ( max( abs(np.array([pA_heads[j+1],pB_heads[j+1]]) - 
    +                    np.array([pA_heads[j],pB_heads[j]]) )) )
    +    j = j+1
    +
    +plt.figure();
    +plt.plot(range(0,j),pA_heads[0:j], 'r--')
    +plt.plot(range(0,j),pB_heads[0:j])
    +plt.show()
    +
  6. +
+",2013-10-15 07:21:33.433 +57502,22704.0,2,,48658.0,,,,CC BY-SA 3.0,"

Take a look at a post in Healthy Algorithm: +http://healthyalgorithms.com/2011/11/23/causal-modeling-in-python-bayesian-networks-in-pymc/

+ +

also in PyMC's totorial: +http://pymc-devs.github.io/pymc/tutorial.html

+ +

Maybe you would try the following code clip (assuming you have imported pymc as mc):

+ +
A = mc.Normal('A', mu_A, tau_A)
+B = mc.Normal('B', mu_B, tau_B)
+p_C = mc.Lambda('p_C', lambda A=A, B=B: <<dependency spec goes here>>, doc='Pr[C|AB]')
+C = mc.Bernoulli('C', p_C)
+
+",2013-10-15 07:29:18.930 +57504,22678.0,2,,57500.0,,,,CC BY-SA 3.0,"

Basically, it depends on the function type you are trying to model from data:

+ +
    +
  • Linear. $f(x)=a_1x_2+a_2x_2+ \cdots$ where $a_i$ are the parameters of interest.

  • +
  • Nonlinear: $f(x)=x_1a_1 \frac{a_2}{a_4}+\exp(-a_2/(a_1*x_2))$ $a_i$ are also here the parameters of interest, but they form a nonlinear term now.

  • +
  • Parametric: actually, the both from top, but where you have physical/application meaning for the parameters $a_i$. e.g. splines, where the parameters of interest represent the path of a trajectory.

  • +
  • Non-Parametric: Linear model for nonlinear problems. Same as splines, but the bases are called kernels. This is good, when you have a nonlinear/complex model but would like to do some kind of model selection (which abstract $x_i$ is the most important for your data e.g.). See Kernel (ridge) regression for details on this.

  • +
+ +

Edit: Thanks to whuber's comments.

+",2013-10-15 07:41:50.867 +57505,22678.0,2,,57478.0,,,,CC BY-SA 3.0,"

You will need a robust loss function in the kernel estimation model. +However, this topic may become quite advances very fast. :) +For a good start, I would suggest the one class SVM from sklearn. +http://scikit-learn.org/stable/modules/svm.html#density-estimation-novelty-detection

+",2013-10-15 07:46:47.790 +57506,5671.0,2,,57363.0,,,,CC BY-SA 3.0,"

Actually the simplest approach would be Association Rule Mining, aka Frequent Itemset Mining (FIM). ""Clustering"" is an attempt to uncover structure, but not so much to make recommendations. It's explorative, not predictive; the clusters will most often be something rather obvious to the domain expert.

+ +

FIM will learn rules of the form that students, which have taken class A and B, have also taken class C with x% probability, i.e.

+ +

$$ {A,B} \rightarrow {C} \text{ with confidence }x\%$$

+ +

You really need to go through some introductory course. APRIORI is discussed everywhere, and is an obbvious fit here. In particular as you don't have quantities to predict (you don't have users that take class A 5 times and class B 2 times and thus are likely to buy -2 times class C...) Depending on your data, FPGrowth or Eclat algorithms may be more performat though.

+",2013-10-15 07:47:16.070 +57507,22706.0,1,,,,How do I interpret the credibility interval in a Bayesian Regularized Regression?,,CC BY-SA 3.0,"

A penalized regression provides biased estimates of the regression coefficients (bias-variance trade-off principle). Therefore, standard errors and confidence intervals are regarded as not very meaningful for those biased estimates arising from (frequentist) penalized regression method, see e.g. the discussion Estimating R-squared and statistical significance from penalized regression model +. I would assume that the same problems exists in an Bayesian approach but Kyung, Gill, Ghaosh and Casella (2010) say that the Bayesian formulation produces valid standard errors. Does it mean that a 95% credibility intervals includes with 95% probability the true biased estimate and if yes, is this a useful information?

+",2013-10-15 08:17:27.980 +57508,3993.0,1,59166.0,,,Relative variances of higher-order vs. lower-order random terms in mixed models,,CC BY-SA 4.0,"

TL, DR summary:

+

Is there any theoretical or empirical basis to support the following statement being true as a general rule of thumb?

+

"When estimating a mixed model, typically the estimated variances/standard deviations of random effects associated with 'higher-order' terms (e.g., random effects of two-way, three-way, and beyond interaction terms) turn out to be smaller than the estimated variances/standard deviations of random effects associated with 'lower-order' terms (e.g., the residual variance, variances associated with simple effects of grouping factors)."

+

The source of this claim is me. ;)

+
+

Okay, now for the longer version ...

+

Typically when I sit down to start analyzing a new dataset which I know will call for a mixed model, one of the first models that I fit (after the statistical foreplay of looking through the observations in the dataset, plotting various things, cross-tabulating different factors, etc.) is one that is pretty close to the "maximal" random effects specification, where every random effect that is in-principle possible to estimate from the data, is estimated.

+

Naturally, it is not uncommon that this nearly-maximal model will have some computational problems (convergence errors, or wacky variance/covariance estimates, or etc.) and that I have to trim back this model to find one that my data can more easily support. Fine.

+

In these situations, the method I have come to prefer for trimming random terms is not to rely on significance tests or likelihood ratios, but rather to just identify the random effects that seem to have the smallest standard deviations (which can admittedly be a little tricky when predictors are on very different scales, but I try to take account of this in my appraisal) and remove these terms first, sequentially in an iterative process. The idea being that I want to alter the predictions of the model as little as possible while still reducing the complexity of the model.

+

One pattern that I seem to have noticed after a pretty good amount of time spent doing this is that following this method very often leads me to trim random effects associated with higher-order terms (as defined above) of the model first. This is not always true, and occasionally some of the higher-order terms explain a lot of variance, but this doesn't seem to be the general pattern. In sharp contrast, I usually find that lower-order random terms -- particularly those associated with simple effects of the grouping factors -- explain a pretty good amount of variance and are fairly essential to the model. At the extreme, the residual term commonly accounts for close to the most variance, although of course removing this term wouldn't be sensible.

+

This entirely informal observation leads me to form the hypothesis that I stated at the beginning of this question.

+

If it is true, then it constitutes a useful piece of advice that might be passed down to people who are less experience with this kind of model selection process. But before I begin doing so, I want to check with other, more experienced users of mixed models about their reactions to this observation. Does it seem more or less true to you? Is it roughly consistent with your experience fitting many different mixed models to many different datasets? Do you know of any sensible, theoretical reasons why we might actually expect this to be true in a lot of cases? Or does it just seem like bullshit?

+

One possible answer here is that it is not true even in my own case, and I have simply deceived myself. Certainly a possibility that I am open to.

+

Another possibility is that it might be true in my own case, but that this could simply be a kind of coincidence having to do with the kinds of datasets that I tend to work with routinely (which, FYI, are datasets in psychological / social sciences, a slight majority being experimental in origin, but also a fair proportion of non-experimental stuff). If this is the case then there is probably no good reason for expecting my observations to hold in general in other fields that handle very different kinds of data. Still, if there is a coherent non-coincidental reason for why this might be expected to be true, even if only for these particular kinds of datasets, I would love to hear it.

+

And of course another possibility is that others have noticed similar patterns in their own data, and that it represents some kind of general rule of thumb that people find useful to keep in mind as they fit mixed models to various different data. If this is the case then it seems like there must be some compelling statistical-theoretical reason for why this pattern arises. But I really don't know what that reason would look like.

+

I welcome anyone's thoughts and opinions about this. Note that as far as I'm concerned, totally legitimate responses to this question might be as simple as comments like "Yeah I have noticed something similar in the data I've worked with, but I have no idea why it should be true" or conversely "I have noticed nothing remotely like this in the data I've worked with." Of course I also welcome longer and more involved discussions ...

+",2013-10-15 08:21:54.933 +57509,16474.0,2,,57498.0,,,,CC BY-SA 3.0,"

In many cases a distribution can be described as a result of some idealized experiment. For example if we flip a fair coin $n$ times the number of heads will follow a binomial distribution with parameters $n$ and .5. These idealized experiments are often used as models; they are used as simplified representation of how the data came to be. There are obviously many such models, and as a consequence many distributions. If you want the logic behind all distributions, then that will require a book of many volumes, e.g.:

+ +

N. L. Johnson, S. Kotz and N. Balakrishnan (2000). Continuous Multivariate Distributions, Vol. 1 (second edition), New York: Wiley & Sons.

+ +

N. L. Johnson, S. Kotz and N. Balakrishnan (1997). Discrete Multivariate Distributions. New York: John Wiley & Sons.

+ +

N. L. Johnson, S. Kotz and N. Balakrishnan (1995). Continuous Univariate Distributions, Vol. 2 (second edition), New York: John Wiley & Sons.

+ +

N. L. Johnson, S. Kotz and N. Balakrishnan (1994). Continuous Univariate Distributions, Vol. 1 (second edition), New York: John Wiley & Sons.

+ +

N. L. Johnson, A. W. Kemp and S. Kotz (1992). Univariate Discrete Distributions (second edition), New York: John Wiley & Sons.

+ +

A shorter list of distributions that is more suitable/affordable for owning yourself is:

+ +

Forbes, C., Evans, M., Hastings, N., & Peacock, B. (2011). Statistical distributions. Wiley

+",2013-10-15 08:28:49.833 +57510,19557.0,2,,57428.0,,,,CC BY-SA 3.0,"

ID3 is an algorithm for building a decision tree classifier based on maximizing information gain at each level of splitting across all available attributes. It's a precursor to the C4.5 algorithm.

+ +

With this data, the task is to correctly classify each instance as either benign or malignant. Since each attribute takes on whole integer values in the range 1-10, strictly speaking the values aren't continuous in that they can't take decimal values. For each integer value of each attribute, you'll need to calculate which split provides the most homogenous grouping of instances at each level of splitting. This is done by calculating the information gain for each possible split and selecting the greatest (ID3 is known as a greedy algorithm).

+ +

You can do this by hand, but it's obviously better to run the algorithm in a tool such as Weka or R. If you're creating your own implementation, then you'll need to test each possible split and select the one with the greatest information gain, assuming you don't already have a homogenous group (in which case you'd assign the class attribute and change the node to a leaf).

+",2013-10-15 08:47:11.623 +57568,,1,57573.0,,user30490,Sum of autoregressive processes?,,CC BY-SA 3.0,"

I am working on a research topic where I need to add together two AR processes and I was wondering if the distribution of these processes is of a recognizable form/structure. More formally, if $x_t$ is a AR(p) process with characteristic polynomial $\Phi_x(u)$ and $y_t$ is a AR(q) process with characteristic polynomial $\Phi_y(u)$, then what is the structure of $z_t=x_t+y_t$?

+",2013-10-15 20:10:19.377 +58691,23276.0,2,,58689.0,,,,CC BY-SA 3.0,"

Note that for $\def\N{\mathbb N}n \in \N$ we have $$\{T\le n\} = \bigcup_{i=1}^n \{X_i \in B\} = \bigcup_{i=1}^n X_i^{-1}(B) $$ +As $X_i$ is $\mathcal F$-measurable for $i \le n$ by definition, we have $X_i^{-1}(B)\in \mathcal F_n$ for $i \le n$ and hence $\{T \le n\}\in \mathcal F_n$. That is, $T$ is an $(\mathcal F_n)$-stopping time.

+",2013-11-02 09:29:15.103 +57511,8819.0,1,,,,Integration with respect to Multivariate normal distribution,,CC BY-SA 3.0,"

I am working on the numerical integration of an integral of the following functional form:

+ +

$$ \int\limits_{R^{G}} F(x_{1},x_{2},\text{…}x_{G})d\text{Φ}_{\text{Σ}}(x_{1},x_{2},\text{…},x_{G}) $$

+ +

Here
+$$ \text{Φ}_{\text{Σ}}(x_{1},x_{2},\text{…},x_{G}) $$

+ +

is the G-dimensional multivariate normal distribution with correlation matrix $\Sigma$ and F is some function of the constituent marginals.

+ +

What I am essentially doing is calculating the expectation of a function over a correlated multivariate normal distribution. Practically, G is expected to be equal to or less than 4 and most often just 2 or 3.

+ +

Can some one know-how share any of the fundamental references that tackles the issue.My research yielded some information, and it appears that Gaussian quadrature is one of the preferred ways to approach the problem. I am referring to the book Applied Computational +Economics and Finance by Miranda and Fackler for addressing the implementation aspects of the algorithm.

+ +

But, I wanted to get some help from the expert community here on if I am on the right track.

+ +

Sorry if it is a repeat, however I searched on the site, and was not able to find a question that matches with what I had.

+",2013-10-15 08:47:23.500 +57512,22707.0,2,,57338.0,,,,CC BY-SA 3.0,"

The output of TANH is already between -1 and 1. So, if you normalise the inpu, be sure to normalise for the hidden activation functions. In theory it is not required to normalise, because tanh(1000) is mathematically different from tanh(10000). But in practice these are the same, so you should indeed normalise the input in most applications.

+",2013-10-15 09:31:11.227 +57513,10594.0,1,57534.0,,,"A categorical variable in glm shows significance from analysis of deviance, but each level is not significant in z-test",,CC BY-SA 3.0,"

I am fitting a generalized linear model (glm). The explanatory variable is categorical with three levels (control, treat1, treat2). The response variable is 0 or 1. +The response rate for each treatment level is ploted as the figure below (from left to right: control, treat1, treat2):

+ +

+ +

There seems to be a big treatment effect between treat1 vs. control and treat2 vs. control. I applied glm:

+ +

fit <- glm(response ~ treatment, family = binomial, data = dat)

+ +
Coefficients:
+             Estimate Std. Error z value Pr(>|z|)        
+(Intercept)   -21.57    6536.57  -0.003    0.997
+treat1        23.76    6536.57   0.004    0.997
+treat2        43.13    9364.95   0.005    0.996
+
+ +

The z-test shows that neither treat1 nor treat2 is significant compared to the reference level control.

+ +

However, the analysis of deviance confirmed that the treatment factor as a whole is highly significant:

+ +
drop1(M2, test=""Chisq"")
+
+response ~ treatment
+            Df   Deviance    AIC    LRT  Pr(>Chi)    
+ <none>          13.003    19.003                     
+ treatment   2   77.936    79.936 64.932 7.946e-15 ***
+
+ +

How shall I interpret such a strange result? Why does the individual z-test not give me any significant result, while according to the plot there is obviously an effect between treat1 and control, and between treat2 and control?

+",2013-10-15 09:49:58.953 +57514,14874.0,1,57517.0,,,Constructing a bivariate distribution from two gamma-distributed random variables with nonlinear dependence?,,CC BY-SA 3.0,"

I've got 2 gamma-distributed random variables $(X,Y)$ with arbitrary scale and shape parameters. Further, $Y$ should be a non-linear function of $X$, lets say $Y=\sqrt{X}$. What I am interested in is the joint probability $F_{X,Y}(\cdot)$.

+ +

All suggestions or general comments are welcome.

+ +

Thank you in advance

+",2013-10-15 10:00:24.350 +57515,22677.0,2,,57486.0,,,,CC BY-SA 3.0,"

Nevermind, apparently the $ARL0=\frac1\alpha$ where alpha is false alarm probability

+ +

a further reading wold be

+ +
+

Nonparametric monitoring of data streams for changes in location and scale + GJ Ross, DK Tasoulis, NM Adams - Technometrics, 2011 - Taylor & Francis

+
+",2013-10-15 10:13:54.107 +57516,21884.0,1,57519.0,,,Sample variance order,,CC BY-SA 3.0,"

Is it true that (and if so, how does one prove) the following. + $$E\left|\hat{Var}_{n}(X)-Var(X)\right|^{2}=O(n^{-1})$$ + where:

+ +

• $X$ is a random variable with mean $\mu$ and variance $\sigma^{2}$

+ +

• $\hat{Var}_{n}(X)$ is the sample variance of $X$ from $n$ + i.i.d. random variables $X_{1},\cdots,X_{n}$ with mean $\mu$ + and variance $\sigma^{2}$.

+ +

Many thanks in advance. (Feel free to change my notation).

+",2013-10-15 10:40:17.880 +57517,17328.0,2,,57514.0,,,,CC BY-SA 3.0,"
+

OP wrote: I've got 2 gamma-distributed random variables (X,Y) with ... say $Y=\sqrt{X}$.

+
+ +

Your question is internally inconsistent. In particular, if $X$~Gamma$(a,b)$ with pdf $f(x)$, say:

+ +

$$f(x) =\frac{x^{a-1} e^{-\frac{x}{b}}}{b^a \Gamma (a)}, \text{ for } x > 0 $$

+ +

... and $Y =\sqrt{X}$, then the pdf of $Y$, say $g(y)$, is:

+ +

$$g(y) = \frac{2 b^{-a} y^{2 a-1} e^{-\frac{y^2}{b}}}{\Gamma (a)}, \text{ for } y > 0 $$

+ +

... which is not Gamma$(\alpha, \beta)$, as originally assumed.

+",2013-10-15 11:03:36.747 +57518,18296.0,1,57599.0,,,Deterministic components in covariates/exogenous variables in time series models,,CC BY-SA 3.0,"

Actually, I have read a pair of books about time series analysis, but I am still not sure about how to treat deterministic components, like trend and seasonality, in the exogenous variables in a time series model. Do I have to detrend and deseasonalize the covariates before I use them as axplanatory variables in a time series model? I would also be thankful for a reference.

+ +

Thank you in advance!

+",2013-10-15 11:15:55.800 +57526,21884.0,1,,,,Choice of variance estimator,,CC BY-SA 3.0,"

Consider the problem of the choice of estimator of $\sigma^2$ based on a random sample of size $n$ from a $N(\mu,\sigma^2)$ distribution.

+ +

In undergraduate, we were always taught to use the sample variance

+ +

$$\hat{s}^2 = \dfrac{1}{n-1}\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}$$

+ +

instead of the maximum likelihood estimator

+ +

$$\hat{\sigma}^2 = \dfrac{1}{n}\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}.$$

+ +

This is because we learned that $\hat{s}^2$ is an unbiased estimator and that $\hat{\sigma}^2$ is biased.

+ +

However now I'm studying for a PhD and I've read that we choose estimators based on minimizing mean square error (=bias$^2$ + var).

+ +

It can be shown that $$mse(\hat{\sigma}^2) < mse(\hat{s}^2 ).$$

+ +

So, why do most people use $\hat{s}^2$?

+",2013-10-15 13:24:29.923 +58692,11383.0,2,,58688.0,,,,CC BY-SA 3.0,"

The last step simply uses the fact that for each real number $t$, +$$\exp(t)=\sum_{i=0}^\infty\frac{t^i}{i!}.$$ +Here $t=\lambda s$. (the introduction of $\frac{e^{\lambda s}}{e^{\lambda s}}$ does not seem to be of use here)

+",2013-11-02 09:38:27.413 +57519,17328.0,2,,57516.0,,,,CC BY-SA 3.0,"

I assume by the term, sample variance, you are referring to the unbiased estimator of population variance $\mu_2$, i.e. the 2nd h-statistic, namely:

+ +

$$ h_2 = \hat{Var}_{n}(X) = \frac{1}{n-1}\sum _{i=1}^n \left(X_i-\bar{X}\right){}^2$$

+ +

Either way, the expectation you seek is just the MSE of $\hat{Var}_{n}(X)$. Note that the absolute value is irrelevant due to the squaring. That should be enough to do a google search and find an answer somebody has worked out in a journal paper or book.

+ +

More generally, these sorts of calculations are known as moments of moments and can be solved by working with power sum notation $s_r=\sum _{i=1}^n X_i^r$. First, express h2 in terms of power sums:

+ +

+ +

where I am using the HStatistic function in the mathStatica software (of which I am one of the authors). Next: find $E[(h_2-\mu_2)^2]$ ... which is just the 1st RawMoment of $(h_2-\mu_2)^2$, so we can find it with:

+ +

+ +

The ___ToCentral bit expresses the answer in terms of central moments $\mu_i$ of the population.

+ +

All done ... you can now work out what happens as $n$ gets large etc

+",2013-10-15 11:38:31.180 +57520,7155.0,2,,57338.0,,,,CC BY-SA 3.0,"

For regression tasks you should be using a linear neuron in the output. Your logit output likely looks sigmoidal when plotted against the response variable. Plus the loss function you're using, probably makes little sense in this context.

+ +

Input features should always be de-meaned and divided by standard deviation. That has nothing to do with the unit types, and everything to do with training by backprop. Gradient descent will be cradling around a minima if you don't normalize properly because the error surface will be a thin ellipsoid.

+ +

Finally, consider rectified linear units in the hidden because they train much faster than logit or tanh.

+",2013-10-15 11:39:25.197 +57521,22709.0,1,,,,Fraction confidence intervals for small sample size,,CC BY-SA 3.0,"

What is 95% confidence interval for the fraction of 5 successes out of 12 trials?

+ +

Is it possible to compute confidence intervals for a sample of this size?

+",2013-10-15 11:46:03.933 +57522,503.0,2,,57521.0,,,,CC BY-SA 3.0,"

It is certainly possible to do this. There are several methods. The literature is surprisingly large; for a good entry point see Agresti & Coull (1998)

+ +

If you are using R you can use this:

+ +
install.packages(""binom"")
+library(binom)
+binom.confint(5, 12)
+
+",2013-10-15 12:02:04.813 +57523,10278.0,2,,57479.0,,,,CC BY-SA 3.0,"

I think this is best illustrated with an example in R:

+ +
library(GGally)
+data(iris)
+
+ +

Actual labeling according to Species

+ +
ggpairs(iris, columns=c(""Sepal.Length"", ""Sepal.Width"", ""Petal.Length"", ""Petal.Width""), colour='Species', lower=list(continuous='points'), axisLabels='none', upper=list(continuous='blank'))
+
+ +

+ +

Labelling according to kmeans clustering

+ +
set.seed(1234)
+iris$Cluster <- factor(kmeans(iris[,c(""Sepal.Length"", ""Sepal.Width"", ""Petal.Length"", ""Petal.Width"")], centers=length(levels(iris$Species)))$cluster)
+ggpairs(iris, columns=c(""Sepal.Length"", ""Sepal.Width"", ""Petal.Length"", ""Petal.Width""), colour='Cluster', lower=list(continuous='points'), axisLabels='none', upper=list(continuous='blank'))
+
+ +

+ +

From these pair-wise plots you can compare visually what elements kmeans assigns to the same group compared what elements belong to the same species.

+",2013-10-15 12:02:20.597 +57524,12683.0,2,,57497.0,,,,CC BY-SA 3.0,"

Some technicalities to complement @vinux's answer:

+ +

If you have a density $f_X(\cdot)$ for a random variable $X$ where +$$f_X\left(x;\theta,\phi;\vec\xi\right)=\tfrac{1}{\phi}f_X\left(\frac{x-\theta}{\phi};0,1;\vec\xi\right)$$ +then $\theta$, $\phi$, & $\vec\xi$ are location, scale, & shape parameters respectively.

+ +

Location parameters only shift the density, changing its mean (if it has one) & other measures of central tendency, but no higher moments.

+ +

Scale parameters only stretch the density, changing its variance (if it has one) & other measures of dispersion, & the mean when $\newcommand{\ex}{\operatorname{E}}\ex X\neq\theta$, but no higher moments.

+ +

Shape parameters change the shape of the density, perhaps stretching or shifting too, so may change any moments. They tend to get called 'shape' parameters only in contrast to location & scale parameters; e.g. the Weibull distribution has scale & shape parameters but no-one talks about the shape parameter of the Poisson distribution (though they do about the two shape parameters of the beta distribution).

+ +

It's perhaps worth emphasizing @Nick's point that a parameter's being equal to the expectation of the random variable doesn't imply that it's a location parameter: $$\psi=\ex{X} \quad \not\Rightarrow \quad f_X\left(x;\psi\right)=f_X\left(x-\psi;0\right)$$

+",2013-10-15 12:06:38.393 +57525,15563.0,1,,,,How to use R prcomp results for prediction?,,CC BY-SA 4.0,"

I have a data.frame with 800 obs. of 40 variables, and would like to use Principal Component Analysis to improve the results of my prediction (which so far is working best with Support Vector Machine on some 15 hand-picked variables).

+

I understand a prcomp can help me improve my predictions, but I do not know how to use the results of the prcomp function.

+

I obtain the result:

+
> PCAAnalysis <- prcomp(TrainTrainingData, scale.=TRUE)
+> summary(PCAAnalysis)
+Importance of components:
+                          PC1    PC2    PC3    PC4    PC5   PC6    PC7    PC8    PC9   PC10   PC11   PC12   PC13   PC14
+Standard deviation     1.7231 1.5802 1.3358 1.2542 1.1899 1.166 1.1249 1.1082 1.0888 1.0863 1.0805 1.0679 1.0568 1.0520
+Proportion of Variance 0.0742 0.0624 0.0446 0.0393 0.0354 0.034 0.0316 0.0307 0.0296 0.0295 0.0292 0.0285 0.0279 0.0277
+Cumulative Proportion  0.0742 0.1367 0.1813 0.2206 0.2560 0.290 0.3216 0.3523 0.3820 0.4115 0.4407 0.4692 0.4971 0.5248
+                         PC15   PC16   PC17   PC18  PC19   PC20   PC21   PC22   PC23   PC24   PC25   PC26   PC27   PC28
+Standard deviation     1.0419 1.0283 1.0170 1.0071 1.001 0.9923 0.9819 0.9691 0.9635 0.9451 0.9427 0.9238 0.9111 0.9073
+Proportion of Variance 0.0271 0.0264 0.0259 0.0254 0.025 0.0246 0.0241 0.0235 0.0232 0.0223 0.0222 0.0213 0.0208 0.0206
+Cumulative Proportion  0.5519 0.5783 0.6042 0.6296 0.655 0.6792 0.7033 0.7268 0.7500 0.7723 0.7945 0.8159 0.8366 0.8572
+                         PC29   PC30   PC31   PC32   PC33   PC34   PC35   PC36    PC37                 PC38
+Standard deviation     0.8961 0.8825 0.8759 0.8617 0.8325 0.7643 0.7238 0.6704 0.60846 0.000000000000000765
+Proportion of Variance 0.0201 0.0195 0.0192 0.0186 0.0173 0.0146 0.0131 0.0112 0.00926 0.000000000000000000
+Cumulative Proportion  0.8773 0.8967 0.9159 0.9345 0.9518 0.9664 0.9795 0.9907 1.00000 1.000000000000000000
+                                       PC39                 PC40
+Standard deviation     0.000000000000000223 0.000000000000000223
+Proportion of Variance 0.000000000000000000 0.000000000000000000
+Cumulative Proportion  1.000000000000000000 1.000000000000000000
+
+

I thought I would obtain the parameters that are the most important to use, but I just don't find this information. All I see are "standard deviation" etc. on the PCs. But how do I use this for prediction?

+",2013-10-15 12:19:10.007 +57527,22705.0,2,,57518.0,,,,CC BY-SA 3.0,"

It may not be a good idea to consider trend & Seasonality as deterministic components of your dependent variable.

+ +

The Un-observed component model approach is an ideal way to handle such ambiguities. It estimates the trend, seasonality & other exogenous variables as well.

+ +

http://ideas.repec.org/h/eee/ecofch/1-07.html is your starting point.

+",2013-10-15 13:28:09.540 +57529,,2,,57525.0,anon,,,CC BY-SA 3.0,"

The information from the summary() command you have attached to the question allows you to see, e.g., the proportion of the variance each principal component captures (Proportion of variance). In addition, the cumulative proportion is computed to output. For example, you need to have 23 PCs to capture 75% of the variance in your data set.

+ +

This certainly is not the information you typically use as input to further analyses. Rather, what you usually need is the rotated data, which is saved as 'x' in the object created by prcomp.

+ +

Using R code as a short example.

+ +
pr<-prcomp(USArrests, scale = TRUE)
+summary(pr) # two PCs for cumulative proportion of >80% 
+newdat<-pr$x[,1:2]
+
+ +

Then you can use the data in the newdat for further analyses, e.g., as input to SVM or some regression model. Also, see, e.g., https://stackoverflow.com/questions/1805149/how-to-fit-a-linear-regression-model-with-two-principal-components-in-r for more information.

+",2013-10-15 13:46:29.443 +57530,8074.0,2,,57525.0,,,,CC BY-SA 3.0,"

While I'm unsure as to the nature of your problem, I can tell you that I have used PCA as a means of extracting dominant patterns in a group of predictor variables in the later building of a model. In your example, these would be found in the principle components (PCs), PCAAnalysis$x, and they would be based on the weighting of variables found in PCAAnalysis$rotation. One advantage of this process is that PCs are orthogonal, and so you remove issues of multicollinearity between the model predictors. The second, is that you might be able to identify a smaller subset of PCs that capture the majority of variance in your predictors. This information can be found in summary(PCAAnalysis) or in PCAAnalysis$sdev. Finally, if you are interested in using a subset of the PCs for prediction, then you can set the tol parameter in prcomp to a higher level to remove trailing PCs.

+ +

Now, you can ""project"" new data onto the PCA coordinate basis using the predict.prcomp() function. Since you are calling your data set a ""training"" data set, this might make sense to then project a validation data set onto your PCA basis for the calculation of their respective PC coordinates. Below is an example of fitting a PCA to 4 biometric measurements of different iris species (which are correlated to some degree). Following this, I project biometric values of a new data set of flowers that have similar combinations of these measurements for each of the three species of iris. You will see from the final graph that their projected PCs lie in a similar area of the plot as the original data set.

+ +

An example using the iris data set:

+ +
### pca - calculated for the first 4 columns of the data set that correspond to biometric measurements (""Sepal.Length"" ""Sepal.Width""  ""Petal.Length"" ""Petal.Width"")
+data(iris)
+
+# split data into 2 parts for pca training (75%) and prediction (25%)
+set.seed(1)
+samp <- sample(nrow(iris), nrow(iris)*0.75)
+iris.train <- iris[samp,]
+iris.valid <- iris[-samp,]
+
+# conduct PCA on training dataset
+pca <- prcomp(iris.train[,1:4], retx=TRUE, center=TRUE, scale=TRUE)
+expl.var <- round(pca$sdev^2/sum(pca$sdev^2)*100) # percent explained variance
+
+# prediction of PCs for validation dataset
+pred <- predict(pca, newdata=iris.valid[,1:4])
+
+###Plot result
+COLOR <- c(2:4)
+PCH <- c(1,16)
+
+pc <- c(1,2) # principal components to plot
+
+png(""pca_pred.png"", units=""in"", width=5, height=4, res=200)
+op <- par(mar=c(4,4,1,1), ps=10)
+plot(pca$x[,pc], col=COLOR[iris.train$Species], cex=PCH[1], 
+ xlab=paste0(""PC "", pc[1], "" ("", expl.var[pc[1]], ""%)""), 
+ ylab=paste0(""PC "", pc[2], "" ("", expl.var[pc[2]], ""%)"")
+)
+points(pred[,pc], col=COLOR[iris.valid$Species], pch=PCH[2])
+legend(""topright"", legend=levels(iris$Species), fill = COLOR, border=COLOR)
+legend(""topleft"", legend=c(""training data"", ""validation data""), col=1, pch=PCH)
+par(op)
+dev.off()
+
+ +

+",2013-10-15 13:50:46.937 +57531,20304.0,1,57532.0,,,survival analysis without enough data,,CC BY-SA 3.0,"

I have computed and plotted the survival function for a subscription-based service and the following is the result.

+ +

+ +

The problem is that there does not seem to be enough data to get a full curve. This is because most of the oldest accounts are still active. So my question is would it still be useful to compare survival curves for different segments of accounts given that there is clearly not enough data for a full curve.

+",2013-10-15 14:09:15.703 +57532,3999.0,2,,57531.0,,,,CC BY-SA 3.0,"

It is still useful - I wouldn't make really bold claims about what happens past 600 days if I were you, but seeing a clear departure in one category or the other, even if they don't eventually hit zero, is still useful.

+ +

Consider this: The ideal randomized clinical trial of a perfect, flawless drug will have one of the curves not only never going to zero, but never going below ~1.00. The fact that a curve doesn't drop in a meaningful time horizon is, in and of itself, useful information. And since all accounts that have not yet closed are censored, the techniques you're using are already accounting for ""They'll close someday far in the future"".

+",2013-10-15 14:27:38.403 +57533,19265.0,1,,,,Is it discriminant analysis?,,CC BY-SA 3.0,"

I have samplings of one-dimensional data of two classes: $A$ and $B$.

+ +

I have to predict the posterior probability of class $A$.

+ +

$$\tilde{P}(A|x) \approx \frac{N_A\tilde{f_A}(x)}{N_A\tilde{f_A}(x) + N_B\tilde{f_B}(x)}.$$

+ +

$f_A$, $f_B$ are the estimation of densities of classes $A$ and $B$.

+ +

I estimate the densities as normal distributions, whose $\mu$ and $\sigma$ are average values and standard deviations of all the points of each class ($A$ and $B$ accordingly).

+ +

Will it be correct to say, that I'm using discriminant analysis in this case?

+",2013-10-15 14:38:31.047 +57534,232.0,2,,57513.0,,,,CC BY-SA 3.0,"

You cannot use Wald's z-test when the maximum likelihood estimates are infinite (they just look finite in the model fit). However you can still use a likelihood-ratio test as you see with the deviance analysis. You just have to set up model comparisons that test the hypotheses that you are interested in.

+ +

For example, to test $\mu_c = \mu_{treat1}$, you fit a model under this assumption, and compare it to the full model.

+ +
fit2 <- glm(response ~ I(treatment==""treat2""), family = binomial, data = dat)
+anova(fit1, fit2, test=""Chisq"")
+
+",2013-10-15 14:38:53.907 +57535,22713.0,1,,,,Should I use Pearson Correlation or Linear Regression to show the linearity of two sets of numbers?,,CC BY-SA 3.0,"

I have two sets of acceleration data in terms of time. In one set the acceleration increases more linearily than another set. How can I show the linearity difference of the two set of data best? Should I use Pearson Correlation or Linear Regression? If I get r = 0.8 and 0.98 by Pearson Correlation, how I can interpretate the result? Do they have very big difference on the linearity based on 0.8 and 0.98?

+ +

Thanks.

+",2013-10-15 14:44:13.583 +57536,22671.0,1,57542.0,,,Fit a regression line by using `MATLAB`,,CC BY-SA 3.0,"

I have the following data

+ +
  Individual     Heart rate     Weight    Hours of exercise per week
+       1           72            134             3.2
+       2           81            201             3.5
+       3           60            156             7.1
+       4           82            148             2.4
+       5           75            170             1.2
+
+ +

Now i have to fit a regression line by using MATLAB.

+ +

But which one is independent variable ? Hours of exercise per week? And is that Heart rate & Weight response variables ?

+ +

Then how can i fit one regression line ? If i have two response or two independent variables, won't i get two regression line ?

+",2013-10-15 14:44:44.693 +57537,11886.0,1,,,,Warranty claims count prediction,,CC BY-SA 3.0,"

I have weekly data of number of warranty claims from week of manufacturing. I want to use this historical data and predict the number of claims that may arise in next few weeks.

+ +

I have read about count data models like Poisson regression and NB regression but they expect the counts in the range of 0~30 (is this correct?). In my case the claims can be of the range 0 to several thousands. What would be an ideal way to solve this problem?

+",2013-10-15 14:48:21.490 +57538,8351.0,1,57548.0,,,Derivation of conditional distribution from other two distributions,,CC BY-SA 3.0,"

$$Y|X=x \sim N(x,1)\\X\sim N(\mu,\sigma^2 )$$ +What distribution does $X|Y=y$ follow?

+ +

My initial startegy was to $f_{Y|X}f_X=f_{X,Y}$ and solve for $f_{X|Y}=f_{X,Y}/f_{Y}$ +. Computing for $f_{X,Y}$, I get the following: +$$f_{X,Y}=\frac{1}{\sqrt{2\pi\sigma^2}}\exp\left\{-\frac{(x-\mu)^2}{2\sigma^2}\right\}\frac{1}{\sqrt{2\pi}}\exp\left\{-\frac{(y-x)^2}{2}\right\}$$ +And in trying to compute for $f_{Y}$, I was trying to integrate above w.r.t $x$, but I was stuck. I am not sure if that's integratable, and if this is a right approach to solve this question. I am curious if there is some kind of tricks/insights I am missing.

+",2013-10-15 14:54:42.960 +57539,22714.0,1,,,,How to do a classification with only one variable?,,CC BY-SA 3.0,"

I would like to classify individuals of a database by a single quantitative variable. Is hierachical clustering suitable to do this? If it is possible, how the algorithm work? If use hierachical clustering is not right, which other procedures or techniques could I use? +My purpose is not classify the variable, but individuals. So decision trees are not suitable.

+",2013-10-15 14:55:36.683 +18335,5038.0,1,18345.0,,,Non-nested model selection,,CC BY-SA 3.0,"

Both the likelihood ratio test and the AIC are tools for choosing between two models and both are based on the log-likelihood.

+ +

But, why the likelihood ratio test can't be used to choose between two non-nested models while AIC can?

+",2012-01-01 12:15:03.760 +57542,5203.0,2,,57536.0,,,,CC BY-SA 3.0,"

It sounds like you're mixed up on a few different things.

+ +

First, independent variables are the inputs, causes, or explanatory variables, or predictors to your model, while the dependent variables are the outputs from the model, since they ""depend"" on the values of the independent variables (hopefully!).

+ +

You can definitely have a single regression line with multiple independent variables. For example, you might model something like +$$ \textrm{Heart Rate} = \beta_0 + \beta_1 \textrm{Weight} + \beta_2\textrm{Exercise}$$ +People sometimes make a distinction between multiple linear regression, where the model has two or more explanatory variables, from simple linear regression, which has only one. Neither of these should be confused with multivariate regression, where one predicts multiple variables at once, as in +$$<\textrm{Heart Rate}, \textrm{Weight}> = \beta_0 + \beta_1 \textrm{Exercise} + \ldots$$

+ +

Since the independent and dependent variables depend on your hypothesis, only you can decide which are which. For example, you might suspect that resting heart rate is affected by one's weight and exercise habits. If so, you'd use weight and exercise as the independent variables, while heart rate is the dependent variable. This would give you the first model, show above. On the other hand, you might want to predict weight from someone's resting heart rate and exercise. In this case, your model would look something like: +$$ \textrm{Weight} = \beta_0 + \beta_1 \textrm{Heart Rate} + \beta_2\textrm{Exercise}$$

+ +

There are several ways to do a regression in matlab. The regress function (documentation here) might be a reasonable place to start. You'll need to make an $n \times 1$ vector of responses (call it $y$; it is the dependent variable) and an $n \times p$ vector of predictors (the matching values of the dependent variable(s); call this $x$). Then, you run something like b = regress(y,x); to get the associated coefficients (e.g., b(1) is the coefficient for the values in x(:,1)). Note that if you want a constant term in your model, you need to add a column of all ones to your predictor matrix!

+ +

There are several other methods in matlab for fitting linear regression models. The statistics toolbox has a Linear Model class. The curve fitting app cftool can interactively fit linear (and other models); the toolbox also includes a programmatic fit function. Due to how the math works out, you can also just use matrix division b = X\y. Take a look and see which of these fits your workflow the best. Beware that some of the methods insert a constant/intercept term into your model but others do not. Make sure you try at least a few out and get what you want.

+",2013-10-15 15:23:01.013 +57543,19359.0,1,57575.0,,,"""Bayesglm"", p-values and degrees of freedom?",,CC BY-SA 3.0,"

I am trying to perform some logistic regressions (and I am a neophyte user of R). Initially I used ""glm"" to compute coefficients, AIC and p-values; this worked great until I ran across a data set suffering from complete separation. In [1], Gelman et alia suggest using an (informative) prior to address this problem; the corresponding algorithm is implemented in R as ""bayesglm"" (in the ARM package).

+ +

Here is my problem. Previously, with ""glm"", I would compute p-values as follows:

+ +
mylogit <- bayesglm(a ~ b+c+d+e+f+g+h, data = mydata, family=""binomial"")
+with(mylogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
+
+ +

There are 53-48=5 degrees of freedom:

+ +
Null deviance: 71.188  on 53  degrees of freedom
+Residual deviance: 37.862  on 48  degrees of freedom
+
+ +

However, if I use ""bayesglm"" instead of ""glm"", the resulting degrees of freedom are a bit surprising to me:

+ +
Null deviance: 22.279  on 53  degrees of freedom
+Residual deviance: 39.030  on 54  degrees of freedom
+
+ +

If I plug in the preceding formula for a p-value, I have -1 degrees of freedom! Can someone help me get a more sensible answer (or help me interpret this)?

+ +

By the way, the documentation on the ""bayesglm"" command includes the following ominous comment:

+ +
+

We include all the glm() arguments but we haven’t tested that all the options (e.g., offests, contrasts, deviance for the null model) all work.

+
+ +

[1] Gelman, Andrew, et al. ""A weakly informative default prior distribution for logistic and other regression models."" The Annals of Applied Statistics (2008): 1360-1383.

+",2013-10-15 16:06:23.393 +57544,20473.0,2,,57429.0,,,,CC BY-SA 3.0,"

From what I can understand, the real-world phenomenon under study can be described as follows:
+ There are at every period $N_t$ customers which are divided given some criterion into ""existing"" and ""new"" (think why the categorization criterion is not necessarily obvious).

+ +

For each of these two subgroups, we define the ""payback rate = percentage of customers of this subgroup who pay back"". For new customers we denote this payback rate $Y_1$ and for existing customers we denote this payback rate $Y_2$. We also have as possible explanatory variables the credit scores for these customers. I presume that $X_1$ symbolizes the average credit score of new customers, and $X_2$ the average credit scores of existing customers.

+ +

Now, these payback rates should be examined separately, before attempting to build a model for their weighted average.

+ +

We may assume therefore that +$$Y_{1t}= a_1 + b_1X_{1t} + u_{1t} \qquad [1]$$ +and +$$Y_{2t}= a_2 + b_2X_{2t} + u_{2t} \qquad [2]$$

+ +

with $t=1,...,T$ being the length of the time series, and the two error terms assumed white noises, independent of each other, and independent of the regressors.

+ +

What we want is to estimate the weighted average pay back rate. Denoting $p_t$ the existing customers as a percentage of the customer base $N_t$, this weighted average payback rate is exactly defined as

+ +

$$Y_t = (1-p_t)Y_{1t} + p_tY_{2t} \qquad [3]$$

+ +

Relation [3] is a mathematical identity. We turn it into a causal/associative/covariance relationship by inserting into it equations $[1]$ and $[2]$ that refelct theoretical/behavioral assumptions:

+ +

$$\{[1],\,[2],\, [3]\} \Rightarrow Y_t = (1-p_t)\left(a_1 + b_1X_{1t} + u_{1t}\right) + p_t\left(a_2 + b_2X_{2t} + u_{2t}\right)$$

+ +

$$\Rightarrow Y_t = \left[(1-p_t)a_1 + p_ta_2\right] + (1-p_t)b_1X_{1t} + p_tb_2X_{2t} + (1-p_t)u_{1t} + p_tu_{2t} $$

+ +

$$\Rightarrow Y_t = a_1 + (a_2-a_1)p_t + b_1X_{1t}^* + b_2X_{2t}^* + \varepsilon_t \qquad [4]$$

+ +

with

+ +

$X_{1t}^* = (1-p_t)X_{1t}$ and $X_{2t}^*= p_tX_{2t}$ sub-group credit scores weighted by the relative size of each sub-group,

+ +

but most importantly with

+ +

$\varepsilon_t = (1-p_t)u_{1t} + p_tu_{2t} $

+ +

This means that the error term is contemporaneously correlated with all three regressors.

+ +

Denoting by $\mathbf X$ the regressor matrix containing the time series for $\left(p,X_{1}^*,X_{2}^*\right)$ The conditional moments of $\varepsilon_t$ are

+ +

$$E(\varepsilon_t\mid \mathbf X) =E((1-p_t)u_{1t} + p_tu_{2t}\mid \mathbf X) = (1-p_t)E(u_{1t} \mid \mathbf X) + p_tE(u_{2t}\mid \mathbf X) = 0$$

+ +

since the $u$-errors are independent of $\mathbf X$ and white noises. Also
+$$\operatorname {Var}(\varepsilon_t\mid \mathbf X) = E(\varepsilon_t^2\mid \mathbf X) = (1-p_t)^2E\left(u_{1t}^2\mid \mathbf X\right) + p_t^2E\left(u_{2t}^2\mid \mathbf X\right) $$ +$$= (1-p_t)^2Eu_{1t}^2 + p_t^2Eu_{2t}^2=(1-p_t)^2\sigma_1^2 + p_t^2\sigma_2^2$$

+ +

i.e. the error term is conditionally heteroskedastic, with the conditional variance depending on the regressor $p_t$.

+ +

We have arrived at the regression specification $[4]$ by recognizing that the dependent variable is necessarily constructed as a function of the regressors, and so our behavioral/association assumptions should be ""placed"" one step earlier (at sub-group level). From this, the interaction between the regressors emerged naturally. But also, we ended up with endogenous regressors and a heteroskedastic error term, the variance of which changes in each time period and is a function of one of the regressors. +Finally, one should think that, if the model is to be used for prediction, perhaps the $p_t$ should be modeled as an autoregressive scheme, since it is difficult to think that the existing/new customers allocation can exhibit wide variations from one period to the next.

+ +

Concluding, this is your model, or at least, a model that is consistent with the real-world phenomenon under study. As you can see, you have much more serious issues to deal with than just ad hoc ways to represent the interaction between the regressors...

+",2013-10-15 16:06:45.203 +57545,21985.0,1,,,,Sample space of pmf,,CC BY-SA 3.0,"

I have several exercises to solve that deal with sample space of PMF.

+ +

One is:

+ +

Let $X_1,\dots , X_n$ be independent random variables with pmf $p(x;\pi) = (1-\pi)^x \pi$

+ +

What is the sample space of $X_1?$ Try to give a probabalistic interpretation of such a sample space. Hint: for example. a Bernoulli random variable can be used to model a coin with probability of success $p \in \;\rbrack0,1\lbrack$.

+ +

Any hint? I do not know how to ""see"" from the distribution what the sample space is without knowing more about this distribution...

+ +

Possible Solution?

+ +

I have the following idea: +If I plug in all values in the sample space and calculate the sum I have to get $1$ (because it is a probability). But obviously here we can have just one value for the sample space which is $\log({\frac{1}{\pi}})/{\log(1-\pi)}$ (I set the formula for pmf to one and solve the equation for x).

+ +

Is that correct?

+",2013-10-15 16:09:01.513 +57546,9716.0,1,57553.0,,,Design fitness function for polynomial approximation,,CC BY-SA 3.0,"

I'm trying to apply a polynomial approximation for a given function (via Genetic Algorithms), and so far the results are not so good:

+ +
 # or any other GA package
+  require(gaoptim)
+
+  # for polyval
+  require(pracma)
+
+  # polynomial of degree 10
+  ndeg = 10  
+  ndim = ndeg + 1
+
+  # search limits
+  search.low = rep(-1, ndim)
+  search.up = rep(1, ndim)
+
+  # no. of data points
+  m = 101 
+  xi = seq(-1, 1, length = m)
+  yi = 1 / (1 + (5*xi)^2)
+
+  # fitness function
+  pfn = function(p) max(abs(polyval(c(p), xi) - yi))
+
+  # gaoptim perform maximization, so transform the fitness function
+  pfninv = function(p){ 1/(pfn(p) + 1) }
+
+  ## set up the ga
+  ga = GAReal(pfninv, search.low, search.up, popSize = 500)
+  ga$evolve(100)
+  y2 = polyval(ga$bestIndividual(), xi)
+
+  plot(xi, yi, ylim = range(c(yi, y2)), type = 'l', main = 'Runge function')
+  lines(xi, y2, col = 'red')
+
+ +

+ +

Is there any strategy i can apply here, or this is a No-no approach? Maybe a better fitness function, or expand the search limits? Higher values of popSize doesn't seem to help too much.

+ +

Thanks for any insight!

+",2013-10-15 16:14:24.953 +57547,22718.0,1,,,,How to report significance of factor levels shown in summary of glm?,,CC BY-SA 3.0,"

The summary of my GLM shows day 12 of factor Date to be significant, but anova(model, test=""Chisq"") shows Date to be not significant overall. I know how to report the statistics from the Chisq table, but as I have z values in the summary table I am unsure how to report, or if I should report, that day 12 is significant.

+ +

Similarly when finding that Date is significant, how to report what specific dates seem to be important.

+ +

Thanks in advance

+ +

Lara

+ +

I have Fertility and Fecundity of female flies measured over 12 days. I want to check if there is a decline (or otherwise) in fertility/fecundity over this time.

+ +

Additionally, at day 13 females are mated (with a male from one of two different groups), and fertility and fecundity are measured until day 20. I want to use Date as a factor to identify significant peaks in fertility/fecundity i.e. after mating, and potential difference in peaks between groups.

+ +
Call:
+glm(formula = dda$Fertility.Absolute ~ sqrt(dda$Fecundity) + 
+    dda$Group + dda$Date + sqrt(dda$Fecundity):dda$Group + sqrt(dda$Fecundity):dda$Date, 
+    family = poisson)
+
+Deviance Residuals: 
+    Min       1Q   Median       3Q      Max  
+-3.1397  -0.6786  -0.4797   0.3596   3.7588  
+
+Coefficients:
+                                   Estimate Std. Error z value Pr(>|z|)    
+    (Intercept)                    -1.91501    0.51539  -3.716 0.000203 ***
+    sqrt(dda$Fecundity)             0.72372    0.12441   5.817 5.99e-09 ***
+    dda$Group2                      0.19540    0.19230   1.016 0.309585    
+    dda$Date4                       0.18117    0.62648   0.289 0.772439    
+    dda$Date6                      -0.28952    0.68983  -0.420 0.674706    
+    dda$Date8                       0.07111    0.60531   0.117 0.906480    
+    dda$Date10                      0.19557    0.62232   0.314 0.753325    
+    dda$Date12                      0.79619    0.60710   1.311 0.189696    
+    dda$Date14                      1.93702    0.53938   3.591 0.000329 ***
+    dda$Date16                      0.75623    0.58296   1.297 0.194554    
+    dda$Date18                     -0.05392    0.67805  -0.080 0.936618    
+    dda$Date20                     -0.26291    0.68841  -0.382 0.702530    
+    sqrt(dda$Fecundity):dda$Group2 -0.07309    0.04822  -1.516 0.129583    
+    sqrt(dda$Fecundity):dda$Date4   0.27388    0.17555   1.560 0.118734    
+    sqrt(dda$Fecundity):dda$Date6   0.37684    0.22832   1.651 0.098836 .  
+    sqrt(dda$Fecundity):dda$Date8   0.13017    0.13861   0.939 0.347674    
+    sqrt(dda$Fecundity):dda$Date10  0.04552    0.15345   0.297 0.766722    
+    sqrt(dda$Fecundity):dda$Date12 -0.16593    0.14861  -1.117 0.264186    
+    sqrt(dda$Fecundity):dda$Date14 -0.24864    0.12754  -1.949 0.051240 .  
+    sqrt(dda$Fecundity):dda$Date16  0.05496    0.14578   0.377 0.706170    
+    sqrt(dda$Fecundity):dda$Date18  0.15439    0.19341   0.798 0.424715    
+    sqrt(dda$Fecundity):dda$Date20 -0.02006    0.16314  -0.123 0.902161    
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
+
+(Dispersion parameter for poisson family taken to be 1)
+
+    Null deviance: 1327.36  on 359  degrees of freedom
+Residual deviance:  298.26  on 338  degrees of freedom
+AIC: 873.65
+
+Number of Fisher Scoring iterations: 5
+
+###
+
+Analysis of Deviance Table    
+Model: poisson, link: log    
+Response: dda$Fertility.Absolute    
+Terms added sequentially (first to last)    
+
+                                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
+NULL                                            359    1327.36              
+    sqrt(dda$Fecundity)            1   893.88       358     433.48 < 2.2e-16 ***
+    dda$Group                      1     0.03       357     433.45    0.8699    
+    dda$Date                       9    82.16       348     351.29  6.01e-14 ***
+    sqrt(dda$Fecundity):dda$Group  1     0.07       347     351.22    0.7859    
+    sqrt(dda$Fecundity):dda$Date   9    52.96       338     298.26  2.97e-08 ***
+
+",2013-10-15 16:40:46.753 +57548,20473.0,2,,57538.0,,,,CC BY-SA 3.0,"

Separate the exponents to terms that do not contain $x$ and those that contain $x$ . You will obtain an integrand that can be written in the form $e^{-ax^2-bx}$ (all other terms go out of the integral, since you want to integrate w.r.t $x$). Then Gradshteyn & Ryzhik (2007), ""Table of Integrals, Series and Products"", 7th ed. p.336, eq. 3.322(2) give the formula:

+ +

$$\int_{0}^{\infty}\exp\left\{−\frac {x^2}{4\beta}−\gamma x\right\}dx = \sqrt {\pi\beta} \exp\left\{\beta \gamma^2\right\} \left[1-\operatorname{erf}(\gamma \sqrt \beta)\right]$$

+ +

I presume you can turn a $\int_{-\infty}^{\infty}$ integral into $\int_{0}^{\infty}$ integrals.

+ +

It's going to be a bit long and tedious, easy to make an algebraic mistake. After you're done, don't forget to remember the connection of the error function with the cdf of the standard normal.

+",2013-10-15 16:43:38.873 +57549,14436.0,1,59371.0,,,Metric for nearest neighbor method,,CC BY-SA 3.0,"

Is there a requirement that the measure used in Nearest Neighbor methods be a proper metric distance? What will happen if I use an arbitrary function (e.g., one that does not satisfy the triangle inequality)?

+ +

How can you convert an arbitrary function to a valid metric distance?

+",2013-10-15 17:10:39.983 +57550,22719.0,1,,,,Proper Sampling - can I collect a two-group sample this way without issues?,,CC BY-SA 3.0,"

I need to collect a two group sample for a comparison analysis (perhaps using logistic regression).

+ +

The population that I need to extract a sample from is all firms from country A with activities in country B. The firms are classified into two categories: having a subsidiary in country B (S), or not having a subsidiary in country B (NS). I expect the share of S firms to be small relative to NS firms (but I have no way of knowing for sure).

+ +

I already hold the entire population of S firms (because this data was available to me). However data on NS firms is not readily available and I have to collect that, and I will probably not get access to identify and collect all NB firms.

+ +

So my situation is I have the entire population of S firms, and need to collect enough NS firms for subsequent analysis to be significant. Most likely my final sample will consist of all S firms and some share of the population of NS firms. Without much experience in doing these kinds of studies, I can't help to think that there is some kind is bias/reliability issue when sampling this way (one group: entire group population, other group: some part of group population). I have learned that if it so happens that the population of NS firms is indeed much larger than S firms (again there is no way to know without data for the entire population of firms), and I e.g. end up with similar-sized samples of each group, there will be a case of oversampling the minority group. However I cannot find any remarks anywhere that consider this a problem for a comparison study, as a correct sample representation of the entire population is less important in this manner.

+ +

Is my concern justified? Or is it fine to do it that way for e.g. logistic regression? If not, how can I get around the issue?

+",2013-10-15 17:19:39.267 +57551,1805.0,1,,,,Mean Reciprocal Rank with GBM in R,,CC BY-SA 3.0,"

Let's say I'm optimizing MRR with a GBM in R:

+ +
library(gbm)
+generate.data <- function(N) {
+
+  # create query groups, with an average size of 25 items each
+  num.queries <- floor(N/25)
+  query <- sample(1:num.queries, N, replace=TRUE)
+
+  # X1 is a variable determined by query group only
+  query.level <- runif(num.queries)
+  X1 <- query.level[query]
+
+  # X2 varies with each item
+  X2 <- runif(N)
+
+  # X3 is uncorrelated with target
+  X3 <- runif(N)
+
+  # The target
+  Y <- X1 + X2
+
+  # Add some random noise to X2 that is correlated with
+  # queries, but uncorrelated with items
+
+  X2 <- X2 + scale(runif(num.queries))[query]
+
+  # Add some random noise to target
+  SNR <- 5 # signal-to-noise ratio
+  sigma <- sqrt(var(Y)/SNR)
+  Y <- Y + runif(N, 0, sigma)
+  Y <- ifelse(Y>median(Y), 1, 0)
+
+  data.frame(Y, query=query, X1, X2, X3)
+}
+
+set.seed(10)
+data.train <- generate.data(1000)
+gbm.mrr <- gbm(Y~X1+X2+X3,          # formula
+                data=data.train,     # dataset
+                distribution=list(   # loss function:
+                  name='pairwise',   # pairwise
+                  metric=""mrr"",     # ranking metric:
+                  max.rank=1,
+                  group='query'),    # column indicating query groups
+                n.trees=2000,        # number of trees
+                shrinkage=0.005,     # learning rate
+                interaction.depth=3, # number per splits per tree
+                bag.fraction = 0.5,  # subsampling fraction
+                train.fraction = 1,  # fraction of data for training
+                n.minobsinnode = 10, # minimum number of obs for split
+                keep.data=TRUE,      # store copy of input data in model
+                cv.folds=5,          # number of cross validation folds
+                verbose = FALSE,     # don't print progress
+                n.cores = 1)         # use a single core
+
+best.iter.mrr <- gbm.perf(gbm.mrr, method='cv')
+title('Training of pairwise model with mrr metric')
+
+ +

(Code adapted from here)

+ +

This produces the following plot, where the black line is in-sample MRR, and the green line is out-of-sample MRR. Clearly, GBM is mimizing MRR: it chooses the model with the smallest out-of-sample MRR (~0.3): +

+ +

I'm confused as to why gbm MINIMIZES mrr. It's my understanding that a MRR of 1 is perfect (best document is the first result, 1/1=1) and a MRR of close to zero is very bad (e.g. best document is the 1000th result, 1/1000 is close to zero).

+ +

Is my understanding of mmr correct? If so, then why does the gbm package minimize it?

+",2013-10-15 17:41:26.703 +57552,22721.0,1,,,,Include lower level (2-way and 3-way) interactions in 3-level multilevel model?,,CC BY-SA 3.0,"

I have a three level multilevel model (therapists, patients, repeated measures) in which I have included 4-way interactions, for instance:

+ +

therapist self-efficacy * treatment condition * patient on track (yes/no) * time

+ +

This tests the hypothesis that therapist who are in condition A and are higher no self-efficacy have a steeper slope with patients that are not on track

+ +

Most of the lower order interactions are non-significant and also not that relevant for interpretation. Should I still include all of them? I feel like I would put a lot of parameters in the model, whereas it would be difficult to interpret the lower order interactions that are not significant.

+ +

Most fora state that it would be adviceable to include lower order interaction, but there are also some places where it is stated that there is no statistical reason to include them. Any advice or references on this situation?

+",2013-10-15 17:49:56.123 +57553,9483.0,2,,57546.0,,,,CC BY-SA 3.0,"

1) Your target function is yi = 1 /(1 + (5*xi)^2), which is not a polynomial, so it's going to be hard to approximate with a polynomial:

+ +

+ +

2) If you invert the target function, yi = 1 + (5*xi)^2, increase your bounds, e.g. search.low = rep(-50, ndim); search.up= rep(50, ndim)

+ +

+ +

3) It's more common to take the root-mean-square error (RMSE) as the fitness function instead of max(abs(polyval(c(p), xi) - yi)).

+",2013-10-15 17:55:17.723 +57554,15363.0,1,,,,Modified ECDF in R,,CC BY-SA 3.0,"

The ecdf (empirical cumulative distribution function in CDF) in R, instead of giving $P(X \le x)$ for a random variable $X$, it gives the proportion of observations in the data that are $\le X$.

+ +

I tried to look up modified ECDF but couldn't find any. Is there any standard function to do the mathematical ECDF? Or any workaround would be appreciated!

+",2013-10-15 18:13:52.963 +57555,22723.0,1,,,,Randomness and Probability,,CC BY-SA 3.0,"

Suppose the probability of a ""random"" event is very small (call this probability $p$). In real life, true randomness seems impossible. So would the actual true probability of the event be greater than $p$?

+",2013-10-15 18:20:33.817 +57556,22724.0,2,,57461.0,,,,CC BY-SA 3.0,"

Remember that continuous probability distributions can be represented analytically, as a curve in the plane. +Suppose we have two curves represented by f(x) and g(x). One way to define the distance between them is the greatest value of the absolute value of f(x)-g(x), the distance between the two ordinates at the abscissa x. +If this value is small, then the functions are close. Otherwise, nothing is guaranteed. This forms the ""statistical distance.""

+ +

Often, we want to see if our guess of the true probability distribution is correct. One way of doing this is to estimate, using computational methods, the estimated density of the sample. Then, using the test above, we can see if our guess is correct.

+ +

This is just reason I can think of.

+",2013-10-15 18:23:29.187 +57557,5045.0,2,,57492.0,,,,CC BY-SA 3.0,"

I think Cameron and Trivedi's Microeconometrics fits the bill. It strikes a nice balance between breadth, intuition, and rigor (if you follow up on the references). The target audience is the applied researcher. Not everything is Greene is covered:

+ +
I: PRELIMINARIES
+1. Overview
+2. Causal and Noncausal Models
+3. Microeconomic Data Structures
+
+II: CORE METHODS
+4. Linear models 
+5. ML and NLS estimation
+6. GMM and Systems Estimation
+7. Hypothesis Tests
+8. Specification Tests and Model Selection
+9. Semiparametric Methods 
+10. Numerical Optimization
+
+III: SIMULATION-BASED METHODS
+11. Bootstrap Methods
+12. Simulation-based Methods
+13. Bayesian Methods
+
+IV:  CROSS-SECTION DATA MODELS
+14. Binary Outcome Models
+15. Multinomial Models
+16. Tobit and Selection Models
+17. Transition Data: Survival Analysis
+18. Mixture Models and Unobserved Heterogeneity
+19. Models of Multiple Hazards
+20. Count Data Models
+
+V:  PANEL DATA MODELS
+21. Linear Panel Models: Basics
+22. Linear Panel Models: Extensions
+23. Nonlinear Panel Models
+
+VI: FURTHER TOPICS
+24. Stratified and Clustered Samples
+25. Treatment Evaluation
+26. Measurement Error Models
+27. Missing Data and Imputation
+
+APPENDICES
+A. Asymptotic Theory
+B. Making Pseudo-Random Draws
+
+ +

Peter Kennedy's Principles of Econometrics book does the same with more emphasis on intuition. The target audience is students of different sorts. All the material is covered at three levels. The first one provides the intuition and the main idea, the second one introduces some very basic notation, and the last one provide references to the more complicated topics. Sadly the 6th edition is the last one. TOC is here.

+",2013-10-15 18:28:05.363 +57576,14888.0,1,57577.0,,,Is this a repeated measures design or not?,,CC BY-SA 3.0,"

How do you describe, or what do you call, a test that uses two factors as independent variables but uses a dependent variable that is a difference between two measures taken repeatedly from the same individuals?

+ +

Usually, repeated measures means that multiple independent variables are measured on the same individuals. Here, it is the dependent, outcome variable that is the repeated measure. But, since the difference is taken for each individual, is this simply a two-way ANOVA?

+",2013-10-15 21:57:50.220 +25072,7421.0,1,25087.0,,,"Differences on exploratory factor analysis, confirmatory factor analysis and principal component analysis",,CC BY-SA 3.0,"

Before it is pointed, I am aware that a very similar question was already asked. Still, I am in doubt regarding the concept.

+

More specifically, it is mentioned by the most voted answer that:

+
+

In terms of a simple rule of thumb, I'd suggest that you:

+
    +
  1. Run factor analysis if you assume or wish to test a theoretical model of latent factors causing observed variables.

    +
  2. +
  3. Run principal components analysis If you want to simply reduce your correlated observed variables to a smaller set of important independent composite variables.

    +
  4. +
+
+

Question 1:

+

I am having difficulties on understanding based on the results I obtained from R where exactly I am inputing my theoretical model of latent factors. I am using the functions from statsmethods. On both factanal() and princomp() the inputs were the same: A table where each row represented one data point and the columns consisted of different attributes I was interested on reducing. Thus, this add to my confusion on where is this pre assumed model play its role. I noticed that for factor analysis function I used parallel analysis also suggested by the site using the nScree() function to determine the number of factors and I specified if I wanted a varimax (orthogonal) or promax (oblique) rotation. Is that what is it mean by the model? Being able to choose the amount of factors and the type of rotation?

+

The results being provided as visual graphs for both PCA and EFA also doesn't seem to highlight this difference which adds to my confusion. Where does this distinction can be observed on them?

+

+PCA

+

+EFA

+

Question 2: -- Answered

+

I bought a book to study about this from Richard L. Gorsuch. On this book there is something that the author caught attention on the difference between PCA (Principal Component Analysis) and EFA (Exploratory Factor Analysis): It is mentioned that PCA is for population while EFA is for sample. Is that true? I didn't see that being mentioned on any discussion I read so far. Is it irrelevant?

+

Question 3:

+

I noticed that all those methods seems to impose the normal distribution constraint. I also read that for larger sets this constraint can be ignored. Is that true or PCA, EFA and CFA are sensible to distribution constraint violations?

+

Question 4: Where from the results of PCA and EFA should I note that one is talking about latent factors (EFA) and the other is just clustering on components (factors) the variables? The outputs from R looks the same to me. Is it just the way I perceive what the factors being shown as output? I noted that both show me the table where I can see which I can observe which of my variables are expressed the most of my factors. What is the difference on the interpretation I should have on which variable belongs to which factor in respect to PCA and EFA? EFA is saying those with higher expression seems to be more explained by that latent factor while PCA is trying to say that factor is holding those variables from what is it observed?

+

Question 5 +Finally the last question is regarding CFA (Confirmatory Factor Analysis).

+

On the same function website the following image is being shown:

+

+

I read that CFA is usually followed after EFA for hypothesis testing. In that sense, EFA tells you which are the latent factors (which are the output factors) and then you use CFA assuming those factors you observed from EFA for hypothesis testing?

+

Question 6

+

For EFA one of the available rotations on the literature is direct oblimium. I heard that it can accounts for both promax and varimax so 'it takes the best of two words'. Is that true? I am also trying to find a function that employs them on R, since the one suggested on the site does not. I would be happy to get any suggestion on this one.

+
+

I hope it is noted that this question is way more specific on the doubts regarding EFA and PCA and also adds to CFA so not to get closed for being repeated on the subject. If at least one of the questions is answered I am more than happy too as to clarify the confusion in my head.

+

Thank you.

+",2012-05-14 05:55:32.040 +57558,19559.0,1,57564.0,,,Puzzling behavior of glmer(),,CC BY-SA 3.0,"

I'd like your opinion on a very strange behavior that I recently encountered running glmer(). The problem is that when I make the dependent variable into a logical vector, glmer behaves weirdly. My dependent variable is Accuracy, and it is coded in terms of 1 (accurate response) and 0 (wrong response). What puzzles me is that transforming accuracy to a logical vector should work the same way for glmer, as a logical vector is coded in terms of TRUE or FALSE, having also 2 levels. However, glmer gives me different results depending on the transformation of the dependent variable I use. Have you guys encountered this before? Do you know why it happens? Below is sample code so you can replicate the problem yourselves.

+ +
#Create fake data
+Subject   <- c(rep(""S1"",4), rep(""S2"",4), rep(""S3"",4), rep(""S4"",4))
+Item      <- rep(c(""I1"",""I2"",""I3"",""I4""),4)
+Factor1   <- c(c(rep(""e1"",2),rep(""e2"",2)), c(""e1"",""e2"",""e2"",""e1""), 
+           c(rep(""e2"",2),rep(""e1"",2)), c(""e2"",""e1"",""e1"",""e2""))                  
+Accuracy  <- c(1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,1)
+
+#Create data frame and make ""Accuracy"" into a factor with 2 levels
+data          <- data.frame(Subject,Item,Factor1, Accuracy)
+data$Accuracy <- factor(data$Accuracy)  #Accuracy is a factor w/ 2 levels
+#Run glmer
+m1 <- glmer(Accuracy ~ Factor1 + (1+Factor1|Subject) + (1+Factor1|Item), family = ""binomial"", data= data)  
+summary(m1)
+Fixed effects:
+            Estimate Std. Error z value Pr(>|z|)  
+(Intercept)    1.946      1.069   1.820   0.0687 .
+Factor1e2     -1.946      1.282  -1.518   0.1290  
+---
+ Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+ +

That is the output of the first model. Now, look at what happens if I transform data$Accuracy into a logical vector when I run the model:

+ +
m2 <- glmer(as.logical(as.numeric(Accuracy)) ~ Factor1 + (1+Factor1|Subject) + (1+Factor1|Item), family = ""binomial"", data= data)  
+summary(m2)
+
+Fixed effects:
+             Estimate Std. Error z value Pr(>|z|)
+(Intercept) 2.557e+01  1.259e+05       0        1
+Factor1e2   2.223e-06  1.781e+05       0        1
+
+ +

As you can see, now the coefficient estimates are very different. As I said, this seems very puzzling to me and I'd like yo know if you have some thoughts on why this should be.

+ +

Thanks a lot!

+ +

--Sol

+",2013-10-15 18:35:31.263 +57559,9456.0,1,,,,Confusion relative to derivative of partition function,,CC BY-SA 3.0,"

I have this partition function

+ +

+ +

Now if I take the derivative of log(Z(x)) wrt $\lambda_k$

+ +

the result is

+ +

+ +

I didn't get how this was derived. This is the paper

+",2013-10-15 18:36:50.983 +57560,22724.0,2,,57555.0,,,,CC BY-SA 3.0,"

When you flip a coin, you cannot know if it will fall on heads or tails. +There is a chance in will fall of heads and a chance it will fall on tails, both being a half. This seems like true randomness. The theory of probability has the purpose of stamping on events a certain probability.

+ +

Logically, your question makes no sense. You assume that something has a probability, and then ask if its probability is greater. You assume, for instance, that the odds of flipping a heads is 0.2, then you say there is no randomness, and conclude that the odds of flipping a heads is 0.4...

+",2013-10-15 18:39:42.453 +57561,22725.0,2,,27120.0,,,,CC BY-SA 3.0,"

They are absolutely NOT the same.

+ +

mean SQUARE error: square the quantity => calculate the error => calculate the mean

+ +

mean SQUARED error: calculate the error => square the result => calculate the mean

+",2013-10-15 18:43:28.630 +57562,21958.0,1,,,,Model validation in Bayesian statistics from a model with latent variables,,CC BY-SA 3.0,"

I am working with some two-regime autoregressive models first introduced by Hamilton in 1989. The specific models is of no great concern to my question, but some variables within my autoregressive models are latent binary variables - non-observable binary variables, that is. I have a dataset and use MCMC to find posterior densities of all the parameters of my models, including all the latent binary variables.

+ +

I have a dataset of about 1000 observations, and for every observation I have a latent variable in my models, which can wither be 1 or 0. If my data suddendly changes, I can assume that a switch has been turned on and the latent variable has changed. So if the data, at a specific time, is (most likely) best described by the latent variable being 1, my sampler will produce a posterior distribution (for this specific latent variable) with high probability of 1 and low probability of 0.

+ +

So this is all good, I have managed to do this. The question is, how do I validate my model? AIC/BIC is just for comparing models. +What I am thinking is that I have to be able to plot the residuals somehow, like in normal regression where you just say that the residual is the difference of observed and predicted result. +I understand that for all other parameters than the latent parameters/variables, i can just use the mean of each posterior distribution and treat it like a Maximum Likelihood Estimator. But I cant take the mean of my posterior distributions for all the latent variables, because they are either 0 or 1. It does not make any sense using, for example, 0.8 for one of my latent variables. This is no option.

+ +

So how should I go on to validate my model? I am really stuck here. If I have done a bad job explaining my problem, I apologize and will try to explain better.

+ +
+ +

So I have been thinking. Can I use a predictive posterior distribution to validate my model? And if too many observations, lies above, say, the 95 percentile of the predictive posterior distribution, I throw away my model?

+ +

Generally, for any model whose parameters are called theta, observations called x, and new observation called x_new, one calculate the predictive posterior distribution as follows: +For i = 1, . . . , n (samples), we sample: +1. theta[i] +from p(theta|x) and +2. x_new[i] +from p(x_new|theta[i]) +Then x_new[1], . . . ,x_new[n] +are a sample from the predictive posterior distribution. But this method must surely only work when we have a iid sample. I am working with an autoregressive model. So I am stuck...

+",2013-10-15 18:51:33.203 +57563,22726.0,1,,,,Sampling and Conditions,,CC BY-SA 3.0,"

Suppose we consider all the people in the planet. We are interested in randomly selecting 10 people who have heart disease. Is it better to repeatedly sample 10 people and then choose the one selection in which all 10 people have heart disease? Or is it better to only look at the people who have heart disease and sample 10 from that population?

+",2013-10-15 19:10:55.417 +57564,2857.0,2,,57558.0,,,,CC BY-SA 3.0,"

More of a programming question. Compare:

+ +
> as.logical(as.numeric(data$Accuracy)) 
+ [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 
+[16] TRUE 
+> as.logical(as.numeric(Accuracy)) 
+ [1]  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE 
+[13]  TRUE  TRUE  TRUE  TRUE 
+
+ +

You're performing the former with your call to glmer since you are using the data = ... argument

+ +
m2 <- glmer(as.logical(as.numeric(Accuracy)) ~ Factor1 + (1+Factor1|Subject) + (1+Factor1|Item), family = ""binomial"", data= data)
+
+ +

As to why this is happening:

+ +
> as.numeric(data$Accuracy) 
+ [1] 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 2 
+> as.numeric(Accuracy) 
+ [1] 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 1
+
+ +

Basically as.numeric returns the numeric representation of the levels of a factor variable, and then as.logical treats all non-zero values as TRUE (not entirely sure about negative values, actually). To get the original values back, you need to use

+ +
> as.numeric(levels(data$Accuracy)[data$Accuracy]) 
+ [1] 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 1 
+
+ +

Thus...

+ +
> m2 <- glmer(as.logical(as.numeric(levels(Accuracy)[Accuracy])) ~ Factor1 + (1+Factor1|Subject) + (1+Factor1|Item), family = ""binomial"", data= data)   
+> summary(m2) 
+... 
+Fixed effects: 
+            Estimate Std. Error z value Pr(>|z|)   
+(Intercept)    1.946      1.069   1.820   0.0687 . 
+Factor1e2     -1.946      1.282  -1.518   0.1290  
+
+",2013-10-15 19:12:09.373 +57565,22727.0,2,,57563.0,,,,CC BY-SA 3.0,"

So long as it is truly random, and your hypothetical list covers everyone on the planet, then it would be most efficient to only select from the group with heart disease as that is what you are interested in studying.

+ +

There shouldn't be any difference between randomly sampling 10 until you have 10 with heart disease and randomly sampling 10 from those that you know have have heart disease. This is assuming your list of those with heart disease includes everyone with heart disease on the planet.

+",2013-10-15 19:21:21.580 +57566,22728.0,1,,,,symmetric r.v. raised to an odd power,,CC BY-SA 3.0,"

My prof claims that raising a symmetric r.v., like N(0,1), to an odd power gives a distribution with expectation 0. What's the best way to see this?

+",2013-10-15 19:27:55.210 +57569,21243.0,2,,57555.0,,,,CC BY-SA 3.0,"

Your question raises the philosophical question of whether or not randomness is possible - while it's an interesting subject, we pretty genuinely don't care.

+ +

We do care, however, about your particular interpretation of probability, which is the important part (though they both agree that the answer to your question is ""No, the probability is $p$"", based on their individual criteria). Briefly consider the two prevailing interpretations:

+ +
    +
  1. A Frequentist would say that the given probability $p$ is the fraction of times that the event occurs in a very large number of trials. In this case, the number $p$ arises as a result of a large number of empirical trials; it is our best estimate based on the experimental evidence at hand.
  2. +
  3. A Bayesian would say that based on some sufficient evidence, we have calculated the number $p$ to express how sure we are of the outcome of the event. Note that we need not to have seen a large number of trials or any trials at all; this is a direct epistemological claim about our sureness in the outcome of the event.
  4. +
+ +

Both views agree that we could be wrong; however, $p$ is still the best bet we have. So in a way it is true that under both interpretations $p$ could be incorrect; however, it is not particularly helpful to say so and undermines the usefulness of using $p$ in the first place.

+",2013-10-15 20:18:20.350 +57570,668.0,2,,57545.0,,,,CC BY-SA 3.0,"

There are two huge problems with this question that make it unanswerable.

+ +

First, a sample space is a set of outcomes of an experiment. A random variable is a function assigning a unique real value to each outcome. A probability measure on the sample space determines the distribution of the random variable. Given only the distribution, we cannot possibly identify the sample space. For instance, the random variable assigning the value $0$ to ""tails"" and $1$ to ""heads"" to describe outcomes of the flip of a fair coin has sample space {""heads"", ""tails""}. It has a Bernoulli$(1/2)$ distribution. The random variable assigning the value $0$ to all points in the Earth's northern hemisphere and $1$ to all points in its southern hemisphere has a sample space consisting of all points on Earth. If all points are considered equally probable, then this variable, too, has a Bernoulli$(1/2)$ distribution--but obviously points on the Earth are not flips of a coin!

+ +

Second, let's re-interpret the question to ask about the set of possible values of a random variable (its range as a function), because maybe there is a chance this could be answered with the information given. Unfortunately, the question is still ambiguous. This can be shown by exhibiting two different distributions with different ranges that nevertheless satisfy the given conditions: namely, there exists some number $\pi$ such that the probability of each possible value $x$ is given by the formula $(1-\pi)^x\pi.$ Such a formula suggests (but does not explicitly indicate) that $x$ is intended to be integral, which helps limit our search for counterexamples.

+ +
    +
  1. Let the range be $\{-1, 0\}$. The total probability is

    + +

    $$1 = (1-\pi)^{-1}\pi + (1-\pi)^0\pi.$$

    + +

    One solution is $\pi = (3 - \sqrt{5})/2 \approx 0.381966.$

  2. +
  3. Let the range be $\{-1, 0, 1\}$. The total probability is

    + +

    $$1 = (1-\pi)^{-1}\pi + (1-\pi)^0\pi + (1-\pi)^1\pi.$$

    + +

    One solution is a root of $x^3 - 3 x^2 + 4 x - 1$ approximately equal to $0.317672.$ (It is the only root lying between $0$ and $1.$)

  4. +
+ +

The first distribution assigns probabilities $0.618034$ to $-1$ and $0.381966$ to $0$; the second distribution assigns probabilities $0.465571$ to $-1$, $0.317672$ to $0$, and $0.216757$ to $1$: obviously they are different and have different ranges.

+",2013-10-15 20:50:42.840 +57571,21840.0,2,,57451.0,,,,CC BY-SA 3.0,"

By drawing the regions and taking integrals we get:

+ +

$P(V^{2} - UW \geq 0) = P(V^{2} \leq UW, 0\leq U,V,W \leq 1)$ +$=P(0 \leq V \leq 2\sqrt{UW},0\leq U,V,W \leq 1)$ +$=P(0 \leq V \leq min(1,2\sqrt{UW}),0\leq U,V,W \leq 1)$ +$=\int_0^\frac{1}{4}\int_0^1\int_0^{2\sqrt{uw}}dudvdw + \int_\frac{1}{4}^1\int_0^{\frac{1}{4u}}\int_0^{2\sqrt{uw}}dudvdw + \int_{\frac{1}{4}}^1\int_\frac{1}{4u}^1\int_0^1dudvdw $

+ +

Solving the integrals we get the same answer, that is , approx 0.2544.

+",2013-10-15 21:03:50.640 +57572,22732.0,1,,,,Prediction based on repeated measure with binary outcome,,CC BY-SA 3.0,"

I'd like to make a prediction about a subject's likelihood that a certain outcome will occur based on a series of measurements taken over time.

+ +

The easiest way to explain the problem is as follows. Imagine that I'm tracking an individual over time to generate a probability of ever getting lung cancer.

+ +

The training data would look something like this with the outcome being ""has cancer""

+ +

+ID           age   cigarettes  asbestos   lung        has cancer
+                     per day   exposure   capacity
+S_1           5       0           N          20             N
+S_1           17      20          N          50             N
+S_1           31      50          N          30             N
+S_1           42      0           N          40             Y
+S_2           15      0           N          55             N
+S_2           37      0           N          45             N
+S_3           5       3           Y          15             N
+S_3           50      0           Y          30             Y
+etc...
+
+ +

The test data would be taken before an individual has gotten cancer and would predict their probability of getting cancer at any time in the future.

+ +

What's the best way to model this?

+ +

I'm currently experimenting with mixed models, treating this like a repeated measure problem. However, the dependent variable (whether someone ever gets cancer) is not being repeatedly measured over time. Instead it is fixed, so it seemed this might not be the best approach. As you can see there are additional problems of the dataset being unbalanced, with measures taken at irregular periods.

+ +

EDIT 1: Would it be better to think of this in terms of survival analysis? I assume the event in this case would be contracting cancer. If so how do I deal with deaths not related to lung cancer?

+ +

Edit 2: I'm thinking now of using a joint model that combines a mixed effect model with a survival analysis model: http://rwiki.sciviews.org/doku.php?id=packages:cran:jm. Any thoughts as to whether this might work for my problem? It seemed that I could use the hazard function to get a prediction for likelihood of ever getting cancer.

+",2013-10-15 21:24:40.827 +57573,132.0,2,,57568.0,,,,CC BY-SA 3.0,"

This was studied by Granger and Morris (1976) who showed that

+ +

AR($p$) + AR($q$) = ARMA($p+q,\max(p,q)$).

+",2013-10-15 21:33:45.000 +57574,668.0,2,,57563.0,,,,CC BY-SA 3.0,"

The question of equivalence is settled by considering every possible outcome of the sampling procedure and determining its chance of being the sample. When two procedures select all outcomes with the same probabilities, they are statistically equivalent.

+ +
+ +

In the first case, we repeatedly sample subsets of $10$ people out of all $N$ on the planet. We do so in a way that selects every one of the $\binom{N}{10}$ distinct subsets with equal probability. We repeat until the one we select is a subset of the $M\le N$ people who have heart disease. (Let's call such a subset ""desirable"" and all other subsets ""undesirable."") That makes the procedure somewhat complicated: it is a process that can take an arbitrarily large number of steps.

+ +

One way to compute the chance of a particular set of diseased patients being selected is to break down the process according to how many steps were taken: that set was either selected on the first attempt, or on the second attempt, or, ..., or on the $K^\text{th}$ attempt, or ... . The chance of being selected on the $K^\text{th}$ attempt is the chance of being selected on that attempt given that on every one of the preceding $K-1$ attempts, one of the $\binom{N}{10}-\binom{M}{10}$ undesirable subsets was selected. Assuming the separate attempts were independent, this chance is computed by multiplying the chances at each attempt, giving

+ +

$$\left(\frac{\binom{N}{10}-\binom{M}{10}}{\binom{N}{10}}\right)^{K-1}\frac{\binom{M}{10}}{\binom{N}{10}}$$

+ +

for the chance of being selected on the $K^\text{th}$ attempt. Summing these values for $K=1, 2, \ldots$ gives the chance of being selected. Fortuately, we do not actually have to calculate this chance: it suffices to observe that it does not depend on the particular set we have under consideration, but only on the numbers $N,$ $M,$ and $10.$ Thus, the first sampling procedure selects all desirable sets with equal probability. Obviously that's what the second procedure does (by design), so the procedures are statistically equivalent, even though they have been conducted differently.

+ +
+ +

This analysis ought to look like overkill, because it is: it should be obvious that all desirable sets have the same chance of being selected using either procedure, because no individual is favored in the selection nor are the selections of any individuals interdependent (as would happen if, for instance, entire households of people were selected at a time). The reason for presenting this analysis in such detail is to demonstrate, with a simple example, how one might go about evaluating any sampling procedure of a defined population: namely, you determine the chance of every possible sample.

+ +

Although the concept is simple, the results can be very illuminating in complex situations, such as hierarchical sampling or spatial sampling schemes. For instance, a common way to sample soils in a field is to pick a random origin and a random orientation, lay out a regular grid of points starting at that origin and oriented accordingly, and taking a sample at each grid point that falls within the field. By emulating the analytical process exemplified here, you will be led to discover that many possible sets of samples have no chance of being selected at all. For instance, they won't all be in the same half of the field. This provides valuable insight into why the usual statistical procedures are not valid to apply to such samples (except, occasionally, as approximations).

+",2013-10-15 21:39:35.160 +57575,3183.0,2,,57543.0,,,,CC BY-SA 3.0,"

I'm not sure how you got 5 degrees of freedom with seven independent variables for your glm-based model, but I'll assume that's just a typo somewhere or that I'm missing something minor.

+ +

Anyway, counting degrees of freedom with models that are constrained by a prior can be tricky, and there isn't necessarily a ""correct"" way to do it in many cases. Perhaps the authors of arm used -1 degrees of freedom as a way to keep people from blindly misinterpreting the results.

+ +

Although we can't easily calculate the number of degrees of freedom for most regularized models, we can at least put an upper bound on it: the number of degrees of freedom must be less than or equal to the degrees of freedom for the corresponding un-regularized model.

+ +

So (assuming the 5 degrees of freedom you reported above is correct), you can plug in 5 and be confident that the true P-value will be no larger than what your Chi-square test predicts. Thus, if it's significant with 5 degrees of freedom, the true value will also be significant.

+ +

If you want something more exact, you might want to look into using the lasso or ridge regression for regularization instead: statisticians have invested a lot of effort into counting degrees of freedom for these models, and have even developed some significance tests for them. Andrew Gelman talks about one recent advance on his blog here.

+ +

Edited to add: If you do stick with bayesglm but don't trust the null deviance estimates, you can find it yourself by running a model with no predictors except the intercept. The formula syntax for this would be a ~ 1.

+",2013-10-15 21:46:27.687 +58281,3580.0,2,,58279.0,,,,CC BY-SA 3.0,"

Obviously whenever $X_1,X_2$ are independent but I guess that's not the point.

+ +

My go-to for dependent rvs is $U$ uniform on $[0,1]$ and take $X_1 = \sin(2\pi U), X_2=\cos(2\pi U)$. This basically says that if you pick a point uniformly on the unit circle then the coordinate functions are uncorrelated. This fact boils down to showing +$$ +\int_0^{2\pi} \sin(t)\cos(t) \ dt = 0. +$$

+",2013-10-26 22:11:17.093 +57577,503.0,2,,57576.0,,,,CC BY-SA 3.0,"

It could be called a change score design.

+ +

If you have only two repetitions of the measure, then it is one reasonable choice. However, it is very good to have more than two repetitions. If the dependent variable is measured with error (and which ones aren't?) then the change score is partly due to statistical error. e.g. suppose two people have identical true scores at both time 1 and time 2. But, for random reasons, they won't score the same at time 1 or time 2. Indeed, since

+ +

$O = T + E$

+ +

where O is observed score, T is true score and E is error, then, if $T_{11} = T_{21}$ and $T_{12} = T_{22}$ (where $T_{ij}$ is the score for person i at time j) then the change scores are just the changes in the errors and it's all regression to the mean.

+",2013-10-15 22:08:23.610 +57578,16144.0,1,,,,GLM with two related predictors (X and X-squared),,CC BY-SA 3.0,"

I am running a general linear model in which I have two predictors: X and X-squared.

+ +

I entered both these predictors in my analysis because I think X might explain the variance in the outcome measure partially linearly and partially in a quadratic fashion.

+ +

Obviously, there is multicollinearity in this example. However, I was wondering if there are maybe some reasons why it is not a good idea to put both these predictors in my model.

+",2013-10-15 22:18:53.827 +57579,22736.0,1,57580.0,,,In statistics what does NA stand for?,,CC BY-SA 3.0,"

I understand that NA means data is missing, null or not present. But what do the letters NA stand for? ""Not Available""?

+",2013-10-15 22:50:21.217 +57580,9483.0,2,,57579.0,,,,CC BY-SA 3.0,"

In datasets, NA can mean:

+ +
    +
  • ""Not Available"": e.g. the sensor was down at the time of the measure,
  • +
  • ""Not Applicable"": e.g. when asking a bachelor the name of his wife,
  • +
  • ""No Answer"": e.g. the respondent to a questionnaire skipped a question.
  • +
+",2013-10-15 22:53:52.320 +57581,1741.0,2,,57453.0,,,,CC BY-SA 3.0,"

It depends on which language you are more familiar with.

+ +

randomForest package implements the original Fortran version of Breiman's random forest. You should try to modify the Fortran code then.

+ +

party packages has everything implemented in C. So, you can try to modify the C code.

+ +

WEKA RandomForest is implemented in Java and involves the classes Bagging and RandomTree.

+ +

Honestly, I am more familiar with Java and I would use WEKA then. I actually implemented some ensemble pruning techniques in WEKA and it was pretty simple.

+",2013-10-15 23:13:06.160 +57582,22740.0,1,,,,R correlation between two time series analysis,,CC BY-SA 3.0,"

This is my very first post on CV so comment if I can improve my post.

+ +

I have two webs that sell very similar products and also have a very similar group of customers.

+ +

I was trying to prove that concept that ""Fast Moving Inventory From Web A Will Also Move Fast on Web B""

+ +

I have the daily snapshots of the inventory of both WebA and WebB. For example, for the product A from Web A, data looks like this:

+ +
TimeStamp  Product  Stock  UnitPrice
+Oct 1st    A        100    1.2
+Oct 2nd    A        90     1.2
+Oct 3rd    A        40     1.2
+Oct 4th    A        240    1.2
+..
+
+ +

For those two Webs, some of their products are exactly the same. For example, WebA and WebB both have product A. I am wondering is there a way to use R to do some time series analysis so I can prove that for those products that they have in common. There is a very high correlation (WebA sell product A crazy last weekend, which is also a good sale reflected on Web B's data)?

+ +

Is there some R functions to do what I want? Then I can tell Web B that you need to carry those top selllers of Web A but Web B doesn't carry now.

+",2013-10-15 23:42:13.630 +57583,22741.0,1,,,,Estimate the parameters of beta exponential distribution via L-Moments,,CC BY-SA 3.0,"

Estimate 3 parameters of beta exponential distribution in the case of censored type 1 samples via L-moments

+",2013-10-16 00:11:38.053 +57584,15539.0,1,57587.0,,,Partial F ratio from ANOVA table,,CC BY-SA 3.0,"

In multiple regression, if you have just an ANOVA table, and nothing else, no specific data, how can you do a partial F test on X1, given X2 is already in the model?

+ +

So, you have the ANOVA table:

+ +
source        df   SS    MS     F
+-----------------------------------
+regression    2    1.44  0.72   9.72
+error         3    0.22  0.07
+total         5    1.66
+-----------------------------------
+
+ +

All values are filled in. With only this information, how can you do the partial F test where:

+ +
    +
  • F = MSR(X1|X2) / MSE(X1, X2)
  • +
  • MSR(X1|X2) = SSR(X1, X2) - SS(X2) = 1.44 - ????
  • +
  • MSE (X1, X2) = MSE = 0.07
  • +
+ +

SSR(X1, X2) can be obtained from the table (SS regression) +MSE(X1, X2) can also be obtained from the table (just MSE) +but I cannot get SS(X2) from the table, as far as I know......

+ +

As far as I know, you need specific X and Y values to do this. Any other way from just the table?

+",2013-10-16 01:00:58.310 +57585,10684.0,2,,57559.0,,,,CC BY-SA 3.0,"

Write $\mathbf{x} = \mathbf{x}^{(i)}$ just to avoid writing superscript $(i)$'s everywhere. +$$\frac{\partial}{\partial \lambda_k} \log Z(\mathbf{x}) = \frac{1}{Z(\mathbf{x})} \frac{\partial}{\partial \lambda_k} Z(\mathbf{x}) = \frac{1}{Z(\mathbf{x})} \sum_\mathbf{y} \frac{\partial}{\partial \lambda_k} \exp\left(\sum_{j=1}^K \lambda_j f_j(y_t, y_{t-1}, \mathbf{x}_t)\right)$$ +which, using the ordinary chain rule for functions of one variable, gives +$$\frac{1}{Z(\mathbf{x})} \sum_\mathbf{y} f_k(y_t, y_{t-1}, \mathbf{x}_t) \exp\left(\sum_{j=1}^K \lambda_j f_j(y_t, y_{t-1}, \mathbf{x}_t)\right) = \sum_\mathbf{y}f_k(y_t, y_{t-1}, \mathbf{x}_t) \frac{\exp\left(\sum_{j=1}^K \lambda_j f_j(y_t, y_{t-1}, \mathbf{x}_t)\right)}{Z(\mathbf{x})} $$ +By the definition of $p(\mathbf{y}|\mathbf{x})$ in Equation 1.16 of the paper, this is +$$\sum_\mathbf{y} f_k(y_t, y_{t-1}, \mathbf{x}_t) p(\mathbf{y}|\mathbf{x}).$$ +Now $\mathbf{y}$ is shorthand for $(y,y')$ and the sum over all possible $\mathbf{y}$ has been rewritten as $\sum_{y, y'}$.

+",2013-10-16 01:03:06.297 +57597,22747.0,1,,,,Which AIC value to use from R's sarima() function for model comparison,,CC BY-SA 3.0,"

I'm using R's 'astsa' package and I get the following output from sarima.

+ +

Which AIC value would I use to compare this model (let's call it A) against others? When trying another model (B), model A's fit$AIC (858.19) is greater than model B's, but model A's AIC (12.38841) is less than model B's, so I'm not sure which model to choose.

+ +

What's the difference between the two AIC's, AICc's, and BIC's? I've checked the sites below, among others, but haven't been able to figure it out. Any help is much appreciated.

+ +

http://stat.ethz.ch/R-manual/R-patched/library/stats/html/arima.html

+ +

http://www.inside-r.org/packages/cran/astsa/docs/sarima

+ +
$fit
+Series: xdata 
+ARIMA(0,1,1)(1,1,1)[12]                    
+
+Coefficients:
+         ma1    sar1     sma1
+     -0.3282  0.5529  -0.8835
+s.e.   0.3290  0.4751   0.8635
+
+sigma^2 estimated as 82513:  log likelihood=-425.1
+AIC=858.19   AICc=858.93   BIC=866.5
+
+$AIC
+[1] 12.38841
+
+$AICc
+[1] 12.42448
+
+$BIC
+[1] 11.48327
+
+",2013-10-16 06:55:06.430 +57586,20473.0,2,,57319.0,,,,CC BY-SA 3.0,"

(This answer uses results from W.H. Greene (2003), Econometric Analysis, 5th ed. ch.21)

+ +

I will answer the following modified version, which I believe accomplishes the goals of the OP's question : ""If we only estimate a logit model with one binary regressor of interest and some (dummy or continuous) control variables, can we tell whether dropping the control variables will result in a change of sign for the (coefficient of) the regressor of interest?""

+ +

Notation: Let $RA\equiv Y$ be the dependent variable, $HHS \equiv X$ the binary regressor of interest and $\mathbf Z$ a matrix of control variables. The size of the sample is $n$. Denote $n_0$ the number of zero-realizations of $X$ and $n_1$ the number of non-zero realizations, $n_0+n_1=n$. Denote $\Lambda()$ the cdf of the logistic distribution.
+Let the model including the control variables (the ""unrestricted"" model) be

+ +

$$M_U : \begin{align} &P(Y=1\mid X,\mathbf Z)=\Lambda(X, \mathbf Z,b,\mathbf c)\\ &P(Y=0\mid X,\mathbf Z)=1-\Lambda(X, \mathbf Z,b,\mathbf c) \end{align}$$

+ +

where $b$ is the coefficient on the regressor of interest.
+Let the model including only the regressor of interest (the ""restricted"" model) be

+ +

$$M_R : \begin{align} &P(Y=1\mid X)=\Lambda(X, \beta)\\ &P(Y=0\mid X)=1-\Lambda(X,\beta) \end{align}$$

+ +

STEP 1

+ +

Consider the unrestricted model. The first-derivative of the log-likelihood w.r.t to $b$ and the condition for a maximum is

+ +

$$\frac {\partial \ln L_U}{\partial b}= \sum_{i=1}^n\left[(y_i-\Lambda_i(x_i, \mathbf z_i,b,\mathbf c)\right]x_i=0 \Rightarrow b^*: \sum_{i=1}^ny_ix_i=\sum_{i=1}^n\Lambda_i(x_i, \mathbf z_i,b^*,\mathbf c^*)x_i \;[1]$$

+ +

The analogous relations for the restricted model is +$$\frac {\partial \ln L_R}{\partial \beta}= \sum_{i=1}^n\left[(y_i-\Lambda_i(x_i,\beta)\right]x_i=0 \Rightarrow \beta^*: \sum_{i=1}^ny_ix_i=\sum_{i=1}^n\Lambda_i(x_i, \beta^*)x_i \qquad[2]$$

+ +

We have

+ +

$$\Lambda_i(X,\beta^*) = \frac {1}{1+e^{-x_i\beta^*}}$$

+ +

and since $X$ is a zero/one binary variable relation $[2]$ can be written

+ +

$$\beta^*: \sum_{i=1}^ny_ix_i=\frac {n_1}{1+e^{-\beta^*}} \qquad[2a]$$

+ +

Combining $[1]$ and $[2a]$ and using again the fact that $X$ is binary we obtain the following equality relation between the estimated coefficients of the two models:

+ +

$$\frac {n_1}{1+e^{-\beta^*}} = \sum_{i=1}^n\Lambda_i(x_i, \mathbf z_i,b^*,\mathbf c^*)x_i $$

+ +

$$\Rightarrow \frac {1}{1+e^{-\beta^*}} = \frac {1}{n_1}\sum_{x_i=1}\Lambda_i(x_i=1, \mathbf z_i,b^*,\mathbf c^*) \qquad [3]$$

+ +

$$\Rightarrow \hat P_R(Y=1\mid X=1) = \hat {\bar P_U}(Y=1\mid X=1,\mathbf Z) \qquad [3a]$$

+ +

or in words, that the estimated probability from the restricted model will equal the restricted average estimated probability from the model that includes the control variables.

+ +

STEP 2
+For a sole binary regressor in a logistic regression, its marginal effect $m_R(X)$ is

+ +

$$ \hat m_R(X)= \hat P_R(Y=1\mid X=1) - \hat P_R(Y=1\mid X=0)$$

+ +

$$ \Rightarrow \hat m_R(X) = \frac {1}{1+e^{-\beta^*}} - \frac 12$$

+ +

and using $[3]$

+ +

$$ \hat m_R(X) = \frac {1}{n_1}\sum_{x_i=1}\Lambda_i(x_i=1, \mathbf z_i,b^*,\mathbf c^*) - \frac 12 \qquad [4]$$

+ +

For the unrestricted model that includes the control variables we have

+ +

$$ \hat m_U(X)= \hat P_U(Y=1\mid X=1, \bar {\mathbf z}) - \hat P_U(Y=1\mid X=0, \bar {\mathbf z})$$

+ +

$$\Rightarrow \hat m_U(X) = \frac {1}{1+e^{-b^*-\bar {\mathbf z}'\mathbf c^*}} - \frac {1}{1+e^{-\bar {\mathbf z}'\mathbf c^*}} \qquad [5]$$

+ +

where $\bar {\mathbf z}$ contains the sample means of the control variables.

+ +

It is easy to see that the marginal effect of $X$ has the same sign as its estimated coefficient. Since we have expressed the marginal effect of $X$ from both models in terms of the estimated coefficients from the unrestricted model, we can estimated only the latter, and then calculate the above two expressions ($[4]$ and $[5]$) which will tell us whether we will observe a sign reversal for the coefficient of $X$ or not, without the need to estimate the restricted model.

+",2013-10-16 01:22:22.130 +57587,5237.0,2,,57584.0,,,,CC BY-SA 3.0,"

As @PatrickCoulombe hints, you can't conduct a partial F-test with (only) the information in that ANOVA table. Let's assume you want to conduct the partial F-test for X1, where your full model includes both X1 and X2. In that case, you would need the ANOVA table for the full model, and the ANOVA table for the reduced model, which would only include X2. The reason you can't find the number to put in place of the ""????"" is because that number isn't listed in the ANOVA table you have access to--you need the reduced model ANOVA table as well.

+ +

The equations you list in the question aren't quite right. Your equation for the F ratio is right, and MSE is right, but your equation for MSR is actually for SS(X1|X2). Having calculated that, you get the MS(X1|X2) by dividing by the appropriate degrees of freedom, which is the degrees of freedom for those regressors that were dropped / you are testing (in your case, I'm guessing the df for X1=1). You calculate the F by dividing the two MSs, as you list; the realized F can be assessed against the theoretical distribution for F with numerator degrees of freedom equal to the df for dropped predictors, and denominator df equal to the df(residual) in the full model.

+ +

For a fuller understanding of this topic, it might help you to read my answers here: Testing for moderation with continuous vs categorical moderators, and possibly here: how to interpret type I (sequential) ANOVA and MANOVA.

+",2013-10-16 01:38:21.057 +57588,15539.0,1,,,,"Best regression model, given coefficient of variation $R^2$ and mean-squared error",,CC BY-SA 3.0,"

If you have 3 separate models in a multiple regression problem (and 3 ANOVA tables), which would be best given that you have the coefficient of determination, $R^2$, and mean-squared error values?

+ +

So you have 1 table, with just $X_1$, another with just $X_2$ and a third with $X_1$, $X_2$ combined. Which is best given specific $R^2$ values and MSE values?

+ +

I assume you're looking for the highest $R^2$ values, but how does MSE play into this?

+",2013-10-16 02:18:25.063 +57589,22743.0,1,,,,Plotting data from several files on one plot,,CC BY-SA 3.0,"

I have more than one file of data from different experiments. For the sake of the argument here, let's say I have three files E1.txt, E2.txt, E3.txt.

+ +

I anticipate that I will have more files in future, say E4.txt, E5.txt, etc.

+ +

Each of these files has two columns, Time(h) and Growth. Plotting data from one file, for instance, E1.txt, is simple. I just read the data from E1.txt using the read.table function in R. I then assign the data from E1.txt to variable E1.

+ +

Then I just use the plot function this way, plot(E1).

+ +

Now, what do I need to write in a script that will plot all the data from the E1.txt, E2.txt, and E3.txt files on the same plot?

+ +

I thought of writing plot(c(E1,E2,E3)), but it doesn't work.

+ +

Thanks for the help.

+",2013-10-16 03:44:01.090 +57590,9483.0,2,,57589.0,,,,CC BY-SA 3.0,"

One way to do it is to use points:

+ +
x <- seq(0, 2*pi, len = 51)
+y1 = sin(x)
+y2 = cos(x)
+plot(x, y1)
+points(x, y2, col = ""red"")
+
+ +

+ +

If your data files share a common axis, you can use matplot:

+ +
a <- matrix (rnorm(20), c(5,1))
+matplot(a, type = ""b"")
+
+ +

+",2013-10-16 04:11:24.593 +57598,95860.0,1,,,user2885313,How to calculate the p-value for a binomial test using pbinom?,,CC BY-SA 3.0,"

25 patients receive treatment, and they each get a difference score (After-Before). 18 of them get positive scores. I would like to test the null hypothesis that p=0.5.

+ +

I know that 1-pbinom(17,25,0.5) gives me the probability of 18 or more patients getting a positive score. My question is, to calculate the p-value, why do we have to do

+ +
2*(1-pbinom(17,25,0.5))
+
+ +

That is, why do we multiply by 2 here?

+",2013-10-16 07:12:18.720 +57599,20463.0,2,,57518.0,,,,CC BY-SA 3.0,"

Unfortunately there are many possibilities.

+ +

1) Are seasonal factors stochastic or deterministic? +2) Does these seasonal factors affect both dependent and independent variables? +3) Do you have a system of simultaneous dynamic equations which has to be estimated jointly?

+ +

Following article by K. Wallis is interesting.

+ +

http://www.nber.org/chapters/c3905.pdf

+ +

In most detailed level you have a multivariate input-output system with very complicated transfer function - polynomial matrix operator structure.

+ +

It seems that doing seasonal adjustment for the time-series separately leads at least to the inefficient estimates.

+ +

Regards,

+ +

-A

+",2013-10-16 07:15:01.223 +57591,21243.0,2,,53439.0,,,,CC BY-SA 3.0,"

As the user above points out, this is not simply an issue of choosing a machine learning method and letting it go wild on the data. Particularly, the introduction of decision theory (even in a basic form) is vital. While the inclusion of something like ""decision theory"" might seem a bit complex, this really only means in your case that you are not simply estimating a quantity and figuring out if you are right; that is, not every wrong solution is ""equally wrong"", in the sense that you'll need to take into account your profit and your funds left in reserve.

+ +

A general approach is difficult to recommend without intimate knowledge of the data. However, it sounds like you'll need to formalize the aspects of your model. Think about the following questions as you decide on your model:

+ +
    +
  1. What is the quantity I am trying to predict? (In this case, it sounds like you are looking for the particular markup at which you should sell, and/or the price at which to buy so that a profit can be made, which is just as much optimization as it is machine learning).
  2. +
  3. What examples can I use to train my model, and what are my inputs? What market factors can I train my model with to predict the quantities in (1)?
  4. +
+ +

As a final point, it is really quite unlikely that reinforcement learning is the approach you'll want to take in this case. Reinforcement learning is quite powerful in certain situations, but is somewhat unpredictable (depending on the particular formulation), and tends to make an awful lot of errors before it gets anything right (something that likely is not an option when there is money on the line). As I said, try to figure out which quantities you want to estimate, then figure out what market factors might affect those quantities.

+",2013-10-16 04:37:57.570 +57592,22744.0,1,,,,How to determine if short strings of text are closely related to a larger text?,,CC BY-SA 3.0,"

I have 1 short string of text (let's say it's a tweet, max 140 characters):
+""A review of my beloved Roku 3 media player""

+ +

I also have a larger body of text (like a blog article, hundreds of words) which I know is related to the tweet:
+""The Roku 3 media player is a great way to watch your favorite ....""

+ +

The tweet and blog article are both about the Roku 3 media player specifically. Same author, and they share many of the same phrases, words, collocations, etc. It's likely that the string ""Roku 3"" appears in the text, along with variations like """"Roku 3 streaming media player"", ""Roku 3 player"" etc

+ +

I then have 10 other tweets, some which are related to the ""Roku 3 media player"", some of which are not (but very similar):

+ +

RELATED ""A good review of the Roku 3 media player""
+UNRELATED ""The Roku 2 media player review""
+RELATED ""Roku 3 is amazing""
+RELATED ""The Roku 3 is better than the Roku 2 by far""
+RELATED ""The Roku version 3 streaming media player, fully reviewed""
+UNRELATED ""A comparison review of the top 3 media player boxes. Roku, Android, Toshiba""
+RELATED ""Roku 3 streaming media player reviewed""

+ +

Those are some examples, and I would have 10 in total. All of the tweets contain ""Roku"", some are about ""Roku 2"" and are unrelated, one uses ""Roku version 3"" and is related etc. Obviously, this is a very small data set.

+ +

What is the best method to classify each of the 10 tweets as relevant or not, in relation to the first tweet and blog article? What sort of features would be useful?

+",2013-10-16 05:39:24.313 +57593,21243.0,2,,57592.0,,,,CC BY-SA 3.0,"

On a data set this size, it can be pretty tough to learn anything at all; as a baseline, it might be worth it to learn the N-Grams shared by the two examples, and run from there. For a training pair consisting of one short string and a longer article, it would be relatively easy to find the common N-Grams between the article and tweet, and then analyze the frequency with which those N-Grams appear in the test data. In the case you presented, the important N-Grams seem to be ""Roku 3"" and ""Roku 3 Media Player"" (a bigram and 4-gram, respectively), both of which are shared by the training data strings.

+",2013-10-16 06:02:26.797 +57594,22705.0,2,,57588.0,,,,CC BY-SA 3.0,"

Hope you are also looking at adjusted r square.

+ +

A high r square and low mape could indicate over fitting.

+ +

The p values of the regressor and its sign in the two models could be compared.

+ +

aic can be compared. lower the better. +hope it helps.

+",2013-10-16 06:05:05.923 +57595,22746.0,1,,,,"If $X_{n+1}$ is a martingale subject to $Y_0,\ldots,Y_n$, then is it a martingale with respect to $Y_0^2,\ldots,Y_n^2$?",,CC BY-SA 3.0,"

I don't have a very solid foundation in measure theory, and this always seems a bit confusing to me so I would appreciate any help.

+ +

We are given +$ +E \left( X_{n+1} | Y_0,\ldots,Y_n \right) = X_n. +$ +Prove or disprove +$ +E \left( X_{n+1} | Y_0^2,\ldots,Y_n^2 \right) = X_n +$

+ +

I am thinking that if $F=\sigma \left(Y_0,\ldots,Y_n \right)$ and +$G=\sigma \left(Y_0^2,\ldots,Y_n^2 \right)$, I need to prove that F = G? Is this correct?

+ +

Then I can do something like this: +$ +E \left( X_{n+1} | G \right) = E \left( E \left( X_{n+1} | F \right) | G \right) = E \left( X_{n+1} | F \right) = X_{n+1} +$.

+ +

Also is $E \left( X_{n+1}^2 | G \right) = X_n^2$ (a martingale) given +$ +E \left( X_{n+1} | Y_0,\ldots,Y_n \right) = X_n. +$

+",2013-10-16 06:19:34.817 +57596,22600.0,1,,,,How to compute expectations from a probability density function?,,CC BY-SA 3.0,"

How to find a tax/subsidy in an income probability density function situation?

+ +

I am asked the following question:

+ +
+

Suppose all families with $Y \lt 20$ are given transfer payments equal to $(20-Y)/2$; + and suppose the cost of these transfers is raised by taxing all families + with $Y \gt 20$ an amount $t(Y-20).$

+ +

Solve for $t.$

+
+ +

I am trying to understand how to develop an equation to solve for $t,$ but none of my answers work.

+ +

The probability density function is triangular:

+ +

+ +
+ +

The values I have solved for are:

+ +
    +
  • Height : 1/30
  • +
  • Mean : 26.67
  • +
  • Median : 25.36
  • +
  • slope of $Y\gt 20$ : 1/600
  • +
  • slope of $20\lt Y\lt 60$ : 1/1200
  • +
+ +

Any help would be greatly appreciated.

+",2013-10-16 06:21:29.603 +57659,22775.0,1,,,,Why one can fit Poisson GLM on non-integer values in Stata?,,CC BY-SA 3.0,"

I have aggregated death counts and some categorical predictors e.g. era, drug use, and I'm running a Poisson GZLM in SPSS, also a glm in Stata (generalized linear model). There are many, but not excessive, zeros and it has been suggested to me to add 0.5 to the observed values. When I do that the GENLIN procedure in SPSS won't run, because the Poisson dependent should have integer values. However, the glm with family Poisson and log link in Stata runs and just makes a note that the dependent has non-integer values.

+ +

Why does Stata run the Poisson glm if the values of the dependent should be integers only?

+",2013-10-16 21:06:10.343 +57600,651.0,2,,50982.0,,,,CC BY-SA 3.0,"

If there are only two methods, A and B, I would calculate the probability that for an arbitrary training/test partition that the error (according to some suitable performance metric) for model A was lower than the error for model B. If this probability were greater than 0.5, I'd chose model A and otherwise model B (c.f. Mann-Whitney U test?) However, I strongly suspect that will end up choosing the model with the lower mean unless the distributions of the performance statistic are very non-symmetric.

+ +

For grid search on the other hand, the situation is a bit different as you are not really comparing different methods, but instead tuning the (hyper-) parameters of the same model to fit a finite sample of data (in this case indirectly via cross-validation). I have found that this kind of tuning can be very prone to over-fitting, see my paper

+ +

Gavin C. Cawley, Nicola L. C. Talbot, ""On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation"", Journal of Machine Learning Research, 11(Jul):2079−2107, 2010. (www)

+ +

I have a paper in review that shows that it is probably best to use a relatively coarse grid for kernel machines (e.g. SVMs) to avoid over-fitting the model selection criterion. Another approach (which I haven't investigated, so caveat lector!) would be to choose the model with the highest error that is not statistically inferior to the best model found in the grid search (although that may be a rather pessimistic approach, especially for small datasets).

+ +

The real solution though is probably not to optimise the parameters using grid-search, but to average over the parameter values, either in a Bayesian approach, or just as an ensemble method. If you don't optimise, it is more difficult to over-fit!

+",2013-10-16 07:36:28.433 +57601,22750.0,1,,,,Calculating joint probabilities from conditional probabilities,,CC BY-SA 3.0,"

I need to calculate the probability of $P(L,D)$, where $L$ and $D$ are not independent. I have estimated $P(L)$ and $P(D)$ with two distinct models and I also know $P(L|D)$ and $P(D|L)$.

+ +

As far as I can see it I have two different estimates of $P(L,D)$:

+ +

$P(L,D) =P(L) * P(D|L)$

+ +

$P(L,D) =P(D) * P(L|D)$

+ +

Am I best off just averaging these two estimates for the joint $P(L,D)$?

+",2013-10-16 08:30:42.900 +57602,11210.0,1,,,,What is the statistical efficiency of L-moments?,,CC BY-SA 3.0,"

In particular I am interested in the scale estimator. Hopefully it is much better than that of IQR.

+",2013-10-16 09:45:32.003 +57603,22751.0,2,,57601.0,,,,CC BY-SA 3.0,"

The answer depends on the answer to this question:

+ +

""Why are (P(L) * P(D|L)) and (P(D) * P(L|D)) different?"".

+ +

If it depends on the fact that the model for P(L|D) is unreliable you should only keep (P(L) * P(D|L)), if you have no idea you can consider taking the average.

+",2013-10-16 09:54:34.153 +57604,22262.0,2,,49879.0,,,,CC BY-SA 3.0,"

It means re-estimation on a rolling basis. However, I do not understand why one would want to do this when there's perfectly good parametric dynamic copulas invented by Patton (2006) and extended by others with various forcing equations, as well as the more recent stochastic autoregressive copula (SCAR). Read here [1][www.wisostat.uni-koeln.de/Institut/.../Manner_Reznikova(2010ER).pdf‎].

+",2013-10-16 10:31:13.157 +57605,22752.0,1,57607.0,,,Covariance matrix of least squares estimator $\hat{\beta}$,,CC BY-SA 3.0,"

I read that the $Cov(\hat{\beta})=\sigma^2(Z'Z)^{-1}$, where $\hat{\beta}=(Z'Z)^{-1}Z'y$. However, I have yet been unable to find a proof of this fact online. Could anyone please provide a proof and/or a reference?

+",2013-10-16 10:39:39.203 +57606,20473.0,2,,57605.0,,,,CC BY-SA 3.0,"

This is the expression for the conditional variance-covariance matrix of the estimator. For the model $$Y=Z\beta + U, \; E(U\mid Z) =0,\; E(UU'\mid Z) = \sigma^2I$$ +we have +$$\operatorname {Cov}(\hat\beta \mid Z)=\operatorname {Cov} \left[(Z'Z)^{-1}Z'y\mid Z\right]$$

+ +

$$=\operatorname {Cov} \left[(Z'Z)^{-1}Z'(Z\beta +U)\mid Z\right] = \operatorname {Cov} \left[\beta +(Z'Z)^{-1}Z'U)\mid Z\right] = \operatorname {Cov} \left[(Z'Z)^{-1}Z'U)\mid Z\right] $$

+ +

Since $\beta$ is treated as a constant in the frequentist approach. Now

+ +

$$\operatorname {Cov} \left[(Z'Z)^{-1}Z'U)\mid Z\right] = E\Big\{\left[(Z'Z)^{-1}Z'U\right]\left[(Z'Z)^{-1}Z'U)\right]'\mid Z\Big\} - E\left[(Z'Z)^{-1}Z'U)\mid Z\right]E\left[(Z'Z)^{-1}Z'U)\mid Z\right]'$$

+ +

Since +$$E\left[(Z'Z)^{-1}Z'U)\mid Z\right]' = (Z'Z)^{-1}Z'E\left[U\mid Z\right]' = 0$$ +we are left with

+ +

$$\operatorname {Cov} \left[(Z'Z)^{-1}Z'U)\mid Z\right] = E\Big\{\left[(Z'Z)^{-1}Z'U\right]\left[(Z'Z)^{-1}Z'U)\right]'\mid Z\Big\} $$

+ +

$$= (Z'Z)^{-1}Z'E(UU'\mid Z)Z(Z'Z)^{-1}= (Z'Z)^{-1}Z'\sigma^2IZ(Z'Z)^{-1} $$

+ +

$$=\sigma^2(Z'Z)^{-1} $$

+",2013-10-16 11:45:37.907 +57607,17573.0,2,,57605.0,,,,CC BY-SA 3.0,"

A good reference is Greene, Econometric Analysis. You should be able to pick up an older version (sixth edition or before) online for relatively cheap. Seventh is not noticeably better than sixth. I am changing your notation $Cov(\hat{\beta})$ to +$V(\hat{\beta}_{\textrm{OLS}})$, but I mean the same thing by it.

+ +

Here is the proof:

+ +

If

+ +
    +
  1. $Y=Z\beta+\epsilon$
  2. +
  3. $E\left\{\epsilon|Z \right\}=0$
  4. +
  5. $V\left(\epsilon|Z \right)=\sigma^2I$
  6. +
  7. The OLS estimator exists and is unique (i.e. $Z'Z$ invertible)
  8. +
+ +

then the OLS estimator is unbiased for $\beta$ and +$V\left(\hat{\beta}_{\textrm{OLS}}|Z \right)=\sigma^2(Z'Z)^{-1}$.

+ +

Proof: +Using the definition of the OLS estimator and then substituting in using 1: +\begin{align} +\hat{\beta}_{\textrm{OLS}} &= \left( Z'Z\right)^{-1}Z'Y\\ + &= \left( Z'Z\right)^{-1}Z'\left( Z\beta+\epsilon \right)\\ + &= \left( Z'Z\right)^{-1}Z'Z\beta + + \left( Z'Z\right)^{-1}Z'\epsilon\\ + &= \beta + \left( Z'Z\right)^{-1}Z'\epsilon +\end{align} +Taking expectations of both sides conditional on $Z$ gives you that the OLS estimator is unbiased. Taking variances on both sides conditional on $Z$ gives you: +\begin{align} +V\left( \hat{\beta}_{\textrm{OLS}} | Z \right) + &= V\left( \beta + \left( Z'Z\right)^{-1}Z'\epsilon | Z \right)\\ + &= V\left(\left( Z'Z\right)^{-1}Z'\epsilon | Z \right) \\ + &= \left( Z'Z\right)^{-1}Z'V\left(\epsilon | Z \right) Z \left( Z'Z\right)^{-1} \\ + &= \left( Z'Z\right)^{-1}Z'\sigma^2I Z \left( Z'Z\right)^{-1} \\ + &= \sigma^2\left( Z'Z\right)^{-1}Z'Z \left( Z'Z\right)^{-1} \\ + &= \sigma^2\left( Z'Z\right)^{-1} +\end{align} +QED

+ +

This does not quite give you what you asked for, since the variance is conditional on $Z$ rather than unconditional. If you want the variance to be unconditional, you have to additionally assume that $Z$ is fixed, so that the conditional variance becomes just an unconditional variance. On the other hand, this is the right variance conditional on the dataset you used to estimate $\beta$ with OLS, and inference based on this variance gives (asymptotically, if you don't assume $\epsilon$ normal) you correctly-sized hypothesis tests and confidence intervals.

+",2013-10-16 11:53:52.693 +57615,2149.0,2,,57613.0,,,,CC BY-SA 3.0,"

Forecasting weekly data using weekly history is problematic as what we do in say week 4 is probably not what we did in week 4 last year whereas what we do in month 4 is probably systematic with what we did in month 4 last year ,save special effects such as Easter or Thanksgiving. Furthermore the different number of weeks in a year can throw a monkey-wrench into the analysis. More importantly the effect of holidays on weekly sums can be quite dependent on when the holiday occurs thus effectively distorting pattern. I have seen very few examples of where weekly data is consistent/predictable and can be used reliably to obtain weekly forecasts.

+ +

With the development of statistically aggressive daily models taking into account the window of response around each holiday/event, day-of-the-week effects,day-od-the-month effects,month-of-the-year effects,level shifts and or local time trends... users are now developing daily models to obtain weekly predictions. Additionally they can compute probabilities of making month-end numbers or of meeting a plan/goal number.

+ +

THe other item dealing with missing values is easily handled by Intervention Detection schemes which would identify pulses for the missing values and effectively replace the missing value with an imputed value based upon the full model.

+",2013-10-16 14:19:22.403 +57608,22262.0,1,,,,Can logistic regression estimates suffering from subsample abuse be salvaged?,,CC BY-SA 3.0,"

Suppose we have some logistic regression modelling problem; $f(X) = Y$, where $Y$ is binary and $X$ is a vector of normally distributed variables.

+ +

In industry it is sometimes the case that practitioners will delete rows of data where 'nothing happens' (according to an algorithmic criteria), often to reduce the dimensions of the regression (data can get quite big) or because of the perception that this 'nothing happening' is not what we want to model - we want the model only to capture when interesting events occur.

+ +

Needless to say this can bias out of sample probability estimates (i.e., given some new $X$, our estimate of $P(Y=1)$ is biased). A quick R demonstration will show this, where the true data generating process has $Y=1$ half the time and $Y=0$ the other half but our subsampling approach has this ratio to $\frac19$:

+ +
predictions <- rep(NA,1000)
+
+y <- c(rep(1,5000),rep(0,45000))
+x <- rnorm(50000)
+#y <- c(rep(1,25000),rep(0,25000))
+#x <- rnorm(50000)
+
+fit = glm(y~x,family=binomial(logit))
+
+for(i in 1:1000) {
+    newX = data.frame(x=rnorm(1))
+    predictions[i]=predict(fit,newX,type=""response"")
+}
+
+    mean(predictions)
+plot(predictions)
+
+ +

Question

+ +

What I would like to ask is whether our fit, $\hat{f}$, can be salvaged in the following way. For every new $X$ that we observe, we only estimate $\hat{P}(Y-1)$ if and only if it would have passed through our filter had this new $X$ been part of our in sample dataset. Intuitively I want to say yes because in-sample we have the data generating process that we artificially created, $f_\text{intrusion}(X) = Y_\text{intrusion}$, and this is the same DGP that we are drawing from out of sample to get our probability estimates.

+",2013-10-16 12:32:09.917 +57609,10060.0,2,,57589.0,,,,CC BY-SA 3.0,"

Use [FileName]$[VariableName] to refer to different variables in different datasets:

+ +
set.seed(276)
+
+# Simulate data:
+
+E1 <- data.frame(time=seq(1:365), growth=rnorm(365))
+E2 <- data.frame(time=seq(1:365), growth=rnorm(365))
+
+# Determine the right y-axis limits:
+
+ymax <- max(c(E1$growth, E2$growth), na.rm=T)
+ymin <- min(c(E1$growth, E2$growth), na.rm=T)
+
+# Plot the first graph:
+
+plot(E1$time, E1$growth, type=""l"", col=""red"", ylim=c(ymin, ymax),
+     xlab=""Time"", ylab=""Growth"")
+
+# Plot the second line onto the first graph:
+
+points(E2$time, E2$growth, type=""l"", col=""blue"")
+
+ +

Result:

+ +

+",2013-10-16 12:42:50.790 +57610,22753.0,2,,57472.0,,,,CC BY-SA 3.0,"

All of the above look like great resources, but I must link to this great example. It presents a very simple explanation for finding the parameters for two lines of a set of points. The tutorial is by Yair Weiss while at MIT.

+ +

http://www.cs.huji.ac.il/~yweiss/emTutorial.pdf
+http://www.cs.huji.ac.il/~yweiss/tutorials.html

+",2013-10-16 12:43:56.013 +57611,20456.0,1,,,,Propensity score and Cox regression,,CC BY-SA 3.0,"

I have a retrospective dataset of patients treated with a certain drug (treatment, $n=46$) or with placebo (control $n=96$). The stored variables are age, sex, stage of disease. I want to assess the effect of treatment on overall survival with propensity score. Here are the steps I followed:

+ +
    +
  1. I calculated propensity score with a binary logistic regression +model using treatment as dependent variable and age, sex, stage as +covariates.
  2. +
  3. I used fuzzy matching to create a 1:1 matching with 0.05 tolerance.
  4. +
  5. I deleted the unmatched cases and obtained a dataset of 46*2 cases +(46 treated, 46 controls).
  6. +
  7. I used a Cox proportional regression model using propensity score +and treatment as covariates.
  8. +
+ +

Is my procedure correct? I'm using SPSSv19.

+",2013-10-16 12:59:52.663 +57612,17573.0,2,,57601.0,,,,CC BY-SA 3.0,"

If you have two estimators, call them $\hat{\beta}_1$ and $\hat{\beta}_2$, of the same parameter $\beta$, then you can combine them in a variety of ways. Let's suppose you know that the two estimators are consistent and asymptotically normal --- this is generally true of estimators you get from maximum likelihood methods, method of moments methods, and some other methods as well. Furthermore, suppose you know the (asymptotic) variances of the two estimators, $V(\hat{\beta}_1)$ and $V(\hat{\beta}_2)$ and the covariance of the two estimators $Cov(\hat{\beta}_1,\hat{\beta}_2)$.

+ +

You propose a combined estimator of $\beta$ given by $\frac{1}{2}\hat{\beta}_1+\frac{1}{2}\hat{\beta}_2$. This is consistent and asymptotically normal if $\hat{\beta}_1$ and $\hat{\beta}_2$ are. What is its variance? +\begin{align} +V(\frac{1}{2}\hat{\beta}_1+\frac{1}{2}\hat{\beta}_2) &= \frac{1}{4}V(\hat{\beta}_1) +\frac{1}{4}V(\hat{\beta}_2) +\frac{1}{2}Cov(\hat{\beta}_1,\hat{\beta}_2) +\end{align}

+ +

This estimator might be better (lower variance) or worse than either $\hat{\beta}_1$ or $\hat{\beta}_2$. If, say, $\hat{\beta}_1$ has a crazy-high variance, then the variance of $\frac{1}{2}\hat{\beta}_1+\frac{1}{2}\hat{\beta}_2$ might be higher than the variance of $\hat{\beta}_2$. We would like to avoid this. Also, we would like to find the best (i.e. lowest variance) way of combining the two estimators while preserving consistency. That is, we want to solve: +\begin{align} +&min_{\lambda}V(\lambda\hat{\beta}_1+(1-\lambda)\hat{\beta}_2)\\ +&min_{\lambda}\lambda^2V(\hat{\beta}_1)+(1-\lambda)^2V(\hat{\beta}_2) ++2\lambda(1-\lambda)Cov(\hat{\beta}_1,\hat{\beta}_2) +\end{align} +The first order condition is: +\begin{align} +2\lambda V(\hat{\beta}_1)-2(1-\lambda)V(\hat{\beta}_2) ++2(1-2\lambda)Cov(\hat{\beta}_1,\hat{\beta}_2)&=0 \\ +\frac{V(\hat{\beta}_2)-Cov(\hat{\beta}_1,\hat{\beta}_2)}{V(\hat{\beta}_1)+V(\hat{\beta}_2)-2Cov(\hat{\beta}_1,\hat{\beta}_2)} &= \lambda +\end{align} +The combined estimator is then: +\begin{align} +\hat{\beta}_* = \frac{V(\hat{\beta}_2)-Cov(\hat{\beta}_1,\hat{\beta}_2)}{V(\hat{\beta}_1)+V(\hat{\beta}_2)-2Cov(\hat{\beta}_1,\hat{\beta}_2)} \hat{\beta}_1 + +\frac{V(\hat{\beta}_1)-Cov(\hat{\beta}_1,\hat{\beta}_2)}{V(\hat{\beta}_1)+V(\hat{\beta}_2)-2Cov(\hat{\beta}_1,\hat{\beta}_2)} \hat{\beta}_2 +\end{align} +Because of the way we set up $\lambda$ in the minimization, you are assured that this new estimator is consistent. Notice, if the covariance between the two estimators is zero, then this new estimator is just the weighted sum of the two original estimators where the original estimators are weighted inversely to their variance---the new estimator ""pays attention"" to the old estimators inversely according to their variances.

+ +

Finally, the variance of the new estimator is: +\begin{align} +V(\hat{\beta}_*) = \frac{V(\hat{\beta}_1)V(\hat{\beta}_2)-2Cov^2(\hat{\beta}_1,\hat{\beta}_2)}{V(\hat{\beta}_1)+V(\hat{\beta}_2)-2Cov(\hat{\beta}_1,\hat{\beta}_2)} +\end{align}

+ +

This process of optimally combining multiple estimators is called ""minimum distance estimation"" in econometrics. A cite is chapter 13 in Greene, Econometric Analysis, seventh ed.

+",2013-10-16 13:09:06.063 +57613,22369.0,1,57615.0,,,Building a forecast model based on past year data in R,,CC BY-SA 3.0,"

I am attempting to build a model to forecast attendance in a given week in the current year based on this year's attendance values up until the present, and data from two previous years. My data looks like this:

+ +
   Week 11-12 Cumulative ADA    12-13 Cumulative ADA    13-14 Cumulative ADA
+   1    0.9941                  0.9941                  0.9914
+   2    0.9907                  0.991                   0.989
+   3    0.9888                  0.9888                  0.9879
+   4    0.9877                  0.987                   0.9869
+   5    0.9869                  0.9865                  0.9867
+   6    0.9862                  0.985                   0.9859
+   7    0.9856                  0.9842                  0.9857
+   8    0.9856                  0.984                   NA
+   9    0.9852                  0.9839                  NA
+   10   0.9848                  0.9834                  NA
+
+ +

Any guidance on how to predict the three NAs based on the past two years data and this year's values would be much appreciated.

+ +

Thanks!

+",2013-10-16 13:42:40.313 +57614,19298.0,2,,47447.0,,,,CC BY-SA 3.0,"

Have you looked at the LaplacesDemon package? They have some examples for autoregressive poisson. http://cran.cermin.lipi.go.id/web/packages/LaplacesDemon/vignettes/Examples.pdf

+",2013-10-16 14:05:52.853 +57652,12980.0,1,71920.0,,,Understanding tail dependence coefficients,,CC BY-SA 3.0,"

How can I analyze the $\lambda_U$ and $\lambda_L$ results (estimated by non-parametric method)? What does higher or lower coefficients mean? Does $\lambda_U = 0.5$ mean there's some kind of linear dependence between $X$ and $Y$?

+ +

This is what I mean by $\lambda_U$ and $\lambda_L$:

+ +

+",2013-10-16 20:27:28.370 +57616,22756.0,1,,,,Determining sample size with a proportion and binomial distribution,,CC BY-SA 3.0,"

I am trying to learn some statistics using the book, Biometry by Sokal and Rohlf (3e). This is an exercise in the 5th chapter which covers probability, the binomial distribution, and Poisson distribution. +

+ +

I realize there is a formula to produce an answer to this question: +$$ +n = \frac 4 {( \sqrt{p} - \sqrt{q} )^2} +$$ +However, this equation is not in this text. I'd like to know how to calculate sample size knowing only the probability, the desired level of confidence, and the binomial distribution. Are there any resources covering this topic that I can be pointed to? I've tried Google, but what I've seen so far requires information I don't have access to in this problem.

+",2013-10-16 14:27:26.077 +57617,22399.0,1,,,,Visual display of multiple comparisons test,,CC BY-SA 3.0,"

Suppose, the data below shows the mean response time on a task for respondents among four different groups:

+ +
A     B     C    D  
+1.2   2.3   4.5  6.7
+
+ +

In order to assess which one of the means are different from one another I do a multiple comparisons test (after an omnibus ANOVA test is cleared) and the multiple comparisons test tells me that the mean for group D is significantly different from the ones for groups A and B and no other pair of differences is significantly different.

+ +

What is the best way to present this information visually?

+",2013-10-16 14:34:52.500 +57618,22729.0,2,,57598.0,,,,CC BY-SA 3.0,"

If you do not multiply by 2, you will be evaluating the probability of having scores ranging from 18 to 25 (one-sided test).

+ +

Multiplying by 2, you are evaluating the probability of having scores ranging from 0 to 7 and 18 to 25 (two-sided test). +Your command results in an answer similar to this one:

+ +
binom.test(18, 25, 0.5, alternative=""two.sided"") 
+
+",2013-10-16 14:45:41.947 +57619,22757.0,2,,46070.0,,,,CC BY-SA 3.0,"

As I understand it, sigma2 is the constant variance the model assumes, and it is the variance for the innovations or random shocks (uncorrelated zero mean random variables) that the model uses behind the scenes. An ARIMA model is one where the current random variable $X_t$ can be written as the sum of a linear filter of the previous $X_{t-h}, h>0$ (the AR part, and also the I (order of differencing) if it has unit roots in a certain sense), and a linear filter of the random shocks $Z_{t-j}, j\geq 0$ in which the coefficient of $Z_t$ is 1 (the MA part, not counting $Z_t$ itself). These random shocks are the ones that have to have a common variance.

+",2013-10-16 15:00:49.397 +57620,22759.0,2,,1248.0,,,,CC BY-SA 3.0,"

I forget where it's from, but there's the one that when asked how he felt on reaching his 90th birthday, the old statistician replied, ""I'm very happy about it - the numbers show that few people die after their 90th birthday.""

+",2013-10-16 15:03:08.217 +57621,16046.0,1,57632.0,,,Cumulants in chinese restaurant process?,,CC BY-SA 3.0,"

I already wrote a similar question on StackOverflow but was not welcomed there! So I decided to ask it from the folks here. +I have wrote a code in Python for CRP problem. I think everybody here is familar with the subject but nevertheless:

+ +

Short description of it: +Suppose we want to assign people entering to a restaurants to potentially infinite number of tables. If $z_i$ represents the random variable assigned for the $i$'th person entering the restaurant the following should hold:

+ +

With probability $p(z_i=a|z_1,...,z_{i-1})=\frac{n_a}{i-1+\alpha}$ for $n_a>0$, $i$'th person will sit in table $a$ and with probability $p(z_i=a|z_1,...,z_{i-1})=\frac{\alpha}{i-1+\alpha}$ $i$'th person will sit around a new table.

+ +

I am not quite sure if my code is correct cause I am surprised how small the final number of tables are. I would be happy if somebody could give me cumulants for the distribution associated with this process.

+ +
import numpy as np
+def CRP(alpha,N):
+    """"""Chinese Restaurant Process with alpha as concentration parameter and N 
+    the number of sample""""""
+    #Array which will save for each i, the number of people people sitting
+    #until table i
+    summed=np.ones(1) #first person assigned to the first table
+    for i in range(1,N):
+        #A loop that assigns the people to tables
+
+        #randind represent the random number from the interval [1,i-1+alpha]
+        randind=(float(i)+alpha)*np.random.uniform(low=0.0, high=1.0, size=1)
+        #update is the index for the table that the person should be placed which
+        #if greater than the total number, will be placed in a new table
+        update=np.searchsorted(summed,randind,side='left')
+        if randind>i:
+            summed=np.append(summed,i+1)
+        else:
+            zerovec=np.zeros(update)
+            onevec=np.ones(summed.size-update)
+            summed+=np.append(zerovec,onevec)
+    #This part converts summed array to tables array which indicates the number
+    #of persons assigned to that table
+    tables=np.zeros(summed.size)
+    tables[0]=summed[0]
+    for i in range(1,summed.size):
+        tables[i]=summed[i]-summed[i-1]
+    return tables
+a=CRP(5,1000)
+print a
+
+",2013-10-16 15:05:33.317 +57622,20062.0,2,,57617.0,,,,CC BY-SA 4.0,"

Point is, that your dataset is too small (4 groups 5 values each). The means obtained from such data are not very accurate representative values for each group - and therefore you should not run ANOVA to make inference about differences among group.

+
+

One thing is to be understandable to the audience but more important is to be scientifically accurate.

+
+

I suggest to solve this issue by Kruskal-Wallis followed by multiple comparisons.

+

Boxplots (with medians) is probably the most used graphical representation of multiple comparisons of groups. To display differences you either make brackets above pairs which are statistically different and add (***-symbols or N.S.) This looks good if you have small number of groups. Or can make notches on each boxplot (very helpful in large number of groups) by which anyone will found desired comparison be eye.

+
+

You may created boxplots for example in R:

+
data <- data.frame(value=c(rnorm(60), rnorm(20)+3), 
+    group=rep(c("A", "B", "C", "D"), each=20))
+
+              value group
+    1  -1.206926025     A
+    2  -0.311125313     A
+    3   1.336579675     A
+    ......
+    21  1.543827796     B
+    22 -1.874257866     B
+    ......
+    80  4.383037868     D
+    etc.  
+
+boxplot(data$value ~ data$group, notch=TRUE,
+    col = "red", xlab="group", ylab="value")
+
+

+

Boxplots shows median values instead of mean. +I strongly suggest to not display ONLY mean values for each group. Raw data are the last possibility.

+",2013-10-16 15:28:50.510 +57623,2857.0,2,,57608.0,,,,CC BY-SA 3.0,"

In logistic regression, oversampling events does not affect non-intercept coefficients - they remain unbiased. Only the intercept $\beta_0$ is affected, and this is what drives the incorrect probability predictions. So if you only cared about the coefficients for explanatory variables and not the actual predicted probabilities, you don't need to do anything, but that's not the case here.

+ +

See King and Zheng (2001) [equation 7] for the direct correction to $\beta_0$

+ +

$$ +\hat{\beta_0} - ln[({1-\tau \over \tau})({\hat{y} \over 1-\hat{y}})] +$$

+ +

$\hat{\beta_0}$ being the incorrectly estimated intercept
+$\tau$ being the proportion in the actual population
+$\hat{y}$ being the proportion in the sample population

+ +

So for your simple example, you should have something like ${\beta_0} = -2.2 - ln[({0.5 \over 0.5})({0.1 \over 0.9})] = -2.2 + 2.2 = 0 $.

+",2013-10-16 15:33:03.820 +57624,7700.0,2,,57617.0,,,,CC BY-SA 3.0,"

Based on your question and follow-up comments, I'd start with a dot-plot. They're quick and easy (even in Excel). Here's s sample with your data:

+ +

+ +

This chart type scales well, handles large numbers of data points well and is very easy to understand-even to a non-tech audience.

+",2013-10-16 15:42:10.740 +57625,10060.0,2,,57617.0,,,,CC BY-SA 4.0," +
iv <- c("A","B","C","D")
+dv <- c(1.2,2.3,4.5,6.7)
+gp <- c(1,1,1,2)
+
+par(mai=c(1,1,0,0))
+plot(dv, gp, axes=F, xlab="Average time", ylab="Grouping based on 
+                   \n mean comparison",
+     ylim=c(0,3), xlim=c(0,7), pch=16)
+text(dv, gp-.2, iv)
+axis(side=2, label=c("i", "ii"), at=c(1,2))
+axis(side=1)
+abline(h=c(1,2),col="blue",lty=3)
+
+

Provide a footnote: Means on the same horizontal reference line are not statistically different from each other. Alpha = 0.05, Bonferroni adjustment

+

+

And I really like this design because you can flexibly accomodate group means with multiple memberships. Like in this case, C is not different from D and also not different from A and B:

+
iv <- c("A","B","C", "C", "D")
+dv <- c(1.2,2.3,4.5, 4.5, 6.7)
+gp <- c(1,1,1,2,2)
+
+par(mai=c(1,1,0,0))
+plot(dv, gp, axes=F, xlab="Average time", ylab="Grouping based on 
+            \n mean comparison",
+     ylim=c(0,3), xlim=c(0,7), pch=16)
+text(dv, gp-.2, iv)
+axis(side=2, label=c("i", "ii"), at=c(1,2))
+axis(side=1)
+abline(h=c(1,2),col="blue",lty=3)
+
+

+",2013-10-16 15:51:05.500 +57626,22762.0,1,217429.0,,,How to estimate vector autoregression & impulse response function with panel data,,CC BY-SA 3.0,"

I am working on vector auto-regression (VARs) and impulse response function (IRFs) estimation based on panel data with 33 individuals over 77 quarters. How should this type of situation be analyzed? What algorithm's exist for this purpose? I would prefer to conduct these analyses in R, so if anyone is familiar with R code or a package designed for this purpose that they could suggest, that would be especially helpful.

+",2013-10-16 15:53:54.087 +57627,19298.0,2,,55609.0,,,,CC BY-SA 3.0,"

I'd suggest taking a look at http://www.amazon.com/Predictive-Analytics-Microsoft-Conrad-Carlberg/dp/0789749416 if you are restricted to Excel. There are also example R codes in the book to help transition from thinking in Excel to thinking in R. The spreadsheet examples from the book can be found here: http://www.quepublishing.com/store/predictive-analytics-microsoft-excel-9780789749413. Chapter 5 is about time series.

+",2013-10-16 15:57:56.437 +57628,18198.0,1,,,,Estimators for linear regression when multicollinearity is present,,CC BY-SA 3.0,"

I have a multicollinearity problem in a linear regression model and ridge regression was suggested as a solution. So I have spent quite some time researching different ridge regressors in the literature (there's at least a dozen), however in the course of my research I have found that Principal Components Regression can be considered a special case of ridge regression and so I am including it too. Clearly Partial Least Squares regression is closely linked to Principal Component Regression so it seems it should be added too (although I haven't researched that yet). Also when deriving Ridge regression from the Bayesian viewpoint it became obvious that a Kalman filter approach could be used to implement the Bayesian approach so I'm including that too. The Bayes approach seems fairly natural in my application as I have several years worth of data some in which the collinearity effects are present some where they are not so I can build up a decent prior for the Beta parameters for the cases where little or no MC is present.

+ +

I have reviewed various comparison papers and they usually compare the estimators on the basis of their Mean Square Error Performance. Generally in the Ridge v Bayes/Kalman there seems to be no clear winner with the performance depending on the orientation of the betas being estimated to the principal components of the design matrix, the level of multicollinearity and the signal to noise ratio. So it would seem that the best estimator can only be defined in the sense of the best for a given problem.

+ +

When should I use lasso vs ridge?

+ +

Now through my research I have stumbled into Lasso (and Least Angle regression) and from the answers to the previous thread (see above) I can add ""Elastic Net"" and ""non-negative garrote"" (and probably more). I just wonder whether it makes sense to add these to my comparison too? If I'm totally honest it seems that I should. The theory behind the Lasso is pretty similar to that used for the Ridge estimator with the optimization being done in the L1 rather than L2 space.

+ +

Ideally I would like to compare all the different classes of shrinkage estimators using the best estimator from each class. Right now I have at least a dozen ridge regressors and no way a priori of knowing which would be best for my problem (given the inconclusive results of the comparison papers I mentioned previously). If I only have to add say the ""Lasso"" and ""Elastic-net"" and even the ""Non-negative Garrotte"" then probably I can do it but if they have as many variations as the Ridge literature has and similar problems identifying which one is likely to be best for a given problem then it seems that the whole thing may become rather unwieldy and I can't cover every approach that's ever been suggested in the literature. So my questions are:

+ +
    +
  1. What are the different classes of estimators to deal with Multicollinearity in linear regression: eg. Ridge, PCR, PLS, Lasso...
  2. +
  3. Apart from Ridge regression do these other classes have one implementation that is generally regarded as best. Any paper I have seen that compares Ridge to other classes of estimators, the authors generally use the basic Ridge regression method which is shown to be one of the poorest performing ridge methods.
  4. +
  5. Do you have experience in comparing Ridge to these other techniques and which was found to be best?
  6. +
  7. Is it realistic to compare these different classes all together?
  8. +
+ +

Here is a list of the lasso techniques I have found from a review article by Tibshirani (2011):

+ +

Grouped Lasso; +Elastic Net Lasso; +Fusd Lasso; +Adaptive Lasso; +Graphical Lasso; +Dantzig selector; +Near Isotonic Regulation; +Matrix Completion; +Compressive Sensing; +Multivariate Methods;

+ +

Now my problem consists of only 4 different X values and around 30 y values. The really high multicollinearity occurs between X variables 3 & 4. Although there can be significant multicollinearity with variable 2 as well. (Variable 1 is a constant).

+ +

Most of these techniques are suited for p>>N but this seems directly at odds with the comment from Gary here:

+ +

When should I use lasso vs ridge?

+ +

That lasso should be used when you have high multicollinearity effects and few variables?

+ +

In any case given my problem I think that the Lasso and Elastic Net are most suitable. +Can anyone shed any light as to whether any of the remaining techniques may be helpful for my problem?

+",2013-10-16 16:02:21.723 +57629,21864.0,1,57635.0,,,How to define the multiplier range for variance test based outliers detection algorithm?,,CC BY-SA 4.0,"

I have a variance test based outliers detection algorithm. The algorithm is exposed with a visual application where the user can configure its parameters namely the multiplier.

+ +

The question is what is the range of values for the algorithm multiplier?

+ +

the algorithm basically is calculating the mean and standard deviation (sigma) of the data set and then comparing the dataset elements to the upper and lower bounds to flag an element as an outlier or not.

+ +
multiplier <- 2.3; 
+upper_bound <- mean + multiplier * sigma; 
+lower_bound <- mean - multiplier * sigma; 
+
+ +

the evolution of the two functions upper_bound (red) and lower_bound(blue) with the multiplier values is as follows:

+ +

+ +

But this graph doesn't give any clue on what range to define. Do you have any idea how to define this range?

+",2013-10-16 16:17:09.840 +57630,17628.0,2,,13058.0,,,,CC BY-SA 3.0,"

Other answerers assume that you deal with raster image of a graph. But nowadays the good practice is to publish graphs in vector form. In this case you can achieve much higher exactness of the recovered data and even estimate the recovery error if you work with the code of the vector graph directly, without converting it to raster image.

+ +

Since the papers are published online as PDF files, I assume that you have a PDF file which contains vector plot with data you wish to recover from it (get in numerical form) and estimate introduced recovery error.

+ +

First of all, PDF is a vector format which is basically textual (can be read by a text editor). The problem is that it can (and almost always) contain compressed data streams which require to be uncompressed in order to read them by a text editor. These compressed data streams usually contain the information we need.

+ +

There are several ways to uncompress data streams in order to convert PDF file to a textual document with readable PDF code. Probably the simplest way is to use free QPDF utility with --stream-data=uncompress option:

+ +
qpdf infile.pdf --stream-data=uncompress -- outfile.pdf
+
+ +

Some other ways are described here and here.

+ +

The generated outfile.pdf can be opened by a text editor. Now you need PDF Reference Manual 1.7 to understand what you see. Do not panic at this moment! You need to know only few operators described in the ""TABLE 4.9 Path construction operators"" on pages 226 - 227. The most important operators are (the first column contains coordinate specification for an operator, the second contains the operator and the third is operator name):

+ +
x y               m   moveto 
+
+x y               l   lineto 
+
+x y width height  re  rectangle
+
+                  h   closepath
+
+ +

In most cases it is sufficient to know these four operators for recovering the data.

+ +

Now you need to import the outfile.pdf file as text into some program where you can manipulate the data. I'll show how to do it with Mathematica.

+ +

Importing the file:

+ +
pdfCode = Import[""outfile.pdf"", ""Text""];
+
+ +

Now I assume the simplest case: the graph contains a line which consists of many two-point segments. In this case each segment of the line is encoded like this:

+ +
268.79999 408.92975 m
+272.39999 408.92975 l
+
+ +

Extracting all such segments from the PDF code:

+ +
lines = StringCases[pdfCode, 
+   StartOfLine ~~ x1 : NumberString ~~ "" "" ~~ y1 : NumberString ~~ "" m\n"" ~~ 
+                  x2 : NumberString ~~ "" "" ~~ y2 : NumberString ~~ "" l\n"" 
+                                        :> ToExpression@{{x1, y1}, {x2, y2}}]; 
+
+ +

Visualizing them:

+ +
Graphics[{Line[lines]}]
+
+ +

You get something like this (the paper I am working with contains four graphs):

+ +

+ +

Each two adjacent segments share one point. So in this case you can turn the sequences of adjacent segments into paths:

+ +
paths = Split[lines, #1[[2]] == #2[[1]] &];
+
+ +

Now you can visualize all the paths separately:

+ +
Graphics[{Line /@ paths}]
+
+ +

From this figure you can select (by double-clicking) the path you are looking for, copy graphics selection and paste as new Graphics. For converting it backward to list of points you take the element {1, 1, 1}. Now we have the points not in the coordinate system of the graph but in the coordinate system of the PDF file. We need to establish relationship between them.

+ +

From the above plot you select ticks by hand (holding Shift for multiple selection), then copy them and paste as new Graphics. Here is how you can extract coordinates of horizontal ticks:

+ +

+ +

Now check the differences between ticks:

+ +
Differences[reHorTicks]
+
+ +

From these differences you can see how precise is positioning of the ticks in the PDF file. It gives an estimate of error introduced by converting original datapoints into vector graph included in the PDF file. If there are appreciable errors in ticks positioning you can reduce the error by fitting the coordinates of ticks to a linear model. This linear function now can be used to get original coordinates of points of the path (that is in the coordinate system of the plot).

+",2013-10-16 16:20:30.000 +57631,,1,57637.0,,user30490,Backshift operator applied to a constant,,CC BY-SA 3.0,"

This questions is two part:

+ +

1) What happens when you apply the backshift operator to a constant? For example, if I have the AR process +$$(1-\phi B)(y_t-\mu)=\epsilon_t$$ +does that equal +$$y_t-\mu-\phi By_t-\phi B\mu = \epsilon_t$$ +which (I believe reduces to) +$$y_t-\mu-\phi y_{t-1}-\phi \mu = \epsilon_t\longrightarrow y_t=\mu+\phi y_{t-1}+\phi \mu+\epsilon_t $$

+ +

So am I correct in assuming that the backshift of a constant (in my example $\mu$) is just the constant?

+ +

2) If I assume that $\epsilon_t\sim N(0,v)$, then what is the likelihood of the above AR process in 1?

+",2013-10-16 16:40:19.333 +57632,3580.0,2,,57621.0,,,,CC BY-SA 3.0,"

I don't know how many tables you were expecting, but the mean and variance of the number of total tables is available in closed form. For $\alpha = 5$ and $N = 1000$ +$$ +E[\mbox{Num Tables}] = \sum_{i=1}^N \frac{\alpha}{\alpha + i - 1} \approx 27, +$$ +and +$$ +\mbox{Var}[\mbox{Num Tables}] = \sum_{i=1}^N \frac{\alpha(i-1)}{(\alpha + i - 1)^2} \approx 21.5. +$$

+ +

I ran your code in python and it seems consistent with these formulas. Note that the expected number of tables grows logrithmically in $N$.

+",2013-10-16 17:06:27.293 +57633,16746.0,1,,,,Vector space model: cosine similarity vs euclidean distance,,CC BY-SA 3.0,"

I have corpora of classified text. From these I create vectors. Each vector corresponds to one document. Vector components are word weights in this document computed as TFIDF values. Next I build a model in which every class is presented by a single vector. Model has as many vectors as there classes in the corpora. Component of a model vector is computed as mean of all component values taken from vectors in this class. +For unclassified vectors I determine similarity with a model vector by computing cosine between these vectors.

+ +

Question: Can I use Euclidean Distance between unclassified and model vector to compute their similarity? If not - why?
+Thanks!

+",2013-10-16 17:33:04.630 +57634,22507.0,2,,57237.0,,,,CC BY-SA 3.0,"

It seems that in order to reproduce the joint distribution $\rho(a,v)$, you should select new $a$ not only based on $v$, but based on the old $a$ also:

+ +

$a_{i+1} \sim \rho'(a_{i+1}|a_i, v_i)$

+ +

The question (to which I don't know the answer yet) is how to find $\rho'$ which produces $\rho$.

+ +

UPD: +You are to solve the following integral equation:

+ +

$$\rho(a, v) = \int da' \rho'\left(a|a', v-{a+a'\over 2}\Delta t\right) \rho(a', v-{a+a'\over 2}\Delta t)$$

+ +

Approximating the function $\rho$ with a histogram, you turn this to a system of linear equations:

+ +

$$\cases{ +\rho(a, v) = \sum_{a'} \rho'\left(a|a', v-{a+a'\over 2}\Delta t\right) \rho(a', v-{a+a'\over 2}\Delta t) \\ +\sum_a \rho'\left(a|a', v'\right) = 1}$$

+ +

This system is underdetermined. You may apply a smoothness penalty to obtain a solution.

+",2013-10-16 17:35:22.410 +58282,22637.0,2,,58279.0,,,,CC BY-SA 3.0,"

You need independent RV's since then the covariance is identically equal to zero. An example might be the following distributions

+ +

$$ \begin{align} f_Y (y)= 2y\ \text{for}\ 0<y<1 \\ f_X(x)= 2x\ \text{for}\ 0<x<1 \\ f_(x,y)=4xy \end{align} $$ +Try it!

+",2013-10-26 22:12:25.427 +57635,450.0,2,,57629.0,,,,CC BY-SA 3.0,"

To find the outliers, you cannot use the distance of an observation to a model through a rule such as:

+ +

$$\frac{|\hat{\mu}-x_i|}{\times \hat{\sigma}},\;i=1,\ldots,n$$

+ +

if your estimates of $(\hat{\mu},\hat{\sigma})$ are the classical ones (the usual mean/standard deviation) because the fitting procedure you use to obtain them is itself liable to being pulled towards the outliers (this is called the masking effect).

+ +

One simple way to reliably detect outliers however is to use the general idea you suggested (distance from fit) but replacing the classical estimators by robust ones much less susceptible to be swayed by outliers. Below I present a general illustration of the idea. If you give more information about your specific problem I can append my answer to address the particulars of your situation.

+ +

An illustration: consider the following 20 observations +drawn from a $\mathcal{N}(0,1)$ (rounded to the second +digit):

+ +
x<-c(-2.21,-1.84,-.95,-.91,-.36,-.19,-.11,-.1,.18,
+.3,.31,.43,.51,.64,.67,.72,1.22,1.35,8.1,17.6)
+
+ +

(the last two really ought to be .81 and 1.76 but have + been accidentally misstyped).

+ +

Using a outlier detection rule based on comparing the statistic

+ +

$$\frac{|x_i-\text{ave}(x_i)|}{\text{sd}(x_i)}$$

+ +

to the quantiles of a normal distribution would never +lead you to suspect that 8.1 is an outlier, leading you +to estimate the $\text{sd}$ of the 'trimmed' series to be + 2 (for comparison the raw, e.g. untrimmed, estimate of + $\text{sd}$ is 4.35).

+ +

Had you used a robust statistic instead:

+ +

$$\frac{|x_i-\text{med}(x_i)|}{\text{mad}(x_i)}$$

+ +

and comparing the resulting robust $z$-scores to the +choosen quantiles of a candidate distribution (typically the +standard normal if you can assume the $x_i$'s to be +symetrically distributed) you would have correctly + the last two observations as outliers (and correctly +estimated the $\text{sd}$ of the trimmed series to be +0.96).

+",2013-10-16 17:48:18.097 +57636,18848.0,1,57694.0,,,How to identify variable (from many variables) which is able to discriminate between groups?,,CC BY-SA 3.0,"

I currently have a data frame with 98 observations and 107 variables. All of the variables are numeric, but one variable is binary (yes or no). My goal is to determine which correlation and/or variable give the greatest segregation between the yes and no samples. I have been using the pairs () function to do this, but I can only do a few variables at a time. Is there a way to determine which correlation gives the greatest discernment between yes and no?

+ +

To Clarify - My table is 98 observations and 107 variables, but doing a correlation matrix with the pairs function is not able to fit all of the variables.

+ +

I have used this function:

+ +
pairs(x[70:80], ch=21, bg=c(""red"",""green"")[unclass(x$outcome)])
+
+",2013-10-16 17:50:57.100 +57637,20473.0,2,,57631.0,,,,CC BY-SA 3.0,"

The Backshift operator operates normally on a constant as on every other symbol. So it shifts the constant one period back -where we find that the constant has the same value as in the current period, since this is what the essence of a constant is.

+ +

For the likelihood of an AR(1) process, in this answer there is the likelihood for the case without the constant -but from there it is just a small step to here.

+ +

ADDENDUM
+The chain rule will be the same, but the conditional density will be

+ +

$$Y_i | Y_{i-1},\dots,Y_0 \sim \mathcal{N}\left((1+\phi) \mu+\phi Y_{i-1},v\right) $$

+ +

You need to specify what the distribution of $Y_0$ will be (will it contain the unknown parameters $\phi$, $v$? If not, it doesn't really matter.

+",2013-10-16 17:54:32.043 +57638,5237.0,4,,,,,,CC BY-SA 3.0,The response of an endogenous system to an exogenous shock. This is an important topic in time-series econometrics.,2013-10-16 18:34:20.250 +57639,5237.0,5,,,,,,CC BY-SA 3.0,,2013-10-16 18:34:20.250 +57640,22767.0,1,,,,Summation of a product,,CC BY-SA 3.0,"

I need to calculate the following expression:

+ +

$$\sum_{k=1}^N a_k b_k$$

+ +

${a_k}$ and $b_k$ are real positive numbers. N and k are integers.

+ +

I know the average values of $a_k$ , defined as $\overline {a} = {\sum_{k=1}^N a_k \over N } $ and $b_k$ , defined as $\overline {b} = {\sum_{k=1}^N b_k \over N } $. I also know the standard deviation of $a_k$ and $b_k$, $\sigma_a$ and $\sigma_b$.

+ +

If only knowing these quantities, I have to make some approximation, I would like to know how much error I am producing with that approximation. $N$ is relatively big.

+ +

Any help is appreciated.

+",2013-10-16 18:47:58.330 +57641,22631.0,1,,,,Find weight of features for feature selection,,CC BY-SA 3.0,"

I have a data set of videos from which I need to recognize the emotion of the speaker. For that reason I have some markers on the face of the speaker. I detect their movement as the speaker speaks and for every frame find the change in the marker coordinates.

+ +

There are a total of 65 markers (blue dots) on the face of a speaker so at the end of one frame I have 130 (x and y coordinates) stored. I have chosen the mean and standard deviation of those points over the video sequence as my features (based on a paper published in 2009)

+ +

My question is from these 260 features per video (mean and standard deviation of each x and y coordinate) how do I reduce this to something more understandable or which produces a meaningful probability output(in my case an emotion). In the paper it used the plus l and take away r algorithm based on the Bhattacharyya Distance criterion but I just couldn't understand how to actually say one feature is better than another based on that criterion.

+",2013-10-16 18:51:55.967 +57642,22765.0,2,,41914.0,,,,CC BY-SA 3.0,"

As I know from Lewandowsky Algorithm, it works like Holt-Winters algorithm. +You will define three parameters:

+ +
    +
  1. α is the data smoothing factor and it's 0 < α < 1
  2. +
  3. β is the trend smoothing factor, 0 < β < 1,
  4. +
  5. γ is the seasonal change smoothing factor, 0 < γ < 1.
  6. +
+ +

If you select the big number (near to 1) for α, it means you rely more on recent past data rather than old past data.
+If you choose the big number (near to 1) for β, it means you rely more on past data's trend and you believe the trend will go on in future too. (you increase the weight of trend smoothing)
+If you choose the big number (near to 1) for γ, it means you rely more on past data's seasonality and you believe the seasonality factors will remain in future.
+My suggestion is to start with some numbers for α,β and γ then after each period try to calculate your error, and find the numbers which reduce your errors.
+I used this model in Health-care and it gives me the accurate numbers.

+",2013-10-16 19:01:11.903 +57643,18767.0,1,57675.0,,,Latent Dirichlet Allocation - understanding the posterior,,CC BY-SA 3.0,"

I have a problem understanding the posterior for computing LDA, stated in page 7 of Blei (2007). From my point of view, it's not exactly consistent with Bayes' theorem, as described here. Could anyone give me a simple explanation of how this formula was derived? I don't really understand how the $p(\beta, \theta, z, w)$ corresponds to $P(B|A)P(A)$ in Bayes theorem. I will be extremely grateful for any help.

+",2013-10-16 19:06:39.283 +58283,21762.0,2,,58279.0,,,,CC BY-SA 3.0,"

A simple example of an uncorrelated but dependent pair:

+ +

$X_1=(0,0.1,\dots,1)\cdot \pi$

+ +

and

+ +

$X_2=\sin(X_1)$

+ +

Edit: Since it is not particularly easy to work with trigonometric functions, you might as well work with a triangle:

+ +

$X_1=(-2,-1,0,1,2)$ with mean 0 and

+ +

$X_2=2-|X_1|$

+",2013-10-26 22:12:46.530 +57644,12140.0,1,,,,Explaining why process obeys Central Limit Theorem,,CC BY-SA 3.0,"

I'm trying to explain why some complex process obeys Central Limit Theorem.

+ +

The process is a chip compiler that runs complex place & route algorithms. The input is an integer seed. It initializes the algorithms in a random way. The output is a real number, which determines quality of results; the higher the number - the better. Exact implementation of place & route algorithms is not known. But their goal is to reach quality of results be positive.

+ +

I run 100 compiles with different seeds. When I plot a histogram of the results, it looks like a normal distribution. I tried different designs, tool versions, etc., and always get nicely shaped normal distribution, but with different mean and variance.

+ +

I strongly suspect that Central Limit Theorem plays a role here. But why?

+ +

Why would a complex place&route algorithm obey CLT, if it has nothing to do with any random distribution. Or maybe the interpretation of the results has nothing to do with the CLT.

+ +

Below is a process block diagram and example of the results.

+ +

+",2013-10-16 19:13:11.033 +57645,22769.0,1,,,,Binary Logistic Regression Multicollinearity Tests,,CC BY-SA 3.0,"

I like Peter Flom's answer to an earlier question about multicollinearity in logistic regression, but David Garson's Logistic Binomial Regression states that there is no valid test for multicollinearity for binary-dependent logistic regression, even if the independent variables are ratio scale. Can anyone supply one or more references? My own experience is that OLS correlation matrices and VIF worked for me, as my logistic coefficients went haywire before removing entangled independent variables based on the OLS tests for multicollinearity. But I have to publish my results and methods, and would like a reputable way to cite the practice, if one or more exist.

+",2013-10-16 19:53:44.243 +57646,22729.0,2,,40030.0,,,,CC BY-SA 3.0,"

Cross-validation article in Encyclopedia of Database Systems says:

+ +
+

Stratification is the process of rearranging the data as to ensure + each fold is a good representative of the whole. For example in a + binary classification problem where each class comprises 50% of the + data, it is best to arrange the data such that in every fold, each + class comprises around half the instances.

+
+ +

About the importance of the stratification, Kohavi (A study of cross-validation and bootstrap for accuracy estimation and model selection) concludes that:

+ +
+

stratification is generally a better scheme, both in terms of bias and variance, when compared to regular cross-validation.

+
+",2013-10-16 20:09:48.570 +57647,22555.0,2,,57237.0,,,,CC BY-SA 3.0,"

Doesn't the gps data contain position $p$? I would have thought that, not only is $v_{i+1}$ dependent upon $v_{i}$ and $a_{i}$ but $a_{i+1}$ would also be dependent upon $p_{i}$. Consider: in any road network there are bottlenecks, speed limits, signals, intersections, steep gradients, etc. that are geolocated. So something like an ensemble (distribution) defined by:

+ +

$F_{a} = Pr ( A_{i+1} \le a_{i+1}\ |\ a_{i},v_{i},p_{i} )$
+$v_{i+1} = v_{i} + a_{i}dt$

+ +

For such an ensemble, the difficulty will lay in the nature of the data. It is likely that the true population will be asymmetric, non-linear (piece-wise) and may not have defined moments. These characteristics may not be evident within the sample you have at hand.

+ +

As @whuber has stated, the problem, ie exactly what you are seeking to produce, does not yet seem fully and clearly defined. It is not clear as to whether you are interested in the ensemble or more so the individuals.

+",2013-10-16 20:14:15.467 +57648,4871.0,1,,,,Conditional Logit for recommender systems?,,CC BY-SA 3.0,"

Are conditional multinomial logits used for recommendation engines? Although they are commonly used in econometrics, I've never heard it used or discussed in the context of recommender systems.

+ +

Economists use multinomial conditional logits to model which of several options a person would choose and how much they value each characteristic of the items being chosen. This is often referred to as the hedonic model.

+ +

The classic multinomial logit deals with discrete items (predict whether a commuter would ""walk,"" ""take the bus,"" or ""take the subway.""

+ +

The conditional multinomial logit uses data on ANY sets of observed choices and does not require that each person choose among the same set of things. It also puts values on various characteristics/variables. For example, you may see people decide which of several houses to buy. Each house is different -- square footage, number of rooms, price, etc. Based on observed choices, the model estimates the importance of various characteristics and you can derive a predicted ""utility"" score that each person has for each house. The model then predicts that the house with the highest score is chosen.

+ +

Here is a description: +http://data.princeton.edu/wws509/notes/c6s3.html

+",2013-10-16 20:16:59.357 +57649,22772.0,2,,18335.0,,,,CC BY-SA 3.0,"

The derivation of AIC as an estimator of Kullback-Leibler information loss makes no assumptions of models being nested.

+",2013-10-16 20:18:28.967 +57650,13051.0,1,57655.0,,,Use of linear regression or logistic regression when testing conditioned group differences,,CC BY-SA 3.0,"

Background

+ +

I have a colleague interested in a particular disease and specifically if the continuous variable $X$ is different between controls and patients. Preliminary results suggest that patients have higher values of $X$ than controls.

+ +

The straightforward approach would be a unpaired t-test to test if there is a difference in mean values between patients and controls.

+ +

However, the literature suggests that age and sex are correlated with $X$ so my colleague wanted to control for this when testing. Hence, she decided to use regression to solve the problem.

+ +

Problem

+ +

My colleague's supervisor reasons that since they do not know if it is the disease that causes raised $X$ or if raised $X$ causes the disease they could use either linear regression or logistic regression to solve the problem. The supervisor also argued that logistic regression was the preferable approach.

+ +

If $d$ is a categorical variable with levels {control, disease} then the first model could be written

+ +

$$X = \beta_0 + \beta_1 d + \beta_2 \text{age} + \beta_2\text{sex} + \epsilon$$

+ +

Where the interpretation was that there is a correlation between disease and $X$ if $\beta_1$ was found significant.

+ +

and the second model

+ +

$$\log(\text{Odds}(d=\text{disease})) = \beta_0 + \beta_1 X + \beta_2\text{age} + \beta_2\text{sex} + \epsilon$$

+ +

where the interpretation was that there is a correlation between disease and $X$ if $\beta_1$ was found significant.

+ +

What is the opinion among the experts on cross validated? Are there other methods?

+ +

comments

+ +

The patient and control groups were not perfectly matched and my colleague wants to make sure that she has controlled for both sex and age to avoid upsetting the reviewers. +I do not know if there were any significant differences between the sex and age distributions between patients and controls.

+ +

There were also indications that the variance in $X$ was different in the two groups and questions were asked if this would influence the regression models.

+ +

My personal opinion is that because my colleague is interested in the conditional expected value $E(X|d,\text{age},\text{sex})$ she should use the ordinary regression model or just a t-test (if there are no age or sex differences between groups).

+",2013-10-16 20:20:48.940 +57651,15183.0,2,,57626.0,,,,CC BY-SA 3.0,"

I would suggest using the {vars} library in R. It has a function for estimating a VAR-model and for estimating an impulse response function from this model and for investigating Granger causality etc.

+ +

I suggest you look into the following functions:

+ +
> VARselect()
+> VAR()
+> irf()
+> causality()
+
+",2013-10-16 20:26:25.067 +57653,22773.0,1,57660.0,,,Denominator is Zero for Matthews correlation coefficient and F-measure,,CC BY-SA 3.0,"

Recently, I built a classification model based on the imbalanced data set(positive sample is minority and negative sample is majority), and the model gave the following result for the test set:

+
+

True Positives = 0

+

True Negatives = 139

+

False Positives = 0

+

False Negatives = 10.

+
+

My question is: for the result, can Matthews correlation coefficient (MCC ) and F-measure be used for estimating the classifier?

+

Since the denominators for MCC and F-measure are zero, it seems meaningless. If so, MCC and F-measure is not always works for estimating the classifier, and sensitivity and specificity as well as g-mean should be better. Is that right?

+

Any help is appreciated.

+",2013-10-16 20:30:15.897 +57654,503.0,2,,57645.0,,,,CC BY-SA 3.0,"

I'm glad you like my answer :-)

+ +

It's not that there is no valid method of detecting collinearity in logistic regression: Since collinearity is a relationship among the independent variables, the dependent variable doesn't matter.

+ +

What is problematic is figuring out how much collinearity is too much for logistic regression. David Belslely did extensive work with condition indexes. He found that indexes over 30 with substantial variance accounted for in more than one variable was indicative of collinearity that would cause severe problems in OLS regression. However, ""severe"" is always a judgment call. Perhaps the easiest way to see the problems of collinearity is to show that small changes in the data make big changes in the results.

+ +

[this paper http://www.medicine.mcgill.ca/epidemiology/joseph/courses/epib-621/logconfound.pdf] offers examples of collinearity in logistic regression. It even shows that R detects exact collinearity, and, in fact, some cases of approximate collinearity will cause the same warning:

+ +
Warning message:
+glm.fit: fitted probabilities numerically 0 or 1 occurred
+
+ +

Nevertheless, we can ignore this warning and run

+ +
set.seed(1234)
+x1 <- rnorm(100)
+x2 <- rnorm(100)
+x3 <- x1 + x2 + rnorm(100, 0, 1)
+
+y <- x1 + 2*x2 + 3*x3 + rnorm(100)
+ylog <- cut(y, 2, c(1,0))
+
+m1<- glm(ylog~x1+x2+x3, family = binomial)
+coef(m1)
+
+ +

which yields -2.55, 1.97, 5.60 and 12.54

+ +

We can then slightly perturb x1 and x2, add them for a new x3 and run again:

+ +
x1a <- x1+rnorm(100,0,.01)
+x2a <- x2+rnorm(100,0, .01)
+x3a <- x1a + x2a + rnorm(100, 0, 1)
+
+ya <- x1a + 2*x2a + 3*x3a + rnorm(100)
+yloga <- cut(ya, 2, c(1,0))
+
+
+m2<- glm(ylog~x1a+x2a+x3a, family = binomial)
+coef(m2)
+
+ +

this yields wildly different coefficients: 0.003, 3.012, 3.51 and -0.41

+ +

and yet, this set of independent variables does not have a high condition index:

+ +
library(perturb)
+colldiag(m1)
+
+ +

says the maximum condition index is 3.54.

+ +

I am unaware if anyone has done any Monte Carlo studies of this; if not, it seems a good area for research

+",2013-10-16 20:42:49.420 +57655,5237.0,2,,57650.0,,,,CC BY-SA 3.0,"

I say you are right. You should use OLS regression here, not logistic regression.

+ +

The question of causality is a red herring. Causality is not required for either linear regression or logistic regression, and it is fine to model a cause as a (e.g., linear) function of an effect. In fact, there are predictive models that do so. As an example, researchers studying the collapse of the Mayan civilization have hypothesized that drought may have initiated its decline. Predictive models have been built that allow researchers to make an educated guess about rainfall levels (i.e., causes) from traces that remain (i.e., effects; e.g., analyses of core samples from lake beds), to clarify this possibility.

+ +

Which variable should be made the response variable, and which the explanatory variable should be decided based on the question you want to answer. It is clear from your setup that you are wondering about possible differences in the level of $X$ given the disease state. Thus, $X$ should be the response variable, and disease state should be the explanatory variable.

+",2013-10-16 20:50:03.253 +57656,16039.0,1,,,,Fitting a gam model with simple Gaussian/Student-t heteroskedasticity,,CC BY-SA 3.0,"

I am fitting a gam model in R (using the gam function in mgcv) to account for some non-linear effects in my data. A stripped down example of what I am doing in R is:

+ +
mod=gam(y~s(x)+s(z),data=df)
+
+ +

However, I want to add a slightly more complicated variance model to my regression of the form

+ +

$$\epsilon \sim N(0,\sigma^2),\ \sigma = f(\hat{\mu})$$

+ +

where $\hat{\mu}$ is the fitted value of the model. (Actually, it would be nice if $\epsilon \sim t_\nu$ for some $\nu$ but sticking to this for now. I have managed to do this in the gls function from nlme using the varFunc(form=fitted(.)) type approach, but can't figure out if there is an option to do the same kind of thing using gam.

+ +

I recognise this is not really the intention of a GLM/GAM model, but I don't want to reinvent the wheel if I am just missing something obvious

+ +

Edit: In response to the question in the comment below, I am hoping to fit a linear or quadratic function for $f$. I do not know the exact form of $f$ but plan to iteratively estimate it from the residuals if this can't be done automatically.

+ +

Edit2: Typo in R code - first spline is not meant to be a function of y!

+",2013-10-16 20:50:26.200 +57657,15583.0,1,57666.0,,,How to use liblbfgs for fitting?,,CC BY-SA 3.0,"

I am trying to use bfgs algorithm in order to fit a set of $\{(x,y),f(x,y)\}$ to a function in the form of let's say $a\cdot cos(x)+b \cdot y=f(x,y)$.

+ +

I try to understand how to use bfgs algorithm with liblbfgs, but I don't understand the example and it is not clear what function the author tried to fit.

+",2013-10-16 20:55:33.907 +57658,449.0,2,,57650.0,,,,CC BY-SA 3.0,"

I agree with your supervisor, logistic regression is the better choice. The continuous predictor allows you to make a more nuanced probabilistic statement about the disease relationship rather than the more blunt categorization statement of the relationship resulting from using disease as the predictor.

+ +

You did word your initial question about whether X was different between controls and patients but it isn't clear that future use of the information might better be served by being able to make statements about the probability of disease given the value of X. I'm trusting that your supervisor has better knowledge of the more general use of the findings and therefore would offer more appropriate advice than any of us can here.

+ +

It gets a bit tricky looking at your covariates of age and sex when they're primarily supposed to be related to X. My guess is they're related to both X and disease, in which case you really need to treat them not as covariates, that are generally ignored, but as fully analyzed predictors. You'll need to know the correlations among your predictors and how they influence your model.

+",2013-10-16 21:04:07.037 +57725,2873.0,2,,57718.0,,,,CC BY-SA 3.0,"

Do you have any other information available? previous studies of similar cases? number and types of foods that were not measured? guesses about the process from biologists?

+ +

If you can find other existing information then possibly your best bet (other than the simple descriptives) is a Bayesian analysis. But be very careful in how you create your priors and the relationships, without a lot of data it can be easy to over interpret results that are more a reflection of the prior than the data.

+",2013-10-17 18:44:32.173 +57660,5203.0,2,,57653.0,,,,CC BY-SA 3.0,"

This is only really a problem if you compute the precision and recall first, then plug them in.

+ +

One can also compute the $F_1$ score as +$$F_1 = \frac{2 \cdot \textrm{True Positive}}{2 \cdot \textrm{True Positive} + \textrm{False Positive} + \textrm{False Negative}}$$

+ +

Plugging in your numbers, you'll arrive at an $F_1$score of zero, which seems appropriate since your classifier is just guessing the majority class.

+ +

There is an information-theoretic measure called proficiency that might be of interest if you are working on fairly unbalanced data sets. The idea is that you want it to remain sensitive to both classes as either the number of true positives or negatives approaches zero. It's essentially $$ +\frac{I(\textrm{predicted labels}; \textrm{actual labels})}{H(\textrm{actual labels)}}$$

+ +

See pages 5--7 of White et al. (2004) for more details about its calculation and interpretation

+",2013-10-16 21:31:57.827 +57661,22777.0,1,,,,Forecasting an ADL model,,CC BY-SA 3.0,"

I am fairly new to Stata, currently taking an undergrad time series econometric class. The economic significance of the regression I am running and attempting to forecast is all but zero; this is simply my learning how to use Stata somewhat efficiently. I have run an Autoregressive Distributed Lag (ADL) regression of Industrial Production (ip) using two lags of ip and one lag of the Federal Funds Rate (ffr).

+ +

My goal: forecast this model 30 periods into the future. I was hoping I could run an AR(2) model of the ffr and forecast it 30 periods (this is straightforward and I have done so) and then use these values to forecast my ADL model 30 periods.

+ +

Does this make sense and could someone inform me on how I could do this or suggest another method?

+",2013-10-16 21:40:34.343 +57662,12544.0,2,,57659.0,,,,CC BY-SA 3.0,"

You can fit a Poisson model to any kind of data. But it doesn't really make sense to fit it to data that have non-integers, because it's supposed to be counts.

+ +

SPSS is worrying about that. Stata isn't.

+ +

Have you tested to see if you do have too many zeroes? (I always do.) If you do, you should run negative binomial regression instead. In Stata, run:

+ +
estat gof
+
+ +

after a Poisson regression to get a test of goodness of fit.

+ +

(Just as an aside, it's the opposite way around with non-integer frequency weights, Stata won't allow them, SPSS will.)

+",2013-10-16 21:42:30.623 +57663,21586.0,2,,57500.0,,,,CC BY-SA 3.0,"

I fear that there exist some notational differences across different sub-disciplines of statistics. Let me stick to a pragmatic, non-technical notation quite commonly used in Econometrics. Further, in my answer let me add point 5. to the list above, denoting semi-parametric regression models.

+ +

As an illustrative example consider the case of an additive regression model with response $Y$, regression function $g(X)$ and error process $U$, +\begin{equation} +Y=g(X)+U +\end{equation} +Usually we distinguish between +1. linear and +2. non-linear +regression functions, where ""linearity"" refers to linearity-in-parameters. Common examples used in Econometrics are

+ +
    +
  1. $g(X)=\beta_0+\beta_1 X$

  2. +
  3. $g(X)=\beta_0X^{\beta_1}$

  4. +
+ +

Now both cases 1. and 2., respectively, can be present in 3. parametric and 5. semi-parametric regression models. A prominent example of 3. is $U \sim N(\mu,\sigma^2)$, while case 5. is present if we do not wish to impose a parametric assumption about the distribution of $U$.

+ +

Finally, the regression function $g(X)$ may not contain parameters. If in addition we do not wish to impose a parametric assumption about the distribution of $U$ we have a 4. non-parametric regression model.

+ +

Remarks. As noted above there are different perceptions on how to define a non- or a semi-parametric model. Further, in case of non-additive regressions notational distinction becomes even more complicated. A now classical text trying to clarify the discussion is ""Econometric Foundations"" by Mittelhammer, Judge and Miller (2000, Cambridge Univ. Press).

+",2013-10-16 21:59:00.847 +57664,633.0,2,,41244.0,,,,CC BY-SA 3.0,"

Because the $N$ (independent) coin flips occur with probability proportional to $p^k(1-p)^{N-k}$, the likelihood induced on the coin's bias is $\textrm{Beta}(k + 1, N-k + 1)$.

+ +

You could have picked any parametrization of the bias. You chose to represent it as a probability $0 \le p \le 1$, but it could have been an ""odds"" $0\le o$, or a log-odds $\ell$. Since this choice is arbitrary, your prior should be independent of this choice. Jeffreys found the only prior that satisfies this ""indifference"" to the choice of parametrization: the Jeffreys prior, $\textrm{Beta}(\frac12, \frac12)$.

+ +

Pointwise product of densities gives the posterior $\textrm{Beta}(k+\frac12, N-k+\frac12)$.

+",2013-10-16 22:01:59.733 +57665,19325.0,1,85353.0,,,Why discriminative models are preferred to generative models for sequence labeling tasks?,,CC BY-SA 3.0,"

I understand that discriminative models, such as CRF(Conditional Random Fields), model conditional probabilities $P(y|x)$, while generative models, such as HMM(Hidden Markov Model), model joint probabilities $P(y,x)$.

+ +

Take CRF and HMM for example. I know that CRF can have a larger range of possible features. Apart from that, what else makes CRF (discriminative models) preferable to HMM(generative models) in sequence labeling tasks such as Part-of-Speech tagging and NER(Name Entity Recognition)?

+ +

Edit:
+I found out that HMMs will have to model $P(x)$, while CRFs don't. Why would it make a big difference in sequence labeling tasks?

+",2013-10-16 22:29:27.990 +57666,22143.0,2,,57657.0,,,,CC BY-SA 3.0,"

The example is doing a 100-dimensional (see #define N 100 in the code) optimization. The author is only printing the first two dimensions of x = (x[0],x[1],...,x[N]) as shown below for iteration 1.

+ +
Iteration 1:
+fx = 254.065298, x[0] = -1.069065, x[1] = 1.053443
+xnorm = 10.612828, gnorm = 325.365479, step = 0.000607
+
+ +

Now f(x) is defined in the function evaluate.

+ +
for (i = 0;i < n;i += 2) {
+    lbfgsfloatval_t t1 = 1.0 - x[i];
+    lbfgsfloatval_t t2 = 10.0 * (x[i+1] - x[i] * x[i]);
+    g[i+1] = 20.0 * t2;
+    g[i] = -2.0 * (x[i] * g[i+1] + t1);
+    fx += t1 * t1 + t2 * t2;
+}
+
+ +

fx is for the function value at $x$ and $g(x)$ is a $N\times 1$ (or $100\times 1$) dimensional gradient.

+ +

\begin{align} +f(x) = \sum_{i=0,2,4,...,N-2}(1-x_i)^2 + \left(10(x_{i+1} - x_i^2)\right)^2 +\end{align} +The gradient at odd components ($i=1,3,5,...$) is +$$200(x_{i+1} - x_i^2) $$ +The gradient component at even coordinates ($i=0,2,4,6,...$) is +$$-2(1-x_i) - 400*x_i*(x_{i+1} - x_i^2)$$

+ +

Note:

+ +
    +
  1. indexing starts from 0
  2. +
  3. I believe the gradients in the code are incorrect. When I change the appropriate line g[i+1] = 20.0 * t2; to g[i+1] = 200.0 * t2; I am getting a different answer. Potentially I may be making a mistake here. Nonetheless, hopefully I have answered your question.
  4. +
+ +

Our fitting problem +In our case, we have a two dimensional problem. Rename our $f(x,y)$ to $z$. Then, we have an $m\times 3$ dimensional matrix of values with each row being a tuple $(x_j,y_j,z_j), j=1,...,m$ which are fixed. We could now minimize the function $h(a,b)$ +\begin{align} +h(a,b) = \sum_{j=1}^{m}(a\cos(x_j) +b y_j - z_j)^2 +\end{align} +With +\begin{align} +\frac{\partial h(a,b)}{\partial a} = -2\sum_{j=1}^{m}\left((a\cos(x_j) +b y_j - z_j)\sin(x_j)\right)\\ +\frac{\partial h(a,b)}{\partial b} = 2\sum_{j=1}^{m}\left((a\cos(x_j) +b y_j - z_j)y_j\right) +\end{align} +as the gradient functions. +All that you need to do is encode these in place of the for loop above, change #define N 100 to 2 and initialize some initial value of $a,b$ to be passed into the lbfgs function.

+",2013-10-16 22:32:05.073 +57667,20752.0,2,,57665.0,,,,CC BY-SA 3.0,"

CRFs and HMMs are not necessarily exclusive model formulations. In the formulation you have above, X in the HMM is usually a state variable that is unobserved, so a generative model is somewhat necessary. In the CRF though, X is some feature vector that is observed and affects Y in the traditional way. But you can have a combination of both: a sequence of states and outputs where the state is unobserved, and a set of observed features that affects the conditional probabilities of the outputs given the states (or transition probabilities between states).

+ +

I believe that ultimately the CRF admits some more flexible models where the conditional probabilities are more dynamic, and could be affected by, for example, the output from several observations ago, or something like that. They can get awfully large and difficult to train when they start including many more free parameters like that though.

+",2013-10-16 22:50:58.400 +57668,9175.0,1,57724.0,,,Prove the following are independent,,CC BY-SA 3.0,"

I have been unable to figure out the following question

+ +

$f_{Y|X}(y|x)=N(x,x^2)$

+ +

$f_X(x)=U(0,1)$

+ +

Prove that $\frac{Y}{X}$ and $X$ are independent

+",2013-10-16 23:16:46.210 +57669,1145.0,1,57671.0,,,Calculate $t$ value for expanded sample using just $t$ and $n$,,CC BY-SA 3.0,"

Imagine a sample of observations $x_1 \dots x_n$ for which a $t$-value (i.e. $\bar x/SEM$) is known. Say an additional observation of zero is added to the sample so that we have $x_1 \dots x_{n+1}$ where $x_{n+1}=0$.

+ +

Can a new $t$-value be calculated from just the known original $t$ and $n$? (I suspect that it can, but my mathematics is inadequate for me to work out exactly how.)

+",2013-10-16 23:18:37.720 +57670,16205.0,1,,,Manas,Comparing statistics of networks of different sizes,,CC BY-SA 3.0,"

Hi This maybe a basic Stats question.

+ +

Let say I have 3 networks of different sizes. Size in terms of number of nodes and links. +Network n1, n2 and n3 have v1, v2 and v3 nodes and l1, l2 and l3 links. That is all we know about the networks.

+ +

For each of these networks we compute some parameters based on some criteria. For example,

+ +

param1 = no. of nodes with criteria 1

+ +

param2 = no. of nodes with criteria 2

+ +

param3 = no. of nodes with criteria 3

+ +

Result

+ +
   nodes links param1 param2 param3 
+n1   v1    l1     a      b      c   
+n2   v2    l2     d      e      f   
+n3   v3    l3     g      h      i
+
+ +

Question

+ +

If I average (a+d+g/3) to get the expected param1 across networks, would that make sense knowing that the size of the networks is different and the parameters are a function of the size of the network? How to reconcile with the fact that different networks have different sizes and therefore cannot be compared directly.

+",2013-10-16 23:34:59.187 +57671,594.0,2,,57669.0,,,,CC BY-SA 3.0,"

You have $$t = \frac{\bar x}{s/\sqrt n} = \sqrt n \frac{\bar x}{s}$$

+ +

So $$\frac{\bar x}{s} =t/\sqrt{n}$$

+ +

Using the subscript $n$ to denote ""calculated from the sample of size $n$"":

+ +

$$\bar{x}_{n+1} = \frac{n\bar{x}_n + 0}{n+1} = \frac{n}{n+1} \bar{x}_n$$

+ +

$$s^2_{n+1} = \frac{1}{n}[(n-1)s^2_n + (0-\bar{x}_{n})(0-\bar{x}_{n+1})] = \frac{n-1}{n}s^2_n+\bar{x}_{n}\bar{x}_{n+1}/n$$

+ +

$$= \frac{n-1}{n}s^2_n+\frac{\bar{x}_{n}^2}{n+1}$$

+ +

$$s_{n+1} = s_n\sqrt{\frac{n-1}{n}+\frac{\bar{x}_{n}^2}{(n+1)s^2_n}}= \frac{s_n}{\sqrt{n}}\sqrt{n-1+\frac{t_n^2}{n+1}}$$

+ +

So:

+ +

\begin{eqnarray} +t_{n+1} &=& \frac{\bar{x}_{n+1}}{s_{n+1}/\sqrt{n+1}}\\ +&=& \frac{\frac{n}{n+1} \bar{x}_n}{\frac{s_n}{\sqrt{n}} \sqrt{n-1+\frac{t_n^2}{n+1}}/\sqrt{n+1}}\\ +&=& \frac{\bar{x}_n}{s_n/\sqrt{n}}\frac{\frac{n}{n+1} }{ \sqrt{n-1+\frac{t_n^2}{n+1}}/\sqrt{n+1}}\\ + &=& t_n \frac{n }{ \sqrt{n^2-1+t_n^2}} +\end{eqnarray}

+ +
+ +

Numerical example in R; first computed using the formula above, then the actual expanded sample:

+ +
> x=rnorm(10);(t=mean(x)/(sd(x)/sqrt(10)))
+[1] -0.2219399
+> t*(10/sqrt(99+t^2))
+[1] -0.2230025
+> x1=c(x,0);(t1=mean(x1)/(sd(x1)/sqrt(11)))
+[1] -0.2230025
+
+ +

Looks like it works.

+",2013-10-17 00:00:26.417 +57672,,1,,,user30490,Relationship between inverse gamma and gamma distribution,,CC BY-SA 3.0,"

I have the following posterior distribution for $v$ +$$f(v)\propto v^{-p/2}\exp\left(-\frac{1}{v}\frac{s}{2}\right)$$ +and so clearly +$$v\sim\text{Inverse-Gamma}\left(\frac{p}{2}-1,\frac{s}{2}\right)$$

+ +

Now can I say that +$$v^{-1}\sim\text{Gamma}\left(\frac{p}{2}-1,\frac{s}{2}\right)$$

+",2013-10-17 00:18:07.753 +57673,22779.0,1,57674.0,,,Relationship between test stat and p-value in relation to t-test,,CC BY-SA 3.0,"

I was wondering why it is necessarily true that if a test statistic exceeds the critical value of t, then it will also be true that the p-value will not exceed the level of significance.

+",2013-10-17 00:23:22.743 +57674,5237.0,2,,57673.0,,,,CC BY-SA 3.0,"

I suppose this depends on what is meant by ""exceed"", but generally when people say a test statistics exceeds the critical value, they mean $|t|\boldsymbol{>}t_{\rm crit}$, and when they say the p-value exceeds the level of significance, they mean $p\boldsymbol{<}\alpha$. Thus, when the test statistic exceeds the critical value of t, the p-value also exceeds the level of significance.

+ +

As to why that fact is the case, it is simply because the value of $t_{\rm crit}$ is determined by the point where $p<\alpha$.

+",2013-10-17 00:31:51.443 +57675,4537.0,2,,57643.0,,,,CC BY-SA 3.0,"

Do you mean equation (2)? I think he's not using Bayes' theorem at all -- he's just using the definition of conditional probability.

+ +

Recall that if $A$, $B$ are events, then +$$ +P(A | B) = \frac{P(A \cap B)}{P(B)}. +$$ +If you want to know more about conditional probability, I think the Wiki article is pretty good.

+",2013-10-17 01:22:03.280 +57676,13045.0,1,94362.0,,,Inverse transformation sampling for mixture distribution of two normal distributions,,CC BY-SA 3.0,"

I am confused by the special way required to use inverse method in the following problem,

+ +

Here is the problem:

+ +
+

Consider a mixture distribution of two normal distributions, where the + desired PDF $f(x)$ is given by:

+ +

$f(x) = r\, f_a(x) + (1 − r)\, f_b(x)$,where $f_a$ and $f_b$ are + normal PDFs with means $a$ and $b$, respectively (standard deviation + is 1 for both). Using two uniform random variables $u_1$ and $u_2$, + explain how we can use the inversion method to sample from $f(x)$. + Note, the qnorm command in R may be helpful here.

+
+ +

My confusion is from ""two uniform random variables $u_1$ and $u_2$"". My thought is that we find out the cdf, $F(x)$ (which can be obtained via pnorm() in R), and then we can use some numerical method (such as Newton-Raphson) to generate $x\sim f(x)$, so here it only needs one uniform distribution and does not need qnorm().

+ +

What's wrong with my method? Does the problem suggest a better method?

+",2013-10-17 01:37:34.807 +57677,18865.0,2,,30862.0,,,,CC BY-SA 4.0,"

The point of low-rank approximation is not necessarily just for performing dimension reduction.

+ +

The idea is that based on domain knowledge, the data/entries of the matrix will somehow make the matrix low rank. But that is in the ideal case where the entries are not affected by noise, corruption, missing values etc. The observed matrix typically will have much higher rank.

+ +

Low-rank approximation is thus a way to recover the ""original"" (the ""ideal"" matrix before it was messed up by noise etc.) low-rank matrix i.e., find the matrix that is most consistent (in terms of observed entries) with the current matrix and is low-rank so that it can be used as an approximation to the ideal matrix. Having recovered this matrix, we can use it as a substitute for the noisy version and hopefully get better results.

+",2013-10-17 01:40:25.533 +57678,17448.0,1,57682.0,,,How can the Kolmogorov-Smirnov test be used/interpreted?,,CC BY-SA 3.0,"

The test of Kolmogorov-Smirnov (K-S) is a traditional test of normality, although Shapiro-Wilk test (S-W) is applied more frequently than K-S (Arango, 2012).

+ +

I am not an expert in statistics, so my question concerns about the use of K-S test.

+ +
    +
  1. Is it possible to use K-S test to another purposes besides normality test?
  2. +
  3. Why some times we should use K-S test instead of S-W test? Is it related to sample size?
  4. +
+",2013-10-17 01:46:24.010 +57679,4537.0,2,,57672.0,,,,CC BY-SA 3.0,"

Yes, but I think the first parameter of the Gamma should be $1-p/2$ instead of $1+p/2$. +$$ +v \sim \text{Gamma}(1-p/2, s/2) +$$ +I'm using the shape-rate parametrization, as in here.

+",2013-10-17 01:52:23.197 +57680,2075.0,1,57777.0,,,Linear kernel and non-linear kernel for support vector machine?,,CC BY-SA 3.0,"

When using support vector machine, are there any guidelines on choosing linear kernel vs. nonlinear kernel, like RBF? I once heard that non-linear kernel tends not to perform well once the number of features is large. Are there any references on this issue?

+",2013-10-17 02:21:02.553 +57681,10684.0,2,,57670.0,,,,CC BY-SA 3.0,"

A typical strategy would be to compare the proportion of the nodes in each network satisfying each of the criteria instead of the number of nodes. So for example, if your networks had $100, 200, 400$ nodes respectively and the param1 for each of these networks was $50, 60, 80$, then the corresponding proportions of nodes of this particular type would be $50/100, 60/200$ and $80/300$, or $0.5, 0.3, 0.2$. Then it would make sense to average these numbers; the average proportion of nodes of this type is $(0.5+0.3+0.2)/3 = 0.333$. So if, say, you had a new network with $1000$ nodes, you might guess that it would have about $0.333 \times 1000 = 333$ nodes of the desired type, all other things being equal.

+",2013-10-17 02:27:02.890 +57682,594.0,2,,57678.0,,,,CC BY-SA 3.0,"

The Kolmogorov-Smirnov test is a test of any completely specified continuous distribution against general alternatives.

+ +

The Shapiro-Wilk is a test of normality without specifying the mean or variance.

+ +

That is, the K-S and the S-W apply to different circumstances. To apply the K-S to the situation of the S-W, you'd get the Lilliefors test for normality (which allows for the effect of the parameter estimation, via simulation). Alternatively, to apply the S-W to the situation of the K-S on normal distributions you'd need to add a test for the specified mean and variance and combine the two in some way.

+ +

The Shapiro-Wilk has excellent power against a wide range of alternatives from normality.

+ +

There are other alternatives to the Shapiro-Wilk, such as the Anderson-Darling test. The Anderson-Darling is usually preferred to the K-S on the basis that it generally has better power against interesting alternatives.

+ +

If you adjust the distribution of the A-D for estimated parameters, it's reasonably competitive with the Shapiro-Wilk at the normal, but the S-W would generally be slightly preferred.

+",2013-10-17 02:46:15.200 +57683,594.0,2,,57331.0,,,,CC BY-SA 3.0,"

Your question implies that for independent random variables, $σ_\text{sum} = σ_x + σ_y + σ_z$. This is not the case.

+ +

The squares are additive: $σ^2_\text{sum} = σ^2_x + σ^2_y + σ^2_z$. So $σ_\text{sum} = \sqrt{σ^2_x + σ^2_y + σ^2_z}$.

+ +

However, otherwise you're correct - if you add three independent normal random variables, the distribution of the sum is normal with mean equal to the sum of their means and variance equal to the sum of their variances (indeed that applies to adding any number of terms).

+ +

Variances of correlated random variables are a little more complicated, but still straightforward. For correlated multivariate normals, you also still have normality.

+",2013-10-17 02:59:28.050 +57684,22781.0,1,,,,Sobel test with survey data,,CC BY-SA 3.0,"

I would like to ask if there is any problem / concern with Sobel test when I use survey data? In particular, I was using the web-based calculation tool here based on coefficients derived from regression analysis using survey command. I also hope to know if there is a good reference article about this.

+",2013-10-17 03:10:06.457 +57685,18268.0,1,,,,Weighing probabilities into a polygon,,CC BY-SA 3.0,"

I have a collection of 4-member probability vectors (essentially proportions over 4 mutually exclusive categories). Is there a method to represent this data as a cloud of points inside a square? If each of the values of the 4-tuple represents a weight towards one of the 4 edges, can we appropriately place each inside the 2D space?

+ +

What I am looking for is a method to plot data of the following sort:

+ +
1: 1 0 0 0
+2: 0 1 0 0
+3: 0.5 0.5 0 0
+4: 0.25 0.25 0.25 0.25
+
+ +

In the following fashion (sorry the bottom border got clipped)

+ +

+ +

Thanks.

+",2013-10-17 04:21:25.697 +57686,22783.0,1,,,,Plot of copula (based on data set) - R,,CC BY-SA 3.0,"

I have to do an empirical analysis for a statistics paper. For this I want to show the differences of dependence structure for a specific data set.

+ +

So I selected 2 stock prices, transformed them into the returns and started to measure the dependency with R. So far it is no problem, I have a result for Bravais-Pearson, Kendall and Spearman. Additionally I plotted the regression model for this two values.

+ +

I have read in many papers, according to Sklar's Theorem, that it is easy to get the copula function out of the distribution function, just by use the inverse.

+ +

So my question is, if there is a possibility with R to plot the copula function (and density) just by having this data set (2 returns) or if I must first estimate the parameters to be able to plot this function.

+ +

And how can I do this with R? I tried to search the answer in the handbook of the package ""copula"", but my search wasn't really helpful.

+ +

Thanks in advance for your help!

+",2013-10-17 05:20:18.950 +57687,594.0,2,,57359.0,,,,CC BY-SA 3.0,"

When including polynomials and interactions between them, multicollinearity can be a big problem; one approach is to look at orthogonal polynomials.

+ +

Generally, orthogonal polynomials are a family of polynomials which are orthogonal with +respect to some inner product.

+ +

So for example in the case of polynomials over some region with weight function $w$, the +inner product is $\int_a^bw(x)p_m(x)p_n(x)dx$ - orthogonality makes that inner product $0$ +unless $m=n$.

+ +

The simplest example for continuous polynomials is the Legendre polynomials, which have +constant weight function over a finite real interval (commonly over $[-1,1]$).

+ +

In our case, the space (the observations themselves) is discrete, and our weight function is also constant (usually), so the orthogonal polynomials are a kind of discrete equivalent of Legendre polynomials. With the constant included in our predictors, the inner product is simply $p_m(x)^Tp_n(x) = \sum_i p_m(x_i)p_n(x_i)$.

+ +

For example, consider $x = 1,2,3,4,5$

+ +

Start with the constant column, $p_0(x) = x^0 = 1$. The next polynomial is of the form $ax-b$, but we're not worrying about scale at the moment, so $p_1(x) = x-\bar x = x-3$. The next polynomial would be of the form $ax^2+bx+c$; it turns out that $p_2(x)=(x-3)^2-2 = x^2-6x+7$ is orthogonal to the previous two:

+ +
x         p0  p1  p2   
+1          1  -2   2   
+2          1  -1  -1
+3          1   0  -2
+4          1   1  -1
+5          1   2   2
+
+ +

Frequently the basis is also normalized (producing an orthonormal family) - that is, the sums of squares of each term is set to be some constant (say, to $n$, or to $n-1$, so that the standard deviation is 1, or perhaps most frequently, to $1$).

+ +

Ways to orthogonalize a set of polynomial predictors include Gram-Schmidt orthogonalization, and Cholesky decomposition, though there are numerous other approaches.

+ +
+ +

Some of the advantages of orthogonal polynomials:

+ +

1) multicollinearity is a nonissue - these predictors are all orthogonal.

+ +

2) The low-order coefficients don't change as you add terms. If you fit a degree $k$ polynomial via orthogonal polynomials, you know the coefficients of a fit of all the lower order polynomials without re-fitting.

+ +
+ +

Example in R (cars data, stopping distances against speed): +

+ +

Here we consider the possibility that a quadratic model might be suitable:

+ +

R uses the poly function to set up orthogonal polynomial predictors:

+ +
> p <- model.matrix(dist~poly(speed,2),cars)
+> cbind(head(cars),head(p))
+  speed dist (Intercept) poly(speed, 2)1 poly(speed, 2)2
+1     4    2           1      -0.3079956      0.41625480
+2     4   10           1      -0.3079956      0.41625480
+3     7    4           1      -0.2269442      0.16583013
+4     7   22           1      -0.2269442      0.16583013
+5     8   16           1      -0.1999270      0.09974267
+6     9   10           1      -0.1729098      0.04234892
+
+ +

They're orthogonal:

+ +
> round(crossprod(p),9)
+                (Intercept) poly(speed, 2)1 poly(speed, 2)2
+(Intercept)              50               0               0
+poly(speed, 2)1           0               1               0
+poly(speed, 2)2           0               0               1
+
+ +

Here's a plot of the polynomials: +

+ +

Here's the linear model output:

+ +
> summary(carsp)
+
+Call:
+lm(formula = dist ~ poly(speed, 2), data = cars)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-28.720  -9.184  -3.188   4.628  45.152 
+
+Coefficients:
+                Estimate Std. Error t value Pr(>|t|)    
+(Intercept)       42.980      2.146  20.026  < 2e-16 ***
+poly(speed, 2)1  145.552     15.176   9.591 1.21e-12 ***
+poly(speed, 2)2   22.996     15.176   1.515    0.136    
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+Residual standard error: 15.18 on 47 degrees of freedom
+Multiple R-squared:  0.6673,    Adjusted R-squared:  0.6532 
+F-statistic: 47.14 on 2 and 47 DF,  p-value: 5.852e-12
+
+ +

Here's a plot of the quadratic fit: +

+",2013-10-17 06:04:30.500 +57688,22426.0,1,,,,Meaning of link functions (GLM),,CC BY-SA 3.0,"

I am performing ordinal regression on several datasets, I have 5 ordered response categories and only one explanatory variable X. +For each dataset I run the analysis 3 times, each time using a different link function (1. probit, 2. logit, 3. comploglog) and I calculate the AIC to see which function fits my data best.

+ +

It seems that for different datasets I get different link functions significantly providing a ""best"" fit; for example probit is better for dataset 1 and logit is better for dataset 2 etc. +I am trying to find an explanation for such difference.

+ +

So my question is, what is the ""physical"" meaning of each link function? +For example, I understand the probit link function assumes the response scale can be related to a latent continuous, normally distributed variable but for the other 2 I have no idea.

+ +

Any insight on this would be great!

+",2013-10-17 06:08:37.423 +57689,12787.0,1,,,,Not specifying the main effect of a term that is part of a tensor product interaction,,CC BY-SA 3.0,"

Say you've got a model +$$ +y = f(X_1,X_2)+\epsilon +$$

+ +

and you're OK with linear (or other parametric) functional forms. Say you think that the effect of $X_1$ on $y$ depends on $X_2$. The standard wisdom is that one should include the main effect of the interaction to capture the effect of $X_1$ on $y$ when $X_2$ = 0, and to render coefficients invariant to location changes.

+ +

I am curious whether this applies to generalized additive models estimated by penalized splines, with interactions represented by tensor products. Intuitively, it seems like it would NOT apply.

+ +

WRT the first issue, the tensor product smooth estimates the ""height"" of a surface defined by two variables, along a grid. Thus you've always got a function for one variable at a fixed value of the other. If you had a univariate smooth along with it, you'd have to simply add the two, at a cost of degrees of freedom (which would get penalized/reduced).

+ +

WRT the second issue, this paper by Simon Wood lays out how the tensor products implemented in gam are scale invariant.

+ +

Is this a reasonable interpretation? If I wanted to estimate the model above nonparametrically, is it unreasonable to leave out smooth functions of $X_1$ and $X_2$, and simply model $X_1 \otimes X_2$.

+ +

If yes, how would you quickly and imply explain why to an audience that isn't used to GAMs?

+",2013-10-17 06:43:51.323 +57690,8719.0,1,57693.0,,,Simulating p-values as a function of sample size,,CC BY-SA 3.0,"

We are trying to prove a very subtle effect occurring in cells after a certain treatment. Let's assume that the measurements are normally distributed. Let's also assume the untreated cells have $\mu = 1$ and $\sigma = 0.1$ and the treated cells have $\mu = 1.1$ and $\sigma = 0.22$. The question is:

+ +

How large must the sample size be in order for the observed effect to be statistically significant ($\alpha = 0.05$)?

+ +

I know that very subtile effects require a larger sample size than more apparent effects, but how many? I'm still learning statistics, so please be patient with me. I tried to perform a little simulation in R. Assuming that you randomly pick $n$ samples from a normal distribution, I tried to calculate the mean p-value as a function of $n$.

+ +

+ +

Is this a correct way to do find the right sample size? Or am I completely off the track with this approach?

+ +

Code:

+ +
library(ggplot2)
+
+ctrl.mean <- 1
+ctrl.sd <- 0.1
+treated.mean <- 1.1
+treated.sd <- 0.22
+
+# Function that repeats t-test a number of times (rpt) with given sample size, means and sds.
+# Returns a list of p-values from the test
+
+tsim <- function(rpt, n, mean1, sd1, mean2, sd2) {
+  x <- 0
+  ppool <- NULL
+  while (x <= rpt) {
+    ppool <- c(ppool, t.test(rnorm(n,mean1,sd1), y = rnorm(n,mean2,sd2))$p.value)
+    x <- x + 1
+  }
+  return(ppool)
+}
+
+# Iterate through sample sizes and perform the function
+# Returns data frame with list of mean p-values at a given sample size
+
+i <- 2
+num <- 50
+res <- NULL
+
+while (i <= num) {
+  sim <- tsim(1000, i, ctrl.mean, ctrl.sd, treated.mean, treated.sd)
+  res <- rbind(res, cbind(i, mean(sim), sd(sim)))
+  i <- i + 1
+}
+
+# Plot the result
+
+res <- as.data.frame(res)
+
+ggplot(res, aes(x=i, y=-log10(V2))) +
+  geom_line() +
+  geom_ribbon(aes(ymin=-log10(V2)-log10(V3), ymax=-log10(V2)+log10(V3)), alpha = 0.2) +
+  annotate(""segment"", x = 6, xend = num, y = -log10(0.05), yend = -log10(0.05), colour = ""red"", linetype = ""dashed"") +
+  annotate(""text"",  x = 0, y=-log10(0.05), label= ""p = 0.05"", hjust=0, size=3) +
+  annotate(""segment"", x = 6, xend = num, y = -log10(0.01), yend = -log10(0.01), colour = ""red"", linetype = ""dashed"") +
+  annotate(""text"",  x = 0, y=-log10(0.01), label= ""p = 0.01"", hjust=0, size=3) +
+  annotate(""segment"", x = 6, xend = num, y = -log10(0.001), yend = -log10(0.001), colour = ""red"", linetype = ""dashed"") +
+  annotate(""text"",  x = 0, y=-log10(0.001), label= ""p = 0.001"", hjust=0, size=3) +
+  xlab(""Number of replicates"") +
+  ylab(""-log10(p-value)"") +
+  theme_bw()
+
+",2013-10-17 07:48:14.250 +57691,9047.0,1,58379.0,,,Construct confidence interval of the mean for auto-correlated data,,CC BY-SA 3.0,"

I feel like I'm missing something obvious, but here we go. I have auto-correlated data measured in triplicate for two (or more) treatments. Something like this:

+ +
t <- 3:20 #times in my real dataset are possibly not always equidistant
+a <- structure(c(0.652492388457625, 0.905172522010166, 1.23437705454616, 
+                 1.48003667490842, 1.77876898946135, 1.99175317367897, 2.31666502140984, 
+                 2.43520651415548, 2.67903421794922, 2.84115747823017, 2.89693734873647, 
+                 2.91199679761145, 2.85645436179354, 2.99371033437697, 2.99965220711105, 
+                 2.84984814715963, 2.64275376547326, 2.64060469520379, 0.481029734912324, 
+                 0.8466803252367, 1.31126162780809, 1.56745630574946, 1.74865844658142, 
+                 1.80367117155375, 2.06688393210808, 2.24500095501872, 2.52978288460243, 
+                 2.69073206006205, 2.89657418056785, 2.93759772556246, 2.99305951550274, 
+                 2.89146932307489, 2.88890777189028, 2.7974672802907, 2.70933381639295, 
+                 2.66799551352975, 0.624178180970784, 0.867127935268765, 1.09752295578438, 
+                 1.35037796202753, 1.60094288950107, 1.97949255710341, 2.15496378191076, 
+                 2.42556913246041, 2.54331160179646, 2.67440414122285, 2.84249532365163, 
+                 2.95278639560433, 3.06192227561515, 3.03297885461444, 3.04101341059534, 
+                 3.01736966686846, 2.80061410999215, 2.69852643323913), 
+               .Dim = c(18L, 3L), .Dimnames = list(NULL, c(""a1"", ""a2"", ""a3"")))
+b <- structure(c(0.516527990622755, 0.84883434472028, 1.04202664437099, 
+                 1.3100841689546, 1.48050413266838, 1.7824492800856, 1.96557179831706, 
+                 2.17419105778186, 2.2453178060978, 2.35460428313729, 2.49308342865959, 
+                 2.62343038370418, 2.70831189685371, 2.79459971623943, 2.94938536147398, 
+                 3.04822554887815, 3.00287042052314, 2.91673487674283, 0.589490441973075, 
+                 0.751768045201717, 0.917973959434798, 1.17617337222852, 1.39497560590896, 
+                 1.65920945485901, 1.87749014780468, 2.11880355292648, 2.372755207219, 
+                 2.46211141942227, 2.59688733749884, 2.72270421752644, 2.79848710425447, 
+                 2.81134394947587, 2.75390203306788, 2.78499114431362, 2.86001341271914, 
+                 2.95652300178809, 0.558662398944567, 0.834996005844121, 0.988238211915554, 
+                 1.27569591423003, 1.38577342414377, 1.62664982549252, 1.83299700801392, 
+                 2.04943560731628, 2.22950648854987, 2.38533269800646, 2.49845003387994, 
+                 2.60036098089373, 2.61941602504858, 2.71298500309883, 2.78126388719353, 
+                 3.04792375845498, 3.02691814463875, 3.06667590650438), 
+               .Dim = c(18L, 3L), .Dimnames = list(NULL, c(""b1"", ""b2"", ""b3"")))
+
+matplot(t,a,pch=1,xlab="""",ylab="""",col=""blue"")
+matlines(t,a,col=""blue"", lty=2)
+
+matpoints(t,b,pch=16,col=""red"")
+matlines(t,b,col=""red"", lty=2)
+
+ +

+ +

I would like to know in which time periods the treatments differ. I would like to avoid fitting any kind of model. (There are models for my kind of data from science, but they are known to be only an approximation for some ranges of my data and I'm afraid that model error might mask differences.) My idea is to calculate the mean and construct confidence intervals (using an assumption of normality) like this:

+ +
a_means <- apply(a,1,mean)
+a_sds <- apply(a,1,sd)
+a_lwr <- a_means-qt(0.975,3)*a_sds/sqrt(3)
+a_upr <- a_means+qt(0.975,3)*a_sds/sqrt(3)
+
+b_means <- apply(b,1,mean)
+b_sds <- apply(b,1,sd)
+b_lwr <- b_means-qt(0.975,3)*b_sds/sqrt(3)
+b_upr <- b_means+qt(0.975,3)*b_sds/sqrt(3)
+
+DF <- data.frame(treat=factor(rep(1:2, each=length(t))), 
+                 time=rep(t, 2),
+                 mean=c(a_means,b_means),
+                 lwr=c(a_lwr,b_lwr),
+                 upr=c(a_upr,b_upr))
+
+library(ggplot2)
+p <- ggplot(DF, aes(x=time, y=mean, ymin=lwr, ymax=upr)) +
+  geom_ribbon(aes(fill=treat), alpha=0.3) +
+  geom_line(aes(color=treat))
+print(p)
+
+ +

+ +

The way I'm constructing the confidence intervals obviously doesn't consider auto-correlation.

+ +
    +
  • Is there a way to construct some kind of ""auto-correlated confidence interval""?
  • +
  • Can I use the ""un-correlated confidence interval""? Can I somehow estimate if it is too narrow or too wide in comparison to the auto-correlated confidence interval?
  • +
  • Is there a better approach to my problem?
  • +
+",2013-10-17 07:52:01.927 +57692,22784.0,1,57697.0,,,How to check if there is any “dependence” between columns?,,CC BY-SA 3.0,"

How to check if those values are ""dependent"" ? Take a look on the values in second column coz the difference between them is really huge so it's hard to choose the scale. Any propositions of checking ""dependence"" between them are welcome. I was thinking about correlation but I am new in statistic so please let me know step by step what should I do to analyse those data.

+ +
diff    abund_mean
+ 0   3444804.79
+ 1   847887.02
+ 2   93654.19
+ 0   721692.76
+ 2   382711.04
+ 1   428656.65
+ 1   120933.91
+ 0   157528.72
+ 1   159650.70
+ 0   124602.80
+ 0   90844.33
+ 2   501825.37
+ 1   270592.56
+
+ +

I am learning R aswell so I can calculate evertyhing in R if you let me know how to do that.

+",2013-10-17 08:07:59.677 +57693,4910.0,2,,57690.0,,,,CC BY-SA 3.0,"

You have almost performed what is usually called a power analysis. I say almost, because what you usually measure in a power calculation is not the mean p-value, but rather the probability that, given the sample size and the hypothesised mean difference, you would get a p-value lower than say 0.05.

+ +

You can make small changes to your calculations in order to get this probability, however. The following script is a modification of your script that calculates the power for sample sizes from 2 to 50:

+ +
ctrl.mean <- 1
+ctrl.sd <- 0.1
+treated.mean <- 1.1
+treated.sd <- 0.22
+
+n_range <- 2:50
+max_samples <- 50
+power <- NULL
+p.theshold <- 0.05
+rpt <- 1000
+
+for(n in n_range) {
+  pvals <- replicate(rpt, {
+    t.test(rnorm(n,ctrl.mean, ctrl.sd), y = rnorm(n, treated.mean, treated.sd))$p.value
+  })
+  power <- rbind(power, mean(pvals < p.theshold) )
+}
+
+plot(n_range, power, type=""l"", ylim=c(0, 1))
+
+ +

+ +

The way I would read this graph goes like: ""Given my assumptions of the two groups, the probability that I would find a significant effect at n = 30 is roughly 50%"". Often an 80% chance of finding an actual effect is considered a high level of power. By the way, power analysis is generally considered a good thing. :)

+",2013-10-17 08:27:11.787 +57743,8926.0,2,,57680.0,,,,CC BY-SA 4.0,"

Andrew Ng gives a nice rule of thumb explanation in this video starting 14:46, though the whole video is worth watching.

+ +

Key Points

+ +
    +
  • Use linear kernel when number of features is larger than number of observations.
  • +
  • Use gaussian kernel when number of observations is larger than number of features.
  • +
  • If number of observations is larger than 50,000 speed could be an issue when using gaussian kernel; hence, one might want to use linear kernel.
  • +
+",2013-10-18 02:01:22.970 +57694,20062.0,2,,57636.0,,,,CC BY-SA 3.0,"

When you have multiple variable and you are looking for variable(s) which is the best for discriminating between groups (""yes"" and ""no"" samples in this case) a tool for this is MANOVA.

+ +
# Suppose we have a data.frame with 7 variables and one group:
+my.data<-data.frame(v1=rnorm(100),v2=rnorm(100),v3=rnorm(100),
+v4=rnorm(100),v5=rnorm(100),v6=rnorm(100), v7=c(rnorm(50),
+rnorm(50)+20),response=rep(c(""yes"",""no""), each=50))
+
+# run MANOVA
+my.mnv<-manova(cbind(v1,v2,v3,v4,v5,v6,v7) ~ response, data=my.data)
+
+# and look on p-values (if p-value < 0.05 then it is able to 
+# significantly discriminate between ""yes"" and ""no"")
+summary.aov(my.mnv)
+
+# plot
+pairs(my.data[c(""v1"",""v2"",""v3"",""v4"",""v5"",""v6"",""v7"")], pch=22,
+bg=c(""red"", ""yellow"")[unclass(my.data$response)])
+
+ +
+ +

It's not good to make conclusions about statistical significance based on looking on the plot (although it is necessary to look on it). In you case of 107 variables the pairs() plot will be very chaotic.

+",2013-10-17 08:31:52.307 +57695,,1,,,Matteo,How to estimate errors on a sample with very few data points,,CC BY-SA 3.0,"

I have a very simple question to ask, but I can't figure this on my own. +I have two samples: sample A with only three data points, and sample B with hundreds of points. For each sample I measure the median of certain quantity. +Now my question is, what is the error associated to the median? +I can consider the quartiles but, what if sample A consisted of only 1 single point? This would virtually assign no error to median value. I expect my measurement more accurate for sample B.

+",2013-10-17 08:41:56.497 +57696,22787.0,1,57709.0,,,Interpreting 5-way Mixed Model ANOVA,,CC BY-SA 3.0,"

I'm running a 2x2x2x2x2 mixed model ANOVA (on SPSS v21) for my study and found two 4-way interactions, one 5-way interactions and a couple of 3-way interactions. Whilst I understand how to interpret a 3-way ANOVA. I'm having quite a hard time trying to interpret the 4/5-way interactions.

+ +

My variables are:

+ +
    +
  • Within-subject factors: lineup sex (female, male), +lineup ethnicity (Asian, Caucasian)
  • +
  • Between-subject factors: +lineup procedure (sequential, simultaneous), +participant ethnicity (Asian, Caucasian), +participant sex (female, male)
  • +
+ +

Sample size: 552

+ +

I've gotten to the stage where I have done separate ANOVAs (splitting the data) on each factor following a 5-way or 4-way interactions obtained from the initial analysis (e.g., AxB at C1, AxB at C2 and so on) and obtained a ton of output. Some of the output had only significant main effect while others had no sig. effects. But the graph that the SPSS produced with the estimated marginal means clearly indicates a significant interaction.

+ +

E.g: +Lineup ethnicity x Participant sex at two levels of Lineup Sex.

+ +
+

Male Caucasians Lineup: Lineup ethnicity x Participant Sex

+ +
    +
  • All main effects & interaction are non. sig.
  • +
+ +

Female Caucasian Lineup: Lineup ethnicity x Participant Sex

+ +
    +
  • All main effects & interaction are non. sig.
  • +
+ +

But the graph indicates an interaction effect only for Female + Caucasians.

+
+ +

I've consulted with my supervisor about this and due to the time constraints, he has advised me to just compare and contrast the graphs of each ANOVA. That much I understand but I am clueless as to what to do next. Yes, I compare between the graphs but am unsure on how that would help explain the 5/4-way interactions..

+ +

Any help would be greatly appreciated!

+",2013-10-17 09:15:19.207 +57697,20470.0,2,,57692.0,,,,CC BY-SA 3.0,"

You can check linear association between the two columns using correlation. The advantage of correlation over covariance is that it is normalised and not dependent on the scales of the column values you are comparing. It takes a value between $-1$ and $1$.

+ +

If diff and abund_mean are numeric sequences, in R:

+ +
data <- cbind(matrix(diff, ncol=1),matrix(abund_mean, ncol=1)) 
+cor(data, use=""complete.obs"")
+
+ +

produces:

+ +
> cor(dat)
+               [,diff]   [,abund_mean]
+[diff,]        1.0000000 -0.2813283
+[abund_mean,] -0.2813283  1.0000000
+
+ +

This is the correlation matrix. As you can see on the diagonals, the correlation of a column-variable with itself is maximum at $1$. The correlation between diff and abund_mean is negative at $-0.2813283$.

+",2013-10-17 09:20:19.100 +57698,22788.0,1,,,,Single sample versus multiple sample,,CC BY-SA 3.0,"

I have a jar with white and black balls. Total number of balls in the jar is 100000. I want to estimate the proportion of white balls. My constraint is that the sample size for estimation should be low, lets assume 500 balls. I am debating between two approaches.

+ +
    +
  1. Draw a single sample of 500 balls, $\hat{p}$ = number of white balls +divided by 500
  2. +
  3. Draw 10 samples of 50 balls each. Calculate the proportion of white balls in each sample, i. e., $[r_1, r_2, ..., r_{10}]$. Estimated $\hat{p}$ = average of $[r_1, r_2, ..., r_{10}]$.
  4. +
+ +

Which method should I use so that I am less susceptible to sampling error?

+",2013-10-17 09:53:49.753 +57699,22790.0,1,,,,Propensity scores and patient comparability,,CC BY-SA 3.0,"

I have two groups of patients who underwent a surgery using method A or method B. The first group are patients who were operated in 1980's and 1990's only with method A. The second group are patients operated recently with mostly method B, but also in some cases A. In addition to that, I have various variables about patients (gender, age, medical indicators, etc.) that capture the pre-operation medical history and types of symptoms that patient developed.

+ +

The goal of the study is to compare the ""effecitveness"" of methods A and B in terms of patients' survival times after operation.

+ +

Somewhat different patients (in terms of age, gender, etc.) are operated with A and B. For example, quite some more older people were operated with B than with A. I want to used propensity score matching to balance the data.

+ +

My question is:

+ +

Does it make sense to estimate propensity scores (method ~ age + gender + ...) and use them for creating a matched dataset for further analysis (e.g. Cox regression)?

+ +

In particular, is it a problem that for patients in the first group method B was not yet available, so none of them could potentially received the alternative treatment?

+",2013-10-17 10:18:03.660 +57700,22793.0,1,,,,How to transform this dataset to make classes linearly separable?,,CC BY-SA 3.0,"

I have this data set:

+ +

+ +

And I want to transform the data (with a RBF kernel?) in order to be able to do a simple linear ridge-classifier.

+ +

I know I can do more or less the same thing using a kernel SVM with a RBF kernel but I have to use the ridge classifier.

+ +

Does anyone know how it (if?) can be done? This is kinda homeworkish so I don't want a full solution I just want some input.

+",2013-10-17 10:51:21.157 +57744,1150.0,2,,57644.0,,,,CC BY-SA 3.0,"

Although I don't know anything about circuit design, I'm confused as everyone else why you would expect results to cluster around the average. The CLT has to do not with the shape (as many have correctly pointed out) but the probability of each a data point landing into a random a subset. If the underlying process had a bi-modal distribution, most of the samples would have skinnier peaks.

+ +

If your random seeds determine the efficiency of their placement and there is some sort of upper and lower bound on performance then your numbers simply reflect the distribution of such locations.

+",2013-10-18 02:05:58.343 +57745,2081.0,2,,57710.0,,,,CC BY-SA 4.0,"

Nominal vs Interval

+ +

The most classic ""correlation"" measure between a nominal and an interval (""numeric"") variable is Eta, also called correlation ratio, and equal to the root R-square of the one-way ANOVA (with p-value = that of the ANOVA). Eta can be seen as a symmetric association measure, like correlation, because Eta of ANOVA (with the nominal as independent, numeric as dependent) is equal to Pillai's trace of multivariate regression (with the numeric as independent, set of dummy variables corresponding to the nominal as dependent).

+ +

A more subtle measure is intraclass correlation coefficient (ICC). Whereas Eta grasps only the difference between groups (defined by the nominal variable) in respect to the numeric variable, ICC simultaneously also measures the coordination or agreemant between numeric values inside groups; in other words, ICC (particularly the original unbiased ""pairing"" ICC version) stays on the level of values while Eta operates on the level of statistics (group means vs group variances).

+ +

Nominal vs Ordinal

+ +

The question about ""correlation"" measure between a nominal and an ordinal variable is less apparent. The reason of the difficulty is that ordinal scale is, by its nature, more ""mystic"" or ""twisted"" than interval or nominal scales. No wonder that statistical analyses specially for ordinal data are relatively poorly formulated so far.

+ +

One way might be to convert your ordinal data into ranks and then compute Eta as if the ranks were interval data. The p-value of such Eta = that of Kruskal-Wallis analysis. This approach seems warranted due to the same reasoning as why Spearman rho is used to correlate two ordinal variables. That logic is ""when you don't know the interval widths on the scale, cut the Gordian knot by linearizing any possible monotonicity: go rank the data"".

+ +

Another approach (possibly more rigorous and flexible) would be to use ordinal logistic regression with the ordinal variable as the DV and the nominal one as the IV. The square root of Nagelkerke’s pseudo R-square (with the regression's p-value) is another correlation measure for you. Note that you can experiment with various link functions in ordinal regression. This association is, however, not symmetric: the nominal is assumed independent.

+ +

Yet another approach might be to find such a monotonic transformation of ordinal data into interval - instead of ranking of the penultimate paragraph - that would maximize R (i.e. Eta) for you. This is categorical regression (= linear regression with optimal scaling).

+ +

Still another approach is to perform classification tree, such as CHAID, with the ordinal variable as predictor. This procedure will bin together (hence it is the approach opposite to the previous one) adjacent ordered categories which do not distinguish among categories of the nominal predictand. Then you could rely on Chi-square-based association measures (such as Cramer's V) as if you correlate nominal vs nominal variables.

+ +

And @Michael in his comment suggests yet one more way - a special coefficient called Freeman's Theta.

+ +

So, we have arrived so far at these opportunities: (1) Rank, then compute Eta; (2) Use ordinal regression; (3) Use categorical regression (""optimally"" transforming ordinal variable into interval); (4) Use classification tree (""optimally"" reducing the number of ordered categories); (5) Use Freeman's Theta.

+",2013-10-18 02:28:34.133 +57701,8386.0,2,,57698.0,,,,CC BY-SA 3.0,"

If there is no replacement of balls after drawing, then approaches 1 and 2 are equivalent. With approach 2, you can find the average of the proportions of white balls in each of the 10 samples, or find the total number of white balls in the 10 samples combined and express this as a proportion of 500. Both calculations will give the same result (given that, as is the case here, the samples are all of the same size).

+ +

If however for approach 2 the balls are replaced after each sample of 50, then the sampling error will be slightly higher than with approach 1. One way to see this is to consider the extreme case in which rather than 100000 there are only 500 balls in the jar. In that case approach 1, which would then be a 100% sample, would be guaranteed to estimate the true proportion correctly. But approach 2 would still be subject to sampling error because each sample of 50 would be only a 10% sample. With a much larger number of balls this effect is still present, albeit greatly diminished.

+",2013-10-17 11:07:46.470 +57702,20426.0,1,,,,Simple regression assumptions (homoscedasticity),,CC BY-SA 3.0,"

There is a simple regression model table I was looking at in a textbook with IQ values grouped into 5 intervals and each group had an N number associated with it. There was also information given about the residuals for each group (mean and variance for the residuals). +e.g For the < 75 IQ group, N = 23, Mean of residuals = -0.407 and variance = 71.288

+ +

The conclusion from the table was merely stated as ""assumptions for regression have been met"". I am unable to figure out what method was used to suggest if the homogeneity of variance (homoscedasticity) assumption is reasonable for the model (based on the information in the table). I'd like to know how the book arrived at its conclusion; are there plots of residuals' means/variances that indicate violations clearly? Is it like an ANOVA, where visually, one can make the simplistic estimation that if the ratio of variances exceeds a certain number, the assumption has been violated? Given a table like that, how does one proceed to test the assumptions of a regression model? Thanks!

+",2013-10-17 11:30:20.973 +57703,16474.0,2,,57702.0,,,,CC BY-SA 3.0,"

Based on your description I would guess that the authors just looked at the variances of the residuals and concluded that they were similar enough. They have given the variances, so you can make up your own mind if you agree with them.

+",2013-10-17 11:39:58.580 +57704,12358.0,2,,57700.0,,,,CC BY-SA 3.0,"

Have you tried putting the data into $r,\theta$? You could pick the origin as as the mean value of all of the data. You'd end up with two, slightly overlapping ellipsoidal blobs.

+",2013-10-17 11:57:10.503 +57705,19395.0,1,,,,Dependent is the difference between two Likert scales: Which regression to use?,,CC BY-SA 3.0,"

I asked participants in an experiment the same question once before and once after the experiment, to see the effect of the experiment on the answer. The answers were given on a 5-point scale. To me the change, so the difference between the before and the after answer to the question, is interesting. So I want to see which of multiple factors in the experiment caused the difference and accordingly want to run a regression on the DIFFERENCE in answers.

+ +

The difference is always an integer, ranging from -4 to 4. Since its not continuos, I'd like to use something like multinomial logit or ordered logit, but cannot decide which one would be exactly right. Is there a 'right' regression to use?

+ +

(Related, I have the problem that always a group of 5 participants took part in the same experiment. But I guess that can be solved with an FE or clusters.)

+ +

edit(clarification): I have 200 observations of 40 experimental groups with 5 participants each. Dependent variables in the above scenario would be changes in the experiment-setup (same in each group but different among groups), the relation of participants in a group (personal relation, class-difference, etc.) and socio-economics of participants

+",2013-10-17 11:58:57.590 +57706,3922.0,2,,57705.0,,,,CC BY-SA 3.0,"

I would run a bivariate ordinal model with the original responses, rather than their differences, constraining the demographic coefficients between the equations and letting the experimental condition coefficients to be free in the second response. In Stata, this can be done with cmp. You won't get much with 5 clusters, and that would be the limit to the # of explanatory variables if you cluster by them. If you have multiple observations per person, so that you have a few dozen data points, you can still do the random effects with cmp though.

+",2013-10-17 12:09:52.073 +57707,21918.0,1,57754.0,,,Hessian matrix and initial guess in logistic regression,,CC BY-SA 3.0,"

The log-likelihood function for logistic function is $$l(\theta) = \sum_{i=1}^m(y^{(i)}\log h(x^{(i)}) + (1-y^{(i)})\log(1 - h(x^{(i)})))$$, where $$h(x^{(i)}) = \frac{1}{1 + e^{-\theta^Tx^{(i)}}}\,.$$

+ +

In order to obtain maximum likelihood estimation, I implemented fitting the logistic regression model using Newton's method. I encountered 2 problems:

+ +
    +
  1. I try to fit the model to my data, but during the iterations, a singular Hessian matrix is encountered, what do I do with this kind of problem?

  2. +
  3. With different initial guess $\theta$, will the model converge to different results?

  4. +
+",2013-10-17 12:30:26.707 +57708,2666.0,2,,57359.0,,,,CC BY-SA 3.0,"

I don't feel that centering is worth the trouble, and centering makes the interpretation of parameter estimates more complex. If you use modern matrix algebra software, algebraic collinearity is not a problem. Your original motivation of centering to be able to interpret main effects in the presence of interaction is not a strong one. Main effects when estimated at any automatically chosen value of a continuous interacting factor are somewhat arbitrary, and it's best to think of this as a simple estimation problem by comparing predicted values. In the R rms package contrast.rms function, for example, you can obtain any contrast of interest independent of variable codings. Here is an example of a categorical variable x1 with levels ""a"" ""b"" ""c"" and a continuous variable x2, fitted using a restricted cubic spline with 4 default knots. Different relationships between x2 and y are allowed for different x1. Two of the levels of x1 are compared at x2=10.

+ +
require(rms)
+dd <- datadist(x1, x2); options(datadist='dd')
+f <- ols(y ~ x1 * rcs(x2,4))
+contrast(f, list(x1='b', x2=10), list(x1='c', x2=10))
+# Now get all comparisons with c:
+contrast(f, list(x1=c('a','b'), x2=10), list(x1='c', x2=10))
+# add type ='joint' to get a 2 d.f. test, or conf.type='simultaneous'
+# to get simultaneous individual confidence intervals
+
+ +

With this approach you can also easily estimate contrasts at several values of the interacting factor(s), e.g.

+ +
contrast(f, list(x1='b', x2=10:20), list(x1='c', x2=10:20))
+
+",2013-10-17 12:39:51.070 +57746,22817.0,1,,,,Deriving the optimum value of a function,,CC BY-SA 3.0,"

I have a function $f(t) = \sum_{i=1}^{N} |y_i-t|$.

+ +

What will be the optimum value of t that will minimize it. How to derive it?

+ +

Similarly what is the optimal value of t which minimizes $f(t) = \sum_{i=1}^{N} |y_i-t|^{\infty}$?

+",2013-10-18 02:40:52.280 +57709,22716.0,2,,57696.0,,,,CC BY-SA 3.0,"

Generally, you should start from the highest order interactions. You are probably aware that it is usually not sensible to interpret a main effect A when that effect is also involved in an interaction A:B. This is because the interaction tells you that the effect of A actually depends on the level of B, rendering any simple main effect interpretation of A impossible. +In the same way, if you have factors A, B, C, then A:B should not be interpreted if A:B:C is significant.

+ +

Thus, when you have a 5-way interaction, none of the lower-order interactions can be sensibly interpreted. Therefore, if I understand you correctly and you have interpreted your lower order interactions, you should probably not continue along those lines.

+ +

Rather, what you can do is to split up your data set and continue to analyze factor levels of your data set separately. Which of the factors you use to split up the dataset is arbitrary, but often it is very useful to split up the data for each variable and assess what you see. In your example, you might start with sex, and calculate an ANOVA for males, and another one for females (each ANOVA contains the 4 remaining factors). Just as well, you could split up the data according to ethnicity (one ANOVA for Asian, one for Caucasian). +You could also split up by one of the within-subject factors.

+ +

I will assume that you have decided to split the data by sex (just to continue with the example here). +Then, assume that for males, you get a 4-way interaction. You would then go on to split up the male data by one of the remaining variables (say, ethnicity). You would then calculate ANOVAs for male Asians (over the remaining 3 factors), and for male Caucasians.

+ +

Importantly, if you get only a lower-order interaction, then you are only ""allowed"" to analyze these further. This is because the other factors did not show significant differences. Thus, if your males ANOVA gives you only a 2-way interaction, then you would average over the other factors and calculate only an ANOVA over the 2 interacting factors (and, because we are in the male part of the ANOVAs, this would be for the males alone).

+ +

For the females, everything may look different, and so the decision which follow-up ANOVAs to calculate is separate for this group. So, what you did for males should be done for females in the same way ONLY if you got the same interactions.

+ +

Thus, you will potentially have a lot of ANOVAs, and it might not be easy to decide which ones to report. You should report 1 complete line down from the hightest interaction to the last effects (possibly t-tests to compare only 1 of your factors at the end). You should not usually report several lines (e.g., one starting the split-up by sex, then another one starting by ethnicity). However, you must report a complete line, and cannot simply choose to report only some of the ANOVAs of that line. So, you report one complete analysis, not more, not less. Which way to go in terms of splitting up / follow-up ANOVA is a subjective decision (unless you have clear hypotheses you can follow), and might depend on which results can be understood best etc.

+",2013-10-17 12:42:10.667 +57710,22795.0,1,57745.0,,,Correlation coefficient between a (non-dichotomous) nominal variable and a numeric (interval) or an ordinal variable,,CC BY-SA 4.0,"

I've already read all the pages in this site trying to find the answer to my problem but no one seems to be the right one form me...

+ +

First I explain you the kind of data I'm working with...

+ +

Let's say that I have an array vector with several names of city, one for each of 300 users. I also have another array vector with scores response to a survey of each user or a continuous value for each user.

+ +

I would like to know if exist a correlation coefficient that compute the correlation between these two variables so, between a nominal and a numeric/continuous or ordinal variables.

+ +

I've searched on the Internet and in some pages they suggest to use the contingency coefficient or Cramer's V or Lambda coefficient or Eta . For each of this measure the just say that they could be applied for such data in which we have a nominal variable and interval or numerical variable. +The thing is that searching and searching, trying to understand every one of them, sometime is written or watching the examples that they are reasonable to use them if you have dichotomous nominal variable, except for Cramer's V, other time is not written any requirement for the type of data. +A lot of other pages say that is right to apply regression instead, that is right, but I would just simply like to know if there is a coefficient like pearson/spearman for this kind of data.

+ +

I also think that is no so properly to use Spearman Correlation coeff since the cities are not sortable.

+ +

I have also built the function of Cramer'sV and Eta by myself (I'm working with Matlab) but for Eta they don't talk about any p-value to see if the coefficient is statistically significant...

+ +

In the matlabWorks site there is also a nice toolbox that says to compute eta^2 but the kind of input it needs is not understandable.

+ +

Is here someone that have done a test like mine? If you need more detail to understand the kind of data I'm using just ask me and I'll try to explain you better.

+",2013-10-17 13:05:10.583 +57711,22796.0,1,,,,How to apply Bonferroni correction when including an interaction term?,,CC BY-SA 3.0,"

Suppose we have two variables $x_1$ and $x_2$ and an interaction term $x_1 \cdot x_2$. Suppose we set the family-wise error rate to $\alpha = 0.05$. For the Bonferroni correction, would we look at $\alpha/2$ or $\alpha/3$?

+",2013-10-17 14:17:45.533 +57712,22798.0,1,,,,How can I explain these linear regression charts well on my scientific poster?,,CC BY-SA 3.0,"

I know that these laboratory analysis reports each have a linear regression relationship and two have a positive slope and one has a negative slope. I am taking my first statistics class and want to be able to explain these very well in a research poster I am doing for work. Am I missing any important information? The negative slope indicates that the values are decreasing together negatively. The positive means they are increasing together. The ADL concentration chart does not have a very good relation but there is still a relation. +

+ +

+

+ +

+",2013-10-17 14:57:33.620 +57713,668.0,2,,40104.0,,,,CC BY-SA 3.0,"

Provided not a whole lot of probability is concentrated on any single value in this linear combination, it looks like a Cornish-Fisher expansion may provide good approximations to the (inverse) CDF.

+ +

Recall that this expansion adjusts the inverse CDF of the standard Normal distribution using the first few cumulants of $S_2$. Its skewness $\beta_1$ is

+ +

$$\frac{a_1^3 \lambda_1 + a_2^3 \lambda_2}{\left(\sqrt{a_1^2 \lambda_1 + a_2^2 \lambda_2}\right)^3}$$

+ +

and its kurtosis $\beta_2$ is

+ +

$$\frac{a_1^4 \lambda_1 + 3a_1^4 \lambda_1^2 + a_2^4 \lambda_2 + 6 a_1^2 a_2^2 \lambda_1 \lambda_2 + 3 a_2^4 \lambda_2^2}{\left(a_1^2 \lambda_1 + a_2^2 \lambda_2\right)^2}.$$

+ +

To find the $\alpha$ percentile of the standardized version of $S_2$, compute

+ +

$$w_\alpha = z +\frac{1}{6} \beta _1 \left(z^2-1\right) +\frac{1}{24} \left(\beta _2-3\right) \left(z^2-3\right) z-\frac{1}{36} \beta _1^2 z \left(2 z^2-5 z\right)-\frac{1}{24} \left(\beta _2-3\right) \beta _1 \left(z^4-5 z^2+2\right)$$

+ +

where $z$ is the $\alpha$ percentile of the standard Normal distribution. The percentile of $S_2$ thereby is

+ +

$$a_1 \lambda_1 + a_2 \lambda_2 + w_\alpha \sqrt{a_1^2 \lambda_1 + a_2^2 \lambda_2}.$$

+ +

Numerical experiments suggest this is a good approximation once both $\lambda_1$ and $\lambda_2$ exceed $5$ or so. For example, consider the case $\lambda_1 = 5,$ $\lambda_2=5\pi/2,$ $a_1=\pi,$ and $a_2=-2$ (arranged to give a zero mean for convenience):

+ +

+ +

The blue shaded portion is the numerically computed CDF of $S_2$ while the solid red underneath is the Cornish-Fisher approximation. The approximation is essentially a smooth of the actual distribution, showing only small systematic departures.

+",2013-10-17 15:02:12.897 +57714,5448.0,2,,51047.0,,,,CC BY-SA 3.0,"

Here you go - three examples. I've made the code much less efficient than it would be in a real application in order to make the logic clearer (I hope.)

+ +
# We'll assume estimation of a Poisson mean as a function of x
+x <- runif(100)
+y <- rpois(100,5*x)  # beta = 5 where mean(y[i]) = beta*x[i]
+
+# Prior distribution on log(beta): t(5) with mean 2 
+# (Very spread out on original scale; median = 7.4, roughly)
+log_prior <- function(log_beta) dt(log_beta-2, 5, log=TRUE)
+
+# Log likelihood
+log_lik <- function(log_beta, y, x) sum(dpois(y, exp(log_beta)*x, log=TRUE))
+
+# Random Walk Metropolis-Hastings 
+# Proposal is centered at the current value of the parameter
+
+rw_proposal <- function(current) rnorm(1, current, 0.25)
+rw_p_proposal_given_current <- function(proposal, current) dnorm(proposal, current, 0.25, log=TRUE)
+rw_p_current_given_proposal <- function(current, proposal) dnorm(current, proposal, 0.25, log=TRUE)
+
+rw_alpha <- function(proposal, current) {
+   # Due to the structure of the rw proposal distribution, the rw_p_proposal_given_current and
+   # rw_p_current_given_proposal terms cancel out, so we don't need to include them - although
+   # logically they are still there:  p(prop|curr) = p(curr|prop) for all curr, prop
+   exp(log_lik(proposal, y, x) + log_prior(proposal) - log_lik(current, y, x) - log_prior(current))
+}
+
+# Independent Metropolis-Hastings
+# Note: the proposal is independent of the current value (hence the name), but I maintain the
+# parameterization of the functions anyway.  The proposal is not ignorable any more
+# when calculation the acceptance probability, as p(curr|prop) != p(prop|curr) in general.
+
+ind_proposal <- function(current) rnorm(1, 2, 1) 
+ind_p_proposal_given_current <- function(proposal, current) dnorm(proposal, 2, 1, log=TRUE)
+ind_p_current_given_proposal <- function(current, proposal) dnorm(current, 2, 1, log=TRUE)
+
+ind_alpha <- function(proposal, current) {
+   exp(log_lik(proposal, y, x)  + log_prior(proposal) + ind_p_current_given_proposal(current, proposal) 
+       - log_lik(current, y, x) - log_prior(current) - ind_p_proposal_given_current(proposal, current))
+}
+
+# Vanilla Metropolis-Hastings - the independence sampler would do here, but I'll add something
+# else for the proposal distribution; a Normal(current, 0.1+abs(current)/5) - symmetric but with a different
+# scale depending upon location, so can't ignore the proposal distribution when calculating alpha as
+# p(prop|curr) != p(curr|prop) in general
+
+van_proposal <- function(current) rnorm(1, current, 0.1+abs(current)/5)
+van_p_proposal_given_current <- function(proposal, current) dnorm(proposal, current, 0.1+abs(current)/5, log=TRUE)
+van_p_current_given_proposal <- function(current, proposal) dnorm(current, proposal, 0.1+abs(proposal)/5, log=TRUE)
+
+van_alpha <- function(proposal, current) {
+   exp(log_lik(proposal, y, x)  + log_prior(proposal) + ind_p_current_given_proposal(current, proposal) 
+       - log_lik(current, y, x) - log_prior(current) - ind_p_proposal_given_current(proposal, current))
+}
+
+
+# Generate the chain
+values <- rep(0, 10000) 
+u <- runif(length(values))
+naccept <- 0
+current <- 1  # Initial value
+propfunc <- van_proposal  # Substitute ind_proposal or rw_proposal here
+alphafunc <- van_alpha    # Substitute ind_alpha or rw_alpha here
+for (i in 1:length(values)) {
+   proposal <- propfunc(current)
+   alpha <- alphafunc(proposal, current)
+   if (u[i] < alpha) {
+      values[i] <- exp(proposal)
+      current <- proposal
+      naccept <- naccept + 1
+   } else {
+      values[i] <- exp(current)
+   }
+}
+naccept / length(values)
+summary(values)
+
+ +

For the vanilla sampler, we get:

+ +
> naccept / length(values)
+[1] 0.1737
+> summary(values)
+   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+  2.843   5.153   5.388   5.378   5.594   6.628 
+
+ +

which is a low acceptance probability, but still... tuning the proposal would help here, or adopting a different one. Here's the random walk proposal results:

+ +
> naccept / length(values)
+[1] 0.2902
+> summary(values)
+   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+  2.718   5.147   5.369   5.370   5.584   6.781 
+
+ +

Similar results, as one would hope, and a better acceptance probability (aiming for ~50% with one parameter.)

+ +

And, for completeness, the independence sampler:

+ +
> naccept / length(values)
+[1] 0.0684
+> summary(values)
+   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+  3.990   5.162   5.391   5.380   5.577   8.802 
+
+ +

Because it doesn't ""adapt"" to the shape of the posterior, it tends to have the poorest acceptance probability and is hardest to tune well for this problem.

+ +

Note that generally speaking we'd prefer proposals with fatter tails, but that's a whole other topic.

+",2013-10-17 15:06:27.020 +57722,21182.0,2,,57644.0,,,,CC BY-SA 3.0,"

One important point that many seem to be confused about is the application of the Central Limit Theorem (CLT). The CLT applies to the arithmetic mean of a distribution—not the distribution itself. Given an increasing number of samples, the average of those samples tends to be normally distributed with the ""mean mean"" equal to the overall mean and the variance of that mean estimator proportional to the variance and the number of samples. The actual distribution itself is not going to be normal. If anything, the distribution of the samples (NOT their average) will flesh out the shape of the distribution and you will have a greater chance of seeing an extreme value the more samples are generated.

+ +

My hunch is that your routing score is probably an average of some values, and, as such, is the mean of some distribution. If so, as the mean of a distribution, under many conditions its own distribution will tend to the normal after enough samples are generated.

+",2013-10-17 17:48:45.993 +57723,8414.0,2,,57684.0,,,,CC BY-SA 3.0,"

From a mathematical standpoint, there's nothing wrong with doing a Sobel test with survey data (by the way, and slightly off-topic -- you should consider using a bootstrapping method to test your indirect effects instead of a Sobel test; bootstrapping methods are uniformly more powerful than Sobel tests). The real question is what conclusions you would be able to draw from your Sobel test.

+ +

To get a clear sense of the problem, consider a simple study in which the researcher measures people's scores on a self-report measure of trait empathy and the amount of money these people donated to charitable causes within the last year. Assuming that the researchers observed a relationship between trait empathy and donations, few people would make the mistake of concluding that trait empathy causes donations (i.e., empathy -> donations), since the people in the study were not randomized to their values of trait empathy. Thus, it is possible that people who thought about their levels of donations reported higher levels of trait empathy (i.e., donations -> empathy) or that a third variable caused the observed values of both donations and empathy.

+ +

Let's now consider a study in which the researchers measured trait empathy, charitable donations, and positive emotions. The researcher wishes to show that the experience of positive emotions mediates the link between trait empathy and charitable donations (i.e., that empathy -> positive emotions -> donations). In order to convincingly establish mediation, we must show both that empathy -> positive emotions and that empathy -> donations. However, because people were not randomized to their values of trait empathy, we cannot conclude that empathy caused either positive emotions or donations.

+ +

However, even if we had randomized people to their empathy scores, we would still not necessarily be able to conclude that positive emotions were a mediator for the empathy -> donations effect because, after people's assignment to their empathy scores, people were not randomized to their values of positive emotions. Thus, even if we established a non-zero indirect effect, it is possible that, for example, an unobserved candidate mediator causes both positive emotions and donations, and it is this unobserved candidate mediator that creates the observed empathy -> positive emotions -> donations indirect effect (for more information about this problem, see some of the references added below).

+ +

In short, there is nothing wrong with doing a Sobel test or any other test of mediation with survey data. However, just as when you examine simple bivariate relationships with survey data, such a test probably will not reveal much about causal mechanisms because the assumptions required to draw these conclusions are implausible at best.

+ +

I recommend reading some of the references below for more information about assumptions in mediation models.

+ +

Jo, B. (2008). Causal inference in randomized experiments with mediational processes. Psychological Methods, 13, 314–336.

+ +

Imai, K., Keele, L., & Yamamoto, T. (2010). Identification, inference and sensitivity +analysis for causal mediation effects. Statistical Science, 25, 51-71.

+ +

Imai, K., Keele, L., Tingley, D., & Yamamoto T. (2011). Unpacking the black box of causality: Learning about causal mechanisms from experimental and observational studies. American Political Science Review, 105, 765-789.

+",2013-10-17 18:12:15.970 +57715,13846.0,1,57753.0,,,"Logistic regression with categorical predictors, do log-odds differ from 0?",,CC BY-SA 3.0,"

I have a 3 by 2 design, with a total of 6 conditions. The outcome is binary (0 or 1). Below is a sample dataset generated in R:

+ +
set.seed(2)
+mockdata<-data.frame(outcome=sample(1:0, 48, prob=c(0.5, 0.5), replace=TRUE),
+                     f1=rep(letters[1:2], each=24), 
+                     f2=rep(letters[1:3], each=8))
+
+head(mockdata)
+#  outcome f1 f2
+#1       0  a  a
+#2       1  a  a
+#3       1  a  a
+#4       0  a  a
+#5       1  a  a
+#6       1  a  a
+
+ +

One of the things I would like to look at is whether the log-odds of the outcome for each of the 6 conditions is significantly different from 0. I can create a new condition variable as follows:

+ +
mockdata$f12 <- paste(mockdata$f1, mockdata$f2, sep=""."")
+
+ +

then, I can do logistic regression using the newly created variable (see below for output). The intercept below tells me that for the condition that is treated as the baseline condition, the log-odds is not significant different 0.

+ +

My questions are:

+ +

(1). to check the other conditions, should I simply change the baseline condition, and after testing all 6 conditions, I adjust the p-values accordingly?

+ +

(2). Are there better ways of testing what I want to test?

+ +
summary(glm(outcome ~f12, family=""binomial"", data=mockdata))
+
+Call:
+glm(formula = outcome ~ f12, family = ""binomial"", data = mockdata)
+
+Deviance Residuals: 
+    Min       1Q   Median       3Q      Max  
+-1.6651  -1.1774  -0.5168   1.0215   2.0393  
+
+Coefficients:
+            Estimate Std. Error z value Pr(>|z|)  
+(Intercept)  -0.1412     0.3281  -0.430   0.6669  
+f121          0.6520     0.6806   0.958   0.3380  
+f122          0.1412     0.6641   0.213   0.8316  
+f123         -0.3696     0.6806  -0.543   0.5871  
+f124         -1.8047     0.9325  -1.935   0.0530 .
+f125          1.2398     0.7430   1.669   0.0952 .
+---
+
+ +

EDIT

+ +

I would also like to check how the log-odds differ amongst the different conditions. For that, I was gonna just run a regular logistic regression with the main effects of f1 and f2, and the interaction of the two, and conduct additional multiple comparisons dependent on the kind of result I get from the omnibus test.

+",2013-10-17 15:15:49.887 +57716,14799.0,2,,57710.0,,,,CC BY-SA 3.0,"

Do a one-way anova on the response, with city as the grouping variable. The $F$ and $p$ it gives should be the same as the $F$ and $p$ from the regression of the response on the dummy-coded cities, and $SS_{between\, cities}/SS_{total}$ should equal the multiple $R^2$ from the regression. The multiple $R$ is the correlation of city with the response.

+",2013-10-17 15:51:14.580 +57717,22800.0,1,,,,Analyzing reflected and transformed variables II,,CC BY-SA 3.0,"

I've searched past posts and have not seen an answer to this specifically, perhaps because it is assumed to be known so it was not stated in previous related posts.

+ +

I have a very simple single variable model with a negatively skewed dependent variable Y.

+ +

The residuals from Y = b0 + b1X are non-normal, which, by my understanding will violate regression assumptions and impact my residuals and hence inferences but should not affect the properties of unbiassedness or consistency of my estimate 'b1'. (correct me where I am wrong)

+ +

To make correct inferences it seems the practice is to reflect and log the dependent variable as such Y* = ln(1 + max(Y) -Y) and run the following regression:

+ +

Y* = b0 + b1X.

+ +

If I am correct so far then my primary question is do we interpret b from the model above the same as a traditional log lin model:

+ +

i.e. if ln(Y) = b0 + b1x

+ +

Then b1 is interpreted as the b1*100% change in Y due to a 1 unit change in x. or exp(b1) = the change in the 'geometric' mean of Y due to a 1 unit change in x.

+ +

If this is correct, does the same interpretation apply in the reflected and transformed case?

+ +

Thank you.

+",2013-10-17 16:11:49.973 +57718,13549.0,1,57725.0,,,Modeling what should be a logistic regression but has no negative responses,,CC BY-SA 3.0,"

I have a data set of reported food-borne illnesses and we're trying to determine what environmental conditions during food cultivation led to high bacterial counts in the food, and thus caused the illnesses. Unfortunately, I only have data of foods that caused confirmed illnesses. I requested that we go back and ""randomly"" sample from food tags that did not cause a reported illness but am not allowed to do so for various reasons. Even that would have had problems (because just because an illness is not reported doesn't mean it didn't occur), but at least this would have given me some negative observations.

+ +

I was originally planning to model these data using a logistic regression but I am stuck at what to do now. Without negative observations, I can only really provide univariate descriptive statistics, right? I'm hoping that someone else has had this problem and perhaps there's some model I haven't heard of before that can handle this. Thank you.

+",2013-10-17 16:18:09.787 +57719,22756.0,2,,57616.0,,,,CC BY-SA 3.0,"

That would be the probability of obtaining a false negative in 5 slides:

+ +

(0.80)^5 = 0.32768

+ +

Ahhh, so in order to decrease the probability of false negatives below 1% you can do:

+ +
> x <- matrix(c(0), nrow=25)
+> for(i in 1:25) x[i] = (0.8)^i
+> x
+             [,1]
+ [1,] 0.800000000
+ [2,] 0.640000000
+ [3,] 0.512000000
+ [4,] 0.409600000
+ [5,] 0.327680000
+ [6,] 0.262144000
+ [7,] 0.209715200
+ [8,] 0.167772160
+ [9,] 0.134217728
+ [10,] 0.107374182
+ [11,] 0.085899346
+ [12,] 0.068719477
+ [13,] 0.054975581
+ [14,] 0.043980465
+ [15,] 0.035184372
+ [16,] 0.028147498
+ [17,] 0.022517998
+ [18,] 0.018014399
+ [19,] 0.014411519
+ [20,] 0.011529215
+ [21,] 0.009223372
+ [22,] 0.007378698
+ [23,] 0.005902958
+ [24,] 0.004722366
+ [25,] 0.003777893
+
+ +

And find that the false positive rate is less than 1% at i = 21.

+ +

Great! Thanks. I can't believe I didn't see that. I was trying all kinds of conditional probabilities and such for some reason. Keep it simple, stupid...

+",2013-10-17 16:36:36.770 +57720,13549.0,2,,57712.0,,,,CC BY-SA 3.0,"

In addition to the regression line that Jeremy suggested, it would be helpful to your audience to have the p-value of the slope and the R-squared. I'm not sure how new you are to this, but you can the p-value of the slope is really part of a test of whether or not the slope = 0. If the p-value is statistically significant (p < 0.05 usually) then you can be pretty confident that the slope is not zero and that there is likely to be a ""real"" relationship between the independent and dependent variables. The R-squared shows how much of the variation in y is explained by variation in x. For instance, I would suspect that the ADL relationship is probably real (significant p-value) but that the R-squared will be low-ish. This isn't bad, but simply means that there are probably other things affecting ADL sorghum silage aside from ADL fresh sorghum.

+",2013-10-17 16:40:07.893 +57721,436.0,1,,,,Analysis of temporal patterns,,CC BY-SA 3.0,"

I am analysing data on events that I have categorized into groups.

+ +

So, for instance, say I have 3000 events categorized into 5 groups, which we call A to E.

+ +

I will have something like

+ +
    Event  | Group | Time
+   --------+-------+-------
+       1   |   A   |   0
+       2   |   A   |   5
+       3   |   C   |   7
+       4   |   D   |   16
+      ...  |       |
+     3000  |   B   |   6000
+
+ +

Now, I would like to see whether there is some sort of n-event long temporal sequence appearing repeatedly (higher than chance).

+ +

So for instance one 4-event long pattern may be:
+A - 3 seconds - A - 2 seconds - D - 5 seconds - C

+ +

I found this paper which proposes some interesting method (I am not actually working on spike trains, but the issue is similar enough), but before implementing that I would like to see whether anyone knew of other methods/statistics that can be applied to this kind of problems.

+",2013-10-17 17:37:34.770 +57726,22802.0,1,,,,Nonparametric test for trend using Python,,CC BY-SA 4.0,"

I am looking to perform a nonparametric test for trend on a continuous outcome across three groups, preferably in Python. For example height (pretend height is not normal) in 4th, 5th and 6th graders.

+ +

I would like to implement something like the Cuzick method. Scipy has Wilcoxon rank sum and other nonparametric methods but only for two groups. Similarly, Scipy has a Kruskal-Wallis method for three groups but it does not indicate direction or trends. Does anything like this exist for exploring a directional trend across three groups?

+ +
+ +

To clarify I am trying to determining whether there is a significant shift in a continuous trait measured across three groups. The groups will be of very different size: group 1 has 1000's of samples and likely to be normally distributed, group 2 100's os samples, group 3 ~10 or less. Group 1 serves as the ""control"" group, and my hypothesis is that the mean value of group 1 will be shifted in either direction relative to group 0, and group 2 will be shifted further in the same direction as group 1. Because group 3 will always be very small compared to the other group, my instinct was to use nonparametric methods, but I am open to other suggestions.

+ +

Can anyone suggest a method to explore this type of directional trend?

+",2013-10-17 18:51:01.530 +57727,22341.0,1,,,,How to show that the mean is (weakly) consistent,,CC BY-SA 3.0,"

How can I show that the mean is weakly consistent? Is weakly consistent the same as consistent?

+",2013-10-17 19:26:42.837 +57728,22805.0,2,,4705.0,,,,CC BY-SA 3.0,"

Joseph Hilbe (1944-), first president of the International Astrostatistics Association and author of over 10 books on statistical modeling, including popular texts on count models, logistic regression, generalized estimating equations (GEE), generalized linear models, and statistical methodology. Hilbe is an emeritus professor at the University of Hawaii and adjunct professor of statistics at Arizona State University.

+",2013-10-17 19:49:01.727 +57729,14806.0,1,57730.0,,,"ANOVA failed to model data, what is a more appropriate test?",,CC BY-SA 3.0,"

This is a cross-post (https://stackoverflow.com/questions/19432964/anova-error-in-levelsxx) about an error I received in R while trying to run an ANOVA on my data. But error aside, I need help understanding why an ANOVA can't deal with my data and what other statistical models could be applied instead.

+ +

So here's my objective: I have 3 people (speaker) who recorded a bunch of words that I analyzed. The analysis yielded 3 continuous variables: skewness, kurtosis and Center of Gravity (CoG)*. I need to find out what combinations of these 3 variables best model the difference between each speaker. For example, are skewness and CoG together more significant than just CoG in finding the difference between speakers?

+ +

I have a basic knowledge of stats, but erring on the side of assuming I'm an idiot might be better for any complex explanations.

+ +

Thanks in advance!

+ +
    +
  • The skewness is a measure for how much the shape of the spectrum below the center of gravity is different from the shape above the mean frequency.
  • +
  • The kurtosis is a measure for how much the shape of the spectrum around the center of gravity is different from a Gaussian shape.
  • +
  • The center of gravity is a measure for how high the frequencies in a spectrum are on average weighted by their energy.
  • +
+",2013-10-17 19:53:55.743 +57730,13037.0,2,,57729.0,,,,CC BY-SA 3.0,"

Sounds like you are trying to do Multinomial regression. Perhaps look up information on that.

+ +

Here is a great start:

+ +

http://www.ats.ucla.edu/stat/r/dae/mlogit.htm

+ +

e.g.

+ +
install.packages('nnet')
+library(nnet)
+
+test<-multinom(formula = as.factor(speaker) ~ CoG * skewness * kurtosis, data = total)
+
+
+z <- summary(test)$coefficients/summary(test)$standard.errors
+# 2-tailed z test
+p <- (1 - pnorm(abs(z), 0, 1)) * 2
+
+",2013-10-17 20:05:44.610 +57731,22806.0,1,,,,Is there any way to model this grouping process?,,CC BY-SA 4.0,"

I've been working on this problem for three days now, and it doesn't seem that it can be solved using pure thinking. Maybe there is some distribution that can model this but I couldn't find any solution till now.

+ +

This is the process I'm trying to model: I have $K$ points randomly dispersed in an area of radius $R$ using 2D-poisson random variable. I start by putting the first point in a group. Then, the distance $d$ between the first and the second points is found. If this distance is larger than a certain threshold ($Dth$) then the second point will be put in the same group as the first point. Otherwise, the second point will be put in a group alone. The process continues until the last point $K$.

+ +

A point will be put in a group if it has its distance from all the points in that group larger than $Dth$.

+ +

I've derived $P(d>Dth)$ which is the probability that two points have a distance between them larger than $Dth$, so let's denote it by $p$, and $P(d<Dth)$ by $q$.

+ +

What I want to find is either the average number of groups resulting given $K$ points (don't care about the radius $R$ because it is incorporated in the given probabilities) or the average number of points per group given $K$.

+",2013-10-17 20:22:25.147 +57732,14803.0,1,,,,distribution of sample variance or sample coefficient of variation under gamma parent distribution,,CC BY-SA 3.0,"

I am looking for anything related to the distribution of the sample variance or sample coefficient of variation (or joint with sample mean or conditional on sample mean) when the parent distribution is Gamma. Any information of reference would be greatly appreciated.

+",2013-10-17 21:01:28.510 +57733,22340.0,1,,,,Setting intercept to zero: Will this change both standard deviations and the error term?,,CC BY-SA 3.0,"

After running a single regression with a forced zero intercept, I understand that $\beta$ (slope(s)) will change as $\alpha$ (intercept) will be set to zero. Easy.

+ +

$\rho = \beta(\sigma_x / \sigma_y)$ is left in question...

+ +

by way of.... $\beta = \frac{\mathrm{Cov}(X,Y)}{\mathrm{Var}(X)}$.....or......${\mathrm{Cov}(X,Y)}= \beta(\sigma_x^2)$

+ +

Given the forced change, will the $\sigma_x$ and $\sigma_y$ (std.devs) both remain as their non-forced original values, with $\rho$ (correlation) changing to match the new forced slope? What will happen to the error term given the forced zero regression?

+ +

Note: Calculations confirmed in Excel via using StdevP & VarP functions in small sample group.

+",2013-10-17 21:59:44.060 +57747,,2,,57740.0,mrip,,,CC BY-SA 3.0,"

What you've written is a one-sided test. A two sided test would be

+ +
pvalue<-mean(abs(reps)>=ts)
+
+",2013-10-18 03:35:40.610 +57748,1895.0,2,,56784.0,,,,CC BY-SA 3.0,"

The basic results of chi-square goodness-of-fit testing can be understood hierarchically.

+ +

Level 0. The classical Pearson's chi-square test statistic for testing a multinomial sample against a fixed probability vector $p$ is +$$ +X^2(p) = \sum_{i=1}^k \frac{(X^{(n)}_i - n p_i)^2}{n p_i} \stackrel{d}{\to} \chi_{k-1}^2 \>, +$$ +where $X_i^{(n)}$ denotes the number of outcomes in the $i$th cell out of a sample of size $n$. This can be fruitfully viewed as the squared norm of the vector $\mathbf Y_n = (Y_1^{(n)},\ldots,Y_k^{(n)})$ where $Y_i^{(n)} = (X_i^{(n)} - n p_i)/\sqrt{n p_i}$ which, by the multivariate central limit theorem converges in distribution as +$$ +\mathbf Y_n \stackrel{d}{\to} \mathcal N(0, \mathbf I - \sqrt{p}\sqrt{p}^T) \>. +$$ +From this we see that $X^2 = \|\mathbf Y_n\|^2 \to \chi^2_{k-1}$ since $\mathbf I - \sqrt{p}\sqrt{p}^T$ is idempotent of rank $k-1$.

+ +

Level 1. At the next level of the hierarchy, we consider composite hypotheses with multinomial samples. Since the exact $p$ of interest is unknown under the null hypothesis, we have to estimate it. If the null hypothesis is composite and composed of a linear subspace of dimension $m$, then maximum likelihood estimates (or other efficient estimators) of the $p_i$ can be used as ""plug-in"" estimators. Then, the statistic +$$ +X^2_1 = \sum_{i=1}^k \frac{(X^{(n)}_i - n \hat{p}_i)^2}{n \hat{p}_i} \stackrel{d}{\to} \chi_{k-m - 1}^2 \>, +$$ +under the null hypothesis.

+ +

Level 2. Consider the case of goodness of fit testing of a parametric model where the cells are fixed and known in advance: For example, we have a sample from an exponential distribution with rate $\lambda$ and from this we produce a multinomial sample by binning over $k$ cells, then the above result still holds provided that we use efficient estimates (e.g., MLEs) of the bin probabilities themselves using only the observed frequencies.

+ +

If the number of parameters for the distribution is $m$ (e.g., $m = 1$ in the exponential case), then +$$ +X^2_2 = \sum_{i=1}^k \frac{(X^{(n)}_i - n \hat{p}_i)^2}{n \hat{p}_i} \stackrel{d}{\to} \chi_{k-m - 1}^2 \>, +$$ +where here $\hat{p}_i$ can be taken to be the MLEs of the cell probabilities of the fixed, known cells corresponding to the given distribution of interest.

+ +

Level 3. But, wait! If we have a sample $Z_1,\ldots,Z_n \sim F_\lambda$, why shouldn't we estimate $\lambda$ efficiently first, and then use a chi-square statistic with our fixed, known cells? Well, we can, but in general we no longer get a chi-square distribution for the corresponding chi-square statistic. In fact, Chernoff and Lehmann (1954) showed that using MLEs to estimate the parameters and then plugging them back in to get estimates of the cell probabilities results in a non-chi-square distribution, in general. Under suitable regularity conditions, the distribution is (stochastically) between a $\chi_{k-m-1}^2$ and a $\chi_{k-1}^2$ random variable, with the distribution depending on the parameters.

+ +

Untuitively, this means that the limiting distribution of $\mathbf Y_n$ is $\mathcal N(0, \mathbf I - \sqrt{p_\lambda}\sqrt{p_\lambda}^T - \mathbf A(\lambda))$.

+ +

We haven't even talked about random cell boundaries yet, and we're already in a bit of a tight spot! There are two ways out: One is to retreat back to Level 2, or at the very least not use efficient estimators (like MLEs) of the underlying parameters $\lambda$. The second approach is to try to undo the effects of $\mathbf A(\lambda)$ in such a way as to recover a chi-square distribution.

+ +

There are several ways of going the latter route. They basically amount to premultiplying $\mathbf Y_n$ by the ""right"" matrix $\mathbf B(\hat{\lambda})$. Then, the quadratic form +$$ +\mathbf Y_n^T \mathbf B^T \mathbf B \mathbf Y_n \stackrel{d}{\to} \chi_{k-1}^2 \>, +$$ +where $k$ is the number of cells.

+ +

Examples are the Rao–Robson–Nikulin statistic and the Dzhaparidze–Nikulin statistic.

+ +

Level 4. Random cells. In the case of random cells, under certain regularity conditions, we end up in the same situation as in Level 3 if we take the route of modifying the Pearson chi-square statistic. Location-scale families, in particular, behave very nicely. One common approach is to take our $k$ cells each to have probability $1/k$, nominally. So, our random cells are intervals of the form $\hat{I}_j = \hat \mu + \hat\sigma I_{0,j}$ where $I_{0,j} = [F^{-1}((j-1)/k), F^{-1}(j/k))$. This result has been further extended to the case where the number of random cells grows with the sample size.

+ +

References

+ +
    +
  1. A W. van der Vaart (1998), Asymptotic Statistics, Cambridge University Press. Chapter 17: Chi-Square Tests.

  2. +
  3. H. Chernoff and E. L. Lehmann (1954), The use of maximum likelihood estimates in $\chi^2$ tests for goodness of fit, Ann. Math. Statist., vol. 25, no. 3, 579–586.

  4. +
  5. F. C. Drost (1989), Generalized chi-square goodness-of-fit tests for location-scale models when the number of classes tends to infinity, Ann. Stat, vol. 17, no. 3, 1285–1300.

  6. +
  7. M. S. Nikulin, M.S. (1973), Chi-square test for continuous distribution with +shift and scale parameters, Theory of Probability and its Application, vol. 19, no. 3, 559–568.

  8. +
  9. K. O. Dzaparidze and M. S. Nikulin (1973), On a modification of the standard statistics of Pearson, Theory of Probability and its Application, vol. 19, no. 4, 851–853.

  10. +
  11. K. C. Rao and D. S. Robson (1974), A chi-square statistic for goodness of fit tests within exponential family, Comm. Statist., vol 3., no. 12, 1139–1153.

  12. +
  13. N. Balakrishnan, V. Voinov and M. S. Nikulin (2013), Chi-Squared Goodness of Fit Tests With Applications, Academic Press.

  14. +
+",2013-10-18 03:36:15.927 +57734,22808.0,1,,,,Confidence interval for multiple regression parameter,,CC BY-SA 3.0,"

I'm given the least squares model:

+ +
Y = B0 + B1x1 + B2x2 + B3x1x2
+Y = 12 -2x1 + 7x2 +5x1x2
+
+n = 20
+
+ +

as well as some RSS's

+ +
> sum( lm( y ~ 1 )$residuals^2 )                                  #$ (to fix display bug)
+[1] 456 
+> sum( lm( y ~ x1 )$residuals^2 )                                 #$
+[1] 320 
+> sum( lm( y ~ x2 )$residuals^2 )                                 #$
+[1] 360 
+> sum( lm( y ~ x1 + x2 )$residuals^2 )                            #$
+[1] 288 
+> sum( lm( y ~ x1 + x2 + I(x1*x2) )$residuals^2 )                 #$
+[1] 240
+
+ +

So, I know the least squares estimate for B3 is 5.

+ +

I did ANOVA on the full model versus the model where B3 = 0. I found the F statistic for B3 = 0 to be 3.2.

+ +

Now I need to find a 95% confidence interval for B3. I'm not sure where to go from here.

+",2013-10-17 22:08:41.807 +57735,594.0,2,,57695.0,,,,CC BY-SA 3.0,"

In the absence of a clearly identified aim I'll begin with some general comments in the hope that the purpose of your analysis becomes clarified. It would be nice to know if you're after an interval, hypothesis test or simply a standard error - but if the last, to what end?]

+ +

If the distribution of the population from which the sample is drawn is known, the distribution of the median may be computed.

+ +

The density of the $r$-th order statistic for a sample of size $n$ for a continuous random variable is

+ +

$$f_{Y_r}=\frac{n!}{(r-1)!(n-r)!}[F(x)]^{r-1}[1-F(x)]^{n-r}f(x)$$

+ +

For $n=3$,

+ +

$$f_{Y_2}= 6F(x)[1-F(x)]f(x)$$

+ +

For even $n$ it's more complex, but sometimes still doable.

+ +

If the sample size is 1, the distribution of the median is trivial - it's just the distribution of a single observation.

+ +

If the density is available, it should be possible to compute the standard deviation of the distribution of the order statistic.

+ +

(If you don't know the distribution, it's also possible to get an asymptotic standard error for the median, but it relies on knowing the height of the density at the median, which - while a much weaker requirement - would seem unlikely unless you knew the distribution.)

+ +

Additionally, you can generate nonparametric intervals for a median from the order statistics, but I don't think this gets at your present problem.

+",2013-10-17 22:19:18.220 +57736,22807.0,2,,57676.0,,,,CC BY-SA 3.0,"

Is this the problem from the STA511 class?:)

+ +

pnorm() won't give you the right result, because it's a CDF. What you are looking is an inverse of the CDF, so you have to use qnorm() to get it.

+",2013-10-17 22:41:36.583 +57737,20981.0,2,,57715.0,,,,CC BY-SA 3.0,"

Try running ANOVA on your model, e.g.

+ +
anova(glm(...),test=""Chisq"")
+
+ +

The drop1, add1 and step functions might also be useful.

+ +

http://data.princeton.edu/R/glms.html

+",2013-10-17 23:40:36.663 +57738,22815.0,1,,,,"ANOVA using R - unsure of if analysis is appropriate, and if variables need to be numeric",,CC BY-SA 3.0,"

I'm doing an ANOVA to compare the results of two trials.

+ +

Individuals had to walk towards a bucket blindfolded, with their deviation from the target recorded. I want to see if individuals get better with practice (are trial-2 deviations lower than trial-1) and is there a relationship between the trials regarding gender and handedness (left handed/right handed).

+ +

Is an ANOVA appropriate?

+ +

Also, I'm having trouble in R because I've put gender as M and F, and Handedness as L or R. Do I need to make these numerical or not?

+",2013-10-18 00:37:48.237 +57739,22816.0,1,,,,"Basic GARCH (1,1) question",,CC BY-SA 3.0,"

Background to question:

+ +

I was trying to fit a GARCH(1,1) model to the variance of log returns of a series, and ARMA(0,0) for the mean. I was using the fGarch package in R to do this. The aim of the modeling is to generate a predicted volatility number to feed into the Black-Scholes model to an generate option price and therefore option deltas. I plan to backtest the delta from GARCH volatility to hedge my option positions (as opposed to deltas derived from implied vol prices).

+ +

Questions:

+ +

A) I used the predict function in the package to generate a 'n-day ahead' volatility forecast. As I understand GARCH, these numbers are annualized standard deviation numbers. To hedge a 1 month option I want to forecast 30 day volatility. I can simply put 'n-days ahead = 30 to get the numbers, but how do I combine those 30 numbers to get a annualized vol number?

+ +

B) Could anyone also please explain how to use the nroll argument in the package? Basically I want rolling GARCH estimates of volatility. For example, at day 10, I want to use the past 10 days of data to get a vol prediction for day 11, at day 50 I want to use 50 days of data for vol prediction of day 51 etc.

+",2013-10-18 01:18:25.060 +57740,22844.0,1,,,user2892710,Two sided permutation test,,CC BY-SA 3.0,"

I'm trying to write a two sided permutation test to test the alternative hypothesis there is a difference in the medians of 2 independent samples. My question is this: am I calculating the p-value correctly? Thanks SO much!

+ +
sample1 <- groundwater$West[!is.na(groundwater$West)]
+sample2 <- groundwater$East
+ts <- median(sample1) - median(sample2)
+> ts
+[1] 0.105
+R <- 9999
+all <- c(sample1, sample2)
+k <- 1:length(all)
+reps <- numeric(R)
+for (i in 1:R) {
+    m <- sample(k, size=length(sample1), replace=FALSE)
+    permsample1 <- all[m]
+    permsample2 <- all[-m]
+    reps[i] <- median(permsample1) - median(permsample2)
+}
+pvalue <- sum(c(ts, reps) <= ts)/9999
+> pvalue
+[1] 0.9223922
+
+",2013-10-18 01:19:42.453 +57741,11656.0,1,,,,Who is the best writer among statisticians?,,CC BY-SA 3.0,"

I'd like to learn their writing style. +Could you recommend a good writer statistician?

+",2013-10-18 01:50:55.030 +57742,594.0,2,,56784.0,,,,CC BY-SA 3.0,"

I've found at least partial answers to my question, below. (I'd still like to give someone that bonus, so any further information appreciated.)

+ +

Moore (1971) said that Roy (1956) and Watson (1957,58,59) showed that when the cell boundaries +for a chi-square statistic are functions of best asymptotic normal estimated parameter values, then under certain conditions, the asymptotic null distribution of the +chi-square statistic is still that of a sum of a $\chi^2_{k-p-1}$ and a weighted +sum of $p$ $\chi^2_1$ variables (for $k$ cells, $p$ parameters) +where the weights are between 0 and 1 (making the cdf of the +distribution between that of a $\chi^2_{k-p}$ and a $\chi^2_{k}$, as alluded to in my question for the distribution when using ML estimation), and the weights on those last $p$ terms are unaffected by that estimation.

+ +

References

+ +

Moore D.S. (1971), A Chi-Square Statistic with Random Cell +Boundaries, Ann. Math. Stat., Vol 42, No 1, 147–156.

+ +

Roy A.R. (1956), On $\chi^2$ statistics with variable intervals, +Technical Report No. 1, Dept of Statistics, Stanford University.

+ +

Watson, G.S. (1957), The $\chi^2$ goodness-of-fit test for normal +distributions, Biometrika, 44, 336–348.

+ +

Watson, G.S. (1958), On $\chi^2$ goodness-of-fit tests for +continuous distributions, J. Royal Statist. Soc. B, 20, 44–61.

+ +

Watson, G.S. (1959), Some recent results in $\chi^2$ goodness-of- +fit tests, Biometrics, 15, 440-468

+",2013-10-18 01:51:22.243 +57749,594.0,2,,57746.0,,,,CC BY-SA 3.0,"

Here's some major hints to get you started:

+ +

For the first one, you might want to consider what $f'$ looks like (as a formula) between the sorted data values (it's discontinuous at the data values). How does it behave as you move through the data? Where does it change from below 0 to above 0?

+ +

Here's $f$ vs $t$ (the grey vertical bars on top of the x-axis are the data values):

+ +

+ +

Can you see how to show what will minimize $f$ now?

+ +
+ +

For the second case, consider $f_k(t) = ( \sum_{i=1}^{n} |y_i-t|^{k} )^{1/k}$ (you would need to argue that $f_k$ and $f$ share an argmin for a given $k$ - that is taking the $1/k$ power doesn't change the location of the minimum. Below (for the same data as above) $f_k$ is plotted for $k = 2,3,5,9,$ and $99$ (the ""$k=2$"" case is faint dotted grey at the top, the ""$k=99$"" case is purple near the bottom):

+ +

+ +

The ""$k=2$"" case corresponds to least squares, while $k=1$ would be the previous example. As $k$ increases, it's getting 'pointier'. Can you figure where the 'point' is headed, and why? Can you work out $f_k'$? What happens to that as $k$ increases?

+",2013-10-18 04:47:18.360 +57750,22678.0,1,,,,"Lasso ||a|| and ""General Lasso"" ||Da||",,CC BY-SA 4.0,"

Ryan Tibshirani introduced once a more general type of Lasso, where the regularizer is +$$\parallel D \alpha \parallel_1$$ +instead of $\parallel \alpha \parallel_1$. +See paper

+ +

However, there is nearly no discussion about this form and I wonder why since its a great way to deal with derivative smoothness regularizers.

+ +
    +
  • Is there an easy way I overlooked to transform a general Lasso to the standard Lasso form?

  • +
  • Which algorithm can be used for the gen. lasso? Currently I only tested quadratic programs, but this is quite slow.

  • +
+",2013-10-18 07:28:18.987 +57751,5671.0,2,,57685.0,,,,CC BY-SA 3.0,"

You may want to look at Multidimensional Scaling. This will try to find the 2d projection that best preserves your distances. For obvious reasons, There is no lossless 4d to 2d projection.

+",2013-10-18 07:46:48.313 +57752,22601.0,1,60010.0,,,Find k of n items with least pairwise correlations,,CC BY-SA 3.0,"

I have a matrix of pairwise correlations between n items. Now I want to find a subset of k items with the least correlation. Thus there are two questions:

+ +
    +
  1. Which is the appropriate measure for the correlation within that group?
  2. +
  3. How to find the group with the least correlation?
  4. +
+ +

This problem appears like a kind of inverse factor analysis to me and I'm pretty sure that there is a straight-forward solution.

+ +

I think this problem actually equals the problem to remove (n-k) nodes from a complete graph so the remaining nodes are connected with minimum edge weights. What do you think?

+ +

Thanks for your suggestions in advance!

+",2013-10-18 07:50:14.530 +57753,16474.0,2,,57715.0,,,,CC BY-SA 3.0,"

One thing you can do is to exclude the constant and main effects from your model and don't leave out the reference categories. That way the coefficients will be the (adjusted) log odds, and the test commonly reported next to the coefficients will be the test you are looking for. I wrote a brief discussion on that trick for Stata here. I don't know enough about R to tell you which commands to type, but I am certain one can also do it R.

+",2013-10-18 07:52:11.500 +57754,16474.0,2,,57707.0,,,,CC BY-SA 3.0,"

One trick that often helps for logistic regression type problems is to realize that:

+ +

$1 - h(x^{(i)}) = h(-x^{(i)})$

+ +

and that $h(-x^{(i)})$ is more numerically stable than $1 - h(x^{(i)})$.

+ +

You can find a discussion of that here. This is an article in the Stata Journal so the examples are in Stata/Mata, but the problem has to do with the way computers store numbers and is thus more general. For example, I have been able to reproduce the first anomalous example exactly in R, i.e. not just the general pattern but the exact values.

+",2013-10-18 08:06:21.207 +57755,23201.0,1,58862.0,,Chris K,"Is there a better name than ""average of the integral""?",,CC BY-SA 3.0,"

I'm testing throttle position sensors (TPS) my business sells and I print the plot of voltage response to the throttle shaft's rotation. A TPS is a rotational sensor with $\approx$ 90° of range and the output is like a potentiometer with full open being 5V (or sensor's input value) and initial opening being some value between 0 and 0.5V. I built a test bench with a PIC32 controller to take a voltage measurement every 0.75° and the black line connects these measurements.

+ +

One of my products has a tendency to make localized, low amplitude variations away from (and under) the ideal line. This question is about my algorithm for quantifying these localized ""dips""; what is a good name or description for the process of measuring the dips? (full explanation follows) In the below picture, the dip occurs at the left third of the plot and is a marginal case whether I would pass or fail this part:

+ +

+ +

So I built a dip detector (stackoverflow qa about the algorithm) to quantify my gut feeling. I initially thought I was measuring ""area"". This graph is based on the printout above and my attempt to explain the algorithm graphically. There is a dip lasting for 13 samples between 17 and 31:

+ +

+ +

Test data goes in an array and I make another array for ""rise"" from one data point to the next, which I call $deltas$. I use a library to get the average and standard deviation for $deltas$.

+ +

Analyzing the $deltas$ array is represented in the graph below, where the slope is removed from the above graph. Originally, I thought of this as ""normalizing"" or ""unitizing"" the data as the x axis are equal steps and I'm now solely working with the rise between data points. When researching this question, I recalled this is the derivative, $\frac {dy}{dx}$ of the original data.

+ +

+ +

I walk through $deltas$ to find sequences where there are 5 or more adjacent negative values. The blue bars are a series of data points who are below the average of all $deltas$. The values of the blue bars are:

+ +

$0.7 + 1.2 + 1.3 + 1.4 + 1.8 + 2.5 + 2.9 + 3.0 + 2.5 + 2.0 + 1.5 + 1.0 + 1.2$

+ +

They sum to $23$, which represents the area (or the integral). My first thought is ""I just integrated the derivative"" which should mean I get back the original data, though I'm certain there's a term for this.

+ +

The green line is the average of these ""below average values"" found via dividing the area by the length of the dip:

+ +

$23 \div 13 = 1.77$

+ +

During the testing of 100+ parts, I came to decide that dips with my green line average less than $2.6$ are acceptable. Standard deviation calculated across the entire data set wasn't a strict enough test for these dips, as without enough total area, they still fell within the limit I established for good parts. I observationally chose standard deviation of $3.0$ to be the highest I would allow.

+ +

Setting a cutoff for standard deviation strict enough to fail this part would then be so strict as to fail parts which otherwise appear to have a great plot. I do also have a spike detector which fails the part if any $|deltas - avg| > avg+std dev$.

+ +

It's been almost 20 years since Calc 1, so please go easy on me, but this feels a lot like when a professor used calculus and the displacement equation to explain how in racing, a competitor with less acceleration who maintains higher corner speed can beat another competitor having greater acceleration to the next turn: going through the previous turn faster, the higher initial speed means the area under his velocity (displacement) is greater.

+ +

To translate that to my question, I feel like my green line would be like acceleration, the 2nd derivative of the original data.

+ +

I visited wikipedia to re-read the fundamentals of calculus and the definitions of derivative and integral, learned the proper term for adding up the area under a curve via discreet measurements as Numerical Integration. Much more googling on average of the integral and I'm lead to the topic of nonlinearity and digital signal processing. Averaging the integral seems to be a popular metric for quantifying data.

+ +

Is there a term for the Average of the Integral? ($1.77$, the green line)?
+... or for the process of using it to evaluate data?

+",2013-10-18 08:27:04.943 +57756,22822.0,1,,,,Panel study is a quasi-experimental study? Quasi-experimental is the same as correlational?,,CC BY-SA 3.0,"

I'm working with panel data. This panel dataset includes data from pupils of two kinds of schools:

+ +
    +
  • State schools (G1) where pupils change to secondary school when they are 12 years old.

  • +
  • Private schools (G2) where pupils remain in the same school. They can follow studying in the same school when they are 12 years old.

  • +
+ +

I want to compare the change in a dependent variable between these two groups of pupils, controlling for some other independent variables.

+ +

So, is this a quasi-experimental design? Maybe G1 can be seen as a treatment group and G2 as a control although there isn't a randomized assignment? Or is this an observational study? +And finally, what exactly is a correlational study? Is it quasi-experimental or observational?

+",2013-10-18 09:14:44.847 +57757,22823.0,1,57844.0,,,Distance or Similarity metric for 2D frequency data maps,,CC BY-SA 3.0,"

I want to compare the distance/similarity of 2D flood frequency data maps. The maps are square with YxY grid size and in each cell of the map is stored its flood frequency. For example in a 5x5 grid we may have this two flood frequency maps of the same area for the past 10 years, where we observe how many times the corresponding cell/place flooded:

+ +

0 0 0 0 0       0 0 0 0 0
+0 1 2 1 0       0 2 3 1 0
+0 4 6 2 0       9 9 8 7 6
+0 1 2 1 0       0 2 3 1 0
+0 4 6 2 0       9 9 8 7 6

+ +

I can easily transform these maps into a probability map that will add up to one. So the question now is what is the most meaningful way of comparing these kind of maps with each other to find their (dis)similarity. A distance metric taken from information theory field like JSD or L1 (and many others) or a similarity metric taken from the image processing field like the area under ROC (and many others)?

+",2013-10-18 09:16:24.410 +57758,22824.0,1,57760.0,,,Question about $\epsilon' \epsilon$ in the linear regression model,,CC BY-SA 3.0,"

While studying the standard multivariate linear regression model, I came across the following:

+ +

+ +

Could anyone please explain me why the last equality holds, and, why $Z(Z'Z)^{-1}Z'$ cannot simply be simplified to $I$ (isn't it true that $Z(Z'Z)^{-1}Z'=ZZ^{-1}Z'^{-1}Z'=II=I$?

+",2013-10-18 09:21:02.257 +57759,14525.0,1,,,,Combining similarity scores,,CC BY-SA 3.0,"

I have a list of m x n similarity score matrix, something like

+ +
           c1         c2         c3         c4         c5  
+      d1  0.2159824  0.3528572  0.2390016  0.3673485  0.2849448
+      d2  0.2849448  0.2669695  0.2441495  0.3829949  0.3511353
+      d3  0.3281100  0.3251407  0.4328260  0.2895179  0.2814589  
+
+ +

these ""similarity scores"" lie in between 0-1. What I am trying to do here is to combine these scores into a single score, also in between 0-1.

+ +

My issue here is that I am not able to figure out a good approach to combine these scores into this single score. So far I have tried taking the average, max. value, calculating row, column averages and using the max.value out of them. The problem with these scores is that the matrices I have vary a lot in row and column lengths, and I cannot account for this variation using average because at the end of the day I have to sort these matrices based on this similarity score and select n top ranking ones, and from manually checking these observations, I realized that max.value in a matrix is not a suitable single score the similarity between these observations. Do you have any suggestions for an approach to combine these scores ?

+ +

Also is there any statistical tests that could be applied on this combined score ? I have tried random sampling approach, but the steps to calculate similarity scores for the observations take a long time to run and iterating +these steps ~1,000 times or more is not feasible now.

+",2013-10-18 09:23:40.167 +57760,21638.0,2,,57758.0,,,,CC BY-SA 3.0,"

Your first question has been answered by Glen_b in his comment. Regarding your second question, $\matrix{Z}$ is not (generally) square, hence it does not have an inverse. $\matrix{Z}'\matrix{Z}$ on the other hand is square and can be inverted.

+",2013-10-18 09:47:38.870 +57761,503.0,2,,57759.0,,,,CC BY-SA 3.0,"

You actually have two problems, not one.

+ +

The first problem seems to be to average the scores in a matrix. Here the mean, median, trimmed mean and winsorized mean all seem potentially sensible.

+ +

The other is to somehow ""account"" for the size of the matrix. Here the total size (rxc) seems to be the obvious solution.

+ +

If you need more than this, please clarify your question again, but I see no reason why the average (or any of the variations I listed) are poor choices simply because the matrices are different sizes.

+",2013-10-18 10:19:26.323 +57762,2081.0,2,,57752.0,,,,CC BY-SA 3.0,"

[Forewarning: this answer appeared before the OP decided to reformulate the question, so it may have lost relevance. Originally the question was about How to rank items according to their pairwise correlations]

+ +

Because matrix of pairwise correlations isn't a unidimensional array it is not quite clear what ""ranking"" may look like. Especially as long as you haven't worked out your idea in detail, as it seems. But you mentioned PCA as suitable for you, and that immediately made me to think of Cholesky root as potentially even more suitable alternative.

+ +

Cholesky root is like a matrix of loadings left by PCA, only it is triangular. I'll explain both with an example.

+ +
R, correlation matrix
+         V1       V2       V3       V4
+V1   1.0000   -.5255   -.1487   -.2790
+V2   -.5255   1.0000    .2134    .2624
+V3   -.1487    .2134   1.0000    .1254
+V4   -.2790    .2624    .1254   1.0000
+
+A, PCA full loading matrix
+          I       II      III       IV
+V1   -.7933    .2385    .2944    .4767
+V2    .8071   -.0971   -.3198    .4867
+V3    .4413    .8918    .0721   -.0683
+V4    .5916   -.2130    .7771    .0261
+
+B, Cholesky root matrix
+          I       II      III       IV
+V1   1.0000    .0000    .0000    .0000
+V2   -.5255    .8508    .0000    .0000
+V3   -.1487    .1589    .9760    .0000
+V4   -.2790    .1361    .0638    .9485
+
+A*A' or B*B': both restore R
+         V1       V2       V3       V4
+V1   1.0000   -.5255   -.1487   -.2790
+V2   -.5255   1.0000    .2134    .2624
+V3   -.1487    .2134   1.0000    .1254
+V4   -.2790    .2624    .1254   1.0000
+
+ +

PCA's loading matrix A is the matrix of correlations between the variables and the principal components. We may say it because row sums of squares are all 1 (the diagonal of R) while matrix sum of squares is the overall variance (trace of R). Cholesky root's elements of B are correlations too, because that matrix also has these two properties. Columns of B are not principal components of A, although they are ""components"", in a sense.

+ +

Both A and B can restore R and thus both can replace R, as its representation. B is triangular which clearly shows the fact that it captures the pairwise correlations of R sequentially, or hierarhically. Cholesky's component I correlates with all the variables and is the linear image of the first of them V1. Component II no more shares with V1 but correlates with the last three... Finally IV is correlated only with the last, V4. I thought such sort of ""ranking"" is perhaps what you seek for?.

+ +

The problem with Cholesky decomposition, though, is that - unlike PCA - it depends on the order of items in the matrix R. Well, you might sort the items is descending or ascending order of the sum of squared elements (or, if you like, sum of absolute elements, or in order of multiple correlarion coefficient - see about it below). This order reflects the how much an item is gross correlated.

+ +
R, rearranged
+         V2       V1       V4       V3 
+V2   1.0000   -.5255    .2624    .2134 
+V1   -.5255   1.0000   -.2790   -.1487 
+V4    .2624   -.2790   1.0000    .1254 
+V3    .2134   -.1487    .1254   1.0000 
+
+Column sum of squares (descending)
+     1.3906   1.3761   1.1624   1.0833 
+
+B 
+          I       II      III       IV 
+V2   1.0000    .0000    .0000    .0000 
+V1   -.5255    .8508    .0000    .0000 
+V4    .2624   -.1658    .9506    .0000 
+V3    .2134   -.0430    .0655    .9738
+
+ +

From last B matrix we see that V2, most grossly correlated item, pawns all its correlations in I. Next grossly correlated item V1 pawns all its correlatedness, except that with V2, in II; and so on.

+ +
+ +

Another decision could be computing Multiple correlation coefficient for every item and ranking based on its magnitude. Multiple correlation between an item and all the other items grows as the item correlates more with all of them but them correlate less with each other. The squared multiple correlation coefficients form the diagonal of the so called image covariance matrix which is $\bf S R^{-1} S - 2S + R$, where $\bf S$ is the diagonal matrix of the reciprocals of the diagonals of $\bf R^{-1}$.

+",2013-10-18 10:21:33.683 +57763,503.0,2,,57756.0,,,,CC BY-SA 3.0,"

Personally, I don't much care for the term ""quasi-experimental"" but it is used a lot. The Wikipedia entry for quasi-experiment seems to be good. Another way to think about it is that in a true experiment there is random selection and random assignment, but in an observational study there is neither. In a quasi-experiment there is one or the other but not both.

+ +

In your particular case, you seem to have no control over either selection or assignment, so I would call it an observational study.

+ +

As for ""correlational"" I've seen this used by many of my doctoral student clients. I think its frequent use comes from some book that seems to get recommended a lot. If terminology is sane, ""correlational"" should just mean ""involving correlations"", but I've seen it used for studies that involved only regressions. This terminological confusion is borne out by a Google search, which yields mostly results on sites such as ""about.com"". I'd avoid use of the term, myself; clearly correlations could be used in experimental designs, observational designs or pretty much any design you could come up with.

+",2013-10-18 10:28:30.710 +57764,22827.0,1,67832.0,,,Why does Naive Bayes outperform Support Vector Machines?,,CC BY-SA 3.0,"

I have a dataset composed of about 36000 attributes and 550 samples, the dataset is generated from text communication between people in some chatrooms.

+ +

The questions is when I try to classify these samples, a Naive Bayes classifier always outperforms a support vector machine, both in speed and accuracy. But in literature it is always noted that SVM is better in text-mining classification tasks.

+ +

Can anyone please explain in which situations Naive Bayes is better and in which situations SVM?

+ +

For more information about the question:

+ +

I am using the RapidMiner tool, I'm using 10 fold cross validation with stratified sampling. for Naive Bayes, the Laplacian correction is applied and for SVM I use a dot kernel and other parameters are all in their defaults, but when I change the parameters and try again, I get same result; Naive Bayes still outperforms SVM.

+",2013-10-18 10:39:49.607 +57765,22830.0,1,,,,Pearson correlation,,CC BY-SA 3.0,"

Should the sample size n be equal when we are looking for simple correlation? I mean is it OK if variable 1 has a little more or fewer observations than variable 2? I am computing correlation between two variables...the n of one is a little higher than the n of other!

+ +

I am looking for correlation between 2 scales (psy tests). The n of one is a little higher than the n of another. I mean not ALL the respondents who filled up one form (scale) have filled up the other. There are some (very few though) missing.

+",2013-10-18 11:05:21.687 +57766,22831.0,1,,,,How can I calculate utilities for attribute levels in conjoint analysis in R?,,CC BY-SA 3.0,"

I conducted a stated preference survey in which each respondent had to choose 1 set out of 3 choice sets (A, B and C), which are characterised by 4 attributes (let's say brand, color, size and keyboard yes/no) of either 2 or 3 levels.

+ +

I tried to calculate the utilities/coefficients with the estimation of multinomial logit model using the mlogit package in R. It's working well to calculate the utilities for the attributes (I followed exactly the script of Yves Croissant in his paper on the mlogit package.)

+ +

But I want to calculate the coefficients not for the attributes, but for the levels. I want to know what is the utility e.g. for the color ""blue"". +How can I calculate this? Is it possible at all to calculate it with the mlogit package or R?

+ +

I am grateful for every advice!

+",2013-10-18 11:39:06.263 +57780,10147.0,1,,,,Treatment of Ordinal Predicting Variable,,CC BY-SA 3.0,"

I am trying to perform an regression analysis where the response variable is ordinal and the 15 out of the 16 predicting variables are also ordinal. Besides treating all these ordinal predictors as factors, is there any other option? What is the best way to treat these ordinal predictors? Thank you.

+",2013-10-18 15:24:35.867 +57781,,1,,,user31656,Structure of data and function call for recurrent event data with time-dependent variables,,CC BY-SA 3.0,"

I'm attempting to estimate the effect of 2 drugs (drug1, drug2) on the likelihood of a patient falling (event). The patients can fall more than once and can be put on or taken off of the the drugs at any point.

+ +

My question is how the data should be structured with regard to the time period (days), specifically whether there needs to be overlap between the days. There are two reasons why I think my structure is wrong, the first being a seemingly incorrect N. I am also getting some errors where the time period is a single day (i.e. time1=4, time2=4) and am unsure how these should be coded. Should the start time of subsequent entries be the stop time of the previous entry? I've tried it both ways (with and without overlap), and while having overlap gets rid of the warning, the N is still incorrect.

+ +
Warning message:
+In Surv(time = c(0, 2, 7, 15, 20, 0, 18, 27, 32, 35, 39, 46, 53,  :
+  Stop time must be > start time, NA created
+
+ +

Right now I have the data set up where the beginning of the next entry is the next day. Unique patients are identified by their chart numbers.

+ +
Time1    Time2    Drug1    Drug2   Event    ChartNo
+    0        2        1        0       0        123
+    3       10        1        1       1        123
+   11       14        1        1       1        123
+    0       11        0        1       0        345
+    0       19        1        0       1        678
+    0        4        0        1       0        900
+    5       18        1        1       0        900
+
+ +

Patient 123 was on drug1 at the start to day 2, after which point they had drug2 added. They went from day 3 to day 10 on both drugs before falling the first time, then fell a second time on day 14 while still on both drugs. Patient 345 went 11 days on drug2 without falling (then was censored), etc.

+ +

The actual estimation looks like this:

+ +
S <- Srv(time=time1, time2=time2, event=event)
+cox.rms <- cph(S ~ Drug1 + Drug2 + cluster(ChartNo), surv=T)
+
+ +

My main concern is that the n for my analysis is reported to be 2017 (the number of rows in the data), when in actuality I only have 314 unique patients. I am unsure if this is normal or the result of some error I've made along the way.

+ +
> cox.rms$n
+Status
+No Event    Event 
+    1884      133 
+
+ +

The same is true when using coxph() from the survival package.

+ +
 n= 2017, number of events= 133
+
+ +

The number of events is correct however.

+ +

This Post seems to have it set up with the 'overlap' I described, but I am unsure about the N, and they don't seem to be clustering by ID.

+",2013-10-18 15:25:31.520 +57936,22906.0,1,57961.0,,,Generating survival times for a piecewise constant hazard model with two change points,,CC BY-SA 3.0,"

When there are two change points in a piecewise constant hazard model then the density function becomes some triangle exponential distribution. In this situation I can't generate the survival time from the CDF using probability integral transformation. Can any one help me to generate the survival time from this model?

+",2013-10-21 14:57:14.690 +57767,8629.0,1,,,,Using anomalies to calculate trends of seasonal data,,CC BY-SA 3.0,"

I commonly see people doing trend analysis of (monthly) timeseries data which show a strong inter-annual cycle following this scheme:

+ +
    +
  1. compute climatological means (""mean January"", ""mean February"", ..., ""mean December"")
  2. +
  3. subtract climatological means from actual data, to yield an ""anomaly timeseries""
  4. +
  5. perform linear regression on this ""anomaly timeseries""
  6. +
+ +

Climatological in this case means multi-year average of individual months, e.g., an average of the 10 Januaries from 2000 to 2009. As Nick Cox points out in his comment, anomaly just means deviation from a reference level; there is no implication of anything pathological or very unusual.

+ +

While user31264's answer makes sense for processes where the seasonal component is truly purely additive. However, in atmospheric science we often have processes where the amplitude of the seasonal variation depends on the base level, i.e., is somewhat multiplicative.

+ +

Even in these scenarios, people often use the approach I outlined above. However, I could nowhere find a rigorous statistical explanation of why this approach is actually valid. Why is the linear regression of these anomalies a reasonable solution to the regression of the original timeseries data? +Can you give me any justification why this could be reasonable to do? Those people I asked mostly say ""everyone's doing it"" ...

+",2013-10-18 12:07:29.190 +57768,2765.0,2,,52449.0,,,,CC BY-SA 3.0,"

First, the appropriate definition of ""effective sample size"" is IMO linked to a quite specific question. If $X_1, X_2, \ldots$ are identically distributed with mean $\mu$ and variance 1 the empirical mean +$$\hat{\mu} = \frac{1}{n} \sum_{k=1}^n X_k$$ +is an unbiased estimator of $\mu$. But what about its variance? For independent variables the variance is $n^{-1}$. For a weakly stationary time series, the variance of $\hat{\mu}$ is +$$\frac{1}{n^2} \sum_{k, l=1}^n \text{cov}(X_k, X_l) = \frac{1}{n}\left(1 + 2\left(\frac{n-1}{n} \rho_1 + \frac{n-2}{n} \rho_2 + \ldots + \frac{1}{n} \rho_{n-1}\right) \right) \simeq \frac{\tau_a}{n}.$$ +The approximation is valid for large enough $n$. If we define $n_{\text{eff}} = n/\tau_a$, the variance of the empirical mean for a weakly stationary time series is approximately $n_{\text{eff}}^{-1}$, which is the same variance as if we had $n_{\text{eff}}$ independent samples. Thus $n_{\text{eff}} = n/\tau_a$ is an appropriate definition if we ask for the variance of the empirical average. It might be inappropriate for other purposes.

+ +

With a negative correlation between observations it is certainly possible that the variance can become smaller than $n^{-1}$ ($n_{\text{eff}} > n$). This is a well known variance reduction technique in Monto Carlo integration: If we introduce negative correlation between the variables instead of correlation 0, we can reduce the variance without increasing the sample size.

+",2013-10-18 12:23:37.040 +57769,22507.0,2,,57767.0,,,,CC BY-SA 3.0,"

The underlying model is considered to be $T(t)=f(t)+g(t)+h(t)$, where $T$ is temperature, $t$ is time, $f(t)$ is a function without seasonality, $g(t)$ is a periodic function, $h(t)$ is non-autocorrelative noise. The underlying assumption is that the movement of Earth around the orbit, which is periodical and independent on anything else, determines $g(t)$. There is also a small correction due to increase of concentration of greenhouse gases (and other things which are not seasonal in nature), which is . Note that the concentration of greenhouse gases showls little seasonality (unlike its derivative). In order to estimate $g(t)$, we calculate the mean temperature for each month.

+ +

In order to obtain the trends, the autoregression of $f(t)+h(t)$ is better than the autoregression of the whole $T(t)$, because $g(t)$, being periodic, has its own strong autocorrelation.

+",2013-10-18 12:47:46.180 +57770,22833.0,1,57776.0,,,Clarification of log interpretation,,CC BY-SA 3.0,"

Let us say we have this regression

+ +

$$\ln(y) = a + B_1(age) + B_2\ln(savings) + B_3\ln(income+1)$$

+ +

When carrying out the regression we obtain:

+ +

$$\ln(y) = 0.3445 + 0.5(age) + 0.4556 x_1 + 0.55566 x_2$$

+ +

How would one interpret the coefficients in each case? Of particular concern is the income coefficient. An increase of income of 1% would lead to an increase of how much in $y$?

+ +

This is a hypothetical example to illustrate the problem I have.

+",2013-10-18 13:23:28.197 +57771,21762.0,2,,4705.0,,,,CC BY-SA 3.0,"

Abraham Wald (1902-1950) for introducing the concept of Wald-tests and for his fundamental work on statistical decision theory.

+",2013-10-18 13:48:24.960 +57772,21476.0,1,,,,Distribution of the sum of two independent Beta-Binomial variables,,CC BY-SA 3.0,"

Consider two independent discrete random variables $y_1$ and $y_2$, both distributed with a Beta-Binomial distribution, with different number of successes $n_1$ and $n_2$ but the same parameters $a$ and $b$

+ +

$ p(y_1|n_1,a,b) = {n_1 \choose y_1} \dfrac{B(y_1 + a,n_1 -y_1 +b)}{B(a,b)} $

+ +

$ p(y_2|n_2,a,b) = {n_2 \choose y_2} \dfrac{B(y_2 + a,n_2 -y_2 +b)}{B(a,b)} $

+ +

Consider a discrete variable $Z = y_1 + y_2$. Is $Z$ distributed as well as a Beta-Binomial (with parameters $n_1+n_2$, $a'$ and $b'$?

+ +

I could not prove it in an analytic form so far, but I have been trying it out with some simulations, at least to check whether the assumption is wrong in some cases. Reparametrising $a$ and $b$ as $\mu = \dfrac{a}{a+b}$ and $\rho = \dfrac{1}{a+b+1}$, $y_1$ and $y_2$ have mean $\mu n_1$ and $\mu n_2$ respectively and variance $\mu(1-\mu)n_1(1 + (n_1-1)\rho)$ and $\mu(1-\mu)n_2(1 + (n_2-1)\rho)$ respectively.

+ +

Based on independence, the mean of $Z$ is $\mu(n_1+n_2)$ and the variance of $Z$ is $\mu(1-\mu)(n_1(1 + (n_1-1)\rho) + n_2(1 + (n_2-1)\rho))$. If $Z$ was distributed according to a beta-binomial distribution, then it would have paramters $\mu'$ and $\rho'$, with $\mu'= \mu$ and

+ +

$\rho' = \dfrac{\dfrac{n_1(1 + (n_1-1)\rho) + n_2(1 + (n_2-1)\rho)}{n_1+n_2} - 1}{n_1+n_2-1} = \rho \dfrac{n_1(n_1-1)+n_2(n_2-1)}{(n_1+n_2)(n_1+n_2 -1)} $

+ +

Here is some code to generate $Z$ as a sum of two independent Beta-Binomials (sorry about the code, R is not my main language)

+ +
n1 = 20
+n2 = 50
+mu = .6
+k  = 20
+
+p1  = rbeta(1e6,mu*k,(1-mu)*k)
+y1  = rbinom(1e6,n1,p1)
+
+p2  = rbeta(1e6,mu*k,(1-mu)*k)
+y2  = rbinom(1e6,n2,p2)
+
+z   = y1+y2
+
+rho  = 1/(k+1)
+rho1 = rho*(n1*(n1-1)+n2*(n2-1))/((n1+n2)*(n1+n2-1))
+k1   = 1/ rho1 - 1
+p3  = rbeta(1e6,mu*k1,(1-mu)*k1)
+z1  = rbinom(1e6,n1+n2,p3)
+
+print(c(var(z),var(z1)))
+plot(density(z,width= 3))
+lines(density(z1,width = 3))
+
+ +

I have been trying this code for different values of $n_1$, $n_2$, $\mu$ and $k$, but in all the cases the variances using the sum of two beta-binomials or an appropriately tuned beta-binomial are very similar (the densities look indistinguishable)

+",2013-10-18 13:51:46.780 +57798,22843.0,1,57800.0,,,Why is Pearson's correlation coefficient defined the way it is?,,CC BY-SA 3.0,"

$$ +r = \frac{{\rm Cov}(X,Y)}{ \sigma_{X} \sigma_{Y}} +$$ +I do not understand this equation at all. Where does it come from?

+ +

From my personal understanding ${\rm Cov}(X,Y)$ comes from that fact that $X$ and $Y$ are dependent random variables, that is, $E[XY]$ is not the same as $E[X]E[Y]$. Is this analogous to saying that $P(A \cap B) = P(A)P(B|A)$ if $A$ and $B$ are not independent? I'm just confused as to why we want the ratio of $E[XY]-E[X]E[Y]$ over the product of the standard deviations for $X$ and $Y$.

+",2013-10-18 20:21:59.413 +57808,22302.0,1,,,,Observational study vs experimental study?,,CC BY-SA 3.0,"

Say that a study conducted included 300 type A blood cancer patients, 300 type B blood cancer patients, 300 type C blood cancer patients and 300 healthy people, all selected randomly from clinics' lists. We are then given the blood pressure of each patient and healthy person to check if there's a difference between the groups.

+ +

Would that be an observational study or experimental study? I'm leaning towards observational because we do not change anything, just ""observe"".

+ +

Also, why is the healthy 300 people that were randomly selected important for the study?

+",2013-10-18 23:56:00.510 +57773,10060.0,2,,57765.0,,,,CC BY-SA 3.0,"

Yes, kind of...

+ +

Yes, it is ""fine."" See this 10-case example, where case 5 and case 7 both have a missing:

+ +

+ +

Now, look at their correlation outcome, there are only 8 cases participating.

+ +

+ +

The reason is that Pearson's correlation requires the covariance between $x1$ and $x2$ to be calculated. If either one has a missing, there will be no covariance resulted, and the case is thrown out.

+ +

Now, to further illustrate, let us use select case to filter out the two cases:

+ +

+ +

And rerun the correlation again, the results are identical. This exclusion does not just happen to system missing, if you have assigned a user-defined missing, the case with that user-defined missing will also be excluded.

+ +

+ +

But wait...!

+ +

I said that it's ""fine"" because it's true that SPSS does screen out incomplete cases for you. But it is in no way solving the missing phenomenon for you. If there is any systematic reason that causes your participants to not answer a certain question, you correlation coefficient can be wrong. However, if you feel that they missed the answer in a random manner, then your correlation shouldn't be heavily affected, though you may lose some sample size and consequently power.

+ +
+ +

Q: But - I ask you - please tell Tania about pairwise and listwise deletion of missings and under what button it is found in SPSS -- ttnphns

+ +

A: Certainly. It would be necessary to illustrate with another example in which we have a new candidate, $x3$:

+ +

+ +

SPSS correlation analysis uses pairwise deletion by default, which means it'd always maximize the number of case in each of the pairwise comparisons. We have learned from above that the correlation between $x1$ and $x2$ has a sample size of 8 pairs. What about $x1$ and $x3$?

+ +

+ +

Turned out, it's 9 because maximally there are 9 pairs of data. Now, this can get inconvenient if you'd like to screen off the whole case and prevent it from being analyzed. In that case, you'll use list-wise deletion.

+ +

To call the option up, in the Correlation menu, press Option and then check Exclude cases listwise, then press Continute and OK to submit the test again:

+ +

+ +

Now let's run the correlation matrix again, you'll notice that all sample sizes are unified to 8; only cases that provide data to all the three variables are retained. Visit this IBM FAQ if you'd like to learn more about the two types of deletion.

+ +

+",2013-10-18 13:57:49.450 +57774,10772.0,2,,4705.0,,,,CC BY-SA 3.0,"

John Kingman for Coalescent theory and his work on completely random measures

+",2013-10-18 14:11:35.507 +57775,306.0,2,,57770.0,,,,CC BY-SA 3.0,"

Assuming everything else stays constant,

+ +

(change in y) / (y) = B3 * (change in income) / (1 + income)

+ +

LHS is your percentage change in y. Put the values in the RHS. So for a 1% increase in income, change in income is 0.01 and income is 100. Gives you a result straightaway as B3 * (1/101)% or 0.0055%.

+",2013-10-18 14:36:10.050 +57776,21762.0,2,,57770.0,,,,CC BY-SA 3.0,"

If income is typically much larger than 1, you could ignore the $+1$ for interpretation and use the usual statement for linear log-log-models: ""A 1% increase in income is associated with a $100\% \cdot (1.01^{0.55566}−1)=0.5544\%$ increase in the geometric mean of $y$. Or, a bit less precise but better to understand: ""A 1% increase in income is associated with about a 0.56% increase in the typical value of $y$.

+ +

Edit:

+ +
    +
  • If you do not want to ignore the $+1$ for interpretation, just say ""A $1\%$ increase in $1 + \text{income}$ ...""
  • +
  • If you prefer to describe the effect on the arithmetic mean of $y$ instead of its geometric mean, try a (Gamma-)GLM with log link.
  • +
+",2013-10-18 14:37:15.650 +57777,17740.0,2,,57680.0,,,,CC BY-SA 3.0,"

Usually, the decision is whether to use linear or an RBF (aka Gaussian) kernel. There are two main factors to consider:

+ +
    +
  1. Solving the optimisation problem for a linear kernel is much faster, see e.g. LIBLINEAR.
  2. +
  3. Typically, the best possible predictive performance is better for a nonlinear kernel (or at least as good as the linear one).
  4. +
+ +

It's been shown that the linear kernel is a degenerate version of RBF, hence the linear kernel is never more accurate than a properly tuned RBF kernel. Quoting the abstract from the paper I linked:

+ +
+

The analysis also indicates that if complete model selection using the Gaussian kernel has been conducted, there is no need to consider linear SVM.

+
+ +

A basic rule of thumb is briefly covered in NTU's practical guide to support vector classification (Appendix C).

+ +
+

If the number of features is large, one may not need to map data to a higher dimensional space. That is, the nonlinear mapping does not improve the performance. + Using the linear kernel is good enough, and one only searches for the parameter C.

+
+ +

Your conclusion is more or less right but you have the argument backwards. In practice, the linear kernel tends to perform very well when the number of features is large (e.g. there is no need to map to an even higher dimensional feature space). A typical example of this is document classification, with thousands of dimensions in input space.

+ +

In those cases, nonlinear kernels are not necessarily significantly more accurate than the linear one. This basically means nonlinear kernels lose their appeal: they require way more resources to train with little to no gain in predictive performance, so why bother.

+ +

TL;DR

+ +

Always try linear first since it is way faster to train (AND test). If the accuracy suffices, pat yourself on the back for a job well done and move on to the next problem. If not, try a nonlinear kernel.

+",2013-10-18 14:45:39.377 +57778,10547.0,1,57901.0,,,"Calculation of an ""unconstrained"" normal distribution (starting from a censored one)",,CC BY-SA 3.0,"

Assume that two r.v. $W$ and $Y|W=w$ with

+ +

(1) $W \sim \text{N}(\mu_w,\sigma_w^2)$ (iid)

+ +

(2) $Y|W=w \sim \text{N}(w,\sigma_y^2)$ (iid)

+ +

Further we only observe $Y$ if $Y$ is less then $W$, i.e.,

+ +

(3) $Y|Y\le W$

+ +

Goal: Find the pdf of the censored observations, i.e., of $Y|Y\le W$ and from that deduce the uncensored pdf and the first two moments (so i.m.h.o. we have to find$f_Y(y)$). The first two moments of this uncensored pdf are supposed to depend upon $E(Y|Y\le W)$ and $Var(Y|Y\le W)$.

+ +
+ +

By definition of conditional pdf we have that:

+ +

(4) $f_{Y|W}(y|W = w)= \frac{f_{Y,W}(y,w)}{f_W(w)}$

+ +

Next, the definition of a truncated density gives for a abitrary value of $W$:

+ +

(5) $ f_{Y|Y\le W}(y|y\le w) = \frac{f_Y(y)}{P(Y\le W)}$

+ +
+ +

I would simply rewrite (4) to

+ +

$f_{Y|W}(y|W = w)f_W(w) = f_{Y,W}(y,w)$

+ +

then integration over $f_{Y,W}(y,w)$ w.r.t $w$ should yield $f_Y(y)$, i.e.,

+ +

(a) $\int_{-\infty}^{\infty} f_{Y,W}(y,w) dw = \int_{-\infty}^{\infty} f_Y(y|W = w)f_W(w) dw = f_Y(y)$

+ +

Plugin in $f_Y(y)$ into (5), ($P(Y\le W)$ will also be given by $f_Y(y)$) I will se how the moments of $f_{Y|Y\le W}(y|y\le w)$ will look and how the moments of $f_Y(y)$ depend upon them.

+ +

So (a) will look like

+ +

$f_Y(y) = \int_{-\infty}^{\infty}\frac{1}{\sqrt{2\pi\sigma^2_y}}\text{exp}\big(-\frac{(y-w)^2}{2\sigma_y^2}\big)\frac{1}{\sqrt{2\pi\sigma^2_w}}\text{exp}\big(-\frac{(w-\mu_w)^2}{2\sigma_w^2}\big)dw$

+ +

Except for the $w$ in the first $\text{exp}$, this looks very easy but since there is a $w$ im a little bit stuck how to solve this...

+",2013-10-18 15:04:04.967 +57779,,1,85831.0,,user30602,Verification of poisson approximation to hypergeometric distribution,,CC BY-SA 3.0,"

How can I verify that

+ +

$\lim_{N,M,K \to \infty, \frac{M}{N} \to 0, \frac{KM}{N} \to \lambda} \frac{\binom{M}{x}\binom{N-M}{K-x}}{\binom{N}{K}} = \frac{\lambda^x}{x!}e^{-\lambda}$,

+ +

without using Stirling's formula or the Poisson approximation to the Binomial?

+ +

I have been stuck on this problem for a while, because I don't know how to divide up the terms and factorials without using the help of prior results!

+ +

Any help would be appreciated. Thanks in advance.

+",2013-10-18 15:15:30.430 +57782,17670.0,1,57964.0,,,Logistic regression: maximizing true positives - false positives,,CC BY-SA 3.0,"

I have a logistic regression model (fit via glmnet in R with elastic net regularization), and I would like to maximize the difference between true positives and false positives. In order to do this, the following procedure came to mind:

+ +
    +
  1. Fit standard logistic regression model
  2. +
  3. Using prediction threshold as 0.5, identify all positive predictions
  4. +
  5. Assign weight 1 for positively predicted observations, 0 for all others
  6. +
  7. Fit weighted logistic regression model
  8. +
+ +

What would be the flaws with this approach? What would be the correct way to proceed with this problem?

+ +

The reason for wanting to maximize the difference between the number of true positives and false negatives is due to the design of my application. As part of a class project, I am building a autonomous participant in an online marketplace - if my model predicts it can buy something and sell it later at a higher price, it places a bid. I would like to stick to logistic regression and output binary outcomes (win, lose) based on fixed costs and unit price increments (I gain or lose the same amount on every transaction). A false positive hurts me because it means that I buy something and am unable to sell it for a higher price. However, a false negative doesn't hurt me (only in terms of opportunity cost) because it just means if I didn't buy, but if I had, I would have made money. Similarly, a true positive benefits me because I buy and then sell for a higher price, but a true negative doesn't benefit me because I didn't take any action.

+ +

I agree that the 0.5 cut-off is completely arbitrary, and when I optimized the model from step 1 on the prediction threshold which yields the highest difference between true/false positives, it turns out to be closer to 0.4. I think this is due to the skewed nature of my data - the ratio between negatives and positives is about 1:3.

+ +

Right now, I am following the following steps:

+ +
    +
  1. Split data intto training/test
  2. +
  3. Fit model on training, make predictions in test set and compute difference between true/false positives
  4. +
  5. Fit model on full, make predictions in test set and compute difference between true/false positives
  6. +
+ +

The difference between true/false positives is smaller in step #3 than in step #2, despite the training set being a subset of the full set. Since I don't care whether the model in #3 has more true negatives and less false negatives, is there anything I can do without altering the likelihood function itself?

+",2013-10-18 17:00:57.543 +57783,2085.0,2,,57721.0,,,,CC BY-SA 3.0,"

If I understand, you are looking for k-mers which are patterns of size k found in sequences.

+ +

There is an R package for analyzing sequence data called TraMineR which includes functions for plotting the sequences, finding the variance of state durations, compute within sequence entropy, extract frequent event subsequences, etc.

+ +

You could also compare two sequences to see how they align in time by using Dynamic Time Warping

+",2013-10-18 17:29:17.817 +57784,750.0,2,,57721.0,,,,CC BY-SA 3.0,"

A runs test seems appropriate, and the cited literature at the end develops the test statistic for multiple categories. Unfortunately the paper is paywalled but here is a quick run-down of the test statistic (screen shot of relevant page here).

+ +

For each individual group, we can count;

+ +
    +
  • $n_s = \text{Number of successes}$
  • +
  • $r_s = \text{Number of success runs}$
  • +
  • $s_{s}^{2} = \text{Sample variance of success run lengths}$
  • +
  • $c_s = (r^2-1)(r+2)(r+3)/[2r(n-r-1)(n+1)]$
  • +
  • $v_s = cn(n - r)/[r(r + 1)]$
  • +
+ +

Then you calculate this for each separate group, and the test statistic is the sum of the each $c_s \cdot s_{s}^{2}$ and is distributed as $\chi^{2}$ with $\sum{v_i}$ degrees of freedom.

+ +

So, lets say we have a table of run lengths for three different groups as follows;

+ +
Data: 221331333121112112212112122
+
+Length Group1  Group2  Group3
+-----------------------------
+     1   5       4       0 
+     2   2       3       1
+     3   1       0       1
+-----------------------------
+    n_s 12      10       5
+    r_s  8       7       2
+    s_s  0.6     0.3     0.5
+    c_s 11.1    14.0     1.3
+    v_s  7.4     7.5     3.1 
+-----------------------------
+x^2 = (0.6*11.1) + (0.3*14) + (0.5*1.3) = 11
+DF  = 7.4 + 7.5 + 3.1 = 18
+
+ +

Evaluating the area to the right of the test statistic is .9, so in this circumstance we would either fail to reject the null hypothesis that the distribution of the runs are randomly distributed. It is fairly close to the other tail though, so it is borderline evidence the data is more dispersed than you would expect by chance (as this is one of those circumstances it makes sense to evaluate the left tail of the Chi-Square distribution).

+ +
+ +

O'Brien, Peter C. & Peter J. Dyck. 1985. A runs test based on run lengths. Biometrics 41(1):237-244.

+ +

I've posted a code snippet on estimating this in SPSS at this dropbox link. It includes the made up example here, as well as a code example replicating the tables and statistics in the O'Brien & Dyck paper (on a made up set of data that looks like theirs).

+",2013-10-18 18:23:56.543 +57785,22703.0,1,,,,Order Statistics,,CC BY-SA 3.0,"

What is the motivation behind the use of order statistics in parameter estimation. In a very general sense, the first order statistic is considered to be an initial estimate to the location parameter. I wonder, how this would be possible. A layman based explanation would help.

+",2013-10-18 18:36:56.647 +57786,22703.0,2,,57053.0,,,,CC BY-SA 3.0,"

I guess, the frequency table summarizing the number of ones/zeros would be a good summary by itself. Mean seems to be the only reasonably explainable statistic here.

+",2013-10-18 18:40:50.570 +57787,22703.0,1,,,,Three Parameter Gamma Distribution,,CC BY-SA 3.0,"

What is the motivation for the three parameter gamma distribution and the resulting structure of its density?

+ +

What is the meaning of the location, scale and shape parameters here?

+",2013-10-18 18:54:21.117 +57788,22703.0,2,,56859.0,,,,CC BY-SA 3.0,"

In one line, given the data, descriptive statistics try to summarize the content of your data with minimum loss of information ( depending on what measure do you use). You get to see the geography of the data.( Something like, see the performance graph of the class and say who is on top, the bottom and so on)

+ +

In one line, given the data, you try to estimate and infer to the properties of the hypothetical population from which the data comes from. ( Something like, understanding 7th grade students through the good sample from the class, assuming that the underlying population is large enough that you cannot take them into account in totality)

+",2013-10-18 18:59:16.890 +57789,22703.0,2,,56273.0,,,,CC BY-SA 3.0,"

Sturges formula is not from a clear-cut theorem to be proved. It is something like an opinion to fix the number of classes once you are unable to deduce it from data/study.

+",2013-10-18 19:02:50.657 +57807,22792.0,1,57814.0,,,Why do Bayesian Networks use acyclicity assumption?,,CC BY-SA 4.0,"

Actually, this question is more or less a duplicate of the one which I have asked on math.stackexchange two days ago.

+ +

I did not get any answer there but I think now here is a better place to ask this question since it is more about ""the philosophy"", not the calculations involved in the concept, which is what the ""math"" board likes more.

+ +

I am trying to gain a good understanding of Bayesian Networks and the first thing I want to understand exactly is how they are built. I see that these networks are built on conditional independence assumptions in order to simplify joint distributions and they are built commonly by using causal relationships since they imply conditional independence given the direct causes. What I still don't understand is why these networks assume a Directed Acyclic (DAG) structure?

+ +

There can be systems which contain circular causality relationships. For example, let's think of a hypothetical machine consisting of two parallel plates which rub together. We think of three variables ""Heat"", ""Plate Area Expansion"" and ""Friction"". Plate Area Expansion is the effect of the cause ""Heat"" and ""Friction"" is just the effect of the plate area expansion since larger area means larger amount of friction, in turn. But if we think of it, ""Friction"" also causes an increase in the heat level, so ""Friction"" is a direct cause of ""Heat"" as well. This circular causality ends up with the following diagram:

+ +

+

+ +

This is a Directed Cyclic Graph and violates acyclicity (DAG) assumptions on which the whole Bayesian Network idea is founded. So, how can be such systems with circular causes, feedbacks and loops represented with Bayesian Networks? It is surely not possible that the DAG theory does not support such systems because this ""feedback"" mechanism is a quite common and basic thing. But I just cannot see how those kinds of systems are represented with acyclic graphs and I am sure that I am missing something here. I want to know what I am just missing.

+",2013-10-18 23:52:59.917 +57790,22841.0,1,,,,"Logistic regression, SPSS ignores my reference category and assumes another one",,CC BY-SA 3.0,"

I am modelling logistic regressions in SPSS, the same model for different countries (well, with slight differences in the independent variables set due to collinearity diagnosis and stepwise results). The model seems to work fine for most countries. In two countries, I am having some issues with the same variable. The variable has three categories (hierarchical). In the output for those two beautiful countries, seems that SPSS take the reference category out, assumes other as reference category and gives exp(B) for the last category.

+ +

Is there something wrong going on? What should / can I do about it?

+ +
+ +

Thanks a lot for your answers! +The problem cannot be in the syntax since I am using the same for all countries and works perfectly.

+ +

I am using ENTER. The STEPWISE was used as an exploratory method to identify the strongest predictors. According to the -2LL values my variable is not a 'strong' one for this two countries (0.12%). Is not the first time that I read that STEPWISE is a bad choice... I will that in account!

+ +

Nevertheless, why do SPSS does that? +Is this a real problem, meaning something that I must solve? +Or can I use/report the odds for those two countries without that category adding a footnote? +Should I run the model without that variable (theorectically the variable is not that important).

+ +

Thanks ;)

+ +
+ +

Hi, All my variables are recoded to the reference category be 1 and most of my variables have three categories. I am using Enter with the following syntax. LOGISTIC REGRESSION VAR= ""DV"" +/METHOD=ENTER ""IV's"" +/CONTRAST (""IV"")=Indicator (1) [...] +/PRINT=GOODFIT CI(95) + /CRITERIA PIN(.05) POUT(.10) ITERATE(20) CUT(.5). +Do you have idea what is going wrong?

+",2013-10-18 19:12:28.563 +57791,5661.0,2,,28.0,,,,CC BY-SA 4.0,"

Bayesian: "Hello, Machine Learner!"

+

Frequentist: "Hello, Machine Learner!"

+

Machine Learning: "I hear you guys are good at stuff. Here's some data."

+

F: "Yes, let's write down a model and then calculate the MLE."

+

B: "Hey, F, that's not what you told me yesterday! I had some univariate data and I wanted to estimate the variance, and I calculated the MLE. Then you pounced on me and told me to divide by $n-1$ instead of by $n$."

+

F: "Ah yes, thanks for reminding me. I often think that I'm supposed to use the MLE for everything, but I'm interested in unbiased estimators and so on."

+

ML: "Eh, what's this philosophizing about? Will it help me?"

+

F: " OK, an estimator is a black box, you put data in and it gives you some numbers out. We frequentists don't care about how the box was constructed, about what principles were used to design it. For example, I don't know how to derive the $\div(n-1)$ rule."

+

ML: " So, what do you care about?"

+

F: "Evaluation."

+

ML: "I like the sound of that."

+

F: "A black box is a black box. If somebody claims a particular estimator is an unbiased estimator for $\theta$, then we try many values of $\theta$ in turn, generate many samples from each based on some assumed model, push them through the estimator, and find the average estimated $\theta$. If we can prove that the expected estimate equals the true value, for all values, then we say it's unbiased."

+

ML: "Sounds great! It sounds like frequentists are pragmatic people. You judge each black box by its results. Evaluation is key."

+

F: "Indeed! I understand you guys take a similar approach. Cross-validation, or something? But that sounds messy to me."

+

ML: "Messy?"

+

F: "The idea of testing your estimator on real data seems dangerous to me. The empirical data you use might have all sorts of problems with it, and might not behave according the model we agreed upon for evaluation."

+

ML: "What? I thought you said you'd proved some results? That your estimator would always be unbiased, for all $\theta$."

+

F: "Yes. While your method might have worked on one dataset (the dataset with train and test data) that you used in your evaluation, I can prove that mine will always work."

+

ML: "For all datasets?"

+

F: "No."

+

ML: "So my method has been cross-validated on one dataset. You haven't test yours on any real dataset?"

+

F: "That's right."

+

ML: "That puts me in the lead then! My method is better than yours. It predicts cancer 90% of the time. Your 'proof' is only valid if the entire dataset behaves according to the model you assumed."

+

F: "Emm, yeah, I suppose."

+

ML: "And that interval has 95% coverage. But I shouldn't be surprised if it only contains the correct value of $\theta$ 20% of the time?"

+

F: "That's right. Unless the data is truly i.i.d Normal (or whatever), my proof is useless."

+

ML: "So my evaluation is more trustworthy and comprehensive? It only works on the datasets I've tried so far, but at least they're real datasets, warts and all. There you were, trying to claim you were more 'conservative' and 'thorough' and that you were interested in model-checking and stuff."

+

B: (interjects) "Hey guys, Sorry to interrupt. I'd love to step in and balance things up, perhaps demonstrating some other issues, but I really love watching my frequentist colleague squirm."

+

F: "Woah!"

+

ML: "OK, children. It was all about evaluation. An estimator is a black box. Data goes in, data comes out. We approve, or disapprove, of an estimator based on how it performs under evaluation. We don't care about the 'recipe' or 'design principles' that are used."

+

F: "Yes. But we have very different ideas about which evaluations are important. ML will do train-and-test on real data. Whereas I will do an evaluation that is more general (because it involves a broadly-applicable proof) and also more limited (because I don't know if your dataset is actually drawn from the modelling assumptions I use while designing my evaluation.)"

+

ML: "What evaluation do you use, B?"

+

F: (interjects) "Hey. Don't make me laugh. He doesn't evaluate anything. He just uses his subjective beliefs and runs with it. Or something."

+

B: "That's the common interpretation. But it's also possible to define Bayesianism by the evaluations preferred. Then we can use the idea that none of us care what's in the black box, we care only about different ways to evaluate."

+

B continues: "Classic example: Medical test. The result of the blood test is either Positive or Negative. A frequentist will be interested in, of the Healthy people, what proportion get a Negative result. And similarly, what proportion of Sick people will get a Positive. The frequentist will calculate these for each blood testing method that's under consideration and then recommend that we use the test that got the best pair of scores."

+

F: "Exactly. What more could you want?"

+

B: "What about those individuals that got a Positive test result? They will want to know 'of those that get a Positive result, how many will get Sick?' and 'of those that get a Negative result, how many are Healthy?' "

+

ML: "Ah yes, that seems like a better pair of questions to ask."

+

F: "HERESY!"

+

B: "Here we go again. He doesn't like where this is going."

+

ML: "This is about 'priors', isn't it?"

+

F: "EVIL".

+

B: "Anyway, yes, you're right ML. In order to calculate the proportion of Positive-result people that are Sick you must do one of two things. One option is to run the tests on lots of people and just observe the relevant proportions. How many of those people go on to die of the disease, for example."

+

ML: "That sounds like what I do. Use train-and-test."

+

B: "But you can calculate these numbers in advance, if you are willing to make an assumption about the rate of Sickness in the population. The frequentist also makes his calcuations in advance, but without using this population-level Sickness rate."

+

F: "MORE UNFOUNDED ASSUMPTIONS."

+

B: "Oh shut up. Earlier, you were found out. ML discovered that you are just as fond of unfounded assumptions as anyone. Your 'proven' coverage probabilities won't stack up in the real world unless all your assumptions stand up. Why is my prior assumption so diffent? You call me crazy, yet you pretend your assumptions are the work of a conservative, solid, assumption-free analysis."

+

B (continues): "Anyway, ML, as I was saying. Bayesians like a different kind of evaluation. We are more interested in conditioning on the observed data, and calculating the accuracy of our estimator accordingly. We cannot perform this evaluation without using a prior. But the interesting thing is that, once we decide on this form of evaluation, and once we choose our prior, we have an automatic 'recipe' to create an appropriate estimator. The frequentist has no such recipe. If he wants an unbiased estimator for a complex model, he doesn't have any automated way to build a suitable estimator."

+

ML: "And you do? You can automatically build an estimator?"

+

B: "Yes. I don't have an automatic way to create an unbiased estimator, because I think bias is a bad way to evaluate an estimator. But given the conditional-on-data estimation that I like, and the prior, I can connect the prior and the likelihood to give me the estimator."

+

ML: "So anyway, let's recap. We all have different ways to evaluate our methods, and we'll probably never agree on which methods are best."

+

B: "Well, that's not fair. We could mix and match them. If any of us have good labelled training data, we should probably test against it. And generally we all should test as many assumptions as we can. And some 'frequentist' proofs might be fun too, predicting the performance under some presumed model of data generation."

+

F: "Yeah guys. Let's be pragmatic about evaluation. And actually, I'll stop obsessing over infinite-sample properties. I've been asking the scientists to give me an infinite sample, but they still haven't done so. It's time for me to focus again on finite samples."

+

ML: "So, we just have one last question. We've argued a lot about how to evaluate our methods, but how do we create our methods."

+

B: "Ah. As I was getting at earlier, we Bayesians have the more powerful general method. It might be complicated, but we can always write some sort of algorithm (maybe a naive form of MCMC) that will sample from our posterior."

+

F(interjects): "But it might have bias."

+

B: "So might your methods. Need I remind you that the MLE is often biased? Sometimes, you have great difficulty finding unbiased estimators, and even when you do you have a stupid estimator (for some really complex model) that will say the variance is negative. And you call that unbiased. Unbiased, yes. But useful, no!"

+

ML: "OK guys. You're ranting again. Let me ask you a question, F. Have you ever compared the bias of your method with the bias of B's method, when you've both worked on the same problem?"

+

F: "Yes. In fact, I hate to admit it, but B's approach sometimes has lower bias and MSE than my estimator!"

+

ML: "The lesson here is that, while we disagree a little on evaluation, none of us has a monopoly on how to create estimator that have properties we want."

+

B: "Yes, we should read each other's work a bit more. We can give each other inspiration for estimators. We might find that other's estimators work great, out-of-the-box, on our own problems."

+

F: "And I should stop obsessing about bias. An unbiased estimator might have ridiculous variance. I suppose all of us have to 'take responsibility' for the choices we make in how we evaluate and the properties we wish to see in our estimators. We can't hind behind a philosophy. Try all the evaluations you can. And I will keep sneaking a look at the Bayesian literature to get new ideas for estimators!"

+

B:"In fact, a lot of people don't really know what their own philosophy is. I'm not even sure myself. If I use a Bayesian recipe, and then proof some nice theoretical result, doesn't that mean I'm a frequentist? A frequentist cares about above proofs about performance, he doesn't care about recipes. And if I do some train-and-test instead (or as well), does that mean I'm a machine-learner?"

+

ML: "It seems we're all pretty similar then."

+",2013-10-18 19:17:41.230 +57792,21884.0,1,,,,MADE and MSE pros and cons,,CC BY-SA 3.0,"

When assessing the performance of an estimator, in which scenarios should one prefer the use of the Mean Absolute Deviation Error (MADE) over the Mean Squared Error (MSE) and vice versa?

+ +

Edit / Clarification:

+ +

Assume that we have i.i.d data $(X_{1},Y_{1}),\cdots,(X_{n},Y_{n})$. Let $(X,Y)$ denote a generic member of the sample who's conditional mean is denoted by $m(x)=E(Y|X=x)$.

+ +

The performance of an estimator $\hat{m}(x)$ of $m(x)$ is often assessed either by:

+ +

$$MSE(x)=E\left[\left\{ \hat{m}(x)-m(x)\right\} ^{2}|\boldsymbol{X}\right]$$

+ +

or by

+ +

$$MADE(x)=E\left[\left|\hat{m}(x)-m(x)\right||\boldsymbol{X}\right]$$ + where $\mathbf{X}=(X_{1},\cdots,X_{n}).$

+",2013-10-18 19:19:20.950 +57793,1985.0,1,,,,An equality for expectation of the non-negative random variable,,CC BY-SA 4.0,"

I once read the following inequality

+

+

Is there any specific name for this inequality? And, how to prove it?

+",2013-10-18 19:34:57.900 +57794,22555.0,2,,37981.0,,,,CC BY-SA 3.0,"

With variance being defined as the second moment $\mu_{2}$, skewness being defined as the third moment $\mu_{3}$ and the kurtosis being defined as the fourth moment $\mu_{4}$, it is possible to describe the properties of a wide range of symmetric and non-symmetric distributions from the data.

+ +

This technique was originally described by Karl Pearson in 1895 for the so-called Pearson Distributions I to VII. This has been extended by Egon S Pearson (date uncertain) as published in Hahn and Shapiro in 1966 to a wide range of symmetric, asymmetric and heavy tailed distributions that include Uniform, Normal, Students-t, Lognormal, Exponential, Gamma, Beta, Beta J and Beta U. From the chart of p. 197 of Hahn and Shapiro, $B_{1}$ and $B_{2}$ can be used to establish descriptors for skewness and kurtosis as:

+ +

$\mu_{3} = \sqrt {B_{1}\ \mu_{2}^{3}}$
+$\mu_{4} = B_{2}\ \mu_{2}^{2}$

+ +

If you just wanted simple relative descriptors then by applying a constant $\mu_{2} = 1$ the skewness is $\sqrt {B_{1}}$ and the kurtosis is $B_{2}$.

+ +

We have attempted to summarize this chart here so that it could be programmed, but it is better to review it in Hahn and Shapiro (pp 42-49,122-132,197). In a sense we are suggesting a little bit of reverse engineering of the Pearson chart, but this could be a way to quantify what you are seeking.

+",2013-10-18 19:36:15.563 +57795,20991.0,1,57796.0,,,Generating random variable from density function,,CC BY-SA 3.0,"

How can I generate a random variable of size n= 2914 if I have the density function?.

+ +

So the problem is that I have the density f(x) (function well defined)

+ +
P<-function(a,e) { ( (1/6)*(1^3) )-((a/2)*(1^2)) +(((((a)^2)/2)+e)*1)}
+
+D<-function(u,mu,sigma) {dlogis(u,mu,sigma)}
+
+K<- function(u,a,e) {(((1/2)*(u^2))- (a*u) +(((a^2)/2)+e))}
+
+H<-function(u,mu,sigma){ plogis(u,mu,sigma, lower.tail = TRUE)}
+
+Fprim<- function(u,a,e,mu,sigma) (1/P(a,e))*(D(u,mu,sigma))*(K(H(u,mu,sigma),a,e))
+
+Fprim(1,a,e,mu,sigma) 
+
+df<- function(u)  Fprim(u,a,e,mu,sigma)
+
+#### Parameter n,a,e,mu,sigma 
+n<-2914
+mu<- -0.42155226
+sigma<- 0.60665552
+a<- 0.43218138
+e<- 0.02149706
+
+ +

I think I need to reverse and to use Monte Carlo, I don't know how to do?

+",2013-10-18 19:59:56.600 +57796,5875.0,2,,57795.0,,,,CC BY-SA 3.0,"

I suppose you mean

+ +
df <- function(u)  Fprim(u,a = 0.43218138, e = 0.02149706, mu = -0.42155226, sigma = 0.60665552)
+
+ +

I propose

+ +
 x <- seq(-20,20,length=10001)
+ y <- df(x)
+ y1 <- cumsum(y)*diff(x)[1]
+
+ pf <- approxfun(x,y1)
+ qf <- approxfun(y1,x)
+ rf <- function(n) qf(runif(n))
+
+ +

The functions qf, pf and rf are the quantile, cdf, and random generator for the density df. So you just end by

+ +
rf(2914)
+
+",2013-10-18 20:19:48.297 +57797,22555.0,2,,4705.0,,,,CC BY-SA 3.0,"

It's very difficult to add to the constellation of stars that are already listed, but for interest purposes I will throw in the improbable polymath John Maynard Keynes who many would not realize published A Treatise on Probability (1921) that can be downloaded here; and whose work was quoted frequently by Harold Jeffreys (1939).

+ +

Keynes by all accounts helped to bring forward Bayesian statistics and in his treatise considered the most important principle to be the Principle of Indifference.

+ +

According to Wikipedia, The ""Principle of insufficient reason"" was renamed the ""Principle of Indifference"" by the economist John Maynard Keynes (1921), who was careful to note that it applies only when there is no knowledge indicating unequal probabilities.

+",2013-10-18 20:20:29.640 +57799,5237.0,2,,57790.0,,,,CC BY-SA 3.0,"

I suspect your problem is you are using stepwise model selection. Automatic model selection algorithms, such as stepwise selection, don't have any 'knowledge' about what the variables mean or how they ought to relate to each other. For example, it is standard advice that if you include a squared term (e.g., $X_j^2$) in a model, you should include the lower level term ($X_j$) as well, even if it's not 'significant'. However, the selection algorithm doesn't know there is any relationship between the two variables, and so you commonly have situations where it will throw out the lower level term but retain the squared term. Likewise, all levels of your categorical variable should be retained or excluded together, but the stepwise algorithm doesn't necessarily 'know' that. I believe this is your problem.

+ +

Of course, even if you were to address this issue, stepwise selection algorithms are a truly horrible modeling strategy, and still should not be used. For more on that topic, it may help you to read my answer here: algorithms-for-automatic-model-selection.

+ +

As the above suggests, the way to rectify this problem is to stop using stepwise selection. Aside from the problem described in the post, it does not do what people believe it does (i.e., tell them which variables are the 'real' or 'important' ones).

+ +
+ +

Based on your edit, I'm guessing there may be a different issue. It is possible that you don't have any observations (cases) with those levels of the categorical variable for those countries. You should check to see if that is the case. If so, see if one of the other levels of the categorical variable has instances for every country, and use that one as your reference level instead.

+",2013-10-18 20:33:00.640 +57800,19752.0,2,,57798.0,,,,CC BY-SA 3.0,"

One nice thing you get from dividing by the product of standard deviations is that it guarantees that the correlation coefficient will be between -1 and +1.

+ +

If you want to determine if $X$ has a stronger linear relationship with $Y$ or with $Z$ comparing $cov(X,Y)$ with $cov(X,Z)$ directly is not informative, since the scale of each of the covariances depends on the variance of $Y$ an $Z$, which could be very different.

+ +

Dividing by $\sigma_X \sigma_Y$ normalizes the covariance, so you can compare $cor(X,Y)$ with $cor(X,Z)$ in meaningful way.

+",2013-10-18 20:34:03.190 +57801,22846.0,1,,,,Permutation test: Exactness,,CC BY-SA 3.0,"

I am currently reading about permutation/randomization tests and have some difficulties to understand why they are exact. More precisely, I consider two groups of random variables with means $\mu_1$ and $\mu_2$ and variances $\sigma^2_1$ and $\sigma^2_2$, which are assumed to be equal if $ \mu_1 = \mu_2$ holds. +To test the one-sided hypothesis $H_0: \mu_1 \leq \mu_2$ versus $H_1: \mu_1 > \mu_2$, I apply a permutation test with the same test statistic as for a two sample t-test with unequal variances and unequal sample sizes. +It was no big deal to prove and to understand that the level of significance of the permutation test is equal to $\alpha$ for $\mu_1 = \mu_2$. However, I don't understand why the level of significance is less than $\alpha$ for $ \mu_1 \leq \mu_2$!? +Can someone give me a hint?

+",2013-10-18 21:05:16.017 +57802,22848.0,1,57842.0,,,Recalculate log-likelihood from a simple R lm model,,CC BY-SA 3.0,"

I'm simply trying to recalculate with dnorm() the log-likelihood provided by the logLik function from a lm model (in R).

+ +

It works (almost perfectly) for high number of data (eg n=1000) :

+ +
> n <- 1000
+> x <- 1:n
+> set.seed(1)
+> y <- 10 + 2*x + rnorm(n, 0, 2)
+> mod <- glm(y ~ x, family = gaussian)
+> logLik(mod)
+'log Lik.' -2145.562 (df=3)
+> sigma <- sqrt(summary(mod)$dispersion)
+> sum(log(dnorm(x = y, mean = predict(mod), sd = sigma)))
+[1] -2145.563
+> sum(log(dnorm(x = resid(mod), mean = 0, sd = sigma)))
+[1] -2145.563
+
+ +

but for small datasets there are clear differences :

+ +
> n <- 5
+> x <- 1:n
+> set.seed(1)
+> y <- 10 + 2*x + rnorm(n, 0, 2)
+> 
+> mod <- glm(y ~ x, family = gaussian)
+> logLik(mod)
+'log Lik.' -8.915768 (df=3)
+> sigma <- sqrt(summary(mod)$dispersion)
+> sum(log(dnorm(x = y, mean = predict(mod), sd = sigma)))
+[1] -9.192832
+> sum(log(dnorm(x = resid(mod), mean = 0, sd = sigma)))
+[1] -9.192832
+
+ +

Because of small dataset effect I thought it could be due to the differences in residual variance estimates between lm and glm but using lm provides the same result as glm :

+ +
> modlm <- lm(y ~ x)
+> logLik(modlm)
+'log Lik.' -8.915768 (df=3)
+> 
+> sigma <- summary(modlm)$sigma
+> sum(log(dnorm(x = y, mean = predict(modlm), sd = sigma)))
+[1] -9.192832
+> sum(log(dnorm(x = resid(modlm), mean = 0, sd = sigma)))
+[1] -9.192832
+
+ +

Where am I wrong ?

+",2013-10-18 22:27:08.737 +57803,4779.0,2,,57790.0,,,,CC BY-SA 3.0,"

SPSS Statistics provides ridge, lasso, and elastic net in the CATREG command, but if you need to enter blocks of dummies or other groups together, that can be done in REGRESSION by using blocks of terms via multiple METHOD subcommands.

+",2013-10-18 22:31:30.743 +57804,21464.0,1,,,,What is the proper way to estimate the probability (proportion of time) a rare event occurs?,,CC BY-SA 3.0,"

Often, I need to estimate the probability (proportion of time) a rare event occurs. The standard MLE estimate often gives me extreme estimates since the denominator is usually 1, and the numerator is either 0 or 1, giving me either 100% or 0%.

+ +

For example, I am trying to estimate the proportion of web referrals as a result of my email campaign for each of my users. Since the events are rare, most of my users usually have only 1 web referral, and they have either 0 email referral or 1 email referral. In such cases, the MLE estimate is quite unreliable.

+ +

Are there standard tricks to correct this over-under estimation? Perhaps something like the laplace smoothing? If yes, how should I go about it?

+",2013-10-18 23:11:44.410 +57805,22850.0,2,,54574.0,,,,CC BY-SA 3.0,"

What definition of log-likelihood is that? I've seen $$r(a,b) = \log \frac{P(a|Mod)}{P(b|Mod)} = \log(P(a|Mod)) - \log(P(b|Mod)) ,$$ but here you're subtracting your two probabilities.

+",2013-10-18 23:23:26.417 +57806,17538.0,2,,26070.0,,,,CC BY-SA 3.0,"

The paper On the surprising behavior of distance metrics in high dimensional space discusses the behaviour of distance metrics in high dimensional spaces.

+ +

They take on the $L_k$ norm and propose the manhattan $L_1$ norm as the most effective in high dimensional spaces for clustering purposes. They also introduce a fractional norm $L_f$ similar to the $L_k$ norm but with $f \in (0..1)$.

+ +

In short, they show that for high dimensional spaces using the euclidean norm as a default is probably not a good idea; we have usually little intuition in such spaces, and the exponential blowup due to the number of dimensions is hard to take into account with the euclidean distance.

+",2013-10-18 23:51:54.773 +57809,10060.0,2,,57808.0,,,,CC BY-SA 3.0,"

Strictly speaking there isn't enough information to decide what study it is because the outcome(s) and exposure(s) have not been explicitly declared. If the different blood cancers are outcomes and blood pressure is exposure, then it's a case-control study design. However, it's silly because the different blood pressures can be a manifestation of the cancers so the outcome/exposure relationship can be mushy. Hence it's a funny design. In a way, it feels just like a cross-sectional survey.

+ +

Anyway, it's observational so far, because the researchers did not allocate exposures. Using ""observing or not observing"" as a criterion is risky because no matter what the researchers actually always observe, experimental or observational.

+ +

Random sampling is important because if the sample is not random, then all the statistical inferences based on the analysis will be challenged. In other words, a statistical test's result can be used to infer what happens in the population level because the sample was randomly chosen.

+",2013-10-19 00:36:40.520 +57810,22705.0,2,,57500.0,,,,CC BY-SA 3.0,"

I found all the above answers difficult to comprehend: that may be due to my limitations. But I found this link to help me understand the difference between parametric and non parametric:
http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-nonparametric-regression.pdf

+ +

In parametric regression or the common $y = mx+c$ form, we specify the form of the relationship as a straight line.

+ +

In a non-parametric regression such as MARS or splining, we allow the technique to determine the form of the relationship. It could be a simple straight line, or a curved one, or a summation of multiple straight lines (through hinge functions etc.) to get a non-linear relationship.

+",2013-10-19 01:53:06.130 +57811,22836.0,1,,,,"Bayesian, Fisher method: model very simple data to get discriminants",,CC BY-SA 3.0,"

I've just implemented a naive Bayesian classifier and found out about the Fisher method (Linear discriminant analysis and Bayes rule: classification) while looking for ways to improve it. I'm very new to this field.

+ +

My raw data model is like the following (for sentiment analysis):

+ +
{""I like the movie a lot"", ""positive"", 1000}
+{""I hate the movie a lot"", ""negative"", 100}
+...
+
+ +

As you see, I have only two classes, possibly more, and the third value is weight values to count when getting probabilities.

+ +

So when given this kind of data, and to get the PDF like below,

+ +

$$PDF(x|k) = \frac {e^{-d/2}} {(2\pi)^{p/2}\sqrt{|S|})}$$

+ +

I have no idea how to approach this. What should I set as discriminants? Where should I start to matrix-ify my data to get the covariance? Once I model the data, the next step seems to be relatively easier with just calculation.

+ +

In short, what should I do to get the values of $p$ discriminants from the data {""I like the movie a lot"", ""positive"", 1000}, with the first value as training text, send as class value, third as weight value.

+",2013-10-19 02:19:52.437 +57812,22705.0,1,,,,Estimation of a power function in regression $y = ax^k$,,CC BY-SA 3.0,"

I'm performing a case of polynomial regression. I use a power $k$ for the regressors (e.g. marketing spend), which helps me determine the nature of the response curve.

+ +

I also need to estimate the coefficient for each regressor.

+ +

Consider the simplistic case: $y = ax^k + c$ ; $c$ constant, $a$ a coefficient.

+ +

The values of $k$ and $a$ need to be determined (if polynomial, $k$, $x$ and $a$ would be vectors). I vary $k$ between $-2$ and $2$ and find the value of $k$ for which pow(x,k) correlates best with $y$ using a SAS macro. I take the top three $k$ which help $x$ correlate with $y$.

+ +

I start regressing $y$ on pow(x,k) and vary $k$ between the top values in priority and observe model fit and error structure to decide.

+ +

This is a slightly approximate approach (depending on the intervals of $k$ which I choose to iterate over, 0.01/0.1 etc.), but has worked well in polynomial situations because it is a SAS macro and runs pretty fast.

+ +

Is there a better approach?

+ +

Editing to add some more context as suggested by @Nick-Cox. The dependent is the sales of a product. The regressors (x) are marketing spends.

+ +

There is a strong hypothesis backing interaction effects between the x's.

+ +

Another requirement is that not all marketing spends should be forced to have a diminishing impact on sales.

+",2013-10-19 02:43:28.907 +57813,22851.0,1,,,,Modeling time in multilevel logistic regression,,CC BY-SA 3.0,"

I conducted an experiment in which participants listened to sentences while looking at pictures about the sentences on a computer screen. Whether at a given time point a participant looked at the left half of a picture or the right half of a picture was recorded via an eye-tracker.

+ +

The study had a 2 by 2 within subjects and within items design. If time was not an issue here, I could use the model below (using R code). (Here, I assume varying random intercepts and random slopes for both subject and item.)

+ +
lmer(look ~ iv1 * iv2 + (1 + iv1*iv2 | subject) + (1 + iv1*iv2 | item), 
+     family = ""binomial"", data=data)
+
+ +

However, given that participants listened to sentences each of which lasted a few seconds, where they looked at (left vs. right) would vary across time. So one way to model time might be to include time as a covariate (it may even be necessary to natural polynomials):

+ +
lmer(look ~ iv1 * iv2 + time + (1 + iv1*iv2 | subject) + (1 + iv1*iv2 | item), 
+     family = ""binomial"", data=data)
+
+ +

But things get complicated since (1) it is possible that time interacts v1 and v2 and (2) I probably need to somehow model time in the random effect terms. An additional complication is that eye-tracking data is fairly large. The particular set of data that I am working with currently has 5 million rows, so running even the simplest multilevel logistic regression can be fairly time consuming.

+ +

So my question is, given my design, what would be a good way to model time.

+",2013-10-19 02:44:35.453 +57814,22705.0,2,,57807.0,,,,CC BY-SA 3.0,"

Just to add a little more clarity. this approach is sometimes called Temporal Bayesian Models. I have seen it being used in atleast one other situation of marketing mix models where today's marketing spend influences today's brand & revenue. Today's brand also influences tomorrow's brand & revenue and so on.

+",2013-10-19 02:47:17.707 +57815,22705.0,2,,54622.0,,,,CC BY-SA 3.0,"

If we define levels differently, mixed models may not be able to do what Bayesian models can. The other alternative is structural equation modeling.

+ +

For e.g When variables A,B drive C. Variables A,B,C,D, drive E.

+ +

You have equation : E = oA + pB + qC + rD + z ;
+ C = mA + nB + k ;

where all small letters are parameter estimates/error terms k,z

+ +

Mixed models cant be used in this case since the errors z,k could be correlated. Thereby, we have to resort to hierarchical Bayesian models or structural equation models.

+ +

Thoughts?

+",2013-10-19 03:01:58.997 +57816,19559.0,1,57821.0,,,Avoiding p-values and reporting t-values instead. References?,,CC BY-SA 3.0,"

I was wondering if I could get some opinions on an issue. I'm analyzing my data using mixed-effects modeling in R (lme4 package). My model has by-subject and by-item intercepts and slopes, and random correlation parameters between them. Since the current version of lmer() does not have MCMC sampling implemented, I cannot get a pvalue for the coefficients in the model. Therefore, I would like to report the t-value instead.

+ +

I have often seen papers in my field (psycholinguistics) just say something like ""In all models presented, |t| > 2 and |z| > 2 correspond to a significant effect at a significance level of .05"". I was wondering whether there is some reference I can provide from this type of sentences? I understand that this tends to be the case, but I wonder whether this is something that has been shown (and I should give references) or whether it is ok to just state it and assume everyone will be ok with it.

+ +

Suggestions welcome!

+",2013-10-19 03:15:22.560 +57817,22785.0,1,,,,What's the value of information when we decrease the entropy of a probability distribution?,,CC BY-SA 3.0,"

Suppose you have to choose between actions $A_1,\dots,A_n$. You have a probability distribution over each $U(A_i)$, i.e. over the utility of choosing each action. So you should choose the $A_i$ that maximizes $\mathbf E[U(A_i)]$.

+ +

But now you are given the opportunity to reduce your uncertainty about one of these $U(A_i)$. I think the value of this new evidence should be

+ +

$$\int_{e\in E}P(e)~\mathbf E[U'(A_i)]~\text{d}e$$

+ +

where $E$ contains every possible piece of evidence you could receive, $P(e)$ is how likely it is, and $U'(A_i)$ is the utility of action $A_i$ after you receive the evidence $e$.

+ +

And I know I'm pushing it, but: can you think of some way to roughly estimate the value of reducing entropy, given only the expectation and current entropy of each $U(A_i)$, and the amount by which you will reduce the entropy of one particular $U(A_i)$? For instance, if $U(A_1)$ has expectation 20 and entropy 10 bits, and you're given the same information about every other $A_i$, how much would you pay to reduce the entropy of $U(A_i)$ by 2 bits?

+",2013-10-19 03:50:43.880 +57818,22524.0,1,,,,Anisotropy in kriging for non gridded data,,CC BY-SA 3.0,"

I have to perform a mapping of a DVB-T field (the TV signal), per every location I consider the median in + time of the measurements, there are some issues, e.g. the variance seems to be proportional +to the intensity of the field because of the measuring instrumentation.

+ +

My first idea was to use kriging.

+ +

Following R's gstat package vignette I have plotted the directional variograms and one of the variables that are measured shows a clear anisotropy.

+ +
    +
  1. How do I deal with it? Is there a favourite framework?

  2. +
  3. Looking at literature a change in +variable is suggested but +since my data are not in a grid I do not know how to deal with it.

    + +

    I have looked at this post +Problems estimating anisotropy parameters for a spatial model

  4. +
  5. How do you know from the cloud variogram if there are enough points to consider aisotropy?

    + +

    In a second dataset which is less numerous a low number of samples for a range of distance, that is lower than the range corresponds to +a drop in the variogram. I am not sure if this is a phisiological drop due to the field or if it is due to the low-sampling.

  6. +
  7. Is there any way to find out?

  8. +
  9. For doing this kind of analysis I am using gstat in R. Is geoR a better framework? Or is there a better package in absolute?

    + +

    There are two measurement campaigns, one with 110 locations, the other with 35, in a territory which is approximately 4 km * 4 km, but I have also simulated the field, so from the simulated field I can extract as many samples as I want.

  10. +
+ +

Any pointer to relavant literature is very welcome.

+ +

At the moment I am reading [Webster, Oliver] Geostatistics for Environmental Scientists.

+",2013-10-19 08:04:24.787 +57819,15827.0,2,,57812.0,,,,CC BY-SA 3.0,"

Although you are sensibly keeping a careful eye on what fits, your approach can fairly be described as rather home-grown or ad hoc. Depending on your target audience or readership, the consequence may range from practitioner puzzlement to statistician flak.

+ +

Your general model I take to be a sum of power functions. With some change of notation that could be

+ +

$y = b_0 + \sum_{j=1}^J b_j x_j^{k_j}$

+ +

with additive error. As @Glen_b comments, the usual approach to fitting such a model would be to use nonlinear least squares, which I take to be well supported in SAS, although I can't advise on details.

+ +

In many ways a simpler model is a multiplicative power function

+ +

$y = B_0 \prod_{j=1}^J x_j^{b_j}$, from which

+ +

$\ln y = b_0 + \sum_{j=1}^J b_j \ln x_j$, where $\ln B_0 =: b_0$.

+ +

That model is easy to fit by least squares as it is just multiple regression on the logged variables. Error with this is taken to be multiplicative on the original scale and additive on a logarithmic scale, which is often about right. A virtue of this model, as with power functions taken singly, is that it can be consistent with the limiting behaviour that $y$ tends to 0 as all the $x$s tend to 0, often important economically (physically, biologically). (However, the assumption is that all data are positive. Your own approach appears consistent with occasional zeros but not with negative values.)

+ +

More generally, however, we have no sight of your data and only a hint of what the regressors or predictors are, so it is difficult to say much more except to guess that your response variable is probably something zero or positive. If so, it is helpful to ensure that predictions are always positive for all data points. I can't see that is guaranteed by your present approach.

+ +

On a terminology question: I'd advise against calling any of these models a polynomial, even if you spell out very clearly that the powers are in general not integers. Either people don't know what a polynomial is or they will expect the powers to be integers, at least as a mathematical default: there is some obscurity either way, better avoided.

+ +

Edit:

+ +

The model

+ +

$\ln y = b_0 + \sum_{j=1}^J b_j \ln x_j$

+ +

can naturally be complicated according to taste and need, e.g.

+ +

$\ln y = b_0 + \sum_{j=1}^J b_j \ln x_j + \text{extra terms}$,

+ +

where the extra terms could be in the $x_j$, the $\ln x_j$ or both.

+",2013-10-19 08:25:15.977 +57820,15827.0,2,,55617.0,,,,CC BY-SA 3.0,"

In principle, you are right to worry that the response is bounded. In practice, with this kind of data, you are unlikely to get predictions beyond the observed range of the data. This won't be your fault, but just the effect of the high degree of unpredictability with firm-level data.

+ +

Put it this way: The worst you can get is that no predictors really help, in which case the model will predict the average rank for every firm, at least to a good first approximation. In practice, you hope you can do better, but there is little reason to expect that predictions will be outside the observed range. (Or is there?)

+ +

But why predict rank at all? Why not try to predict some performance measure, and then rank the predictions, and then compare with the expert's ranks? That sounds much less problematic.

+",2013-10-19 08:37:45.077 +57840,10135.0,2,,57835.0,,,,CC BY-SA 3.0,"

You should not make any inference when the variance of your residuals are not constant. See for example page 243 of Practical data analysis. This is mainly because the estimated standard errors of your coefficients are not reliable. And that makes your $t$-test (or $F$-ratio) invalid.

+",2013-10-19 18:19:31.100 +57841,1412.0,2,,57832.0,,,,CC BY-SA 3.0,"

(Chain rule for differentiation:) You should post the function $\alpha()$.

+ +

If it were $(x_1+x_2 + ...x_n)^2$ , and $\theta$ were a $\beta$ coefficient in a regression, then $\frac{\partial \alpha}{\partial \theta}$ would be equal to that coefficient. The full derivative would be $2*\beta*(x_1+x_2 + ...x_n)$. The 2 term comes from differentiating $\alpha()$ w.r.t. $(x_1+x_2 + ...x_n)$

+",2013-10-19 18:27:56.793 +57821,20120.0,2,,57816.0,,,,CC BY-SA 3.0,"

A reference can be found in footnote 1 of Baayen, R. H., Davidson, D. J., & Bates, D. M. (2008). Mixed-effects modeling with crossed random effects for subjects and items. Journal of Memory and Language, 59(4), 390–412. I'm quoting the relevant bits here:

+ +
+

For data sets characteristic for studies of memory and language, which typically comprise many hundreds or thousands of observations, the particular value of the number of degrees of freedom is not much of an issue. Whereas the difference between 12 and 15 degrees of freedom may have important consequences for the evaluation of significance associated with a t statistic obtained for a small data set, the difference between 612 and 615 degrees of freedom has no noticeable consequences. For such large numbers of degrees of freedom, the t distribution has converged, for all practical purposes, to the standard normal distribution. For large data sets, significance at the 5% level in a two-tailed test for the fixed effects coefficients can therefore be gauged informally by checking the summary for whether the absolute value of the t-statistic exceeds 2.

+
+ +

As you see, he describes it as ""informal"".

+ +

Generally, I assume you will find many people encourage you to report more informative measures than the probability of the data given an effect of exactly zero; for example, confidence intervals/HPD intervals of standardised effect sizes.

+",2013-10-19 08:56:42.117 +57822,22381.0,1,57823.0,,,What is a loss function in decision theory?,,CC BY-SA 3.0,"

My notes define a loss function as the 'cost' incurred when the true value of $\theta$ is estimated by $\hat\theta$. What kind of cost is it talking about? monetary cost? or is it something related to errors?

+",2013-10-19 11:19:33.860 +57823,15827.0,2,,57822.0,,,,CC BY-SA 3.0,"

A loss function is a mathematical representation of anything bad or at least undesirable: the point is that it is therefore something you want to minimise.

+ +

Calling a loss function a cost is in general just terminology designed to be simple and evocative. The intention is to appeal to your sense that cost is something you want to avoid and (specifically and crucially) that (other things being equal) you prefer a smaller cost to a larger cost.

+ +

It doesn't necessarily imply a cost in any monetary, financial, economic or business sense.

+ +

Simple examples of a loss function arise when we consider the difference between some true or correct value $\theta$ and an estimate $\hat\theta$, which you would like to be as small as possible. Possible ways of taking that further are to work with $(\theta - \hat\theta)^2$ or $|\theta - \hat\theta|$, which are both loss functions. In either case there is a minimum loss of 0 when $\hat\theta = \theta$.

+",2013-10-19 11:30:55.430 +57824,13895.0,2,,34166.0,,,,CC BY-SA 3.0,"

I just re-tripped across this. I've refined some of my thoughts since that last post, and thought I might find a receptive audience for them here.

+ +

First off, on the philosophy of how to address such a controversy: Say arguments A and B exist. Each has a premise, a sequence of deductions, and a result; and the results differ.

+ +

The best way way to prove one argument is incorrect is to invalidate one of its deductions. If that were possible here, there wouldn't be a controversy. Another is to disprove the premise, but you can't do that directly. You can argue for why you don’t believe one, but that won't resolve anything unless you can convince others to stop believing it.

+ +

To prove a premise wrong indirectly, you have to form an alternate sequence of deductions from it that leads to an absurdity or to a contradiction of the premise. The fallacious way is to argue that the opposing result violates your premise. That means that one is wrong, but it doesn't indicate which.

+ +

+++++

+ +

The halfer's premise is ""no new information."" Their sequence of deductions is empty - none are needed. Pr(Heads|Awake) = Pr(Heads)=1/2.

+ +

The thirders (specifically, Elga) have two premises - that Pr(H1|Awake and Monday) = Pr(T1|Awake and Monday), and Pr(T1|Awake and Tails) = Pr(T2|Awake and Tails). An incontrovertible sequence of deductions then leads to Pr(Heads|Awake) = 1/3.

+ +

Note that the thirders don't ever assume there is new information - their premises are based on whatever information exists - ""new"" or not - when SB is awake. And I've never seen anyone argue for why a thirder premise is wrong, except that it violates the halfer result. So the halfers have provided none of the valid arguments I've listed. Just the fallacious one.

+ +

But there are other deductions possible from ""no new information,"" with a sequence of deductions that start with Pr(Heads|Awake) = 1/2. One is that Pr(Heads|Awake and Monday) = 2/3 and Pr(Tails|Awake and Monday) = 1/3. This does contradict the thirder premise, but like I said, that doesn’t help the halfer cause since it still could be their premise that is wrong. Ironically, this result does prove something - that the halfer premise contradicts itself. On Sunday, SB says Pr(Heads|Monday) = Pr(Tails|Monday), so adding the information ""Awake"" has allowed her to update these probabilities. It is new information.

+ +

So I have proven the halfer premise can't be right. That doesn't mean the thirders are right, but it does mean that halfers have not provided any contrary evidence.

+ +

+++++

+ +

There is another argument I find more convincing. It isn't completely original, but I'm not sure if the proper viewpoint has been emphasized enough. Consider a variation of the experiment: SB is always wakened on both days; usually it is in a room that is painted blue, but on Tuesday after Heads it is in a room that is painted red. What should she say the probability of Heads is, if she finds herself awake in a blue room?

+ +

I don’t think anybody would seriously argue that it is anything but 1/3. There are three situations that could correspond to her current one, all are equally likely, and only one includes Heads.

+ +

The salient point is that there is no difference between this version, and the original. What she ""knows"" - her ""new information"" - is that it is not H2. It does not matter how, or IF, she would know it could be H2 if it could. Her capability to observe situations that she knows do not apply is irrelevant if she knows they do not apply.

+ +

I can not believe the halfer premise. It is based on a fact - that she can't observe H2 - that cannot matter since she can, and does, observe that it isn't H2.

+ +

So I hope that I have provided a convincing argument for why the halfer premise is invalid. Along the way, I know I have demonstrated that the thirder result must be correct.

+",2013-10-19 12:13:07.783 +57825,2666.0,2,,55617.0,,,,CC BY-SA 3.0,"

Ordinal regression is ideal for this problem in my opinion. There is no problem other than computational burden caused by having as many unique $Y$ as there are observations. The R rms package's orm function solves the computational burden problem using a special sparse matrix representation. For an example see Which model should I use to fit my data ? ordinal and non-ordinal, not normal and not homoscedastic

+",2013-10-19 12:39:35.157 +57826,22853.0,1,,,,Joint distribution of two multivariate normal distributions,,CC BY-SA 3.0,"

If we define 2 independent variables $Y_1$ and $Y_2$ as follows: +\begin{align} +Y_1 &= (Y_{11},Y_{12},Y_{13})^T \sim\mathcal N_3(\mu_1,\Sigma_{11}), \\ +Y_2 &= (Y_{21},Y_{22})^T \sim\mathcal N_2(\mu_2,\Sigma_{22}) +\end{align} +where, +\begin{align} +\mu_1 &= (2, 2, 2)^T &\Sigma_{11} &= +\left[\begin{array}{ccc} 3 &1 &0 \\ 1 &2 &0 \\ 0 &0 &3 \end{array}\right] \\ +\mu_2 &= (3, 4)^T &\Sigma_{22} &= +\left[\begin{array}{cc} 4 &2 \\ 2 &4 \end{array}\right] +\end{align}

+ +

Then how can I find the joint distribution of $Y_{11}-Y_{13}+Y_{22}$ and $Y_{21}-Y_{12}$?

+ +

I know its a simple question but I could find if it was asked for $Y_1-Y_2$ or something. How am I supposed to solve it when it is like that?

+",2013-10-19 13:26:36.807 +57827,17635.0,2,,57287.0,,,,CC BY-SA 3.0,"

The recommended number of bootstrap replications may vary according to ""the test to be run on the data""(Mooney, C. Z., and R. D. Duval. Bootstrapping: A Nonparametric Approach to Statistical Inference. Newbury Park, CA: Sage. 1993:11), For standard errors, the same source recommends 50-200 bootstrap replications.

+",2013-10-19 13:39:09.630 +57828,22856.0,2,,57816.0,,,,CC BY-SA 3.0,"

Here is an example of the reporting to which you refer.
+We took the Baayen article seriously and report only AIC differences and t-values in a relatively large analysis of several variables. We emphasized effect sizes and no p-values appear!

+",2013-10-19 14:40:30.983 +57829,4854.0,2,,57766.0,,,,CC BY-SA 3.0,"

Utility is an ordinal concept, so the utility of a particular level can only be interpreted relative to something else. In your example, the utility of ""blue"" should be interpreted relative to, say, ""red.""

+ +

Since most of your attributes are categorical rather than continuous, you've probably used dummy or effects coding in your model (and if you haven't, you should). If you've used dummy coding, the utility of each design-coded parameter can be interpreted relative to the excluded reference level. For example, if you used ""no keyboard"" as the reference level for the keyboard attribute, the coefficient on the ""with keyboard"" parameter represents the incremental utility associated with adding a keyboard. The same applies to the colour attribute: the coefficient on the different dummy-coded parameters represents the incremental utility of a particular colour relative to the excluded colour.

+ +

Note that the interpretation of the coefficients on dummy-coded (0,1) and effects-coded (-1,1) parameters are slightly different. The coefficient on an effects coded parameter represents the deviation of the ‘level mean utility’ from ‘overall mean utility’, which not necessarily the same as the difference from the reference level as with a dummy-coded parameter. Dummy-coded parameters are a little easier to interpret, but you run into problems with confounding between the utility of the reference levels and the alternative-specific constant (i.e. the intercept). This is a particular problem if you're using a labelled design, but arguably less critical in a generic design. See this reference for a useful summary... http://methodology.psu.edu/media/techreports/12-120.pdf. There is also a good summary in Hensher, Rose and Greene.

+",2013-10-19 14:51:57.300 +57892,503.0,2,,57887.0,,,,CC BY-SA 3.0,"

I'd say regular ordinary least square regression should be fine, although technically, your dependent variable is a count, so you should use Poisson or negative binomial regression. But with counts this high, I would guess that OLS regression would give similar results

+",2013-10-20 19:59:25.313 +57830,22092.0,1,,,,What is the test statistic in Kolmogorov–Smirnov test?,,CC BY-SA 3.0,"

I am reading the Wikipedia page on the Kolmogorov-Smirnov test, specifically the section titled Kolmogorov_distribution.

+ +

$x$ is $D_\text{max}$ in the CDF $Pr(k\leq x)$?

+ +

My question is the number of points matters too,right?

+ +

When $n$ is large, $x$ approximates $\sqrt n D_\text{max}$ ?

+ +

Here is a snapshot of the section from Wikipedia:

+ +

+ +
+ +

+ +

Q1
+How do we obtain the distribution of $D_n$ when $n$ is fixed?

+ +

Q2
+If I get the value of $D_\text{max}$ and the sample size is $n$, I have to calculate $Pr(K<=x)$, right? Is $D_\text{max}$ the $x$ in the formula on the Wikipedia page?

+ +

By formula 14.3.9 of Numerical Recipes, we should calculate a value got from the expression in the brackets - should that be the $x$? The value approximates $\sqrt n D_\text{max}$ when $n$ is large.

+ +

I am totally confused here.

+ +

Q3
+We make tests and get a distribution,right? Could you please explain your figure in a ""test"" way? Maybe that is easier to follow.

+ +
+ +

I calculate several values(significance)and compare them with the tableks table

+ +

The $x$ in $Pr(K<=x)$ is not $D_{max}$ .If sample size is large,x is $(\sqrt{n}+0.12+0.11/\sqrt{n})*D_{max}$ .That means there is no error in 14.4.9 of NR . The significance we want to get is determined by sample size and $D_{max}$ .

+ +

Please let me explain my questions in a ""test"" way.

+ +

Suppose sample size is 30, we obtain a dataset of 30 points and we can get a $D_{max}$ from the comparison between the empirical distribution of the sample and the reference probability distribution. We do it 1000 times and we get 1000 $D_{max}$ . There is a distribution for $D_{max}$ ,right?

+ +

From your figure , there should be 1000*0.01 points with a $D_{max}$ larger than 0.29 and 1000*0.05 points with a $D_{max}$ larger than 0.24 ,more or less .

+ +
+ +

Like you said, I am very confused. Please let me make sure the several statements below are right or wrong first.

+ +

When sample size is large,my calculation method is right?
+we can take $(\sqrt{n}+0.12+0.11/\sqrt{n})*D_{max}$ as x to input to $Pr(K<=x)$ and in this way we can get the significance when sample size is large,right?

+ +

Suppose sample size is 30, we obtain a dataset of 30 points and we can get a $D_{max}$ from the comparison between the empirical distribution of the sample and the reference probability distribution. We do it 1000 times and we get 1000 $D_{max}$ . There is a distribution for $D_{max}$ ,right?

+ +

From your figure , there should be 1000*0.01 points with a $D_{max}$ larger than 0.29 and 1000*0.05 points with a $D_{max}$ larger than 0.24 . Is that your figure tells us ?

+",2013-10-19 15:25:20.260 +57831,21884.0,1,,,,Local linear regression: number of grid points?,,CC BY-SA 3.0,"

Local linear regression is a popular tool. How can one choose the number of grid points for which to estimate the unknown function on?

+",2013-10-19 15:56:12.137 +57832,9456.0,1,,,,Confusion related to calculation of partial derivative,,CC BY-SA 3.0,"

I have this function $P = f(\alpha)$. $\alpha$ is a function $\alpha = f(\theta, x)=\theta x$. Now I have

+ +

$P = \alpha(x_1+x_2 + ...x_n)$

+ +

Now I need to calculate the partial derivative of P wrt $\theta$. Then

+ +

$\frac{\partial P}{\partial \theta} = \frac{\partial P}{\partial \alpha} * \frac{\partial \alpha}{\partial \theta} = (x_1+x_2+x_3)*\frac{\partial \alpha}{\partial \theta}$

+ +

Now what I am trying to do is use gradient descent to maximize P wrt $\theta$. So what I was expecting was the partial derivative wrt $\theta$ to be a single value. However, Here $\frac{\partial \alpha}{\partial \theta} = \frac{\partial \theta x}{\partial \theta}$ changes wrt to x. So I am a bit confused how to get a single value out of it?

+",2013-10-19 16:25:33.063 +57833,22860.0,2,,541.0,,,,CC BY-SA 3.0,"

ANOVA you are testing whether there are significant difference between the population means assuming you are comparing more than two population means, then you are going to use an F test.

+ +

In regression analysis you build a model between independent variables and a dependent variable. If you have one independent variable with four levels you can use three dummy variables and run a regression model. The F-test for the regression model which is used to test for the significance of the regression model is the same as the F which you get when testing for the difference between the population means. If you run a stepwise regression then some of the dummy variables might be dropped from the model and your F-value will differ from that when you perform ANOVA test.

+",2013-10-19 16:30:46.430 +57834,15860.0,2,,57756.0,,,,CC BY-SA 3.0,"

The use of terminology differs across disciplines. I'm answering as a political scientist.

+ +

Your study would only be a (quasi-)experimental design if there were some aspect of randomization in assignment to treatment. Based on the information you've provided, you are dealing with an observational study because the units self-assigned to treatment. This means that there are measurable and unmeasurable features of the observations that differ systematically between units in G1 and G2. In a school setting, there are essential features of students who go to state schools and those who go to private schools. Ideally (but probably not practically) you could control for these differences. I am skeptical because there are probably unmeasurable differences between the populations, such as motivation, family support and upbringing. These are things you cannot control for.

+ +

The key feature of any experiment is random assignment to treatment. If units are selected at random to receive treatment and then all units adhere to their assignment, then in expectation all treatment units and control units will be similar in their pre-treatment characteristics. Then the researcher can credibly infer that any post-treatment differences are due to the effect of treatment.

+ +

An ""experiment"" involves an intervention by a researcher that randomly assigns treatment. In my line of work, random selection is only possible in a very limited range of research designs. As a result, I do not consider it a requirement for a ""good"" or ""true"" experiment in my field.

+ +

A ""natural experiment"" involves randomized assignment to treatment through an intervention executed by something (human or not) other than the researcher.

+ +

In my experience, ""quasi-experiment"" refers to a research design where there is some aspect of randomness to assignment to treatment but not to a degree where the researcher believes that treatment and control units are similar in expectation in terms of pre-treatment features. This term, however, is quite vague and subject to different meanings. In my field people advise against its use. I've heard some claim it's too vague to be useful, obscuring more than it reveals about the research design. I've also heard some claim that ""quasi-experiment"" is just a euphemism for ""bad experiment."" I agree with the former sentiment and tend to disagree with the latter.

+ +

An ""observational study"" involves no randomized assignment to treatment. In these settings a researcher must try to establish ""conditional ignorability."" The term means that after controlling for a set of covariates $X$, a unit's potential outcomes are conditionally independent from assignment to treatment. In other words, the treatment assignment and the potential outcomes are conditionally independent given $X$ if and only if, given knowledge that $X=x$, knowledge of whether $T=\{0,1\}$ provides no information on the likelihood of a particular outcome in $Y$.

+ +

That is, conditional on the covariates, the treatment is independent of the potential outcomes.

+ +

\begin{equation} +(Y(T=1),Y(T=0))⊥T|X +\end{equation}

+ +

A ""correlational study"" would probably not be of much value. A ""correlational analysis,"" however, probably means that you present exploratory data analysis that shows the unadjusted relationship between variables. In other words, without any claim of causality, you are showing how variables tend to move together in your data set.

+ +

The formalization above is based on the Rubin potential outcomes framework for causal inference, a foundational work for this conversation. You need to read it if you're serious about moving forward with work like this. +Holland, Paul W. 1986. “Statistics and Causal Inference.” Journal +of the American Statistical Association. 81(396): 945-960.

+ +

Another approach to conditional ignorability is Judea Pearl's back-door criterion.

+",2013-10-19 16:31:02.250 +57835,6384.0,1,,,,Nonconstancy of error variance,,CC BY-SA 3.0,"

If the residuals show that the non constancy of the error variance is clearly present, does it mean that your regression results are completely invalid?

+",2013-10-19 16:35:27.303 +57836,19559.0,1,57862.0,,,Alternatives to pvals.fnc to compute confidence intervals for fixed effects?,,CC BY-SA 3.0,"

Lately I keep encountering the same problem and I'm wondering whether other people have been able to get around it. I'm running a mixed effects model using lmer(). My model has by-subject and by-item intercepts and slopes, and random correlation parameters between them. Since the current version of lmer() does not have MCMC sampling implemented, I cannot use pvals.fnc(). I get this message:

+ +
Error in pvals.fnc(m, withMCMC = T) : 
+MCMC sampling is not implemented in recent versions of lme4
+for models with random correlation parameters
+
+ +

pvals.fnc() is also the function I use to get confidence intervals (HPD95lower and HPD95upper were two columns in the pvals.fnc output). Does anyone know of an alternative way of getting confidence intervals for the fixed effects estimates in the model? Or does using models with random correlations means that we can no longer get CIs from R?

+ +

Thanks!

+ +

NOTE: I've seen this question asked in other forums in slightly different ways. However, the answers always seem to involve (1) calculating something different as an alternative to the confidence intervals, (2) some complicated solution that is unclear (at least to me) how to implement. I would like to know if there is some alternative way of computing CIs that is both mainstream (so that other researchers can use it) and has a function to do it in R, since I am not a programmer and I feel that trying to create that function myself would be error prone.

+",2013-10-19 16:39:24.940 +57837,20700.0,1,,,,Gibbs sampler for local linear trend model,,CC BY-SA 3.0,"

Question: Consider the local linear trend model given by: +\begin{align*} +y_t = \mu_t + \tau \varepsilon_t \ \cdots \ \text{Observation equation} \\ +\mu_{t+1} = \phi \mu_t + \eta_t \ \cdots \ \text{State equation} +\end{align*} +for $t = 1, 2, \cdots, T$, where $(\varepsilon_t, \eta_t)'$ is independent of $\mu_k$ for $k \le t$ and where: +\begin{align*} +\begin{bmatrix} \varepsilon_t \\ \eta_t \end{bmatrix} \stackrel{i.i.d}{\sim} N\left(\begin{bmatrix} 0 \\ 0 \end{bmatrix}, \begin{bmatrix} 1 & 0 \\ 0 & 1 \end{bmatrix}\right) +\end{align*} +and +\begin{align*} +\mu_1 \sim N\left(0, \frac{1}{1-\phi^2} \right) +\end{align*}

+ +

(Note everything is a scalar quantity in this question).

+ +

Consider a Bayesian analysis of this model under the prior distribution given by: +\begin{align*} +p(\theta) \propto \frac{1}{\tau} \ \text{for} \ -\infty < \phi < \infty \ \text{and} \ 0<\tau<\infty +\end{align*} +where $\theta = (\phi, \tau)$.

+ +

Devise a Gibbs sampler to sample from the joint posterior distribution, $p(\mu_{1:T}, \theta \mid y_{1:T})$ where the notation $\mu_{1:T}$ denotes $(\mu_1, \mu_2, \cdots, \mu_T)$ and similarly, $y_{1:T}$ denotes $(y_1, y_2, \cdots, y_T)$.

+ +
+ +

My Working So Far:

+ +

The joint posterior distribution is given by: +\begin{align*} + p(\mu_{1:T}, \theta \mid y_{1:T}) & \propto p(\mu_{1:T}, \theta, y_{1:T}) \\ + & = \underbrace{p(y_{1:T} \mid \mu_{1:T}, \theta)}_{\text{'likelihood'}}\underbrace{p(\mu_{1:T} \mid \theta)p(\theta)}_{\text{prior}} \\ +& = \left[\prod_{t=1}^{T} p(y_t \mid \mu_t, \theta)\right]\left[\prod_{t=1}^{T-1}p(\mu_{t+1} \mid \mu_t, \theta) \right]p(\mu_1 \mid \theta)p(\theta) \ \ \cdots \ \ (1) +\end{align*} +Since $y_t \mid \mu_t, \theta \sim N(\mu_t, \tau^2)$ for $t=1, 2, \cdots, T$, the pdf is given by: +\begin{gather*} +p(y_t \mid \mu_t, \theta) = \left(2\pi \tau^2\right)^{-\frac{1}{2}}\exp\left[-\frac{1}{2\tau^2}\left(y_t - \mu_t\right)^2 \right] +\end{gather*} +Similarly, $\mu_{t+1} \mid \mu_t, \theta \sim N\left(\phi \mu_t, 1 \right)$, so the pdf is given by: +\begin{gather*} +p(\mu_{t+1} \mid \mu_t, \theta) = \left(2\pi\right)^{-\frac{1}{2}} \exp\left[-\frac{1}{2}\left(\mu_{t+1} - \phi \mu_t \right)^2 \right] +\end{gather*} +We know that $\mu_1 \mid \theta \sim N\left(0, \frac{1}{1-\phi^2} \right)$, so the pdf is given by: +\begin{gather*} +p(\mu_1 \mid \theta) = \left(2\pi\left(\frac{1}{1-\phi^2} \right) \right)^{-\frac{1}{2}} \exp\left[-\frac{\mu_1^2}{2\left(\frac{1}{1-\phi^2}\right)} \right] +\end{gather*} +Finally, we are given that $p\left(\theta\right) \propto \frac{1}{\tau}$.

+ +

Substituting all of the above into Eqn. $(1)$, yields the joint posterior distribution: +\begin{align*} +p(\mu_{1:T}, \theta \mid y_{1:T}) & \propto \left[\prod_{t=1}^{T} \left(2\pi \tau^2\right)^{-\frac{1}{2}}\exp\left[-\frac{1}{2\tau^2}\left(y_t - \mu_t\right)^2 \right]\right]\left[\prod_{t=1}^{T-1}\left(2\pi\right)^{-\frac{1}{2}} \exp\left[-\frac{1}{2}\left(\mu_{t+1} - \phi \mu_t \right)^2 \right] \right] \\ +& \times \left(2\pi\left(\frac{1}{1-\phi^2} \right) \right)^{-\frac{1}{2}} \exp\left[-\frac{\mu_1^2}{2\left(\frac{1}{1-\phi^2}\right)} \right] \left( \frac{1}{\tau}\right) +\end{align*}

+ +

I will implement a ""blocked"" Gibbs sampler sampling $\mu_{1:T}^{(i)}$ together, as follows:

+ +

For $i = 1, 2, \cdots, M$, sample: +\begin{align*} +\mu_{1:T}^{(i)} & \sim \mu_{1:T} \mid \phi^{(i-1)}, \tau^{(i-1)}, y_{1:T} \ \ \cdots \ \ (2)\\ +\phi^{(i)} & \sim \phi \mid \mu_{1:T}^{(i)}, \tau^{(i-1)}, y_{1:T} \ \ \cdots \ \ (3) \\ +\tau^{(i)} & \sim \tau \mid \mu_{1:T}^{(i)}, \phi^{(i)}, y_{1:T} \ \ \cdots \ \ (4) +\end{align*}

+ +

Sampling from $(2)$ is straightforward by using the Forward Filter Backwards Sampling (FFBS)

+ +

My Query: I am stuck on how to sample from $(3)$ and $(4)$, in order to use a Gibbs sampler on $(3)$ and $(4)$, we need to find the full conditional of $\phi \mid \mu_{1:T}, \tau, y_{1:T}$ and $\tau \mid \mu_{1:T}, \phi, y_{1:T}$, but how do you find these full conditionals? I do not see any obvious way by examining the joint posterior distribution. Perhaps a Metropolis Hastings subchain could work? But then what should I pick for my candidate density for the $\phi$ and $\tau$?

+ +

Thanks in advance.

+",2013-10-19 16:55:12.357 +57838,2873.0,2,,57835.0,,,,CC BY-SA 3.0,"

It depends on which results and what you mean by valid/invalid.

+ +

The coefficients are still a measure of a line going through the center of the data, so the line itself is still meaningful.

+ +

The relationship between the mean square error and the variance of the residuals becomes more complicated since there is not a single value that is the variance. But if you can model the variance then you can get a meaningful relationship using weighted regression.

+ +

Standard prediction intervals (based on ordinary least squares, not weighted least squares) will be too narrow in some areas and too wide in other areas, so would probably not be considered valid.

+ +

Tests and confidence intervals based on the standard assumptions are not going to be exact any more so p-values and confidence intervals will be approximate, whether that approximation is close enough to consider them valid, or potentially bad enough to consider them invalid will depend on the amount the variance varies and your personal preferences. Proper use of weighted least squares (or other methodologies) will help here as well.

+",2013-10-19 17:08:28.540 +57839,22862.0,1,,,,"Trace(AB)=Trace(BA)? even if A, B are vectors?",,CC BY-SA 3.0,"

I have read this vector manipulation in standard books:

+ +

$$E[XX^T] =E[\mathrm{trace}(XX^T)]$$

+ +

where $X^T$ is the transpose of $X$, $X$ has a normal distribution and has dimension $n\times 1$, $XX^T$ has dimension $n\times n$.

+ +

How can they introduce trace into expectation?

+",2013-10-19 17:44:55.213 +57842,6162.0,2,,57802.0,,,,CC BY-SA 3.0,"

The logLik() function provides the evaluation of the log-likelihood by substituting the ML estimates of the parameters for the values of the unknown parameters. Now, the maximum likelihood estimates of the regression parameters (the $\beta_j$'s in $X{\boldsymbol \beta}$) coincide with the least-squares estimates, but the ML estimate of $\sigma$ is $\sqrt{\frac{\sum \hat\epsilon_i^2}{n}}$, whereas you are using $\hat\sigma = \sqrt{\frac{\sum \hat\epsilon_i^2}{n-2}}$, that is the square root of the unbiased estimate of $\sigma^2$.

+ +
>  n <- 5
+>  x <- 1:n
+>  set.seed(1)
+>  y <- 10 + 2*x + rnorm(n, 0, 2)
+>  modlm <- lm(y ~ x)
+>  sigma <- summary(modlm)$sigma
+> 
+>  # value of the likelihood with the ""classical"" sigma hat
+>  sum(log(dnorm(x = y, mean = predict(modlm), sd = sigma)))
+[1] -9.192832
+> 
+>  # value of the likelihood with the ML sigma hat
+>  sigma.ML <- sigma*sqrt((n-dim(model.matrix(modlm))[2])/n) 
+>  sum(log(dnorm(x = y, mean = predict(modlm), sd = sigma.ML)))
+[1] -8.915768
+>  logLik(modlm)
+'log Lik.' -8.915768 (df=3)
+
+",2013-10-19 18:45:53.120 +57843,22793.0,2,,57839.0,,,,CC BY-SA 3.0,"

A vector is a matrix which has one column or one row.

+ +

The property holds, you can test it up.

+",2013-10-19 18:50:33.893 +57844,7155.0,2,,57757.0,,,,CC BY-SA 3.0,"

Let's define a kernel that is sensitive to translation, rotation and scaling of the input grid.

+ +

                                                    $f(x,y) ={(2\mu_x\mu_y + c_1)(2\sigma_{xy}+c_2)\over{(\mu_x^2+\mu_y^2+c_1)(\sigma_x^2+\sigma_y^2+c_2)}}$

+ +

Where $c_1$ and $c_2$ are some small constant times the range of the data.

+ +

The guts of this function is the covariance between data, $\sigma_{xy}$. The more absolutely similar our values are, the higher this function goes.

+ +

It's a valid mercer kernel, which means it's a good distance function for many purposes.

+ +

Plotting your data below, we see some vaguely similar pattern over the grid $(1,2)$ to $(2,4)$.

+ +

+ +

With data indexed in a grid pattern, we aren't interested in small translations of the inputs. One such way to overcome these translations is to average over the many small shifts in the grid. This calculation wrt to above distance function is known as Structural Similarity Index.

+ +

For observation $A$ here's 9 3x3 square shifts in the grid.

+ +

+ +

We do the same for $B$, then compute the SSIM. One way to look at the impact of different parts of the grid is to look at the local gradient of SSIM wrt inputs.

+ +

Shown below is the gradient of SSIM computed over windows and not.

+ +

+ +

$SSIM(3) = 0.46375$

+ +

$SSIM = 0.30504$

+ +

If you want to discard the information about scaling, we can $x\over{\sum_i x_i}$, yielding the following gradient.

+ +

+ +

$SSIM_{normed}(3) = 0.72070$

+ +

$SSIM_{normed} = 0.74618$

+ +

See: Image quality assessment: From error visibility to structural similarity.

+ +

The calculations were done with Skimage.measure.structural_similarity

+",2013-10-19 19:09:57.850 +57845,20700.0,1,,,,Determining the posterior distribution for an Autoregressive or order 1 model,,CC BY-SA 3.0,"

Question: For this question, note that the notation $y_{1:T} = (y_1, y_2, \cdots, y_T)$, ie, a vector of random variables.

+ +

Consider the following AR(1) model: +\begin{align*} +y_{t+1} = \phi y_t + \sigma \eta_t \ \ \cdots (a) +\end{align*} +for $t = 1, 2, \cdots, T$ where +\begin{align*} +\eta_t \stackrel{iid}{\sim} N(0,1) \ \ \cdots (b) +\end{align*} +with $\eta_1$ independent of $y_k$ for $k \le t$, and where +\begin{align*} +y_1 \sim N\left(0, \frac{\sigma^2}{1-\phi^2}\right) \ \ \cdots (c) +\end{align*} +Define $\theta = (\phi, \sigma)$ and consider a prior distribution given by: +\begin{align*} +p(\theta) \propto \frac{1}{\sigma} \ \text{for} \ \infty < \phi < \infty \ \text{and} \ 0<\sigma<\infty \ \ \cdots (1) +\end{align*} +Also define the conditional likelihood function to be: +\begin{align*} +p(y_{2:T} \mid y_1, \theta) = \left(2\pi \sigma^2 \right)^{-\frac{T-1}{2}}\exp\left[- \frac{1}{2\sigma^2}\sum_{t=2}^T \left(y_t - \phi y_{t-1} \right)^2\right] \ \ \cdots (2) +\end{align*}

+ +

Show that the conditional posterior distribution, corresponding to the conditional likelihood function in $(2)$ and the prior density in $(1)$, may be obtained analytically, whereas the full posterior distribution, corresponding to the model in $(a)-(c)$ and the prior density in $(1)$, requires an alternative computational approach.

+ +
+ +

My Working: So firstly, I worked out the likelihood function corresponding to the AR(1) model described by $(a)-(c)$. My working is as follows:

+ +

We are given $y_1 \sim N\left(0, \frac{\sigma^2}{1-\phi^2}\right)$, so the pdf is given by: +\begin{align*} +p(y_1 \mid \theta) = \left(2\pi\left( \frac{\sigma^2}{1-\phi^2}\right) \right)^{-\frac{1}{2}}\exp\left[ - \frac{y_1^2}{2\left(\frac{\sigma^2}{1-\phi^2}\right)} \right] +\end{align*} +Conditionally, we have $y_2 \mid y_1 \sim N\left( \phi y_1, \sigma^2\right)$, so the pdf is given by: +\begin{align*} +p(y_2 \mid y_1, \theta) = \left(2\pi \sigma^2\right)^{-\frac{1}{2}} \exp\left[-\frac{1}{2\sigma^2} \left(y_2 - \phi y_1\right)^2 \right] +\end{align*} +Similarly, $y_3 \mid y_2, y_1 \equiv y_3 \mid y_2 \sim N\left(\phi y_2, \sigma^2\right)$, so the pdf is given by: +\begin{align*} +p(y_3 \mid y_2, y_1, \theta) = \left(2\pi \sigma^2\right)^{-\frac{1}{2}} \exp\left[-\frac{1}{2\sigma^2} \left(y_3 - \phi y_2\right)^2 \right] +\end{align*} +So in general, $y_t \mid y_{t-1}, y_{t-2}, \cdots, y_1 \equiv y_t \mid y_{t-1} \sim N\left(\phi y_{t-1}, \sigma^2\right)$, with pdf: +\begin{align*} +p(y_t \mid y_{t-1}, y_{t-2}, \cdots, y_1, \theta) = \left(2\pi \sigma^2\right)^{-\frac{1}{2}} \exp\left[-\frac{1}{2\sigma^2} \left(y_t - \phi y_{t-1}\right)^2 \right] +\end{align*} +Using the method of composition, we have: +\begin{align*} +p(y_{1:T} \mid \theta) & = p(y_1 \mid \theta)p(y_2 \mid y_1, \theta)p(y_3 \mid y_1, y_2, \theta) \cdots p(y_T \mid y_1, y_2, \cdots, y_{T-1}, \theta) \\ +& = p(y_1 \mid \theta) \prod_{t=2}^T p(y_t \mid y_{t-1}, \theta) +\end{align*} +Thus the likelihood function computed for a given value of $\theta = \left(\phi, \sigma^2\right)$, is given by: +\begin{align} + L(\theta) & = \left\{ \left(2\pi\left( \frac{\sigma^2}{1-\phi^2}\right) \right)^{-\frac{1}{2}}\exp\left[ - \frac{y_1^2}{2\left(\frac{\sigma^2}{1-\phi^2}\right)} \right]\right\} \prod_{t=2}^T \left(2\pi \sigma^2\right)^{-\frac{1}{2}} \exp\left[-\frac{1}{2\sigma^2} \left(y_t - \phi y_{t-1}\right)^2 \right] \\ +& \propto (1-\phi^2)^{\frac{1}{2}}\sigma^{-T} \exp\left[-\frac{\sum_{t=2}^T\left(y_t - \phi y_{t-1}\right)^2+y_1^2\left(1-\phi^2\right)}{2\sigma^2} \right] \ \ \cdots (W1) +\end{align}

+ +

Deriving the conditional likelihood given in $(2)$ can be done as follows: +\begin{align*} +p(y_{2:T} \mid y_1, \theta) & = \frac{p(y_{1:T} \mid \theta)}{p(y_1 \mid \theta)} \\ +& = \prod_{t=2}^T p(y_t \mid y_{t-1}, \theta) \\ +& = \left(2\pi \sigma^2 \right)^{-\frac{T-1}{2}}\exp\left[- \frac{1}{2\sigma^2}\sum_{t=2}^T \left(y_t - \phi y_{t-1} \right)^2 \right] +\end{align*}

+ +
+ +

My Query: I don't really get what the question is trying to ask me to do? What does it mean by conditional posterior? Full posterior? Any assistance will be appreciated!

+ +
+ +

EDIT 1 PROGRESS: Okay, so I've played around a bit more and made a bit of progress. I interpret 'conditional posterior' as follows:

+ +

Notice that when $T$ is large, then the factor $(1-\phi^2)$ in Eqn. $(W1)$ is small, so we can approximate the full likelihood with the conditional likelihood: +\begin{align*} +p(y_{2:T} \mid y_1, \theta) \propto \sigma^{-(T-1)}\exp\left[- \frac{1}{2\sigma^2}\sum_{t=2}^T \left(y_t - \phi y_{t-1} \right)^2 \right] +\end{align*} +So under the prior $p(\theta) \propto \frac{1}{\sigma}$, we have: +\begin{align*} +p(\theta \mid y_{1:T}) & \propto \sigma^{-T}\exp\left[- \frac{1}{2\sigma^2}\sum_{t=2}^T \left(y_t - \phi y_{t-1} \right)^2 \right] \\ +& = \sigma^{-T}\exp\left[-\frac{1}{2\sigma^2}\sum_{t=2}^T \left(y_t^2 -2y_t \phi y_{t-1} + \phi^2 y_{t-1}^2 \right) \right] \\ +& = \sigma^{-T}\exp\left[-\frac{1}{2\sigma^2}\left(\underbrace{\sum_{t=2}^Ty_t^2}_{C} - 2\phi\underbrace{\sum_{t=2}^Ty_t y_{t-1}}_{B} + \phi^2 \underbrace{\sum_{t=2}^T y_{t-1}^2}_{A} \right) \right] +\end{align*} +First, note that: +\begin{align*} +p(\phi \mid \sigma, y_{1:T}) & \propto \exp\left[-\frac{1}{2\sigma^2}\left(C-2\phi B + \phi^2A \right) \right] \\ +& = \exp\left[-\frac{A}{2\sigma^2} \left(\phi^2 - 2\phi \frac{B}{A} + \frac{C}{A} \right) \right] \\ +& = \exp\left[-\frac{A}{2\sigma^2}\left(\left(\phi - \frac{B}{A} \right)^2-\left(\frac{B}{A}\right)^2 + \frac{C}{A} \right) \right] \\ +& \propto \exp\left[-\frac{A}{2\sigma^2}\left(\phi-\frac{B}{A}\right)^2 \right] \\ +& = \exp\left[-\frac{1}{2\left(\frac{\sigma^2}{A} \right)}\left(\phi-\frac{B}{A}\right)^2 \right] +\end{align*} +So the distribution of $\phi \mid \sigma, y_{1:T}$ is given by: +\begin{gather} + \phi \mid \sigma, y_{1:T} \sim N\left(\frac{B}{A}, \frac{\sigma^2}{A} \right) \\ + \implies \phi \mid \sigma, y_{1:T} \sim N\left(\frac{\sum_{t=2}^T y_t y_{t-1}}{\sum_{t=2}^T y_{t-1}^2}, \frac{\sigma^2}{\sum_{t=2}^T y_{t-1}^2} \right) +\end{gather} +Next, note that: +\begin{align*} +p(\sigma \mid y_{1:T}) & \propto \int_{\phi} \sigma^{-T} \exp\left[-\frac{1}{2\sigma^2} \sum_{t=2}^T \left(y_t - \phi y_{t-1}\right)^2 \right]d\phi \\ +& = \sigma^{-T} \int_{\phi} \exp\left[-\frac{A}{2\sigma^2} \left(\left(\phi - \frac{B}{A} \right)^2 - \left(\frac{B}{A}\right)^2 + \frac{C}{A} \right) \right]d\phi \\ +& = \sigma^{-T} \int_{\phi} \exp\left[-\frac{A}{2\sigma^2} \left(\phi - \frac{B}{A} \right)^2 + \left(\frac{A}{2\sigma^2}\right) \left(\frac{B}{A}\right)^2 - \left(\frac{A}{2\sigma^2}\right)\left(\frac{C}{A}\right) \right]d\phi \\ +& = \sigma^{-T} \exp\left[\frac{B^2/A - C}{2\sigma^2} \right] \int_{\phi} \exp\left[-\frac{A}{2\sigma^2} \left(\phi - \frac{B}{A} \right)^2\right] d\phi \\ +& = \sigma^{-T} \exp\left[\frac{B^2/A - C}{2\sigma^2} \right] \left(2\pi\left(\frac{\sigma^2}{A} \right) \right)^{\frac{1}{2}} \\ +& \propto \frac{1}{\sigma^{(T-2)+1}}\exp\left[\frac{B^2/A - C}{2\sigma^2} \right] +\end{align*} +Now define $b = \frac{B}{A}$, notice: +\begin{align*} +Q(y_{2:T}, b) & = \sum_{t=2}^T \left(y_t - b y_{t-1}\right)^2 \\ +& = \sum_{t=2}^T \left(y_t^2 - 2y_t b y_{t-1} + b^2 y_{t-1}^2 \right) \\ +& = \sum_{t=2}^T y_t^2 - 2b\sum_{t=2}^T y_t y_{t-1} + b^2 \sum_{t=2}^T y_{t-1}^2 \\ +& = C - 2bB+b^2A +\end{align*} +Then clearly, +\begin{align*} +-Q(y_{2:T}, b) = B^2/A-C +\end{align*} +So the distribution of $\sigma \mid y_{1:T}$ is given by: +\begin{gather} +\sigma \mid y_{1:T} \sim IG\left(v = T-2, \widehat{\sigma^2} = \frac{1}{T-2} \left(B^2/A - C \right) \right) \\ + \implies \sigma \mid y_{1:T} \sim IG\left(v = T-2, \widehat{\sigma}^2 = \frac{1}{T-2} \left(\frac{\left(\sum_{t=2}^T y_t y_{t-1} \right)^2}{\sum_{t=2}^T y_{t-1}^2} - \sum_{t=2}^T y_t^2 \right) \right) +\end{gather} +Or equivalently: +\begin{gather*} +\sigma \mid y_{1:T} \sim IG\left(v = T-2, \widehat{\sigma^2} = -\frac{1}{T-2}Q(y_{2:T}, b) \right) +\end{gather*} +Thus, we can derive the conditional posterior distribution analytically as: +\begin{align*} +p(\phi, \sigma \mid y_{1:T}) = p(\phi \mid \sigma, y_{1:T})p(\sigma \mid y_{1:T}) +\end{align*} +where $p(\phi \mid \sigma, y_{1:T})$ and $p(\sigma \mid y_{1:T})$ are derived above.

+ +
+ +

However, if we use the full likelihood (which is what the second part of the question is asking), how can we derive the joint posterior? I do not see any obvious ways to find the appropriate integrating constants. I'm assuming I need to use Gibbs/M-H/or some other kind of MCMC sampling scheme?

+",2013-10-19 19:57:12.857 +57846,22863.0,1,,,,Does relative Kullback-Leibler divergence exist?,,CC BY-SA 3.0,"

Suppose I have two multivariate normal distributions. I have computed the KL divergence ($d_{KL}(N_1, N_2)$). Is there a way to measure a relative divergence between these two distributions?

+ +

For instance in a deterministic case, there is absolute error and relative error. +Let's say if KL divergence is analogous to absolute error, is there an analogous case for relative error.

+ +

Any references to literature would be highly appreciated!

+",2013-10-19 20:08:06.560 +57847,22864.0,1,,,,Can I regress an index value with variables used to create the index?,,CC BY-SA 3.0,"

I developed an index value (vulnerability score scale of 0 to 1) using a series of variables. I would like to regress these variables with the index value to determine the relative predictive power of each variable. Can I do this?

+ +

I ran the regression and came up with standardized B coefficients. I then interpreted those as relative contribution of the variable towards predicting the index value (vulnerability score).

+ +

I know one cannot regress a variable against itself but I am essentially doing this but primarily am just looking at determining to what degree each variable predicts the indexed value.

+ +

Any insights would be helpful if this is a proper use of regression or if there is an alternative method to assess this. Thanks!

+",2013-10-19 20:29:44.800 +57848,503.0,2,,57847.0,,,,CC BY-SA 3.0,"

No. You can't. In fact, since you created the index, you already know the contribution of each value to the index. You shouldn't do regression here and you don't need an alternate.

+",2013-10-19 20:35:21.537 +57849,,1,57929.0,,user30490,Stationarity of Moving Average processes,,CC BY-SA 3.0,"

Consider the infinite order MA process defined by +$$y_t=\epsilon_t+a(\epsilon_{t-1}+\epsilon_{t-2}+...),$$ +where $a$ is a constant and the $\epsilon_t$s are i.i.d. $N(0,v)$ random variable.

+ +

What is the best way to show that $y_t$ is nonstationary? I know that I need to look at the characteristic roots of the characteristics polynomial and then judge whether or not they are outside of the unit circle, but what is the best way to approach this problem? Should I try rewriting the infinite order MA process as a finite order AR process or is it easier to work the MA process?

+",2013-10-19 21:11:46.583 +57850,594.0,2,,57830.0,,,,CC BY-SA 4.0,"

Note that the Kolmogorov-Smirnov test statistic is very clearly defined in the immediately previous section:

+ +

$$D_n=\sup_x|F_n(x)−F(x)|\,.$$

+ +

The reason they discuss $\sqrt{n}D_n$ in the next section is that the standard deviation of the distribution of $D_n$ goes down as $1/\sqrt n$, while $\sqrt{n}D_n$ converges in distribution as $n\to\infty$.

+ +

Yes, the number of points, $n$, matters to the distribution; for small $n$, tables are given for each sample size, and for large $n$ the asymptotic distribution is given for $\sqrt{n}D_n$ $-$ the very same distribution discussed in the section you quote.

+ +

Without some result on asymptotic convergence in distribution, you'd have the problem that you'd have to keep producing tables at larger and larger $n$, but since the distribution of $\sqrt{n}D_n$ pretty rapidly 'stabilizes', only a table with small values of $n$ is required, up to a point where approximating $\sqrt{n}D_n$ by the limiting Kolmogorov distribution is sufficiently good.

+ +

Below is a plot of exact 5% and 1% critical values for $D_n$, and the corresponding asymptotic critical values, $K_\alpha/\sqrt n$.

+ +

+ +

Most tables finish giving the exact critical values for $D_n$ and swap to giving the asymptotic values for $\sqrt n D_n$, $K_\alpha$ (as a single table row) somewhere between $n=20$ and $n=40$, from which the critical values of $D_n$ for any $n$ can readily be obtained.

+ +
+ +

$\text{Responses to followup questions:}$

+ +

1)

+ +
+

How do we obtain the distribution of $D_n$ when $n$ is fixed?

+
+ +

There are a variety of methods for obtaining the distribution of the test statistic for small $n$; for example, recursive methods build the distribution at some given sample size in terms of the distribution for smaller sample sizes.

+ +

There's discussion of various methods given here, for example.

+ +

2)

+ +
+

If I get the value of $D_\text{max}$ and the sample size is $n$, I have to calculate $Pr(K<=x)$, right?

+
+ +

Your test statistic is your observed sample value of the $D_n$ random variable, which will be some value, $d_n$ (what you're calling $D_\text{max}$, but note the usual convention of upper case for random variables and lower case for observed values). You compare it with the null distribution of $D_n$. Since the rejection rule would be ""reject if the distance is 'too big'."", if it is to have level $\alpha$, that means rejecting when $d_n$ is bigger than the $1-\alpha$ quantile of the null distribution.

+ +

That is, you either take the p-value approach and compute $P(D_n> d_n)=1-P(D_n\leq d_n)$ and reject when that's $\leq\alpha$ or you take the critical value approach and compute a critical value, $d_\alpha$, which cuts off an upper tail area of $\alpha$ on the null distribution of $D_n$, and reject when $d_n \geq d_\alpha$.

+ +
+

By formula 14.3.9 of Numerical Recipes, we should calculate a value got from the expression in the brackets - should that be the x?

+
+ +

14.3.9 looks like it has a typo (one of many in NR). It is trying to give an approximate formula for the p-value of ""observed"" (that is, my ""$d_n$"", your $D_\text{max}$), by adjusting the observed value so you can use the asymptotic distribution for even very small $n$ (in my diagram, that corresponds to changing the $y$-value of the observed test statistic via a function of $n$, equivalent to pushing the circles 'up' to lie very close the dotted lines) but then it (apparently by mistake) puts the random variable (rather than the observed value, as it should) into the RHS of the formula. The actual p-value must be a function of the observed statistic.

+ +

3)

+ +
+

We make tests and get a distribution, right?

+
+ +

I don't know what you mean to say there.

+ +
+

Could you please explain your figure in a ""test"" way?

+
+ +

My figure plots the 5% and 1% critical values of the null distribution of $D_n$ for sample sizes 1 to 40 (the circles) and also the value from the asymptotic approximation $K_\alpha/\sqrt n$ (the lines).

+ +

It looks to me like you have some basic issues with understanding hypothesis tests that's getting in the way of understanding what is happening here. I suggest you work on understanding the mechanics of hypothesis tests first.

+ +
+

That means there is no error in 14.4.9 of NR .

+
+ +

(Presumably you mean 14.3.9, since that's what I was discussing.)

+ +

Yes there is an error. I think you may have misunderstood where the problem is.

+ +

The problem isn't with ""$(\sqrt{n}+0.12+0.11/\sqrt{n})$"". It's with the meaning of the term they multiply it by. They appear to have used the wrong variable from the LHS in the RHS formula, putting the random variable where its observed value should be.

+ +

[When the thing you're reading is confused about that, it's not surprising you have a similar confusion.]

+",2013-10-19 21:31:24.453 +57851,546.0,1,79542.0,,,Approximating the relative quantities of coins in Canada,,CC BY-SA 3.0,"

Would it be possible to approximate accurately the relative quantities of Loonies, Twoonies, quarters, dimes, nickles (and perhaps the discontinued penny) in circulation from simply obtaining a large enough sample of coins through everyday use? By everyday use I refer to the coins you get back in change when you make a purchase in a grocery store for example.

+ +

I suppose this is a 2 part question:

+ +
    +
  1. Is the method of sampling sufficient, or is there some kind of bias +introduced because you are collecting samples through a +deterministic process (of collecting change?) What size of samples +would you need?
  2. +
  3. If the sampling is sufficient for an accurate +approximation, can you use it to determine the relative quantities +of each coin type in circulation? Or, for example, is it that the +sample size necessary to accurately approximate the relative +quantities would itself change the relative quantities of each coin +type in circulation?
  4. +
+",2013-10-19 21:33:32.747 +57893,22884.0,1,,,,Calculate the quantile for a mixed pdf,,CC BY-SA 3.0,"

I am trying to draw random numbers that follow a two-part pdf

+ +

a) $|x|< x_0: f(x)=\text{constant}\quad \to \quad F(p)=a+(b-a)p$

+ +

b) $|x|> x_0: f(x)=\exp(-|x|)\quad \to \quad F(p)=-\ln(1-p)$

+ +

that is the probability is constant between $(-x_0,x_0)$ and falls exponentially outside it.

+ +

Knowing the quantile functions $F(p)$ ($0\leq p\leq 1$) of the pdf, how can I construct the quantile that describes the above pdf comprised of two parts?

+",2013-10-20 21:17:49.097 +57937,22907.0,1,,,,How to prove that a t-distribution can be written as a ratio distribution?,,CC BY-SA 3.0,"

If $X \sim N(0,1)$ and $Y \sim \chi^2(n),$ then it's ""known"" that $Z = X/\sqrt{Y/n}$ is $t$ distributed.

+ +

Is there anywhere a proof for this? That in the end one can see the $t$ distribution?

+",2013-10-21 14:58:19.417 +57852,22865.0,1,,,,A question about the multistart heuristic and pseudo convergence,,CC BY-SA 3.0,"

I'm teaching myself MCMC methods and I encountered this passage in a book that I am not able to make head or tails of:

+ +
+

The phenomenon of pseudo-convergence has led many people to the idea of comparing + multiple runs of the sampler started at different points. If the multiple runs appear to + converge to the same distribution, then—according to the multistart heuristic—all is well. + But this assumes that you can arrange to have at least one starting point in each part of the + state space to which the sampler can pseudo-converge. If you cannot do that—and in the + black box situation you never can—then the multistart heuristic is worse than useless: it + can give you confidence that all is well when in fact your results are completely erroneous.

+
+ +

Can anyone explain this a little better?

+",2013-10-19 21:50:57.833 +57853,503.0,2,,57851.0,,,,CC BY-SA 3.0,"

The bigger problem is going to be part 1, not part 2.

+ +

It will be relatively easy to get a big sample of coins. But how do you know those coins are a random sample? Maybe people where you live use more of a particular coin than people in other parts of Canada. You certainly use money in a way that is not the same as everyone else.

+ +

For example, some people will pay for nearly everything with credit or debit cards; some will make even large purchases with cash. If you only buy cheap stuff with cash, you are going to get smaller coins. If you tend to have a lot of small bills and coins in your wallet, you will get smaller coins.

+ +

Probably not possible to get a truly random sample, but I'd try to get samples from different people in different parts of the country (rural/urban; west, center, Atlantic, etc.) and different ages, incomes etc.

+",2013-10-19 21:53:18.657 +57854,12683.0,2,,57785.0,,,,CC BY-SA 3.0,"

The motivations for the use of an order statistic as an estimator are no different from the motivations for the use of any statistic as an estimator. For example, if you want to estimate the mean $\mu$ of a continuous uniform distribution from $0$ to $2\mu$, the sample maximum is the complete sufficient statistic for $\mu$, so both the maximum-likelihood & the uniformly minimum-variance unbiased estimators are based on it. If you want to estimate the mean $\theta$ of a normal distribution, the sample mean is the complete sufficient statistic for $\theta$, so both the maximum-likelihood & the uniformly minimum-variance unbiased estimators are based on it.

+ +

It's not true in general that the first order statistic is a good estimator of a location parameter (in any reasonable sense of 'good'). Recall the mean of a normal distribution is a location parameter— in this case the sample minimum would not even be a consistent estimator. You've perhaps come across location parameters added to distributions like the Weibull, whose two-parameter version has a lower bound at zero. In cases like this the sample minimum is a consistent estimator of the population minimum, & I daresay a decent starting point for fitting algorithms.

+",2013-10-19 22:31:32.813 +57855,22865.0,1,57856.0,,,Why is the Dirichlet Process unsuitable for applications in Bayesian nonparametrics?,,CC BY-SA 3.0,"
+

The discrete nature of the DP makes it unsuitable for general applications in Bayesian nonparametrics, but it is well suited for the problem of placing priors on mixture components in mixture modeling.

+
+ +

This quote is from Hierarchical Dirichlet Processes (Teh, et al, (2006)$^{[1]}$) and I was looking for an explanation about what it means. Bayesian nonparametrics seems to be too vague a term for me to understand what the author is referring to.

+ +

${[1]}$ Teh, Y. W., Jordan, M. I., Beal, M. J., Blei, D. M. (2006): ""Hierarchical Dirichlet Processes"". Journal of the American Statistical Association, 101, pp. 1566–1581.

+",2013-10-19 23:54:06.537 +57856,7007.0,2,,57855.0,,,,CC BY-SA 4.0,"

With probability one, the realizations of a Dirichlet Process are discrete probability measures. A rigorous proof can be found in

+

Blackwell, D. (1973). "Discreteness of Ferguson Selections", The Annals of Statistics, 1(2): 356–358.

+

The stick breaking representation of the Dirichlet Process makes this property transparent.

+
    +
  1. Draw independent $B_i\sim\mathrm{Beta}(1,c)$, for $i\geq 1$.

    +
  2. +
  3. Define $P_1=B_1$ and $P_i=B_i \prod_{j=1}^{i-1}(1-B_j)$, for $i>1$.

    +
  4. +
  5. Draw independent $Y_i\sim F$, for $i\geq 1$.

    +
  6. +
  7. Sethuraman proved that the discrete distribution function +$$ + G(t,\omega)=\sum_{i=1}^\infty P_i(\omega) I_{[Y_i(\omega),\infty)}(t) +$$ +is a realization of a Dirichlet Process with concentration parameter $c$ and centered at the distribution function $F$.

    +
  8. +
+

The expectation of this Dirichlet Processs is simply $F$, and this may be the distribution function of a continuous random variable. But, if random variables $X_1,\dots,X_n$ form a random sample from this Dirichlet Process, the posterior expectation is a probability measure that puts positive mass on each sample point.

+

Regarding the original question, you can see that the plain Dirichlet Process may be unsuitable to model some problems of Bayesian nonparametrics, like the problem of Bayesian density estimation, but suitable extensions of the Dirichlet Process are available to handle these cases.

+",2013-10-20 00:31:34.960 +57857,22867.0,1,,,,when is an estimator consistent?,,CC BY-SA 3.0,"

Say there are parameters $\theta$ such that $\theta_i > 0$ and $\sum_i \theta_i = 1$ and a model such as $p(x) = \sum_{i=1}^n \theta_i p_i(x)$ where $p_i(x)$ are fixed and defined over a domain of random variable $X$. Say $X$ is a random variable which is a categorical variable.

+ +

Now, say we partition $x$ into equivalence classes, such that we have a new random variable $Y$ which is a more coarse version of $X$. This means that $Y = f(X)$ and $f$ maps several values of $X$ to the same value of $Y$.

+ +

Under what condition a consistent estimator for $\theta$ under the $Y$ model will be a consistent estimator for $\theta$ under the $X$ model? (By the $Y$ model I am referring to $p(y) = \sum_{i=1}^n \theta_i p_i(y)$.) I have a feeling this largely depends on Rao-Blackwell, but not sure exactly how to apply it here.

+ +

Clearly there are cases where consistency is hopeless from the $Y$, for example, if $f$ maps all values of $X$ to just a single value. But say it is at the very least partitioning the space into two values? Can Rao-Blackwell help here? If it won't show consistency, what would it show?

+ +

EDIT: What would be a way to turn a consistent estimator under the $Y$ model to a consistent estimator for the $X$ model? Or at least a better estimator?

+ +

EDIT / refinement: Say we have multiple functions, $f_1$, $f_2$, etc and for each one of them there is a $f(Y_j)$ model. What would be assumptions on $f_j$ and a way to get a consistent estimator from $f(Y_j)$ for the $p(X)$ model?

+",2013-10-20 02:11:50.450 +57858,10135.0,2,,56970.0,,,,CC BY-SA 3.0,"

You should plot your residuals vs. explanatory variables (i.e. $X_i$'s) and residuals vs. fitted values to see if there is anything wrong with the model. There are other diagnostic plots. In R you can use function glm.diag.plots in package boot. See the code below and also this post. Here I will also use package Mass to load a data.

+ +
library(MASS)
+data(menarche) 
+plot(Menarche/Total ~ Age, data=menarche)
+glm.out = glm(cbind(Menarche, Total-Menarche) ~ Age,family=binomial(logit), data=menarche)
+library(boot)
+glm.diag.plots(glm.out) 
+
+ +

+ +

And to have some more fun with the fitted values:

+ +
plot(Menarche/Total ~ Age, data=menarche)
+lines(menarche$Age, glm.out$fitted, type=""l"", col=""red"")
+
+ +

+",2013-10-20 02:22:36.340 +57859,22866.0,1,,,,Should I pick a constant sample size for regressions on data with different available n?,,CC BY-SA 4.0,"

I am using OLS to estimate the effects of various factors on the sales of different items. The data are monthly, and vary somewhat in the number of monthly observations available (some of the items have only been sold for, say, 18 months while others have data going back for years).

+ +

Should I pick some n (e.g. 18) and stick with it for modeling all the products? This option is appealing because I'm more interested in more recent effects and because unmodeled and difficult-to-know-about factors might be present in the more distant data. Also, I'm getting good fits and high significance for the relevant variables even with the smaller sample sizes, so I don't ""need"" to increase the sample size from that perspective.

+ +

I will need to produce reporting on this periodically, so a desirable property would be that parameter estimates are fairly stable from report to report. This is a possible argument for using the larger sample sizes where available.

+",2013-10-20 02:33:34.697 +57860,5987.0,2,,57852.0,,,,CC BY-SA 3.0,"

It isn't clear to me whether you're asking what pseudo-convergence is, or whether you're asking how multistart can fail to detect that pseudo-convergence is happening, so I'll try to answer both briefly.

+ +

Imagine that you're using MCMC to sample from a distribution where $X$ has a high probability (say 99.9%) of being uniformly distributed between 0 and 1, and a small probability (say 0.1%) of being uniformly distributed between 1,000,000 and 1,000,001. This is a black box problem, so in constructing the proposal distribution for the MCMC sampler you might do something naive like considering jumps that are N(0,1) from the current point. Under these circumstances, if you start at a point in [0,1], then it's virtually impossible that your MCMC sequence will ever reach the isolated interval [1,000,000, 1,000,001]. Your MCMC sampler will then converge very nicely to a U(0,1) distribution. Conventional convergenece tests will look great, but you'll have converged to an incorrect distribution.

+ +

You might argue that missing that other 0.1% of the distribution really doesn't matter. Depending on what aspect of the distribution you're interested in, this could be a very serious problem. For example, if you want to estimate $E[X]$, you'll get 0.5 as your estimate, when it should be around 1,000!

+ +

This is an example of false or pseudo convergence.

+ +

Now, suppose you pick a bunch of starting points and run the MCMC sampler from each of those starting points. As long as each of these starting points is close to [0,1], each MCMC run will eventually converge to a U(0,1) distribution, and you'll be led to believe that $X$ really does have a U(0,1) distribution. That's the potential danger of relying on the multistart heuristic.

+ +

The example that I've used here is admittedly extreme. However, pseudo convergence is often reported in practice, particularly on problems in which we're sampling in a very high dimensional space and there are lots of isolated regions with reasonably high probability.

+",2013-10-20 02:45:44.317 +57861,22868.0,1,,,,Can the t-distribution be defined as the distribution on the true mean of a sampled normal?,,CC BY-SA 3.0,"

Wikipedia says now, here in the introduction:

+ +

http://en.wikipedia.org/wiki/Student%27s_t-distribution

+ +

""... then the t-distribution (for n-1) can be defined as the distribution of the location of the true mean, relative to the sample mean and divided by the sample standard deviation... In this way the t-distribution can be used to estimate how likely it is that the true mean lies in any given range.""

+ +

Is this right? It seems not right to me. How can we have a distribution on the true mean after obtaining a sample, without some sort of Bayesian prior? I understand we can get a confidence interval for the true mean. But a distribution?

+",2013-10-20 03:31:09.140 +57862,9049.0,2,,57836.0,,,,CC BY-SA 3.0,"

Most probably the packages lmerTest and lsmeans provide readily available routines for what you are looking for. Mind you, neither of them uses MCMC methodology. If you want to use something resampling-based, you can use lme4's native bootMer() function to bootstrap your model and get parametric bootstrap estimates (ver. 1.0-4 or newer).

+",2013-10-20 04:35:19.480 +57863,9456.0,1,,,,Issues with calculating gradient descent operation,,CC BY-SA 3.0,"

I have this issue when using gradient ascent. I have some synthetic data and after my first iteration the objective function decreases and from the second iteration it keeps on increasing. Is it possible for a convex function. I also have the learning rate very low 0.001. Any suggestions.

+",2013-10-20 05:50:06.650 +57864,6204.0,2,,57861.0,,,,CC BY-SA 3.0,"

You don't have a distribution on the true mean, you have a distribution on the difference between the true mean and the sampled mean, and this difference is scaled by the sampled standard deviation (which is another separate random variable). The true mean is fixed.

+ +

Let $X \sim N(\mu,\sigma^2)$ such that $x_1...x_n$ constitutes an i.i.d. sample of size $N$ from $X$. Let $\bar{X}$ denote the sample mean and $S^2$ denote the sample variance. Then

+ +

$$\frac{\bar{X}-\mu}{\sqrt{S^2/N}} \sim t_{N-1}$$

+ +

The relevant section of the wikipedia article you linked is http://en.wikipedia.org/wiki/Student%27s_t-distribution#Derivation

+",2013-10-20 06:02:07.550 +57865,6204.0,2,,57863.0,,,,CC BY-SA 3.0,"

Sounds like something is probably wrong in your code or you have your step size (learning rate) set too large. If you coded it properly, the objective function at iteration t+1 should always be lower than at iteration t. This feature is what makes it an ""ascent"" algorithm. We might be able to give you more insight if you provide your code and, better yet, a reproducible example.

+",2013-10-20 06:21:55.823 +57866,22860.0,2,,50739.0,,,,CC BY-SA 3.0,"

If I understood you correctly, since you have three categories in your data, you need to put two dummy variables in your regression model. If beta1 and beta2 are not significants then you can conclude there is no difference in the means between the two groups. How about the third group?

+ +

It is easier to conduct ANOVA and then post hoc multiple comparison.

+",2013-10-20 08:17:43.387 +57977,22919.0,1,,,,Two independent groups and three dependent variables,,CC BY-SA 3.0,"

How do I test for differences between two groups on three dependent variables? Is that an ANOVA or multiple t-tests with an adjusted alpha level?

+",2013-10-22 00:32:26.990 +57867,1501.0,1,58168.0,,,Bayesian MMSE estimators from a transformation of the observations,,CC BY-SA 3.0,"

Consider a random variable X whose value we want to estimate using a Bayesian MMSE estimator. Let $O_1(X)$ be a set of observations which depend on $X$ in some complex way (captured by $P(O_1|X)$) then the MMSE estimator is the conditional mean $\hat{X}_1=\mathbb{E}[X|O_1]$. Now consider another set of observation which is a possibly complex transformation of the first $O_2(O_1)$ and the corresponding MMSE estimator $\hat{X}_2=\mathbb{E}[X|O_2]$. Now it is obvious that if the mapping $O_2(O_1)$ is deterministic then $MSE[\hat{X}_1]\le MSE[\hat{X}_2]$ from the minimality of the MSE of $\hat{X}_1$.

+ +

My question is if this is true also when the mapping $O_2(O_1)$ is probabilistic, that is defined by a conditional distribution $P(O_2|O_1)$. Intuitively, it should be as any other stochasticity in the mapping just seems to introduce additional noise as it does not depend on $X$. But I wonder if one can show this explicitly.

+ +

EDITED: as the MMSE estimator is unbiased, indid $MSE[\hat{X}_i]=V[\hat{X}_i]$

+",2013-10-20 08:46:34.413 +57868,221.0,2,,57648.0,,,,CC BY-SA 3.0,"

Preface

+ +

I work with recommender systems on daily basis and have also never heard of the application of such a model as recommender system. I can only speculate about the reasons though.

+ +

The main overall reason might be that recommender systems are often applied in a domain where the price/cost of an item is too small to force the customer +to invest time into making a nearly-optimal decision, maximizing his utility. This should be kept in mind in the following section. Such domains +include e-commerce or news portals (where articles are recommended) or sites like tastekid.com, where the decision at this step costs only a click, i.e. virtually nothing.

+ +

Reasoning

+ +

The described conditional multinomial model requires (or works best) with ...

+ +
    +
  • characteristics of the customer
  • +
  • characteristics of the items
  • +
  • assumed rationality when it comes to the decision
  • +
+ +

Let's step through every point

+ +

Characteristics of the customers

+ +

Beside some basic demographic information like gender, address and (may be) age, little is known. The less the price of an item (see above), +the harder it is to request a survey before the selection process starts. Activity data (bought items, ratings etc.) on the other hand can be collected +without any work from the customer and can be used to describe the customer, following the motto ""you are what you are interested in"". The items +the customer is interested in (the preferences) implicitly capture what is important to the customer.

+ +

Characteristics of the items

+ +

Building a model based on the characteristics of a item is already done, either via ""content based collaborative filtering"" or a model based approach. +These are e.g. used to solve the cold-start-problem, i.e. the fresh new recommender system has not (enough) preferences yet.

+ +

The drawback here however is that is hard to automatically collect the properties of an item. Imagine the case of fashion: Some are easy (color, brand), +some are very hard (how does the cloth feels like on skin, how does it look if my hip is broader than average). Sometime it is completely impossible because +it entirely depends on the reception of the product, e.g. in case of movies. For certain items, such information can be collected by humans or by a very very sophisticated +system understanding semantics and language. It is not clear that the resulting improvement will outweigh the costs.

+ +

So instead of saying: ""Item A is similar to item B due to the properties p1,p2,...,"" it is easier to say ""a lot of people have both bought item A and item B. +I don't know why, but they are similar enough for the purpose of a recommender system"". So the preferences implicitly capture how similar to items are.

+ +

Assumed rationality when it comes to the decision

+ +

We are humans and we pretend to be rational all the time. If e.g. the price or other circumstances forces us to think hard about it a decision, it might be the +case that the rational part of a decision is higher than average. But when it comes to utilize advertising to sell people stuff (and yes, recommendations can be +seen as advertising), marketing will tell that rationality plays a lesser role.

+ +

Additionally, people are often do not know beforehand which properties are most important to them in order to maximize their personal utility function. +If this would be the case, all buying processes could be described via the usage of a search engine, where +a) all relevant properties are listed +b) the customer selects all properties relevant to him and name the product of interest +and the search engine delivers exactly the right results.

+ +

Instead, people have a basic goal (e.g. buying a suit), but then are browsing around to see how products appeal to them and / or to get inspired. +Making a buy decision is still partially rational (budget, invested time) but often comes down to ""what feels right"". Of course, every domain has its + own distribution of rationality and emotionality. The more technical, the more facts do play an important role. But +even than the customer might select a brand due to the curtain fire of advertising, which he would not have named as primary criterion beforehand.

+ +

So building a economic model here might be still be working and it surely correct, buth might be entirely over the top. Additionally, one might have +to build a separate model for each type of item a shop is selling.

+ +

Summary

+ +

Building a recommender system entirely based on preferences is often done because ...

+ +
    +
  • it is simple (=> cheap)
  • +
  • it can be done automatically, no extra work from the customer is required (=> cheap)
  • +
  • it works (good enough), so that a more complicated model might not outweigh the additional costs.
  • +
+ +

But: There are domains, where such a economic model will be better. I do not doubt, that a good estate agent and hence a good expert system based +on a economic model will easily outperform a recommender system based on preferences. I have regularly observed that recommendations made by human experts are often +better than automatic ones. However, the automatics are still good and can produced en mass without too much costs, so that an expert can focus +on more sophisticated tasks.

+",2013-10-20 11:41:33.910 +57869,22262.0,1,60004.0,,,What can I read to give me a meta-view of statistics as a field,,CC BY-SA 3.0,"

What can I read that will give me a meta-view of the diverse field of statistics and data science? With few exceptions much of what I get my hands of goes straight into formulae and methodologies. Preferably something sufficiently high level as to bring in diverse areas such as econometrics, psychometrics, machine learning, etc.

+ +

To be more concrete, preferably something that:

+ +
    +
  • Summarises and discusses the various branches of statistics/data science; what problems are encountered in each branch.
  • +
  • Talks about differences between the branches and their histories.
  • +
  • Contrasts methodological approach.
  • +
+",2013-10-20 12:11:27.733 +57870,22872.0,1,57873.0,,,Interpretation of PCA biplot?,,CC BY-SA 3.0,"

I just ran my first ever PCA, so please excuse any naivety on my part.

+ +

As input, I used five years worth of the following:

+ +
    +
  • S&P/ASX 200 A-REIT
  • +
  • S&P/ASX 200 Consumer Discretionary
  • +
  • S&P/ASX 200 Consumer Staples
  • +
  • S&P/ASX 200 Energy
  • +
  • S&P/ASX 200 Financial-x-A-REIT
  • +
  • S&P/ASX 200 Health Care
  • +
  • S&P/ASX 200 Industrials
  • +
  • S&P/ASX 200 Information Technology
  • +
  • S&P/ASX 200 Materials
  • +
  • S&P/ASX 200 Resources
  • +
  • S&P/ASX 200 Telecommunication Services
  • +
  • S&P/ASX 200 Utilities
  • +
+ +

Using R, I simply ran the following commands:

+ +
+arc.pca1 <- princomp(sp_sector_data, scores=TRUE, cor=TRUE)
+summary(arc.pca1)
+plot(arc.pca1)
+biplot(arc.pca1)
+
+ +

Summary

+ +
+Importance of components:
+                         Comp.1     Comp.2     Comp.3
+Standard deviation     2.603067 1.05203261 0.88394057
+Proportion of Variance 0.564663 0.09223105 0.06511258
+Cumulative Proportion  0.564663 0.65689405 0.72200662
+
+                           Comp.4     Comp.5     Comp.6
+Standard deviation     0.84122312 0.76978259 0.73901015
+Proportion of Variance 0.05897136 0.04938044 0.04551133
+Cumulative Proportion  0.78097798 0.83035842 0.87586975
+
+                           Comp.7     Comp.8     Comp.9
+Standard deviation     0.66409102 0.62338449 0.52003850
+Proportion of Variance 0.03675141 0.03238402 0.02253667
+Cumulative Proportion  0.91262116 0.94500518 0.96754185
+
+                          Comp.10    Comp.11      Comp.12
+Standard deviation     0.45637805 0.42371864 0.0409804189
+Proportion of Variance 0.01735674 0.01496146 0.0001399496
+Cumulative Proportion  0.98489859 0.99986005 1.0000000000
+
+ +

Loadings

+ +
+Loadings:
+        Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
+RE      -0.235         0.520 -0.533 -0.438  0.355 -0.150
+disc    -0.332               -0.125                0.294
+staples -0.295  0.226                      -0.211  0.554
+energy  -0.332 -0.251         0.172  0.176        -0.130
+fin_RE  -0.323               -0.118 -0.130         0.384
+health  -0.224  0.465 -0.124 -0.193  0.603  0.537 -0.112
+ind     -0.337                                          
+IT      -0.224  0.145 -0.757        -0.461        -0.312
+mat     -0.329 -0.351         0.295         0.126 -0.116
+res     -0.335 -0.350         0.297         0.123 -0.133
+telco   -0.161  0.609  0.327  0.609 -0.311        -0.113
+util    -0.270  0.160  0.146 -0.256  0.234 -0.694 -0.509
+
+        Comp.8 Comp.9 Comp.10 Comp.11 Comp.12
+RE      -0.217                               
+disc     0.309  0.567  0.596                 
+staples -0.688        -0.141                 
+energy         -0.215  0.240  -0.783  -0.165 
+fin_RE   0.374 -0.724          0.207         
+health                                       
+ind      0.398  0.311 -0.743  -0.221         
+IT      -0.183                               
+mat     -0.127                 0.461  -0.638 
+res     -0.123                 0.226   0.752 
+telco    0.116                               
+util                           0.115         
+
+               Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
+SS loadings     1.000  1.000  1.000  1.000  1.000  1.000
+Proportion Var  0.083  0.083  0.083  0.083  0.083  0.083
+Cumulative Var  0.083  0.167  0.250  0.333  0.417  0.500
+
+               Comp.7 Comp.8 Comp.9 Comp.10 Comp.11
+SS loadings     1.000  1.000  1.000   1.000   1.000
+Proportion Var  0.083  0.083  0.083   0.083   0.083
+Cumulative Var  0.583  0.667  0.750   0.833   0.917
+
+               Comp.12
+SS loadings      1.000
+Proportion Var   0.083
+
+ +

Scree Plot

+ +

+ +

Biplot

+ +

+ +

Is this useful?

+ +

Am I right in assuming that these indices are correlated with each other?

+ +

Does the biplot show some sort of clustering?

+ +

What if anything, does any of this mean?

+",2013-10-20 13:28:51.057 +57871,12808.0,2,,46384.0,,,,CC BY-SA 3.0,"

Suppose you have a d-dimensional observation vector. You may assume that your emission probabilities come from a single d-dimensional Gaussian density or a mixture of M Gaussians in which case the density is a linear combination of M component Gaussian densities. In the first case you estimate mean vector and cov. matrix for each state, in the second case you do this for all M component densiites together with their mixing weights.

+",2013-10-20 13:34:31.987 +57872,20120.0,2,,57836.0,,,,CC BY-SA 3.0,"

This answer would have deserved comment status at best, but comments are too short and don't really lend themselves to extended mock code. +Also, it seems you already got a more sensible answer by @user11852, but I wanted to give a more general answer (though see the comments below!).

+ +

If all else fails, one may always (?) obtain CIs from bootstrapping. In your specific case, this may be computationally infeasible, since running the model 1000 or so times may take half a century, but it should be fairly fool proof. I don't know R well, so here is some mock code in fake matlab for the 95% CI for the output generated by some parameter estimation function, such as intercepts in lmer. As a special feature, it bootstraps individual subjects and for each subject generates bootstrapped samples of data points for this subject.

+ +
a = data_set
+s = (# of bootstrap iterations, e.g. 1000)
+n = (# of subjects)
+
+% main loop over bootstrap iterations
+for x = 1:s
+
+    % bootstrap over subjects
+
+    % sample with replacement from your subjects (just collect n indices)
+    z = random_sample_with_replacement(1:n)
+
+    % loop over bootstrap selection of subjects
+    % to bootstrap sample individual data points within subject
+    for y = 1:length(z):
+        % for each subject selected for resampling, draw from their data points with replacement
+        for w = 1:length(a(z(y))) 
+            within_subj_boot(w) = random_sample_with_replacement(a(z(y)))
+        end
+        bootsample(y) = within_subj_boot
+    end
+
+    % perform the respective calculation (e.g., lme4) for the bootstrap sample
+    % and store the relevant parameter you want a CI for
+    output(x) = parameter_estimation_function(bootsample)
+
+% check the relevant percentiles of your bootstrapped parameter estimates
+CI = percentile(output,[5,95])
+
+ +

Most languages will have some wrapper function for something equivalent to this (but a lot less inefficient), I think for R it might be the Coin package?

+",2013-10-20 13:49:19.927 +57873,12808.0,2,,57870.0,,,,CC BY-SA 3.0,"

PCA tries to project your data onto a new set of dimensions where the variances in your data are captured such that you can classify/cluster them visually or by using a hopefully simple algorithm.

+ +

The variance plot tells you how much the new set of dimensions capture variances in decreasing order. Biplot is the projection of your data on the first two principal components (where the variances are the highest).

+",2013-10-20 13:59:09.630 +57874,22752.0,1,57881.0,,,Calculate the error variance in a linear regression model,,CC BY-SA 3.0,"

I am trying to calculate the error variance for the following question but I don't have clue where to start. Could anyone please help?

+ +

+ +

+",2013-10-20 14:01:35.673 +57875,22611.0,1,,,,Unpaired t-test with only summary means,,CC BY-SA 3.0,"

Is it appropriate to conduct an unpaired t-test with only summary means from both groups?

+ +

For example, let's say you want to compare:

+ +

a. Professor A's scores received on a evaluation by his students from one semester; with
+b. The average evaluation scores among all professors in the university completed by all students from the same semester.

+ +

The teacher evaluation has 15 Likert questions (6-point scale). Constraints:

+ +
    +
  1. You don't know how many students completed the evaluations in either group.
  2. +
  3. All you have is 15 mean scores for Professor A, and 15 mean scores for the overall university sample.
  4. +
+ +

It's safe to assume the evaluation is psychometrically sound.

+ +

An unpaired t-test is conducted to compare means from group 1 (15 means from Professor A's students) and group 2 (15 means for all professors in the University). So $N = 30$.

+ +

Is this appropriate?

+",2013-10-20 14:08:17.733 +57876,22874.0,1,,,,Statistical significance in yes/no poll question,,CC BY-SA 3.0,"

I have no statistics background, but an trying to complete my first quantitative research survey writeup for my researcher in education class and hope someone can direct me.

+ +

I created a poll of 62 teachers and am trying to analyze the following 4 questions

+ +
    +
  1. Do they use social media personally (yes/no)
  2. +
  3. Do they use SM in the classroom (yes/no)
  4. +
  5. If they plan to use in future in classroom (yes/no)
  6. +
  7. What type of secondary school do you work in (public/private)
  8. +
+ +

I am running into a challenge with how to complete a ""test of statistical significance for this data""

+ +

The professor suggested using either a t-test or ANOVA and provided links, but the data I have do not seem to fit into the formulas.

+ +

Is anyone able to provide some guidance about how to test this type of survey? Is there a different way to analyze the data?

+",2013-10-20 14:40:07.613 +58097,15658.0,1,58102.0,,,Quantile Regression - Interpretation of a significant quantile,,CC BY-SA 3.0,"

I want to perform a quantile regression on two continuous variables; Y (DV) and X (IV). I want to find out if there is an significant association between Y and X.

+ +

When doing this in R like:

+ +

fit2 <- rq(Y ~ X,tau=c(.05, .25, .5, .75, .95))

+ +

If say, the 75% quantile of X is significant with a p-value < 0.05 but rest is not, can I say that X is significant in total? If none of the quantiles are significant, is X not significant in total?

+",2013-10-23 14:51:06.183 +57877,4656.0,2,,57826.0,,,,CC BY-SA 3.0,"

Assuming that the word independent in the opening statement is used +in the way that probabilists use the word and not in the sense of independent +versus dependent variable as is common in regression analysis, the +joint distribution of the five random variables $Y_{11}, Y_{12}, Y_{13}, Y_{21},Y_{22}$ is the product of the joint distributions of +$Y_{11}, Y_{12}, Y_{13}$, and $Y_{21},Y_{22}$, both of which are multivariate +normal. This $5$-variate joint distributions +is also a multivariate normal distribution in which the mean vector is +just the concatenation $(\mu_1, \mu_2)^T$ of the two mean vectors and +the covariance matrix is +$$\Sigma = \left[\begin{matrix}\Sigma_{11} & 0\\0 & \Sigma_{22}\end{matrix}\right].$$ +Thus, the joint distribution of $Y_{11}-Y_{13}+Y_{22}$ and $Y_{21}-Y_{12}$ +is a bivariate normal distribution which can be found by the standard +methods involving setting up a linear transformation mapping +$(Y_{11}, Y_{12}, Y_{13}, Y_{21},Y_{22})$ to +$Y_{11}-Y_{13}+Y_{22},Y_{21}-Y_{12})$ and doing matrix calculations. More +simply, the means and variances of $Y_{11}-Y_{13}+Y_{22}$ and $Y_{21}-Y_{12}$ +as well as their covariance can be computed more directly and used +in writing down the mean vector and covariance matrix of this bivariate +normal distribution.

+",2013-10-20 14:54:33.237 +57878,17635.0,2,,57126.0,,,,CC BY-SA 3.0,"

Set aside the theoretical issues related to extreme observations, I would not advise ttest anyway, as you seem to have 3 groups of observations. Hence, you would be probably better with ANOVA and subsequent multiple comparisons.

+ +

Moreover, as far as your zero-spenders are concerned, I would try to understand how many of them are systematic zeros (that is, people who cannot afford to spend money because they cannot rely upon a disposable income) vs sample zeros (that is, people who are not interested in buying the goods considered in your research).

+",2013-10-20 14:55:11.057 +57879,503.0,2,,57876.0,,,,CC BY-SA 3.0,"

The professor's advice seems odd, if this is all the data you have; also ""t-test of ANOVA"" is not a sensible phrase. Is it a typo of ""t-test or ANOVA""? Even if it is a typo I think it strange advice. T-tests and ANOVAs are for comparing mean scores. In your question, there don't seem to be any means (or things to take the mean of).

+ +

Your title mentions ""statistical significance"" but that requires some hypothesis that you wish to test. What is your hypothesis?

+ +

What do you want to find out about the four questions that you asked? e.g.

+ +

What percent of teachers said ""yes"" to each?

+ +

(responding to your comment)

+ +

For public vs. private school you will have (for each of the four questions) a 2x2 table of results. Have you studied any statistical method that looks at that?

+ +

Similarly for personal vs. classroom use, you have a 2x2 table.

+ +

Unless you have yet more information....

+ +

How the four questions relate to each other?

+ +

Something else? (if so, what?)

+ +

Do you have any other information about the professors? If so, what do you have and how do you want to use it?

+",2013-10-20 15:11:34.663 +57880,22643.0,1,,,,"Time to Event modeling, fixed but different durations",,CC BY-SA 3.0,"

I am looking at probability of an event ($E$) for a number of customers. Each customer qualifies for the analysis through a qualifying Action ($A$), and has a finite Duration ($D$) to complete Event. There are a number of interim actions that should have an effect on reaching the Event. Unlike the time-to-event modeling I've done before, the Duration is known ahead of time, but is different for every customer. That is, customer 1 may have a duration of 3 months from their qualifying Action; whereas customer 2 may have a duration of 3 weeks. The duration is always known at the time of the qualifying Action.

+ +

Consider an advance car rental booking. Making the reservation is the qualifying Action. The Event of interest is whether the customer completes a full online profile. The Duration is the time period between contract & pickup. Interim actions might include receiving an email from the agency, calling into a help desk, starting the profile.

+ +

I think that incorporating duration is important because a customer booking 1 day in advance might (should) have a different propensity to complete an online profile than one who books 6 months in advance.

+ +

My instinct is to transform the duration to a proportion of the finite duration. Concretely, qualifying Action is time 0 and everyone has a duration of 100. But I've not seen that actually done in any of my literature review, and it feels like this may lose important information.

+",2013-10-20 16:35:43.337 +57881,10135.0,2,,57874.0,,,,CC BY-SA 3.0,"

$(X'X)^{-1}=\dfrac{1}{150}.\left( + \begin{array}{cc} + 5 & -10 \\ + -10 & 50 \\ + \end{array} +\right)$. $\hat{\beta}=(X'X)^{-1}X'Y=1/150.\left( + \begin{array}{cc} + 5 & -10 \\ + -10 & 50 \\ + \end{array} +\right).\left( + \begin{array}{cc} + 20 \\ + 10 \\ + \end{array} +\right)=\left( + \begin{array}{cc} + 0 \\ + 2 \\ + \end{array} +\right)$
+$\hat{\beta}\sim N(\beta,\sigma^2.(X'X)^{-1})$. You can estimate $\sigma^2$ by $s^2=\dfrac{1}{n-p-1}(y-X\hat{\beta})'(y-X\hat{\beta})$. Now if you want to simultaneously test $H_0: \beta=\beta_0$ vs $H_1:\beta\neq \beta_0$, where $\beta_0$ is a $p$-dimensional constant, then you need to use the $F$ test as follow:

+ +

$F=\dfrac{(\hat{\beta}-\beta_0)'(X'X)^{-1}(\hat{\beta}-\beta_0)}{_ps^2}\sim F_{p,n-p}$. Here $\hat{\beta}-\beta_0=\left( + \begin{array}{c} + -0.25 \\ + 1.75 \\ + \end{array} +\right) +$.
+ And $(\hat{\beta}-\beta_0)'(X'X)^{-1}(\hat{\beta}-\beta_0)=162.1875 +$. Here $p=1$ and $n=N$. If we let $_ps^2=\sigma^2=1$, then $F$ statistics is 162.1875 and we need to compare it with $F_{1,N-1}$. If $P_r(F_{\alpha,1,N-1}\geq 162.1875 )\geq (1-\alpha)$ then $H_0$ cannot be rejected otherwise accept $H_1$. See e.g. page 70 of Linear Regression Analysis: Theory and Computing.

+",2013-10-20 17:00:29.333 +57882,22877.0,1,,,,Clinical statistic problem,,CC BY-SA 3.0,"

You conduct a case-control study of elevated cholesterol and myocardial infarction (MI). Of 20 MI cases, 10 had elevated cholesterol. Of 30 healthy controls, 10 had elevated cholesterol. These results give an odds ratio (OR) of 2.0, with a 95% confidence interval (CI) $ = [0.6-6.4]$.

+ +

Interpret the CI?

+",2013-10-20 17:09:25.137 +57917,14799.0,2,,44772.0,,,,CC BY-SA 3.0,"

If each rater gave 9 ratings -- one for each pair of subjects -- at each time point, if the subjects were re-paired at each time point, and if it is not known who was paired with whom, then I don't think there is any proper way to analyze the data, because of the unknown and unestimatable correlations among the ratings from from one time to the next.

+ +

If you are willing to treat the data as if there were 18 different subjects at each time point (72 subjects total) then you could do a 1-between, 1-within analysis, where ""between"" and ""within"" are relative to the pairs, which would play the role that is usually played by subjects in such analyses: pairs are nested within time and crossed with raters; time is the between-pairs (grouping) factor -- to check for a time difference, you must look at different pairs -- and raters is the within-pair (repeated measures) factor -- you can check for a rater difference within each pair.

+",2013-10-21 07:19:51.677 +57883,22878.0,1,,,,How to test association in contingency tables with very small numbers and proportions?,,CC BY-SA 3.0,"

I have a sample of 197 responses. 8.6% (17)are from Group A, the remainder from Group B. (The groups are mutually exclusive, and not independent). (If it helps visualize the issue, Group A is under 10's, Group B respondents aged 11+.)

+ +

The responses are then sorted in to groups by type. Type 1 is ""head injuries"", and there are 3 members of this group. 2 are from Group B, 1 is from Group A.

+ +

I would expect the results for head injuries to be under 10 (Group A) & head injury: 0.3 Under 10 & no head injury: 16.7 Over 10 (Group B) & head injury: 2.7 Over 10 & no head injury: 179.3

+ +

I have asked a few friends how to test if the expected value being so much lower than the observed value is significant. So far responses seem to favour a Z test, or a chi squared test with a correction for very small numbers. I'm innumerate in the extreme, but can manage either well enough -- just not sure what is the appropriate test here?

+",2013-10-20 17:10:06.830 +57884,21168.0,1,,,,Outlier treatment in Vector Autoregression (VAR) Model,,CC BY-SA 4.0,"

Data: Multivariate Time Series, Series

+ +
    +
  1. Demand of a product
  2. +
  3. Rainfall data both available at monthly level from 2010-2013.
  4. +
+ +

Approach: I am trying to estimate the effect of rainfall on demand of the product using VAR( Vector Autoregression) model. Demand data has some outliers, like a month of sudden high demand and followed by zero values.

+ +

Question: How to treat these outliers (I am working in R), since I already have few data and deleting them is not an option for me.

+",2013-10-20 17:48:42.767 +57885,15183.0,2,,57884.0,,,,CC BY-SA 3.0,"

You can include dummy variables for the outliers, if they are caused by special events in the demand for the product. The dummy take the value of 1 on the outliers, and zero otherwise.

+",2013-10-20 18:03:09.000 +57886,22880.0,1,,,,Propensity Score,,CC BY-SA 3.0,"

What are the various methods used for binary classification other than logistic regression?

+ +

What are the advantages of logistic reg. model in developing Propensity score w.r.t. other methods?

+ +

Actually I have been asked why? I backed it by saying its binary classification, so logit is perfect. Then I was asked why specifically logistic regression when there are various other binary classification methods?

+",2013-10-20 19:29:08.897 +57887,169.0,1,,,,Is chi square the best approach for looking at number of cases by year,,CC BY-SA 3.0,"

I have the number of people diagnosed with a condition in each quintile of deprivation over five different years. We are interested in whether numbers of diagnoses are going up faster in more deprived quintiles. The data looks like this:

+ +

+ +

It looks to me very much like it rises much faster in the more deprived quintiles, on the left (Q1 and Q2).

+ +

I thought chi square was the best approach but:

+ +
test = structure(list(Q1 = c(98L, 109L, 263L, 323L, 312L),
+                      Q2 = c(90L, 113L, 199L, 237L, 247L),
+                      Q3 = c(70L, 83L, 133L, 166L, 182L), 
+               Q4 = c(20L, 39L, 60L, 87L, 90L),
+                      Q5 = c(38L, 50L, 75L, 101L, 115L)),
+                 .Names = c(""Q1"", ""Q2"", ""Q3"", ""Q4"", ""Q5""),
+                 class = ""data.frame"", row.names =
+                   c(""2008/09"", ""2009/10"", ""2010/11"", ""2011/12"", ""2012/13""))
+
+chisq.test(test)
+
+    Pearson's Chi-squared test
+
+data:  test
+X-squared = 17.285, df = 16, p-value = 0.3674
+
+ +

The thing that I can see is missing is that the years are really ordered, we are looking at an increase, but of course the chi-square just treats them as nominal.

+ +

I don't think I can use logistic regression because I'm just looking at a caseload- I don't have loads of people who don't have the disorder to compare with, just the increase over time.

+ +

Or is it the right approach and it really just isn't significant?

+ +

Thanks.

+",2013-10-20 19:34:51.553 +57888,22882.0,1,,,,How to interpret results from an experiment where covariate is influenced by the experimental setup?,,CC BY-SA 3.0,"

I am doing study investigating the effect of company's heritage on consumer attitude towards the company/brand. For this purpose I have created two texts:

+ +
    +
  • one revealing the history of the company,
  • +
  • and the other one - the control text - revealing some general information about the firm.
  • +
+ +

My results indicated that the two texts differ not only regarding heritage perception, but also regarding information credibility and believability. So the text with history details is also perceived as more credible. And this is also said to influence the attitude.

+ +

My questions are:

+ +
    +
  1. Can I use the perceived text credibility as covariate in the ANCOVA, although it is measured after the treatment and caused by it?
  2. +
  3. How can I interpret the ANCOVA output in this case? Can I separate both effects - the one of the manipulation and the one of the covariate?
  4. +
  5. Would mediation analysis be one possible solution to check whether the effect of the treatment on my dependent variable is fully mediated by text credibility?
  6. +
+",2013-10-20 19:37:55.377 +57889,21762.0,2,,57882.0,,,,CC BY-SA 3.0,"

The OR in your sample is 2. You don't know the exact odds ratio in the corresponding population, but at least you can be 95% confident that it is somewhere between 0.6 and 6.4. Since the interval contains the value 1 (no relation between MI and elevated cholesterol), you cannot claim that there is truely such relation.

+",2013-10-20 19:44:22.163 +57890,22881.0,1,121372.0,,,How to compute the maximum a posteriori probability (MAP) estimate with / without a prior,,CC BY-SA 3.0,"

I am a newbie in this area so I hope someone could explain the following problem to me in plain English.

+ +

Assume I want to use MAP to estimate some parameters on the basis of some observations. I know the method of computing MAP is: +$$ +\theta(x) = {\rm argmax} \ f(X|\theta) g(\theta) +$$

+ +

where $g$ is the prior. However, I cannot find any answers online on how to compute this using a real world example. So here is my proposed question:

+ +

Assume you asked 100 people of who they are going to vote for in an election (out of 2 candidates A and B), and assume the end result is 60% of them saying they will vote for A. How do you estimate the result of an election using MAP if:

+ +
    +
  1. candidate A is known to have a popularity of 40% and candidate B 60% (assume this to be the prior distribution)
  2. +
  3. the popularity is unknown.
  4. +
+ +

I also looked at this answer but I'm still confused: +Example of maximum a posteriori estimation

+",2013-10-20 19:52:06.183 +57891,14799.0,2,,55576.0,,,,CC BY-SA 3.0,"

Let $P$ = a 12 x 12 matrix with 1s on the diagonals and the negatives of the partial correlations on the offdiagonals, and let $Q = P^{-1}$. Then the original correlation between variables $i$ and $j$ is $r_{ij} = q_{ij}/\sqrt{q_{ii}q_{jj}}$.

+",2013-10-20 19:54:22.217 +57894,22885.0,1,,,,PCA scores in a for portfolio replication task: stumble over mean-centering question,,CC BY-SA 3.0,"

I'm trying to implement Principal Component Analysis (PCA) in a portfolio replication procedure. +(The replication procedure looks like regression: there is a vector representing payoffs of an asset under different ecocnomic scenarios, I need to find a linear combination of vectors of another assets that fits the most closely to an initial asset).

+ +

Coefficients I get from regression should tell me how many assets I need to buy and sell to get approximately the same payoff as the asset I'm trying to replicate.

+ +

There is a problem with regression as the candidate asset matrix is ill-conditioned: assets show high correlation. There's a hope that orthogonal principal components could resolve this problem.

+ +

I need to do a PCA decomposition of candidate assets matrix, take only first $n$ components, do optimization and get components coefficients. Then I need to transform the coefficients back into an original basis.

+ +

Now the problem: PCA usually works with mean-centered data, but if I subtract means from the original data, I don't know how to interpret resulting coefficients in my case and don't know how to reverse the operation.

+ +

So far I'm doing eigen-decomposition of a covariance matrix, then using eigenvectors to make an orthogonal transformation of the data that is not mean-centered. Then I'm running a regression (actually L1 norm optimization) to get coefficients and transform them back into an original basis. The results are not bad, but I can't stop thinking about the problem with mean-centering, if I'm doing it completely wrong.

+ +

I was hoping to find a detailed math reasoning for this problem, but unfortunately failed. +I'm very much a noob in this and my math skills are far from being good, so I would really appreciate your help if you can share some insights on the problem of mean-centering in the PCA.

+",2013-10-20 21:37:32.620 +57895,20939.0,1,57912.0,,,How to visualize a fitted multiple regression model?,,CC BY-SA 3.0,"

I am currently writing a paper with several multiple regression analyses. While visualizing univariate linear regression is easy via scatter plots, I was wondering whether there is any good way to visualize multiple linear regressions?

+ +

I am currently just plotting scatter plots like dependent variable vs. 1st independent variable, then vs. 2nd independent variable, etc. I would really appreciate any suggestions.

+",2013-10-20 21:46:37.727 +57896,22843.0,1,57903.0,,,Explain Statistics: Matching formulas for linear regression,,CC BY-SA 3.0,"

I am looking at my statistics book and an article online for linear regression and was wondering if anyone can verify that these two equations are entirely different. Consider the equation $\hat{y} = ax + b$

+ +

In my book, a and b are :

+ +

$a = \frac{r \cdot S_{y}}{S_{x}}$

+ +

$b = \bar{y} - a\bar{x}$

+ +

$r = \sum \frac{(x_{i} - \bar{x})(y_{i} -\bar{y})}{S_{x}S_{y}(n-1)}$

+ +

$\displaystyle S_{y} = \sqrt{ \frac{\sum (y_i - \bar{y})^{2}}{(n-1)} }$

+ +

From one online article, a and b are:

+ +

$\displaystyle a = \frac{n \sum x_{i}y_{i} - \sum x_{i} \sum y_{i}}{n \sum x^2_{i} - (\sum x_{i})^2}$

+ +

$b = \bar{y} - a\bar{x}$.

+ +

The a from the online article vaguely looks like covariance in the numerator and the denominator looks like variance but for only one random variable, not two. Can someone explain the discrepancy (if there are any) and construct an argument for my book's choice? I can understand the second formulation mainly because it comes from setting partial derivatives to zero to minimize an objective function and then finding the coefficients a and b.

+",2013-10-20 22:32:47.917 +57897,22887.0,2,,414.0,,,,CC BY-SA 3.0,"

+ +

an 'easy to digest' pie chart example for Rick Astley fans that my students seem to enjoy

+",2013-10-20 23:10:34.503 +57898,19870.0,1,,,,When finding outliers from the Interquartile range why I have to multiply by 1.5?,,CC BY-SA 3.0,"

I was looking at the outlier detection formula which uses the IQR and I wonder why it should be multiplied by 1.5? Can the constant be increased i.e 3 or 6 to be more ""acid"" if so under what criteria?

+",2013-10-20 23:16:57.080 +57899,19750.0,1,57920.0,,,Understanding Feature Hashing,,CC BY-SA 3.0,"

Wikipedia provides the following example when describing feature hashing; but the mapping does not seem consistent with the dictionary defined

+ +

For example, to should be converted to 3 according to the dictionary, but it is encoded as 1 instead.

+ +

Is there an error in the description? How does feature hashing work?

+ +
+

The texts:

+ +
John likes to watch movies. Mary likes too.
+John also likes to watch football games.
+
+ +

can be converted, using the dictionary

+ +
{""John"": 1, ""likes"": 2, ""to"": 3, ""watch"": 4, ""movies"": 5, ""also"": 6, 
+""football"": 7, ""games"": 8, ""Mary"": 9, ""too"": 10}
+
+ +

to the matrix

+ +
[[1 2 1 1 1 0 0 0 1 1]
+ [1 1 1 1 0 1 1 1 0 0]]
+
+
+",2013-10-20 23:27:14.773 +57900,503.0,2,,57898.0,,,,CC BY-SA 3.0,"

Certainly you can change the criterion.

+ +

The 1.5 multiplier is so that a certain proportion of the sample in a normal population will be outside it. But there is nothing sacred about it.

+ +

However, I would caution against any automatic method of selecting outliers.

+",2013-10-20 23:38:03.220 +57901,20473.0,2,,57778.0,,,,CC BY-SA 3.0,"

Ok. Let's do this, for CV's shake.

+ +

First compact by setting $C=\frac{1}{\sqrt{2\pi\sigma^2_y}}\frac{1}{\sqrt{2\pi\sigma^2_w}} = \frac{1}{2\pi\sigma_y\sigma_w}$, so

+ +

$$f_Y(y) =C \int_{-\infty}^{\infty}\exp\left\{-\frac{(y-w)^2}{2\sigma_y^2}\right\}\exp\left\{-\frac{(w-\mu_w)^2}{2\sigma_w^2}\right\}dw$$

+ +

We have +$$exp\left\{-\frac{(y-w)^2}{2\sigma_y^2}\right\}\exp\left\{-\frac{(w-\mu_w)^2}{2\sigma_w^2}\right\} = +\exp\left\{-\frac{y^2-2yw+w^2}{2\sigma_y^2}\right\}\exp\left\{-\frac{w^2-2w\mu_w+\mu_w^2}{2\sigma_w^2}\right\} +=\exp\left\{-\frac{y^2}{2\sigma_y^2}-\frac{\mu_w^2}{2\sigma_w^2}\right\} \exp\left\{-\frac{w^2}{2\sigma_y^2}-\frac{w^2}{2\sigma_w^2}\right\}\exp\left\{\frac{2yw}{2\sigma_y^2}+\frac{2w\mu_w}{2\sigma_w^2}\right\}$$

+ +

Setting $s^2\equiv \sigma_y^2+\sigma_w^2$ we arrive at

+ +

$$=\exp\left\{-\frac{y^2}{2\sigma_y^2}-\frac{\mu_w^2}{2\sigma_w^2}\right\} \exp\left\{-\frac{s^2}{2\sigma_y^2\sigma_w^2}w^2\right\}\exp\left\{\frac{\sigma_w^2y+\sigma_y^2\mu_w}{\sigma_y^2\sigma_w^2}w\right\}$$

+ +

Include the first $\exp$ in the constant, $C^*=C \exp\left\{-\frac{y^2}{2\sigma_y^2}-\frac{\mu_w^2}{2\sigma_w^2}\right\}$. +Set +$$\beta\equiv \frac{s^2}{2\sigma_y^2\sigma_w^2},\qquad \alpha\equiv \frac{\sigma_w^2y+\sigma_y^2\mu_w}{\sigma_y^2\sigma_w^2}$$ to obtain

+ +

$$f_Y(y) =C^* \int_{-\infty}^{\infty}e^{-\beta w^2+\alpha w}dw=C^*\left[ \int_{-\infty}^{0}e^{-\beta w^2+\alpha w}dw + \int_{0}^{\infty}e^{-\beta w^2+\alpha w}dw\right]$$

+ +

$$=C^* \int_{0}^{\infty}e^{-\beta w^2}\left[e^{-\alpha w}+e^{\alpha w}\right]dw =2C^* \int_{0}^{\infty}e^{-\beta w^2}\operatorname{cosh}(\alpha w)dw$$

+ +

where $\operatorname{cosh}$ is the hyperbolic cosine.

+ +

Using a formula provided in Gradshteyn & Ryzhik (2007), ""Table of Integrals, Series and Products"", 7th ed., p. 384, eq. 3.546(2) we have

+ +

$$f_Y(y)=2C^*\frac 12 \sqrt {\frac {\pi}{\beta}} \exp\left\{\frac {\alpha^2}{4\beta}\right\}$$

+ +

Now $$\frac {\alpha^2}{4\beta} = \frac {\left(\frac{\sigma_w^2y+\sigma_y^2\mu_w}{\sigma_y^2\sigma_w^2}\right)^2}{4\frac{s^2}{2\sigma_y^2\sigma_w^2}} = \frac {(\sigma_w^2y+\sigma_y^2\mu_w)^2}{2\sigma_y^2\sigma_w^2s^2}$$

+ +

and bringing back in $C^*$ (and $\beta$) in all its glory we have

+ +

$$f_Y(y)=\frac{1}{2\pi\sigma_y\sigma_w}\exp\left\{-\frac{y^2}{2\sigma_y^2}-\frac{2\mu_w^2}{\sigma_w^2}\right\}\sqrt{\pi} \left(\sqrt {\frac{s^2}{2\sigma_y^2\sigma_w^2}}\right)^{-1} \exp\left\{\frac {(\sigma_w^2y+\sigma_y^2\mu_w)^2}{2\sigma_y^2\sigma_w^2s^2}\right\} $$

+ +

The constant terms simplify to

+ +

$$\frac{1}{2\pi\sigma_y\sigma_w}\sqrt{\pi} \left(\sqrt {\frac{s^2}{2\sigma_y^2\sigma_w^2}}\right)^{-1} = \frac{1}{s\sqrt{2\pi}} $$

+ +

and, the exponentials end up in the normal exponential. So in the end

+ +

$$f_Y(y) = \frac{1}{s\sqrt{2\pi}}\exp\left\{-\frac{(y-\mu_w)^2}{2s^2}\right\}= N(\mu_w, s^2),\qquad s^2\equiv \sigma_y^2+\sigma_w^2$$

+",2013-10-20 23:39:58.167 +57902,22843.0,1,57906.0,,,Expectation values as vectors?,,CC BY-SA 3.0,"

I want to break down this statement:

+ +

$|E[(X - \bar{x})(Y - \bar{y})]|^2 = |<X - \bar{x}, Y - \bar{y}>|^2$

+ +

I am not familiar with expectation values being broken down into vectors. I only know that by definition $\displaystyle E[(X - \bar{x})^2] = \sum_{i=1}^{n} \frac{(x_{i} - \bar{x})^2}{n}$ and I would like to know how expectation values can be viewed as vectors specifically in the context of inner products like $E[(X - \bar{x})^2] = <X-\bar{x}, X-\bar{x}>$ Also whatever happened to the n?

+ +

My other question is how do I view covariance as a vector? I know that covariance is $E[XY] - E[X]E[Y]$ so how do I rewrite that in vector form?

+",2013-10-21 00:25:41.560 +57926,1406.0,2,,57921.0,,,,CC BY-SA 3.0,"

If you have looked at the package vignette, you would have seen that package rugarch estimates ARFIMA(p,d,q) model with $0<d<1$. So it is not possible to set the integration order higher than one. If you want to keep the value $d$ fixed set fixed.pars=list(arfima=d). For that naturally you need to set arfima=TRUE in the argument mean.model.

+",2013-10-21 11:35:49.830 +57903,10135.0,2,,57896.0,,,,CC BY-SA 3.0,"

They are mainly equivalent. First rewrite $r$ as $r=\dfrac{S_{xy}}{(n-1)S_xS_y}$, where $S_{xy}=\sum(x_i-\bar{x}).(y_i-\bar{y})$. Now plug in $r$ as above in $a$ from your book, we have: $a=\dfrac{r.S_y}{S_x}=\dfrac{S_{xy}}{(n-1)S_xS_y}.\dfrac{S_y}{S_x}=\dfrac{S_{xy}}{S_x^2(n-1)}$. But from definition of $S_y$ from your book, we can have $S_x=\sqrt{\dfrac{\sum (x_i-\bar{x})^2}{n-1}}=\sqrt{\dfrac{S_{xx}}{n-1}}$. Now square both sides and use cross multiplication to have: $S_{xx}=S_x^2.(n-1)$. Now replace this last equality in the denominator of $a$ we found before to get: $a=\dfrac{S_{xy}}{S_{xx}}$. Now work on $a$ from online. Note that $S_{xy}=\sum x_iy_i-\bar{y}\sum x_i-\bar{x}\sum y_i+n\bar{y}\bar{x}=\sum x_iy_i-n\bar{y}\bar{x}$. Hence we have $nS_{xy}=n\sum x_iy_i-n^2\bar{y}\bar{x}=n\sum x_iy_i-\sum x_i \sum y_i.$ So what you have in the numerator of $a$ from online is actually $nS_{xy}$. Now work on the denominator of $a$ and factor out $n$ to get $n\sum x_i^2-(\sum x_i)^2=n\Big(\sum x_i^2-\dfrac{(\sum x_i)^2}{n}\Big )=n\Big(\sum x_i^2-\dfrac{n^2\bar{x}^2}{n}\Big )=n\Big(\sum x_i^2-n\bar{x}^2\Big)=nS_{xx}.$ So what you have in the denominator of $a$ is $nS_{xx}$. Therefore, $a$ from online is actually $a=\dfrac{nS_{xy}}{nSxx}=\dfrac{S_{xy}}{Sxx}$ that is equal to the $a$ from your book. Cheers :)

+",2013-10-21 00:35:39.223 +57904,22889.0,1,,,,Probability that items chosen randomly are defective?,,CC BY-SA 3.0,"

6 light bulbs are chosen at random from 17 bulbs of which 6 are defective.

+ +

(a) What is the probability that exactly 2 are defective?

+ +

(b) What is the probability that at most 1 is defective?

+",2013-10-21 00:38:08.623 +57905,3993.0,2,,57896.0,,,,CC BY-SA 3.0,"

I don't know what online article you looked it (maybe you could link it?), but as far as I can tell, the author there assumes that $x$ and $y$ are centered about their means. Under this assumption, the two formulae for $a$ that you posted do agree.

+ +

To see this, first note that your first $a$ formula can written as +$$ +a = \frac{r \cdot S_y}{S_x} = \frac{{\rm cov}(y,x) \cdot S_y}{(S_y \cdot S_x) \cdot S_x} = \frac{{\rm cov}(y,x)}{S_x^2}, +$$ +with ${\rm cov}$ referring to the covariance.

+ +

Now if you take the second $a$ formula and assume the variables are centered, then the simple sum terms drop out (because then both variables separately sum to 0). So it just reduces to +$$ +a = \frac{\sum x_{i}y_{i}}{\sum x^2_{i}} = \frac{\sum x_{i}y_{i} / (n-1)}{\sum x^2_{i} / (n-1)} = \frac{{\rm cov}(y,x)}{S_x^2}, +$$ +which matches the first formula. So they coincide if the variables are centered, but not in general. Maybe they mention this assumption in the online article you referred to.

+ +

Edit: I suggested that they were not equivalent in general, only in the special case where variables are centered. But @Stat has apparently shown that they are equivalent in general. So I will just leave this answer up in case you find considering this special case to be illuminating :)

+",2013-10-21 00:40:01.817 +57906,20473.0,2,,57902.0,,,,CC BY-SA 3.0,"

The ""by definition"" equality you write does not hold.

+ +

$$\displaystyle E[(X - \bar{x})^2] = \int_{S_X}(x - \bar{x})^2f_X(x) dx $$ +is the correct definition for continuous r.v.'s , with $S_X$ the support of $X$ and $f_X(x)$ the pdf of $X$. +For discrete random variables

+ +

$$E[(X - \bar{x})^2] = \sum_{S_X}(x - \bar{x})^2p_X(x) $$ +Now IF the $x$'s can be viewed as realization of the same ergodic and stationary stochastic process, THEN $\frac {1}{n}\sum_{i=1}^{n} (x_{i} - \bar{x})^2$ is a consistent estimator of $E[(X - \bar{x})^2]$.

+ +

The expected value operator is applied to each element of any vector-matrix. If

+ +

$$A=\left[\begin{matrix} +a_{11} &...& a_{1n}\\ +... & ...& ... \\ +a_{k1} &...&a_{kn} +\end{matrix}\right]$$

+ +

then +$$E(A) = \left[\begin{matrix} +E(a_{11}) &...& E(a_{1n})\\ +... & ...& ... \\ +E(a_{k1}) &...&E(a_{kn}) +\end{matrix}\right]$$

+ +

If $\mathbf x$ and $\mathbf y$ are two $n\times 1$ column vectors, then (prime denoting the transpose) +$$ \operatorname{Cov}(\mathbf x,\mathbf y) = E(\mathbf x \mathbf y') - E(\mathbf x)\Big[E(\mathbf y)\Big]'$$

+ +

This is the expression for the covariance of two random vectors. If you want the covariance matrix of two samples, look up this answer in math.SE

+",2013-10-21 00:57:28.033 +57907,22890.0,1,57909.0,,,"What does ""I"" represent in this context?",,CC BY-SA 3.0,"

I'm trying to work on a problem which contains a symbol that I don't recall seeing before - I. I assume it has some special significance but I'm having a hard time looking it up. Relevant portion of the problem:

+ +

""Consider a random sample X1, ...Xn from the pdf f(x; θ) = .5(1 + θx) I[−1,1](x)""

+ +

If it helps, the context of the problem is point estimation.

+",2013-10-21 01:01:58.073 +57908,15321.0,1,,,,Does adding a dataset in repeated measures change the type of test?,,CC BY-SA 3.0,"

I am trying to design a test and am confused between the types of experiment that I should be conducting. Here is the description of what I am doing:

+ +

It is a repeated measures test. So, a participant is first given an uncolored node-link graph G1 and asked some questions about the graph. The dependent variable is the time it takes for the participant to answer the questions. Next the same participant is given a colored graph G2 and asked a set of questions. This is counterbalanced by reversing the order of the groups. Next I repeat this procedure except that this time, I provide colored G1 and uncolored G2. In all, I have 4 conditions/groups that perform the test in order as specified below:

+ +
    +
  1. G1-uncolored G2-colored
  2. +
  3. G2-colored G1-uncolored
  4. +
  5. G1-colored G2-uncolored
  6. +
  7. G2-uncolored G1-colored
  8. +
+ +

I measure the time for each case. The two graphs are almost equal. The only reason I am using two different graphs is because I cannot use the same graph in both the conditions for a group as if I did that, then the participant would remember his answers from the previous condition.

+ +

Now, I am not sure what should I call this method? Should I consider one independent variable (i.e. presence/abscence of color in the graph) or whether I consider two different IV (color + which graph is used (G1/G2?)). In other words, is this just a t-dependent test or a two-way ANOVA (dunno which one)?

+",2013-10-21 01:04:11.403 +57909,10135.0,2,,57907.0,,,,CC BY-SA 3.0,"

Probably that is an indicator function. It is defined as $I_{[-1,1]}(x)=x$ if $-1\leq x\leq1$ and $0$ otherwise. I am not 100% sure since you didn't write your pdf properly.

+",2013-10-21 01:09:19.913 +57910,20222.0,2,,57904.0,,,,CC BY-SA 3.0,"

I believe the Hyper Geometric distribution can solve this problem.

+ +

My calculation for (a) is 0.399968

+",2013-10-21 01:52:08.647 +57927,1406.0,2,,57925.0,,,,CC BY-SA 3.0,"

The convention to specify vectors or matrices with bold letters is much more frequently upheld than the convention of upper-case letters for random variables. In the articles I usually read (econometrics, time-series regression mostly) the latter convention is not used, i.e. the random variables are usually lower-case.

+ +

Look for the influential papers in your field and try to copy their conventions. Stating the notation somewhere in the beginning is a must usually.

+",2013-10-21 11:48:10.340 +57911,22891.0,1,,,,Logistic Regression with time slices,,CC BY-SA 3.0,"

I'm using logistic regression to predict student retention in an online course.

+ +

I have a data of student interactions within a web platform of an online course. The course spans 6 weeks, with new lecture resources and new assignments uploaded at the beginning of each week. The weekly assignments due at the end of each week. Students can watch lecture videos, view/write forum posts, and do assignments. Students can drop out of the course any time (i.e. no longer interacting within the course platform).

+ +

Each week, I want to predict the likelihood of a student staying within the course in the next week (stay in the next week=1, out in the next week=0). The predictors are the number of times the student watch the lecture videos (video_views), the number of posts the student read (posts_read), the number of posts the student wrote (post_written) and the student's score of this week assignment (score).

+ +

I'm thinking of building 6 models using logistic regression, for each week. But I also want to make a connection between, say, week 6's model and week 5's model. Can such connection be shown if I use percentage of cumulative values for each predictors (e.g. cumulative number of assignment score out of total assignment score for the whole course) instead of weekly values?

+ +

Should I just build a single model with the course week (course_week) as another predictor? This is my first time using logistic regression, and I'm not sure about putting a time variable in a logistic regression model.

+",2013-10-21 02:07:39.870 +57912,5237.0,2,,57895.0,,,,CC BY-SA 4.0,"

There is nothing wrong with your current strategy. If you have a multiple regression model with only two explanatory variables then you could try to make a 3D-ish plot that displays the predicted regression plane, but most software don't make this easy to do. Another possibility is to use a coplot (see also: coplot in R or this pdf), which can represent three or even four variables, but many people don't know how to read them. Essentially however, if you don't have any interactions, then the predicted marginal relationship between $x_j$ and $y$ will be the same as predicted conditional relationship (plus or minus some vertical shift) at any specific level of your other $x$ variables. Thus, you can simply set all other $x$ variables at their means and find the predicted line $\hat y = \hat\beta_0 + \cdots + \hat\beta_j x_j + \cdots + \hat\beta_p \bar x_p$ and plot that line on a scatterplot of $(x_j, y)$ pairs. Moreover, you will end up with $p$ such plots, although you might not include some of them if you think they are not important. (For example, it is common to have a multiple regression model with a single variable of interest and some control variables, and only present the first such plot).

+

On the other hand, if you do have interactions, then you should figure out which of the interacting variables you are most interested in and plot the predicted relationship between that variable and the response variable, but with several lines on the same plot. The other interacting variable is set to different levels for each of those lines. Typical values would be the mean and $\pm$ 1 SD of the interacting variable. To make this clearer, imagine you have only two variables, $x_1$ and $x_2$, and you have an interaction between them, and that $x_1$ is the focus of your study, then you might make a single plot with these three lines:
+\begin{align} +\hat y &= \hat\beta_0 + \hat\beta_1 x_1 + \hat\beta_2 (\bar x_2 - s_{x_2}) + \hat\beta_3 x_1(\bar x_2 - s_{x_2}) \\ +\hat y &= \hat\beta_0 + \hat\beta_1 x_1 + \hat\beta_2 \bar x_2 \quad\quad\quad\ + \hat\beta_3 x_1\bar x_2 \\ +\hat y &= \hat\beta_0 + \hat\beta_1 x_1 + \hat\beta_2 (\bar x_2 + s_{x_2}) + \hat\beta_3 x_1(\bar x_2 + s_{x_2}) +\end{align}

+

An example plot that's similar (albeit with a binary moderator) can be seen in my answer to Plot regression with interaction in R.

+",2013-10-21 02:44:59.367 +57913,449.0,2,,57908.0,,,,CC BY-SA 3.0,"

It sounds like you counterbalanced the pictures and order only to assess colour so colour presence would be your primary variable of interest. However, regardless of how well you controlled for picture similarity, colour could have impacted them differentially. In that case you might want to look at a colour x picture interaction because it could compromise your conclusions, point to new directions for research, or both. A picture main effect wouldn't really mean anything.

+ +

Often times, in these cases one might report the main effect of colour but not fully report the interaction unless there was something meaningful in it because it's only run to assess the design, not as an outcome you want to generalize.

+",2013-10-21 03:29:05.027 +57914,22893.0,1,,,,Probability with fair dice,,CC BY-SA 3.0,"

If you roll 5 standard 6 sided dies, what's the probability that you will get at least three 2s?

+ +

I imagine it would be 1 - P(0 twos) - P(1 two) - P(2 twos), but I don't know how to calculate the probability of these.

+",2013-10-21 03:43:45.820 +57915,14799.0,2,,57778.0,,,,CC BY-SA 3.0,"

$Y = W + Z$, where $Z$ is normal with mean 0 and variance $\sigma_z^2$ and is independent of $W$. (Note that I am using $\sigma_z^2$ where the OP used $\sigma_y^2$, which I reserve for the marginal variance of $Y$.) Then the unconstrained joint distribution of $(W,Y)$ is bivariate normal with $\mu_y = \mu_w$, $\sigma_y^2 = \sigma_w^2 + \sigma_z^2$, and $\sigma_{wy} = \sigma_w^2$.

+ +

Letting $\phi$ denote the standard normal pdf, integrating over the halfplane $Z<0$ gets the following marginal moments of $Y\,|\,(Y<W)$:

+ +

Mean $= \mu_y = 2 \int_{-\infty}^\infty \int_{-\infty}^0 (w \sigma_w + \mu_w + z \sigma_z)\,\phi(z)\mathrm{d}z\,\phi(w)\mathrm{d}w = \mu_w - \sigma_z \sqrt{2/\pi}$.

+ +

Variance $=\sigma_y^2 = 2 \int_{-\infty}^\infty \int_{-\infty}^0 (w \sigma_w + \mu_w + z \sigma_z - \mu_y)^2\,\phi(z)\mathrm{d}z\,\phi(w)\mathrm{d}w = \sigma_w^2 + \sigma_z^2 (1-2/\pi)$.

+ +

Third central moment $=2 \int_{-\infty}^\infty \int_{-\infty}^0 (w \sigma_w + \mu_w + z \sigma_z - \mu_y)^3\,\phi(z)\mathrm{d}z\,\phi(w)\mathrm{d}w = \sqrt{2}(\pi - 4) \sigma_z^3 / \pi^{3/2}$.

+ +

Those can be solved in reverse order to get $\sigma_z^2$, then $\sigma_w^2$, then $\mu_w$, which are necessary and sufficient to specify the unconstrained joint distribution.

+",2013-10-21 04:19:30.463 +57916,18914.0,1,,,,How to deal with collinearity in lme with categorical IV with > 2 levels,,CC BY-SA 3.0,"

I'm analysing data from our experiment. We had participants in 4 groups, each participant was measured 4 times. We measured cortisol in saliva, so it leads us to the linear mixed models, because the individual cortisol levels have different slopes. +I have fitted following model:

+ +
lmer1 <- lmer(Cortisol ~ group*measurement + (1|id), data=df)
+
+ +

I used treatment codig for both categorical variables, because we are interested in differences between 1st measurement in first group with other measurements.

+ +

My problem is, that I get strong correlations between factor levels and I'm not sure, how to solve it. Contrast coding would be one solution, but it would answer different question (as I said, we want to compare differences between 1st group,1st measurement and all the others).

+ +

This is my correlation matrix for fixed effect from lmer method (lme4 package):

+ +
          (Intr) group2 group3 groupP msrmn2 msrmn3 msrmn4 grp2:2 grp3:2 grpP:2 grp2:3 grp3:3 grpP:3 grp2:4 grp3:4
+group2      -0.770                                                                                    
+group3      -0.650  0.500                                                                             
+groupP      -0.557  0.429  0.362                                                                      
+measuremnt2 -0.602  0.464  0.391  0.335                                                               
+measuremnt3 -0.598  0.460  0.388  0.333  0.521                                                        
+measuremnt4 -0.602  0.464  0.391  0.335  0.524  0.521                                                 
+grp2:msrmn2  0.461 -0.600 -0.299 -0.257 -0.765 -0.398 -0.401                                          
+grp3:msrmn2  0.390 -0.300 -0.589 -0.217 -0.647 -0.337 -0.339  0.495                                   
+grpP:msrmn2  0.329 -0.253 -0.214 -0.578 -0.546 -0.284 -0.287  0.418  0.353                            
+grp2:msrmn3  0.461 -0.599 -0.300 -0.257 -0.402 -0.772 -0.402  0.519  0.260  0.220                     
+grp3:msrmn3  0.383 -0.295 -0.579 -0.213 -0.333 -0.641 -0.333  0.255  0.501  0.182  0.495              
+grpP:msrmn3  0.333 -0.256 -0.216 -0.585 -0.290 -0.557 -0.290  0.222  0.188  0.499  0.430  0.357       
+grp2:msrmn4  0.462 -0.598 -0.300 -0.257 -0.402 -0.399 -0.767  0.518  0.260  0.220  0.518  0.256  0.223  
+grp3:msrmn4  0.390 -0.300 -0.589 -0.217 -0.339 -0.337 -0.647  0.260  0.510  0.185  0.260  0.501  0.188  0.496
+grpP:msrmn4  0.329 -0.253 -0.214 -0.578 -0.287 -0.284 -0.546  0.219  0.185  0.493  0.220  0.182  0.499  0.419  0.353
+
+ +

Do you have suggestions about how to solve this (reduce collinearity/ignore it)?

+",2013-10-21 07:14:43.380 +58159,22959.0,1,58161.0,,,how many years will it take to achieve six-sigma quality?,,CC BY-SA 3.0,"

Suppose a business is operating at the three-sigma quality level. If projects have an average improvement rate of 50% annually, how many years will it take to achieve six-sigma quality?

+",2013-10-24 12:05:49.620 +57918,14470.0,1,,,,"Map a normal distribution N(x,s) to an ordinal response variable",,CC BY-SA 3.0,"

I am working on a simulation. I am going to extract a series of normally distributed values from a distribution $N(x,s)$ whose mean $x$ and variance $s$ is known. I want to pretend that such distribution $N(x,s)$ is a latent causal variable that determinates a response ordinal variable $O$ which is in a known range (i.e. $[1,10]$). All other characteristics of $O$ (value distribution) are to be determined on the basis of $N$. What function could I use to map $N$ values to $O$? Since this is just a demonstrative simulation of an algorithm working on ordinal data (this mapping is not the core of the simulation, but it is part of the generative data process) and I am not oriented toward any particular psychometric theory, I am looking for some advice for a simple mapping function, though it would be better if based on some psychometric theory.

+",2013-10-21 09:05:56.440 +57919,22752.0,1,,,,Derivation of the Bivariate normal distribution using change-of-variable technique,,CC BY-SA 3.0,"

+

I am not familiar with the change-of-variable technique that the question refers to. Does anyone have an idea what is meant and how one should go about doing it?

+",2013-10-21 09:11:38.317 +57920,221.0,2,,57899.0,,,,CC BY-SA 3.0,"

The matrix is constructed in the following way:

+ +
    +
  • rows represent lines
  • +
  • columns represent features
  • +
+ +

and every entry matrix(i,j)=k means:

+ +

In line i, the word with index j appears k times.

+ +

So to is mapped to index 3. It appears exactly one time in line 1. So m(1,3)=1.

+ +

More examples

+ +
    +
  • likes is mapped to index 2. It appears exactly two times in the first line. So m(1,2)=2
  • +
  • also is mapped to index 6. It does not appear in line 1, but one time in line 2. So m(1,6)=0 and m(2,6)=1.
  • +
+",2013-10-21 09:31:36.713 +57921,22677.0,1,57926.0,,,"How does one specify arima (p,d,q) in ugarchspec for ugarchfit in rugarch?",,CC BY-SA 3.0,"

Basically I'm trying to fit garch(1,1) model with arima order from auto.arima

+ +
> assign(paste(""spec.ret.fin."",colnames(base.name[1]),sep=""""),    
++ ugarchspec(variance.model = list(model = ""fGARCH"", garchOrder = c(1, 1), 
++ submodel = ""GARCH"", external.regressors = NULL, variance.targeting = FALSE), 
++ mean.model = list(armaOrder = c(2,3,4), include.mean = TRUE, archm = FALSE, 
++ archpow = 1, arfima = FALSE, external.regressors = NULL, archex = FALSE), 
++ distribution.model = ""norm"", start.pars = list(), fixed.pars = list()))
+
+ +

This gives the following result:

+ +
+

spec.ret.fin.chn

+
+ +
*---------------------------------*
+*       GARCH Model Spec          *
+*---------------------------------*
+
+Conditional Variance Dynamics   
+------------------------------------
+GARCH Model     : fGARCH(1,1)
+fGARCH Sub-Model    : GARCH
+Variance Targeting  : FALSE 
+
+Conditional Mean Dynamics
+------------------------------------
+Mean Model      : ARFIMA(2,0,3)
+Include Mean        : TRUE 
+GARCH-in-Mean       : FALSE 
+
+Conditional Distribution
+------------------------------------
+Distribution    :  norm 
+Includes Skew   :  FALSE 
+Includes Shape  :  FALSE 
+Includes Lambda :  FALSE 
+
+ +

But the same code with arfima=TRUE gives

+ +
+

spec.ret.fin.chn

+
+ +
*---------------------------------*
+*       GARCH Model Spec          *
+*---------------------------------*
+
+Conditional Variance Dynamics   
+------------------------------------
+GARCH Model     : fGARCH(1,1)
+fGARCH Sub-Model    : GARCH
+Variance Targeting  : FALSE 
+
+Conditional Mean Dynamics
+------------------------------------
+Mean Model      : ARFIMA(2,d,3)
+Include Mean        : TRUE 
+GARCH-in-Mean       : FALSE 
+
+Conditional Distribution
+------------------------------------
+Distribution    :  norm 
+Includes Skew   :  FALSE 
+Includes Shape  :  FALSE 
+Includes Lambda :  FALSE 
+
+ +

How does one replace that d with the integration order (d) of the arima?

+",2013-10-21 09:39:42.080 +57922,21638.0,2,,57826.0,,,,CC BY-SA 3.0,"

Given that $Y_{1}$ and $Y_{2}$ are independent, we have that

+ +

$$ +\left[\array{Y_{11} \\ Y_{12} \\ Y_{13} \\ Y_{21} \\ Y_{22}}\right] \sim MVN\left(\left[\array{2\\2\\2\\3\\4}\right],\left[\array{3 & 1 & 0 & 0 & 0\\1 & 2 & 0 & 0 &0\\0&0&3&0&0\\0&0&0&4&2\\0&0&0&2&4}\right]\right) +$$

+ +

Let

+ +

$$ +\begin{array}{rcl}X_1 & = & Y_{11}-Y_{13}+Y_{22}\\ +X_2 & = & Y_{21}-Y_{12}\end{array} +$$

+ +

As $Y_{11},Y_{12},Y_{13},Y_{21},Y_{22}$ are jointly normal, the linear combinations $Y_{11}-Y_{13}+Y_{22}$ and $Y_{21}-Y_{12}$ are normally distributed. It also follows that as any linear combination of $X_{1}$ and $X_{2}$ is a linear combination of $Y_{11},Y_{12},Y_{13},Y_{21},Y_{22}$ so must $X_{1}$ and $X_{2}$ be jointly normal.

+ +

All that remains is to determine the mean and covariance of $X_{1}$ and $X_{2}$. Given the linearity of expectations, the mean is trivial to calculate:

+ +

$$ +\begin{array}{rcl} +E[X_1] &=& E[Y_{11} - Y_{13} + Y_{22}]\\ &=& E[Y_{11}] - E[Y_{13}] + E[Y_{22}]\\ +E[X_2] &=& E[Y_{21} - Y_{12}]\\ &=& E[Y_{21}] - E[Y_{12}] +\end{array} +$$

+ +

The covariance is equally straightforward yet tedious:

+ +

$$ +\begin{array}{rcl} +Cov[X_1,X_1] &=& Cov[Y_{11},Y_{11}] + 2 \times Cov[Y_{11},-Y_{13}+Y_{22}] + Cov[-Y_{13}+Y_{22},-Y_{13}+Y_{22}]\\ +&=& Cov[Y_{11},Y_{11}] - 2 \times Cov[Y_{11},Y_{13}] + 2 \times Cov[Y_{11},Y_{22}] + Cov[Y_{13},Y_{13}] - 2 \times Cov[Y_{13},Y_{22}] + Cov[Y_{22},Y_{22}]\\\\ +Cov[X_2,X_2] &=& Cov[Y_{21},Y_{21}] - 2 \times Cov[Y_{12},Y_{21}] + Cov[Y_{12},Y_{12}]\\\\ +Cov[X_1,X_2] &=& Cov[Y_{11},Y_{21}-Y_{12}] + Cov[-Y_{13}+Y_{22},Y_{21}-Y_{12}]\\ +&=& Cov[Y_{11},Y_{21}] - Cov[Y_{11},Y_{12}] - Cov[Y_{13},Y_{21}] + Cov[Y_{13},Y_{12}] + Cov[Y_{22},Y_{21}] - Cov[Y_{22},Y_{12}] +\end{array} +$$

+ +

Fortunately many of these terms are zero.

+ +

Given the tedious nature of the calculations you can do a simple Monte Carlo simulation to check your answers. Here is some R code for achieving that:

+ +
# Include MASS library for mvrnorm for generating multivariate normally distributed samples
+library(MASS)
+
+generateSamples <- function(N)
+{
+  # Generate N samples from Y1 and Y2 with the given mean vectors and covariance matrices
+  Y1 <- mvrnorm(mu=rep(2,3),Sigma=matrix(c(3,1,0,1,2,0,0,0,3),nrow=3,ncol=3),n=N)
+  Y2 <- mvrnorm(mu=c(3,4),Sigma=matrix(c(4,2,2,4),nrow=2,ncol=2),n=N)
+
+  # Calculate X1 and X2
+  X1 <- Y1[,1] - Y1[,3] + Y2[,2]
+  X2 <- Y2[,1] - Y1[,2]
+
+  cbind(X1,X2)
+}
+
+# Generate 100000 samples from X1 and X2
+mySample <- generateSamples(100000)
+
+# Empirical mean vector
+mu <- colMeans(mySample)
+
+# Empirical covariance matrix
+Sigma <- cov(mySample,mySample)
+
+",2013-10-21 09:52:26.270 +57923,14470.0,2,,57899.0,,,,CC BY-SA 3.0,"

As Steffen pointed out, the example matrix encodes the number of times a word appears in a text. The position of the encoding into the matrix is given by the word (column position on the matrix) and by the text (row position on the matrix).

+ +

Now, The hashing trick works the same way, though you don't have to initially define the dictionary containing the column position for each word.

+ +

In fact it is the hashing function that will give you the range of possible column positions (the hashing function will give you a minimum and maximum value possible) and the exact position of the word you want to encode into the matrix. So for example, let's imagine that the word ""likes"" is hashed by our hashing function into the number 5674, then the column 5674 will contain the encodings relative to the word ""likes"".

+ +

In such a fashion you won't need to build a dictionary before analyzing the text. If you will use a sparse matrix as your text matrix you won't even have to define exactly what the matrix size will have to be. Just by scanning the text, on the fly, you will convert words into column positions by the hashing function and your text matrix will be populated of data (frequencies, i.e.) accordingly to what document you are progressively analyzing (row position).

+",2013-10-21 11:02:59.820 +57924,22899.0,1,,,,Duration analysis of unemployment,,CC BY-SA 3.0,"

I am trying to run a discrete duration model for analyzing (monthly) unemployment using survey data. I have household-level data, and as such I would like to control for the household effects in my model. I thought to do this by either allowing for cluster effects in the estimation of the standard errors or by random effects (for households) - i.e., I think that fixed effects would not work because there are a lot of households and because of the incidental parameter problem.

+ +

My model will include both individual characteristics (e.g., age, school, occupation, since when the person has been unemployed - as they were asked retrospectively), and some other variables) as well as household characteristics (e.g. size, number of people unemployed).

+ +

Can anyone provide some comments on my proposed methodology? Are there any things I should be mindful of or are there any better ways of doing this?

+ +

Also I would highly appreciate any relevant references.

+",2013-10-21 11:16:10.237 +57925,22900.0,1,57927.0,,,Notation for random vectors,,CC BY-SA 3.0,"

Random variables are usually denoted with upper-case letters. For example, there could be a random variable $X$. Now, because vectors are usually denoted with a bold lower-case letter (e.g. $\mathbf{z} = (z_0, \dots, z_{n})^{\mathsf{T}}$ and matrices with a bold upper-case letter (e.g. $\mathbf{Y}$), how should I denote a vector of random variables? I think $\mathbf{x} = (X_0, \dots, X_n)^\mathsf{T}$ looks a bit odd. On the other hand if I see $\mathbf{X}$ I would first think it is a matrix. What is the usual way to do this? Of course, I think it would be best to state my notation somewhere in the beginning of paper.

+",2013-10-21 11:27:29.240 +57928,22901.0,1,,,,Use matrix feature for machine learning or cluster analysis,,CC BY-SA 3.0,"

I have a bunch of features that I would like to use for classification/machine learning and cluster analysis. Normally I use single point values or transformations of values for features and everything is fine

+ +

Now I would like to use a matrix as a feature. The matrix is probably going to be a fairly big (say 50x50) but will only be filled with 1's and 0's. It is pretty much an 'image' matrix. It is the shape/pattern of the matrix entries which is important.

+ +

Is there anyway I can easily use the matrix as a feature for machine learning? I know I could use each matrix entry, say Row1Column1 as a feature and then give it a value, but then I would have 2500 features from my 50x50 matrix, which is what I am trying to get away from.

+ +

Any ideas would be greatly appreciated.

+",2013-10-21 12:00:09.227 +57929,1406.0,2,,57849.0,,,,CC BY-SA 3.0,"

When dealing with infinite series of random variables it helps to know when they actually exist. One simple result dealing with the stationarity is the following. The series $\sum \psi_j X_{t-j}$ converges absolutely, almost surely and in mean if $\sum |\psi_j|<\infty$ and $\sup_tE|X_t|<\infty$. In particular if $X_t$ is stationary so is $\sum \psi_jX_{t-j}$.

+ +

In your case $\sum |\psi_j|=1+a+a+...$, so the series does not converge and the sum is not defined.

+ +

A more general result about existence of the series of the sum is the following. For any collection of random variables $X_t$, if $\sum E|X_t|<\infty$ then $\sum X_t$ converges almost surely, i.e. is defined. Again in your case this condition is violated since $E|X_t|=aE|N(0,1)|$ and the resulting series are not summable.

+",2013-10-21 12:01:39.650 +57930,4499.0,1,,,,Is this a valid test?,,CC BY-SA 3.0,"

I have 5000 cases vs 5000 controls with positive/negative outcome. chi.test shows no significance for this data. I get significance when subsetting cases on VarX, 1000 cases vs 5000 controls.

+ +

I suspect that doing 1000 cases vs 5000 controls makes up the significance. To eliminate this I am randomly picking 1000 controls and running chi.test 1000 times vs 1000 cases.

+ +

Here is the histogram of pvalues and the red line is Pvalue for 1000 cases vs 5000 controls: +

+ +

From this can I conclude that there is no significance? +If there is a better way of doing this please advise.

+",2013-10-21 12:10:33.237 +57931,2666.0,2,,57782.0,,,,CC BY-SA 3.0,"

There are several things wrong with that approach, including:

+ +
    +
  • Seeking a cutoff for a continuous probability
  • +
  • Using an arbitrary cutoff of 0.5
  • +
  • Assuming that the cost of a ""false positive"" and a ""false negative"" are the same for all subjects
  • +
  • Using weights that are not fractional
  • +
  • Using weights that are estimated
  • +
  • Overriding maximum likelihood estimation
  • +
  • Not utilizing optimum Bayes decision theory, which dictates that optimum decisions are based on full information (not on whether something exceeds something else) and utility/loss/cost functions
  • +
+",2013-10-21 12:33:40.250 +57932,20498.0,1,57933.0,,,Combining instance-based learning with regression analysis to improve predictions,,CC BY-SA 3.0,"

I have a table that contains items. Each day its possible that an incident can occur to some items but equally no incident may happen. I wish to see if its possible to create a prediction for the next day based on the information below

+ +
**Date      Item 1    Incident       Location**
+20130701    40          12           ES
+20130702    50           6           ES
+20120701    60          10           ES
+20120702    20           8           ES 
+
+ +

This could be a regression problem (at the moment I'm only interested in seeing the next days prediction) but I have an idea that taking the same time period for the previous year and combining it with this years data based on week number (e.g. week 37 this year with week 37 last year) would enrich the prediction because the weather could be similar or the conditions for the incident could be similar. This might make it possible to get a better prediction.

+ +

I'm using rapidminer for this but would know if anyone would know how to implement this correctly or even point me to some research papers that may have covered this. I can get up to five years of data so in essence week 37 five times but for the different years.

+",2013-10-21 13:04:41.890 +57933,16043.0,2,,57932.0,,,,CC BY-SA 3.0,"

Attention conservation notice: this is a long comment.

+ +

Sure -- this approach is called instance-based learning and matches current weather patterns to the ""best matches"" among previously-observed weather conditions; an introduction appears in Data Mining: Practical Machine Learning Tools and Techniques.

+ +

Your proposal merges matching and an explicit generalization (regression). Some things to consider: +How do you measure similarity to previous instances? I don't know anything about your data, but I suspect there are patterns that have resulted in different patterns at subsequent intervals.

+ +

A second part of building the model will be considering how to combine/weight the previous instances with the regression forecast to make a prediction. This can be done by simple averaging or more complicated methods like weighting the instances according to how similar they are, according to some metric.

+ +

I'm not familiar with rapidminer so I'm afraid my answer is not complete with respect to your tool-specific advice.

+",2013-10-21 13:17:50.113 +57934,12683.0,2,,57869.0,,,,CC BY-SA 3.0,"

There's Barnett (2009), Comparative Statistical Inference, which does a good job of contrasting different methodologies, with only as much maths as necessary. Nothing on Machine Learning though—some of the references here may be useful (& indeed the answers).

+ +

A survey of different areas of application (psychometrics, econometrics, &c.) would be interesting, & I hope someone can suggest one.

+",2013-10-21 13:48:28.970 +57935,10409.0,1,57944.0,,,What type of regression to use with negative values?,,CC BY-SA 3.0,"

If possible, please explain things like I'm 5. I know very little about this subject, but would like to learn more.

+ +

I have a data frame (in R) containing player_id, points, away, opponent_fact_1, opponent_fact_2. points can be negative. away lets us know if the game was at home or away (0 or 1). opponent_fact_1 and opponent_fact_2 gives us a stat about the opponent. opponent_fact_1 is on a scale of about 0.0-5.5. opponent_fact_2 is on a scale of about 70.0-95.0. The issue with the two facts is that there are fewer opponents as you reach the upper and lower bounds, so fewer data points exist at those levels.

+ +

How can I determine how much of an influence away, opponent_fact_1, and opponent_fact_2 has on a player's points?

+ +

I asked someone online how to do this and he said use poisson regression, but didn't go into detail. Why would regression be helpful here? What is it? And I read that you can't use poisson regression with negative values? Also, how do I deal with the fewer data points around the upper and lower bounds?

+ +

I'm using R, so any examples in R would be awesome. Explaining the output would be even better.

+ +

I hope this isn't asking for too much.

+ +

EDIT: Added sample data

+ +
  player_id opponent_team_id away  points opponent_fact_1 opponent_fact_2 
+1       695               22    0     0.0        2.888889           81.58 
+2       695               30    1     1.2        2.750000           81.58 
+3       695                4    1     3.0        3.714286           69.57 
+4       695               20    0    -3.0        3.000000           84.09 
+5       695               14    0     0.0        2.444444           72.97 
+
+",2013-10-21 13:55:49.620 +57938,21756.0,2,,57686.0,,,,CC BY-SA 3.0,"

Sklar's Theorem is very powerful and reads quite easy, but in practice one will have to know the (analytical) distributions before any ""inverse"" can be applied. Without knowing the exact joint and marginal distributions, one can still plot the empirical copula of a data set. At times, these plots reveal e.g. asymmetric patterns that can not be captured with correlation coefficients.

+ +

Typically, the scaled ranks of a data set (scaled to the interval (0,1)) are a good estimate of a non-parametric inverse of the marginal distributions. An empirical copula's density can then be represented as a scatter plot of the data's scaled ranks. As the spread of (many) points is at times hard to grasp, a smoothed scatter plot provides a visual proxy of the empirical copula's density (see e.g. the function ""smoothScatter"" from the (base) graphics package or ""dependencePlot"" in the package spcopula available from r-forge). In case one does know the marginal distributions, one can of'course replace the scaled ranks by the marginally ""inverted"" data.

+ +

In case one does know the copula's family and corresponding parameter(s) (sometimes a 1-1 relationship with Kendall's tau), 3D-plots of the copula can be obtained using the function ""persp"" with a copula and PDF/CDF function:

+ +
library(copula)
+persp(claytonCopula(2), dCopula) # plotting the PDF
+persp(claytonCopula(2), pCopula) # plotting the CDF
+
+ +

The package copula provides as well the function ""fitCopula"" that helps to estimate a copula's parameter(s) using different estimators.

+",2013-10-21 15:15:26.320 +57939,8888.0,1,92891.0,,,How to correctly apply the Nemenyi post-hoc test after the Friedman test,,CC BY-SA 4.0,"

I'm comparing the performance of multiple algorithms on multiple data sets. Since those performance measurements are not guaranteed to be normally distributed, I chose the Friedman Test with the Nemenyi post-hoc test based on Demšar (2006).

+ +

I then found another paper that, aside from suggesting other methods like the Quade test with subsequent Shaffer post-hoc test, they apply the Nemenyi test differently.

+ +

How do I apply the Nemenyi post-hoc test correctly?

+ +

1. Using the Studentized range statistic?

+ +

In Demšar's paper it says to reject the null hypothesis (no performance difference of two algorithms) if the average rank difference is greater than the critical distance CD with +$$ +CD = q_{\alpha}\sqrt{{k(k+1)}\over{6N}} +$$

+ +
+

""where critical values qα are based on the Studentized range statistic divided by $\sqrt{2}.$""

+
+ +

After some digging I've found that you those ""critical values"" can be looked up for certain alphas, for example in a table for $\alpha = 0.05$, for infinite degrees of freedom (at the bottom of each table).

+ +

2. or using the normal distribution?

+ +

Just when I thought I knew what to do, I found another paper that confused me again, because they were only using the normal distribution. Demšar is stating a similar thing at page 12:

+ +
+

The test statistics for comparing the i-th and j-th classifier using these methods is + $$ +z = {{(R_i − R_j)}\over{\sqrt{{k(k +1)}\over{6N}}}} +$$ + The z value is used to find the corresponding probability from the table of normal distribution, which is then compared with an appropriate $\alpha$. The tests differ in the way they adjust the value of $\alpha$ to compensate for multiple comparisons.

+
+ +

At this paragraph he was talking about comparing all algorithms to a control algorithm, but the remark ""differ in the way they adjust ... to compensate for multiple comparisons"" suggests that this should also hold for the Nemenyi test.

+ +

So what seems logical to me is to calculate the p-value based on the test statistic $z$, which is normally distributed, and correct that one by dividing through $k(k-1)/2$.

+ +

However, that yields completely different rank differences at which to reject the null hypothesis. And now I'm stuck and don't know which method to apply. I'm strongly leaning towards the one using the normal distribution, because it is simpler and more logical to me. I also don't need to look up values in tables and I'm not bound to certain significance values.

+ +

Then again, I've never worked with the studentized range statistic and I don't understand it.

+",2013-10-21 15:35:51.460 +57940,8414.0,1,,,,Three open philosophical problems in statistics,,CC BY-SA 3.0,"

I recently finished reading The Lady Tasting Tea, a fun book about the history of statistics. At the end of the book, the author, David Salsburg, proposes three open philosophical problems in statistics, the solutions to which he argues would have larger implications for the application of statistical theory to science. I had never heard of these problems before, so I am interested in other people's reactions to them. I am venturing into territory about which I have little knowledge, so I'm just going to describe Salsburg's portrayal of these problems and pose two general questions about these problems below.

+ +

Salsburg's philosophical problems are:

+ +
+
    +
  1. Can statistical models be used to make decisions?
  2. +
  3. What is the meaning of probability when applied to real life?
  4. +
  5. Do people really understand probability?
  6. +
+
+ +

Statistics and decision making

+ +

As an illustration of the problem presented in question 1, Salsburg presents the following paradox. Suppose we organize a lottery with 10000 unnumbered tickets. If we use probability to make a decision about whether any given ticket will win the lottery by rejecting this hypothesis for tickets with probabilities below, say, .001, we will reject the hypothesis of a winning ticket for all the tickets in the lottery!

+ +

Salsburg uses this example to argue that logic is inconsistent with probability theory as probability theory is currently understood, and that, therefore, we currently do not have a good means of integrating statistics (which, in its modern form, is based in large part on probability theory) with a logical means of decision-making.

+ +

The meaning of probability

+ +

As a mathematical abstraction, Salsburg argues that probability works well, but when we attempt to apply the results to real life, we run into the problem that probability has no concrete meaning in real life. More specifically, when we say that there is a 95% chance of rain tomorrow, it is unclear to what entities that 95% applies. Does it apply to the set of possible experiments that we could conduct to obtain knowledge about rain? Does it apply to the set of people who might go outside and get wet? Salsburg argues that the lack of a means to interpret probabilities creates problems for any statistical model based on probability (i.e., most of them).

+ +

Do people understand probability?

+ +

Salsburg argues that one attempt to resolve the issues with the lack of a concrete means of interpreting probability is through the concept of ""personal probability"", proposed by Jimmie Savage and Bruno de Finetti, which understands probability as personal beliefs about the likelihood of future events. However, in order for personal probability to provide a coherent basis for probability, people need to have a common understanding of what probability is and a common means of using evidence to draw conclusions about probability. Unfortunately, evidence such as that produce by Kahneman and Tversky suggests that personal beliefs might be a difficult basis on which to create a coherent basis for probability. Salsburg suggests that statistical methods that model probabilities as beliefs (perhaps such as Bayesian methods? I'm stretching my knowledge here) will need to deal with this problem.

+ +

My questions

+ +
    +
  1. To what extent are Salsburg's problems really problems for modern statistics?
  2. +
  3. Have we made any progress towards finding resolutions to these problems?
  4. +
+",2013-10-21 15:38:25.327 +57941,,1,57950.0,,user30490,Reciprocal roots and eigenvalues relationship in time series,,CC BY-SA 3.0,"

I came across a result in a time series textbook the other day and have not been able to understand why it is true (the authors don't give a proof but just state it as true). I want to show that the eigenvalues of the matrix $\mathbf{G}$ given by

+ +

$$G= +\begin{pmatrix} +\phi_1&\phi_2 &\phi_3 &...&\phi_{p-1} & \phi_p\\ +1 & 0 &0 &...& 0 &0\\ +0 & 1 & 0 &... &0 &0\\ +\vdots & & & \ddots&0&0\\ +0 & 0 &...&...&1 &0 +\end{pmatrix} +$$

+ +

correspond to the reciprocal roots of the $AR(p)$ characteristic polynomial

+ +

$$\Phi(u)=1-\phi_1u-\phi_2u^2-...-\phi_pu^p$$

+ +

The one thing i was able to deduce is that the eigenvalues of $\mathbf{G}$ must satisfy +$$\lambda^p-\phi_1\lambda^{p-1}-\phi_2\lambda^{p-2}-...-\phi_{p-1}-\phi_p=0$$

+",2013-10-21 15:39:32.187 +57942,22909.0,1,,,,P value calculation,,CC BY-SA 3.0,"

I have three groups of experiments. For each experiment I am looking for the percentage of occurrence of case x.

+ +

In the first group I have 15 experiments. The case x was seen 10.191% of the total time for 15 experiments.

+ +

In the second group I have 6 experiments. The percentage of x is 1.564%.

+ +

In the third group I have 3 experiments. The percentage of x is 0%.

+ +

I want to show that occurrence of case x significantly decreased from group one to two and to three. Thus I want to calculate the p-values. Can anyone tell me how to do it?

+ +

Edit:

+ +

The number of measurements for the first group is 22568 and 10.191% of these measurements are case x.

+ +

The second group has 1854 measurements (1.564% are x) and the third group has 1164 measurements (0% x)

+",2013-10-21 15:42:32.237 +57943,2081.0,2,,57894.0,,,,CC BY-SA 3.0,"

Unstandardized principal component values (PC scores) are given by $\bf XU$, where $\bf X$ is the data matrix and $\bf U$ is the matrix of eigenvectors. If covariance matrix decomposed to $\bf U$, - then $\bf X$ must be centered to give proper PC scores. But you said that you used raw data to compute the PC scores despite that eigenvectors came from covariance matrix. What happens then?

+ +

Example is below. Variables are V1, V2. PC1 and PC2 are the scores computed in the usual way; these are centered PC scores. dcPC1 and dcPC2 are the scores computed by ""your"" way; these are decentered PC scores.

+ +
      V1       V2      PC1      PC2    dcPC1    dcPC2
+
+  6.7662   8.6155   2.8843    .3930  10.9224    .8427
+  5.9534   6.9533   1.0506    .6401   9.0887   1.0897
+  5.1772   4.6352  -1.3083   1.2819   6.7298   1.7315
+  5.3906   3.5785  -2.0685   2.0461   5.9696   2.4958
+  3.0136   6.5524   -.9154  -1.5821   7.1227  -1.1325
+  1.4195   3.8332  -4.0620  -1.3978   3.9761   -.9481
+  4.9248   6.0971   -.2327    .2602   7.8054    .7098
+  4.5031   8.5152   1.5441  -1.4333   9.5822   -.9837
+  6.0504   6.8867   1.0491    .7578   9.0872   1.2074
+  1.7513   2.2287  -5.2121   -.2308   2.8260    .2188
+  4.4432   7.1862    .4056   -.7451   8.4437   -.2955
+  2.8280   5.4160  -1.9635  -1.1054   6.0746   -.6558
+  6.8661   3.4229  -1.3786   3.3597   6.6595   3.8093
+  3.6724   3.9823  -2.6869    .3930   5.3512    .8426
+  5.8395   6.4047    .5311    .8500   8.5692   1.2996
+  6.7118  11.4956   5.2492  -1.2516  13.2873   -.8020
+  4.7179   8.8247   1.9208  -1.4266   9.9589   -.9770
+  1.0230   3.2331  -4.7813  -1.3943   3.2568   -.9446
+  7.2815  10.1138   4.4165   -.0105  12.4546    .4391
+  8.4197  10.7265   5.5581    .5958  13.5962   1.0454
+
+ +

The pic displays the principal components drawn in the space of the variables. In both computational cases, the PCs are the same - as axes. They orthogonally cross each other in the centre of the cloud. This is because covariance matrix was analyzed (it implies centering). However, the scores (shown as markers on the PC axes) are different. ""Decentered PC1"" = ""Centered PC1"" + 8.04. ""Decentered PC2"" = ""Centered PC2"" + 0.45. It is unclear what use could be of such decentered scores, since they do not share their means with each other and with the data cloud.

+ +

+",2013-10-21 15:55:45.527 +57944,1805.0,2,,57935.0,,,,CC BY-SA 3.0,"

Regular linear regression (e.g. the lm or glm functions in R) handles negative values just fine.

+ +

One model you could try would be:

+ +
model1 <- lm(points ~ away + opponent_fact_1 + opponent_fact_2, data=my_data_frame)
+summary(model1)
+
+ +

If you've got a lot of data (and several rows per player and per opponent), you could also try this model:

+ +
model2 <- lm(points ~ away + factor(player_id) + factor(opponent_team_id), data=my_data_frame)
+summary(model2)
+
+ +

This will give you a model that includes a coefficient for each player, and for each opponent_team_id. These coefficients will represent the average points expected for a player, as well as the average points expected against a given opponent.

+ +

Have you every run a regression model before? What's the goal of this analysis?

+",2013-10-21 16:16:55.103 +57945,651.0,2,,57940.0,,,,CC BY-SA 3.0,"

Can we use statistics/probability to make decisions? Of course we can, the way in which we should go about this is by choosing the course of action that minimises our expected loss. In this case, all lottery numbers are equally likely to come up; if all provide the same prize, then the expected loss is the same for any number, so it doesn't matter which we choose. If we also have the option not to play the lottery, that would probably be the course of action we should take as it will minimise our expected loss assuming that the lottery makes a profit for somebody (or at least covers the cost of running the lottery). Of course this is just common sense and is consistent with logic, and could be expressed in purely probabilistic terms.

+ +

It seems to me that the question arises from a rather limited view of how statistics can be used to make decisions, it doesn't have to be done with quasi-Fisherian hypothesis tests.

+ +

I would suggest that Jaynes book on Probability theory goes a fair way to addressing points (2) and (3), probabilities can represent objective measures of plausibility without them being ""personal probabilities"", but I expect @probabilityislogic can explain that better than I can.

+",2013-10-21 16:18:15.443 +57946,10469.0,2,,57940.0,,,,CC BY-SA 3.0,"

I don't think these really are questions which can be answered conclusively. +(IOW, they are, indeed, philosophical). +That said...

+ +

Statistics and decision making

+ +

Yes, we can use statistics in decision making.

+ +

However, there are limits to its applicability; IOW, one has to understand what one is doing.

+ +

This is fully applicable to any theory.

+ +

The meaning of probability

+ +

95% probability of rain tomorrow means that if your cost of preparing for a rain (e.g., taking the umbrella) is A and your cost of being caught in the rain unprepared (e.g., wet suit) is B, then you should take the umbrella with you iff A < 0.95 * B.

+ +

Do people understand probability?

+ +

No, people do not understand much, least of all probability.

+ +

Kahneman and Tversky have shown that human intuition is flawed on many levels, but intuition and understanding are not identical, and I would argue that people understand even less than they intuit.

+ +

To what extent are Salsburg's problems really problems for modern statistics?

+ +

Nil. I don't think anyone cares about these issues except for philosophers and those in a philosophical mood.

+ +

Have we made any progress towards finding resolutions to these problems?

+ +

Everyone who cares has a resolution. +My personal resolution is above.

+",2013-10-21 16:55:13.357 +57947,15209.0,1,,,,Maximizing incomplete likelihood,,CC BY-SA 3.0,"

Given the conditional distribution $p(x|y)$ and the prior of the hidden variables $p(y|\theta)$ with unknown hyper-parameter $\theta$. Now we have observed i.i.d. samples of $x$.

+ +

Besides the Bayes and empirical Bayes approach, is it possible to estimate both the hidden variables $y$ and the hyper-parameter $\theta$ directly by maximizing the incomplete likelihood $p(x,y|\theta)$?

+ +

From the Bayesian perspective, it is equivalent to finding the joint mode of $p(y,\theta|x)$ given an uninformative flat prior of $\theta$.

+",2013-10-21 16:58:59.500 +57948,22910.0,1,,,,Comparison of cell counts with a right censoring,,CC BY-SA 3.0,"

I have cell counts related to the action of different microorganisms and I want to compare their distribution. It's supposed they follow a normal distribution after a log transformation, but I can't register all the data due to the limit of measuring instrument, so I obtain a very similar normal curve but it's cut in the point in which I can't register the data, and if I run a K-S test, it doesn't support the normality hypothesis (but n is large).

+ +

My questions:

+ +

I find this can be an usual problem, so how would you demonstrate normality in these cases? or is it impossible?

+ +

I think a Kruskal-Wallis test could works in my case, but I don't know if I have to take into account something else. Someone has suggested me to use an ANOVA due to my n is very high (n>2000) in each microorganism, despite the lack of normality, is this right?

+ +

After reading comments I think I have a right censoring. I have seen in survival analysis is used a Kaplan-Maier estimator, but this case is different. Would it be better just remove censored data and run a non-parametric test?

+",2013-10-21 17:02:30.750 +57949,22752.0,1,57974.0,,,$Cov(\hat{\epsilon})$ in a linear regression model,,CC BY-SA 3.0,"

+ +

I proved the result myself too, but my proof was a lot lengthier. Apparently however, it can be done in just these two lines. However, I don't really get the first step here. Could anyone please explain me why the first equality holds?

+",2013-10-21 17:24:54.920 +57950,668.0,2,,57941.0,,,,CC BY-SA 3.0,"

An eigenvalue of any matrix $\mathbb{G}$ must be a root of its characteristic polynomial $p_G(\lambda) = \det(\lambda - \mathbb{G}).$ By row-reducing the latter we readily find that

+ +

$$p_G(\lambda) = \lambda^p-\phi_1\lambda^{p-1}-\phi_2\lambda^{p-2}-...-\phi_{p-1}\lambda-\phi_p.$$

+ +

If $u = 1/\lambda$ is the reciprocal of an eigenvalue, then $1/u = \lambda,$ whence

+ +

$$0 = p_G(\lambda) = p_G\left(\frac{1}{u}\right) = \left(\frac{1}{u}\right)^p-\phi_1\left(\frac{1}{u}\right)^{p-1}-\phi_2\left(\frac{1}{u}\right)^{p-2}-...-\phi_{p-1}\left(\frac{1}{u}\right)-\phi_p \\ += u^{-p}\left(1-\phi_1u^{1}-\phi_2u^{2}-...-\phi_{p-1}u^{p-1}-\phi_pu^{p}\right) = u^{-p}\Phi(u).$$

+ +

Since $u$ must be nonzero (it's the reciprocal of a number), multiplying both sides by $u^p$ does not change the roots: $u$ must therefore be a root of $\Phi$.

+",2013-10-21 17:35:58.853 +57951,22911.0,1,,,,Finding the most relevant factors that lead to an event,,CC BY-SA 3.0,"

I have been tasked with looking at a set of data with about 6000 records that each have 60 or so qualities associated with them (Let these be X1, X2, ... ) and determining what are the top 8 factors that determine whether a record will have a certain designation (there are three, Let's say, A, B, and C). Most of these X's only have two possible values so they are easy to deal with. However there is an X that has 6 values and an X that has 8 values which I have determined to be fairly important. For now, I am just interested in finding the factors amongst the data that lead a record to be event A. My approach so far has been to calculate all of the probabilities of each individual factor (A, B, C and all X's), calculate the probabilities of all of the X's given factor A, and to then use Bayes' theorem to calculate the probability of event A given the X. This gives me a feel for which factors seem to be contributing to event A but I am not sure how I should continue my analysis.

+ +

I also delved into looking at intersections of events but I feel like that is a big time sink that yields little to no insight because that requires counting all of the intersections and when you want to look at 5+ factors, the amount of counting you have to do is ridiculous (For example, if you want to look at 5 factors, you need to find 2^5 different events because of the possible ways the events can line up. This causes problems especially when I am looking at the factors that have 6 or 8 different possibilities). I wrote some simple java classes to read in a tab delimited text file of the records and to do all of the counting and calculating for me for individual events and some intersections and I also have an excel spreadsheet that I set up first but that seemed to be fairly slow for performing the calculations. Besides adding specific code (or specific formulas in excel) to pick up each individual event, I am not sure how I can efficiently calculate these probabilities. And I'm not sure if calculating these conditional probabilities of the intersections will help me answer the overall question of what are the top 8 factors that contribute to event A. Any thoughts would be greatly appreciated.

+",2013-10-21 17:36:12.943 +57952,10964.0,1,,,,Standard Error vs. Standard Deviation of Sample Mean,,CC BY-SA 3.0,"

I'm having difficulty figuring out the difference between (typically referred to as S.E):

+ +

A. The Standard Error

+ +

And

+ +

B. The Standard Deviation of the Sample Mean (typically referred to as s)

+ +

Are they the same thing?

+ +

OR

+ +
    +
  1. S.E is the standard deviation of the the mean of the sampling distribution similar formula as the standard deviation except you use n-1 instead of n in the denominator
  2. +
  3. s is the standard deviation of a single random sample -- same formula as the standard deviation
  4. +
+",2013-10-21 17:37:59.103 +57953,22912.0,2,,57941.0,,,,CC BY-SA 3.0,"

For any polynomial $p(x)$, we can define a reciprocal polynomial of the form $x^n p(1/x)$ where the roots of this reciprocal function are the reciprocal roots of the original polynomial.

+ +

In the case of $\phi(u)$, the reciprocal polynomial would look like:
+\begin{align} +u^p p(1/u) &= u^p - \phi_1 u^p/u - \phi_2 u^p/u^2 - \ldots - \phi_p u^p/u^p \\ + &= u^p - \phi_1 u^{p-1} - \phi_2 u^{p-2} - \ldots - \phi_p. +\end{align}
+A reordering of this equation reveals the exact characteristic eigenvalue equation that you have found above, only now the $u$ values have become constant eigenvalues. Thus, solutions to the eigenvalue equation will be reciprocal roots of the ${\rm AR}(p)$ characteristic equation.

+",2013-10-21 17:45:28.917 +57954,20473.0,2,,57952.0,,,,CC BY-SA 3.0,"

The official term for the dispersion measure (of a distribution, of a sample etc) is ""standard deviation"" - the square root of the variance.

+ +

The tern ""standard error"" is more often used in the context of a regression model, and you can find it as ""the standard error of regression"". It is the square root of the sum of squared residuals from the regression - divided sometimes by sample size $n$ (and then it is the maximum likelihood estimator of the standard deviation of the error term), or by $n-k$ ($k$ being the number of regressors), and then it is the ordinary least squares (OLS) estimator of the standard deviation of the error term.

+ +

So you see that they are closely related, but not the same thing.

+",2013-10-21 17:56:34.387 +57955,,2,,57952.0,user31668,,,CC BY-SA 3.0,"

For normally distributed data, the SE = s, as the mean is an explicit parameter of the normal distribution. In general, standard error arises in Likelihood theory, where you are forming inferences from a likelihood function as opposed to the true sampling distribution. For example, if you are modeling some data as iid Exponential then you would form the likelihood function for your data $L(X|\lambda)= \prod L_{exp}(x_i|\lambda)$, with unknown $\lambda$ and then optimize L(X|$\lambda$) for $\lambda^*$ (i.e, maximum likelihood estimator). The standard error is defined as the curvature of the quadratic approximation to log(L(X|$\lambda^*$))at the MLE, which will equal the standard deviation for normal data. the only difficulty is that for non-normal data, you will need to do a second step to transform the actual parameters of your distribution (e.g., $\lambda$) into an estimate of the sample mean. Here, you would map $\frac{1}{\lambda} \rightarrow\mu$, so the likelihood of the latter equals that of the former, then take the log of that likelihood and get a standard error of that transformed likelihood function. Sorry for the long answer, but its not super clear cut in all cases. Sometime, its even used loosely, so yo need to read the documentation to really know.

+",2013-10-21 17:56:53.347 +57956,16644.0,2,,57952.0,,,,CC BY-SA 3.0,"

The standard deviation of the mean is usually unknown. We would write it as $$ \sigma_{\bar x } ={\sigma \over \sqrt n} $$

+ +

The standard error of the mean is an estimate of the standard deviation of the mean. $$ \hat \sigma_{\bar x} = {s \over \sqrt n}. $$

+",2013-10-21 17:57:03.800 +57957,22555.0,1,,,,Sequential Inference And Evidence (Jaynes 2003): Is it valid? Is it used?,,CC BY-SA 3.0,"

Exploring the work of ET Jaynes, Probability Theory (11th Printing 2013) has led to consideration of the technique he identifies as Sequential Inference (p. 96); where the evidence, in decibels, accumulates until the investigator either (1) stops with acceptance, (2) stops with rejection or (3) continues with another test. Jaynes' technique appears to be a special case of, and perhaps a more Bayesian approach than, sequential analysis as proposed by Abraham Wald. But I am certainly not an expert here.

+ +

This looks to be an incredibly powerful technique that can be used to investigate a claim of compliance/non-compliance in a cost effective and time efficient manner. Yet when searching the internet and also checking other Bayesian texts (such as Silvia and Skilling) there is little depth on it, if it is mentioned at all.

+ +

So these questions are posed:

+ +

(1) Is it used in practice?

+ +

(2) If so, in what way, if not, why not?

+ +

(3) Are there critical issues/pitfalls with it's use in practice?

+ +

(4) Is (are) there any in-depth reference(s) with case studies of application?

+ +

Addendum

+ +

Since posting this earlier, we have found an excellent collection of information here.

+ +

Coming from an engineering background, by nature we tend to be data driven, and this would put Engineers solidly in the Frequentist camp. However, time and again we have seen that these analyses should start from a position of logic, and this is what a proper Bayesian approach will accomplish, it is clear. So there is no doubt in our mind that the Bayesian method trumps the Frequentist one.

+ +

However, what we are specifically interested in is the concept of evidence in decibels (it seems that bels or maybe the Jaynes Scale (0,10] analogous to the Richter Scale might have been a more interesting way to consider it).

+ +

But is this approach used for more routine issues found in practice, and are there any case studies of such?

+",2013-10-21 18:09:49.567 +57958,22.0,2,,57952.0,,,,CC BY-SA 3.0,"

A standard error can be computed for almost any parameter you compute from data, not just the mean. The phrase ""the standard error"" is therefore ambiguous. I assume you are asking about the standard error of the mean.

+ +

Here are the key differences between the standard deviation (SD) and the standard error of the mean (SEM)

+ +
    +
  • The SD quantifies scatter — how much the values vary from one +another.

  • +
  • The SEM quantifies how precisely you have determined the true mean of the +population. It takes into account both the value of the SD and the +sample size.

  • +
  • Both SD and SEM are in the same units -- the units of the data.

  • +
  • The SEM, by definition, is always smaller than the SD.

  • +
  • The SEM gets smaller as your samples get larger. This makes sense, +because the mean of a large sample is likely to be closer to the true +population mean than is the mean of a small sample. With a huge +sample, you'll know the value of the mean with a lot of precision +even if the data are very scattered.

  • +
  • The SD does not change predictably as you acquire more data. The SD +you compute from a sample is the best possible estimate of the SD of +the overall population. As you collect more data, you'll assess the +SD of the population with more precision. But you can't predict +whether the SD from a larger sample will be bigger or smaller than +the SD from a small sample. (This is not strictly true. It is the +variance -- the SD squared -- that doesn't change predictably, but +the change in SD is trivial and much much smaller than the change in +the SEM.)

  • +
  • The SEM is hard to define conceptually. The only real ""purpose"" of an SEM is as an ""ingredient"" to compute the confidence interval of the mean.
  • +
  • The SEM is computed from the SD and sample size (n) as $$SEM ={SD \over \sqrt n}. $$
  • +
+ +

(From the GraphPad statistics guide that I wrote.)

+",2013-10-21 18:27:55.197 +57959,21985.0,1,,,,Ratio of lengths of two confidence intervalls,,CC BY-SA 3.0,"

I have two random variables:

+ +

(1) With standard normal distribution. Confidence interval $I_1$, which is centered and has probability of $(1-\alpha)$

+ +

(2) With T-distribution. Confidence intervall intervall $I_2$, also centered and has prob. $(1-\alpha)$.

+ +

First I had to compute the length of both: +I got $\mathcal{L}(I_1) = 2 \cdot z_{1-\alpha/2} \frac{\sigma}{\sqrt{n}}$ and $\mathcal{L}(I_2) = 2 \cdot t_{1-\alpha/2;n-1} \frac{S}{\sqrt{n}}$. Hope this is correct.

+ +

Now I have to calculate the asymptotic distribution of the ratio: $\Lambda = \frac{\mathcal{L}(I_2)^2}{\mathcal{L}(I_1)^2}$. As a hint I got $(n-1)\frac{S^2}{\sigma^2} \sim \chi_{n-1}^2$.

+ +

So far I got: $\Lambda = \frac{z_{1-\alpha/2}^2}{t_{1-\alpha/2;n-1}^2}\cdot\frac{S^2}{\sigma^2}$.

+ +

How do I express the ratio of the two distributions? I know that t-Distribution gets to a standard normal if n goes to infinity, but does that help? And do I get the $(n-1)$ from the hint somehow, or is this not really relevant?

+",2013-10-21 18:31:53.557 +57960,5821.0,2,,57782.0,,,,CC BY-SA 3.0,"

The best approach to achieving what you're trying to describe is probably to directly optimize the logistic regression parameters with an AUC loss function. The textbook ""Statistical Methods in Diagnostic Medicine"" by Zhou describes this method.

+ +

The AUC (area under the receiver operating characteristic curve-- or ROC) is roughly interpreted as the probability that a randomly sampled ""case"" has a higher marker value than a ""control"". This is a measure of model discrimination, or its ability to correctly classify the outcome. The ROC is a curve in the unit plane which shows the sensitivity versus 1 - specificity for all possible marker values (fitted outcomes) in a regression model.

+ +

By using the traditional formulation of the logistic regression model,

+ +

$$ \mbox{logit Pr}(Y = 1 | X) = \alpha + \beta X$$

+ +

with log odds ratios for model parameters, you can roughly define an AUC based loss function to obtain optimal parameters. Unlike likelihood based logistic regression, AUC regression is not regular and can converge to local maxima in the parameter space.

+",2013-10-21 18:41:31.170 +57961,232.0,2,,57936.0,,,,CC BY-SA 3.0,"

There are two basic approaches to generating data with piecewise constant hazard: inversion of the cumulative hazard and the composition method.

+ +
    +
  1. Inversion of the cumulative hazard - essentially the inverse CDF method. Since $F(t) = 1-\exp(-H(t))$. If $U \sim Unif(0,1)$, then $F(X) = U$ is equivalent to $1-\exp(-H(X)) = U$, so $X=H^{-1}(-\log(1-U))$. You can also note that $-\log(1-U) \sim Exp(1)$, so you can apply the inverse cumulative hazard to an exponential random variable. +The cumulative hazard is piecewise linear for your case, and should be easy to invert.
  2. +
+ +

Edit (more detail): with two change-points, the hazard is: +$$h(t) = \left\{ \begin{matrix} f_1 , & 0\leq t\leq t_1\\ + f_2, & t_1 < t \leq t_2\\ + f_3, & t > t_2 \end{matrix}\right.$$ +The cumulative hazard is: +$$H(t) = \left\{ \begin{matrix} f_1 t , & 0\leq t\leq t_1\\ + f_1 t_1 + f_2(t-t_1), & t_1 < t \leq t_2\\ + f_1t_1 + f_2(t_2-t_1) + f_3(t-t_2), & t > t_2 \end{matrix}\right.$$ +The inverse of the cumulative hazard is: +$$H^{-1}(x) = \left\{ \begin{matrix} x/f_1 , & 0\leq x\leq f_1t_1\\ + t_1 + (x-f_1t_1)/f_2, & f_1t_1 < x \leq f_1t_1 + f_2(t_2-t_1)\\ + t_2 + (x-f_1t_1-f_2(t_2-t_1))/f_3, & x > f_1t_1 + f_2(t_2-t_1)\end{matrix}\right.$$

+ +

Now generate an exponentially distributed random variable, and plug it in into $H^{-1}$. +End edit

+ +
    +
  1. The Composition method uses the fact that if $X_1$ has hazard $h_1$, and $X_2$ has hazard $h_2$, then $X=\min(X_1,X_2)$ has hazard $h=h_1+h_2$. You can represent your piecewise constant hazard as a sum of hazards that are constant on an interval and 0 outside. Generate a value $X_i$ for each interval (it could be $\infty$, since the resulting distributions are not necessarily proper), and take their minimum.
  2. +
+ +

Edit (more detail): with the above notation, the composition hazards are +$$h_1(t) = \left\{ \begin{matrix} f_1 , & 0\leq t\leq t_1\\ + 0, & t > t_1 \end{matrix}\right.$$ +$$h_2(t) = \left\{ \begin{matrix} f_2 , & t_1 < t\leq t_2\\ + 0, & \text{otherwise} \end{matrix}\right.$$ +$$h_3(t) = \left\{ \begin{matrix} f_3 , & t_2< t\\ + 0, & \text{otherwise} \end{matrix}\right.$$ +You can easily calculate the CDF or the cumulative hazard for each of these hazards.

+ +

One resource with R-Code

+",2013-10-21 18:48:02.733 +57962,10594.0,1,,,,What's the likelihood function in this case?,,CC BY-SA 3.0,"

I have a data set with a binary response variable as non-zero survival cells (y=0) or zero survival cells (y=1), and I would like to build a model with explanatory variable X. I know that the most straightforward method is to use logistic regression. However, in my case, it is more biologically sound to use a Poisson model. Thus, the number of survived cells follows a Poisson distribution $f(k,\lambda)$, and the response (probability of having zero survival cell) is the Poisson distribution at $k=0$.

+ +

Therefore, my model can be written as $y=f(0,\lambda)=e^{-\lambda}$, where $\lambda=g(\beta,x)$. $\lambda$ is a function of $x$ with parameter $\beta$. I am trying to estimate $\beta$ using MLE.

+ +

Now I am bit confused about which likelihood function I should use. Shall I assume that $y$ follows a binomial distribution: $y\sim B(1,e^{-\lambda})$, so that I could conduct MLE on the likelihood function from this binomial distribution? What about the Poisson distribution? It is not used in the estimation? Suppose $\beta^{*}$ is the estimated parameter, which gives $\lambda^{*}$. This means that the probability of having zero survival cells follows a $B(1,e^{-\lambda^{*}})$ distribution and the survival cells follows a Poisson $f(k,\lambda^{*})$ distribution?

+",2013-10-21 18:55:31.027 +57963,20222.0,2,,57951.0,,,,CC BY-SA 3.0,"

If you believe there is little if any interaction between the independent variables a simple approach is the linear-additive model.

+ +

Y= B0 +B1X1 +B2X2 + . . . B60X60 +e

+ +

Then by examining the coefficients you can identify the top 8 factors.

+ +

This paper may be of some use. If the above model is too simple, there are many other approaches such as neural networks, symbolic regression, splines and others.

+",2013-10-21 19:12:41.107 +57964,17573.0,2,,57782.0,,,,CC BY-SA 3.0,"

You don't seem to want logistic regression at all. What you say is ""I would like to maximize the difference between true positives and false positives."" That is a fine objective function, but it is not logistic regression. Let's see what it is.

+ +

First, some notation. The dependent variable is going to be $Y_i$:
+\begin{align} +Y_i &= \left\{ \begin{array}{l} + 1 \qquad \textrm{Purchase $i$ was profitable}\\ + 0 \qquad \textrm{Purchase $i$ was un-profitable} + \end{array} + \right. +\end{align}

+ +

The independent variables (the stuff you use to try to predict whether you should buy) are going to be $X_i$ (a vector). The parameter you are trying to estimate is going to be $\beta$ (a vector). You will predict buy when $X_i\beta>0$. For observation $i$, you predict buy when $X_i\beta>0$ or when the indicator function $\mathbf{1}_{X_i\beta>0}=1$.

+ +

A true positive happens on observation $i$ when both $Y_i=1$ and $\mathbf{1}_{X_i\beta>0}=1$. A false positive on observation $i$ happens when $Y_i=0$ and $\mathbf{1}_{X_i\beta>0}=1$. You wish to find the $\beta$ which maximizes true positives minus false positives, or: +\begin{equation} +max_\beta \; \sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} - \sum_{i=1}^N (1-Y_i)\cdot\mathbf{1}_{X_i\beta>0} +\end{equation}

+ +

This is not an especially familiar objective function for estimating a discrete response model, but bear with me while I do a little algebra on the objective function: +\begin{align} + &\sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} - \sum_{i=1}^N (1-Y_i)\cdot\mathbf{1}_{X_i\beta>0}\\ += &\sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} - \sum_{i=1}^N \mathbf{1}_{X_i\beta>0} ++ \sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0}\\ += &\sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} - \sum_{i=1}^N \mathbf{1}_{X_i\beta>0} ++ \sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} \\ + & \qquad + \sum_{i=1}^N 1 - \sum_{i=1}^N 1 + \sum_{i=1}^N Y_i - \sum_{i=1}^N Y_i\\ += &\sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} + \sum_{i=1}^N (1-Y_i)(1-\mathbf{1}_{X_i\beta>0}) - \sum_{i=1}^N 1 + \sum_{i=1}^N Y_i \\ +\end{align}

+ +

OK, now notice that the last two terms in that sum are not functions of $\beta$, so we can ignore them in the maximization. Finally, we have just shown that the problem you want to solve, ""maximize the difference between true positives and false positives"" is the same as this problem: +\begin{equation} +max_\beta \; \sum_{i=1}^N Y_i\cdot\mathbf{1}_{X_i\beta>0} + \sum_{i=1}^N (1-Y_i)(1-\mathbf{1}_{X_i\beta>0}) +\end{equation}

+ +

Now, that estimator has a name! It is named the maximum score estimator. It is a very intuitive way to estimate the parameter of a discrete response model. The parameter is chosen so as to maximize the number of correct predictions. The first term is the number of true positives, and the second term is the number of true negatives.

+ +

This is a pretty good way to estimate a (binary) discrete response model. The estimator is consistent, for example. (Manski, 1985, J of Econometrics) There are some oddities to this estimator, though. First, it is not unique in small samples. Once you have found one $\beta$ which solves the maximization, then any other $\beta$ which makes the exact same predictions in your dataset will solve the maximization---so, infinitely many $\beta$s close to the one you found. Also, the estimator is not asymptotically normal, and it converges slower than typical maximum likelihood estimators---cube root $N$ instead of root $N$ convergence. (Kim and Pollard, 1990, Ann of Stat) Finally, you can't use bootstrapping to do inference on it. (Abrevaya & Huang, 2005, Econometrica) There are some papers using this estimator though---there is a fun one about predicting results in the NCAA basketball tournament by Caudill, International Journal of Forecasting, April 2003, v. 19, iss. 2, pp. 313-17.

+ +

An estimator that overcomes most of these problems is Horowitz's smoothed maximum score estimator (Horowitz, 1992, Econometrica and Horowitz, 2002, J of Econometrics). It gives a root-$N$ consistent, asymptotically normal, unique estimator which is amenable to bootstrapping. Horowitz provides example code to implement his estimator on his webpage.

+",2013-10-21 19:18:00.883 +57965,22914.0,1,58047.0,,,Drawing with repetition,,CC BY-SA 3.0,"

I thought I was fairly good at combinatorics, but this puzzle is giving me some trouble!

+ +

I have a bag with $N$ balls. I pick one at random, mark it and put it back inside the bag. I repeat this operation $D$ times. What is the probability that I will end up with exactly $C$ clean balls?

+",2013-10-21 19:35:50.170 +57966,22885.0,2,,57894.0,,,,CC BY-SA 3.0,"

@ttnphns Thank you for your answer. You're right, in common case, without mean-centering PCA would lose a lot of sense.

+ +

But I think, if I use it to regularize regression, it could work.

+ +

Here I wrote a bit of algebra for myself. I hope it's correct, but I would appreciate some comments, cause I'm not completely sure.

+ +
    +
  1. Initial problem (doesn't work because A'A is nearly collinear and numerically uninvertible) +$$ +y = Ax +$$

  2. +
  3. Decompose covariance matrix of data (standardized data). To get correct (from the PCA perspective) eigen-everything. +$$ +cov(A) = VDV' +$$

  4. +
  5. Transform raw-data (without mean centering) into an orthogonal basis +$$ +AV +$$ +(just a linear transformation).

  6. +
  7. Coefficients $x^*$ for transformed problem are founded because matrix $AV$ is well-conditioned. +$$ +y = AVx^* +$$
  8. +
  9. As (1) and (4) are approximately equal +$$ +Ax = AVx^* +$$
  10. +
  11. so... +$$ +x = (A'A)^{-1}A'AVx^* +$$
  12. +
  13. A'A seems to have a full rank, so theoretically is invertible analiticaly +$$ +x = Vx^* +$$
  14. +
+",2013-10-21 20:16:20.140 +57967,20604.0,1,,,,When is it appropriate to model count data as continuous?,,CC BY-SA 4.0,"

I have time series of several variables of 60 or so rows of count data. I want to do a regression model y ~ x. I've chosen to use a Quasipoisson & Negative Binomial GLMs as there's overdispersion etc.

+ +
x
+Min.   : 24000  
+1st Qu.: 72000  
+Median :117095  
+Mean   :197607  
+3rd Qu.:291388  
+Max.   :607492  
+
+y
+Min.   : 136345
+1st Qu.: 405239
+Median : 468296
+Mean   : 515937
+3rd Qu.: 633089
+Max.   :1218937
+
+ +

The data itself are very high and so it may be best to model these as count data (this is what I'm trying to investigate - at which point I can model count data as continuous). It seems to be very common practice, what I want to know is the motivation for this?

+ +

Are there any texts that actually show the problem of modelling high count data with Poisson distribution? Perhaps something that shows the factorial in the distribution makes things difficult.

+",2013-10-21 20:30:43.837 +57968,22916.0,1,,,,Joint pdf of a continuous and a discrete rv,,CC BY-SA 3.0,"

Let us consider a manufacturing system. It involves 2 independent components. If one of these components fails then the entire system fails. Let $Y_j$ be distributed $\exp(Q_j)$ where $j=1, 2$.

+ +

If component 1 fails first, then $Y_1$ is observed but $Y_2$ is not ($Y_2$ is censored). If component 2 fails first, then $Y_2$ is observed but Y_1 is not ($Y_1$ is censored). Therefore, if the system fails, we can only observe $u = \min(Y_1, Y_2)$ and the binary random variable $V$, which is $1$ if $Y_1 < Y_2$ and $0$ otherwise.

+ +

How can I derive the joint pdf of a continuous variable $u = \min(Y_1, Y_2)$ and a discrete variable $V = 1$ if $Y_1 < Y_2$ and $0$ otherwise?

+",2013-10-21 20:53:00.177 +58160,22200.0,1,58174.0,,,Interpretation of the final cluster centers (cluster analysis),,CC BY-SA 3.0,"

I have a question concerning the interpretation of the final cluster centers. I performed a cluster analysis based on a pca (the variables are based on a five point Likert-scale). I got the following result for one factor:

+ +
        cluster_1        cluster_2       cluster_3      cluster_4       cluster_5
+          0,31            0,39               -0,82          0,63            0,35
+
+ +

Is this factor also interesting for the description of cluster 1, 2 and 5? Or should I only mention its influence for the clusters 3 and 4?

+ +

Thanks a lot!

+",2013-10-24 12:10:37.597 +57969,8869.0,1,,,,A distributional test based on entropy and self-information,,CC BY-SA 3.0,"

Say that I have a real-valued discrete distribution $p(x)$ and $N$ samples, $x_1, \ldots, x_N$, and I want to test whether the samples came from the distribution without making any further assumptions whatsoever. Note that there are very few samples; in the application that motivated me to make this post, we have $N = 5$. Thus Kolmogorov-Smirnov and Chi-squared tests are not expected to have much power.

+ +

I had a simple idea for doing this under the assumption that one can efficiently sample from $p(x)$. Being a bit statistically naive, I'm having difficulty figuring out whether it exists in the literature or not, and hoping someone can point me to the right resource.

+ +

The idea in a nutshell is to compare the self-information of the sample, $\hat{I}_N$, to the distribution of the self-information $I_N$ of $N$ random samples from $p(x)$. Formally, recall that the self-information of a random variable $X$ having distribution $p$ is given by

+ +

$$ I = - \log p(X), $$

+ +

and the self-information of $N$ iid random variables $X_1,\ldots X_N$ each having distribution $p(x)$ is given by

+ +

$$ I_N = -\sum_{i=1}^N \log p(X_k). $$

+ +

The self-information of the sample data we have, $x_1,\ldots,x_N$ is denoted

+ +

$$ \hat{I}_N = -\sum_{k=1}^N \log p(x_k). $$

+ +

As a test statistic one might consider $\hat{C} = C(\hat{I}_N)$, where $C(s) = \mathbb{P}(I_N < s)$ is the cumulative distribution function. If $\hat{C}$ takes a value very close to 0 or 1, then it has an extreme value compared to the distribution of $I_N$, and is unlikely to come from $p(x)$. For concreteness, we could use the standard thresholds for extreme value tests, such as 0.95 and 0.05 for the high and low ends respectively.

+ +

As an example application of this test, say that $p(x)$ is some strange multimodal distribution with multiple humps, and the samples $x_1,\ldots,x_N$ lie in the valleys between the humps. It is not clear that there is any test suitable from the literature for such a problem, but intuitively we can see that the samples are unlikely to have come from $p(x)$ because the values of $p(x_i)$ are so small, or equivalently, the self-information $-\log p(x_i)$ is much larger than typical. In terms of the above discussion, $\hat{C}$ will be very close to 1, and the hypothesis will be rejected.

+ +

The main problem I see with this test is that the distribution of $I_N$ may be difficult to compute. That may be, but in many cases I would imagine that a few million (or billion) Monte Carlo samples would suffice to get a good approximation of the distribution. Analytical/asymptotic approximations could be used to speed things up, get theoretical results, etc. For example, the first moment of $I_N$ is $N$ times the Shannon entropy, and higher moments could be computed without great difficulty.

+",2013-10-21 21:03:05.447 +57970,21119.0,1,,,,normalising constant on exponential of exponential,,CC BY-SA 3.0,"

I have a distribution of the form $\exp(-\exp(-x^2))$. Is this a known family of distributions. Otherwise how would you find/approximate the normalising constant. The domain is $x \in (-\infty,\infty) $

+ +

Note: I want to do this without sampling.

+",2013-10-21 21:23:25.110 +57971,3580.0,1,,,,"Approximately sampling $(X, Y)$ when sampling $X$ is easy",,CC BY-SA 3.0,"

Suppose I am interested in sampling many pairs $(\mathbf X, Y)$ from some distribution $f(\mathbf x, y)$ where $\mathbf x \in \mathbb R^p$, $p$ large ; I am interested in both exact and approximate simulations. $f(\mathbf x)$ is easy to sample from, but $f(y \mid \mathbf x)$ is not.

+ +

For motivation, I could do Gibbs sampling if the distributions $f(\mathbf x\mid y)$ and $f(y \mid \mathbf x)$ were both easy to simulate from by initializing $(\mathbf X_0, Y_0)$ and drawing $\mathbf X_t \sim f(x \mid Y_{t-1})$ and $Y_t \sim f(y \mid \mathbf X_t)$. If $f(\mathbf x \mid y)$ is easy to sample from, then I am in really good shape because, worst case scenario, I can replace $f(y \mid \mathbf x)$ with any update that leaves this distribution invariant.

+ +

In my situation, $f(\mathbf x \mid y)$ is difficult to sample from, and substantial work would have to go into constructing a suitable transition kernel given the dimension of $\mathbf x$. However, I can draw from the marginal $f(\mathbf x)$ exactly! It seems like this should buy me something like it does with the Gibbs sampler, however if I do something like the following:

+ +
    +
  1. Draw $\mathbf X_t \sim f(\mathbf x)$;
  2. +
  3. Draw $Y_t \sim K(y \mid \mathbf X_t, Y_{t-1})$ where $K(\cdot \mid \mathbf x, y)$ leaves $f(y \mid \mathbf x)$ invariant,
  4. +
+ +

I believe I will not get the correct stationary distribution.

+ +

If it helps, I might be willing to evaluate the density $f(y \mid \mathbf x)$. I know $f(y \mid \mathbf x)$ up-to a normalizing constant, but I can afford to do one numerical integration to get the constant (I would really prefer not to, though). However, $f(y \mid \mathbf x)$ is expensive to compute.

+ +

One thought I've had is to generate $f(\mathbf x)$ and then do a small number of MH random walk steps, but I would very much prefer getting an answer that will at least converge to the correct thing as $t \to \infty$ (this one will always have some error if I do a small number of steps for each $Y_t$).

+ +

EDIT: Maybe this all seems obvious - just use generic sampling techniques for each $f(y \mid \mathbf x)$ and, in general, I shouldn't be able to do better. I guess what I have in mind is that I should somehow be able to get the MCMC to work across $t$, and not have to do my approximate sampling within $t$. That is, I want my approximation to get better and better as $t \to \infty$; I don't want my approximation to have the same amount of error within $t$.

+ +

In general, though, $f(y \mid \mathbf x)$ might be very different for different values of $\mathbf x$ so I shouldn't hope for a solution that will always work. I just want something that has a good chance of working and addresses the above issues. It is strange to me that $f(\mathbf x)$ would be less useful than $f(\mathbf x \mid y)$.

+",2013-10-21 21:23:55.850 +57972,22587.0,1,58018.0,,,Hoops in a Field,,CC BY-SA 3.0,"

I have a simple question, but I haven't been able to find the answer I'm looking for anywhere. I'm sure there is some simple theorem or distribution that tackles this problem, but it evades me. Perhaps I haven't googled the right terms or looked in the right textbooks!

+ +

My problem is as follows. Suppose a proportion ($p$) of a field is covered in water. The water is randomly dispersed in the field. The field has an area $A$. Now we fly over the field in a helicopter and randomly drop a hoop of size $a$ (where $aN = A$) into the field. The hoop lands at a random spot within the field borders. What is the probability that there will be water inside the hoop?

+ +

My intuition about the problem is that for $a > A(1-p)$, the probability is 1 (because the area excluded by the hoop is smaller than the total area of the water), and as $a \rightarrow 0$, the probability approaches $p$. But what about for values of $a$ in the middle? Can the question be answered?

+ +

I have a sneaking suspicion that the answer is embarrassingly simple.

+ +

Thanks for your help!

+",2013-10-21 21:49:46.393 +57973,14715.0,1,,,,"In R, how do I fit a student-t distribution to a set of empirical data?",,CC BY-SA 3.0,"

In R, I can fit a Probability Density Function to some empirical data using the following code:

+ +
energy <- rnorm(30) * 20
+dens <- density(energy)
+sum(dens$y)*diff(dens$x[1:2])
+hist(energy,probability=TRUE)
+lines(density(energy),col=""red"")
+
+ +

This produces the following graph of the Probability Density Function (PDF):

+ +

+ +

Howevever, I would like to fit a student-t distribution to this data instead. I'm wondering if its possible to do this and if its possible to plot the result like in the diagram above?

+",2013-10-21 22:48:43.223 +57974,21638.0,2,,57949.0,,,,CC BY-SA 3.0,"

Let

+ +

$$ +\hat{\epsilon} = y - Z\hat{\beta} +$$

+ +

the usual estimator of $\beta$ is

+ +

$$ +\hat{\beta} = (Z^{T}Z)^{-1}Z^{T}y +$$

+ +

which gives

+ +

$$ +\begin{array}{rcl}\hat{\epsilon} &=& y - Z(Z^{T}Z)^{-1}Z^{T}y\\&=&\left[I - Z(Z^{T}Z)^{-1}Z^{T}\right]y\end{array} +$$

+ +

and so

+ +

$$ +Cov[\hat{\epsilon}] = Cov\left[\left[I - Z(Z^{T}Z)^{-1}Z^{T}\right]y\right] +$$

+ +

then simply use the property

+ +

$$ +Cov[Ax] = ACov[x]A^{T} +$$

+",2013-10-21 22:52:55.497 +57975,10135.0,2,,57973.0,,,,CC BY-SA 3.0,"

Have a look at function fit.st in package QRM.

+ +
library(QRM)
+fit.st(energy )
+
+",2013-10-21 23:20:11.733 +57976,21119.0,1,,,,Normalising Constant for exponentiated function,,CC BY-SA 3.0,"

What would the normalising constant be of the following, or atleast an approximation? I would like to avoid sampling.

+ +

$$f(\theta)=\exp(-k_1e^{-k_2\theta^2}-\theta^2)\qquad\theta\in(-\infty,\infty), \quad k_1,k_2>0$$

+ +

In the event that its not possible, what would be a valid polynomial lower bound on $\log(f(\theta))$, that would take into account $k_1$

+ +

For anyone, who is interested, I end up with this expresssion because I place a Gaussian prior over (a squared exponential) kernel parameters, and trying to use variational bayes (hence, a lower bound).

+",2013-10-21 23:52:46.893 +57978,5237.0,2,,57977.0,,,,CC BY-SA 3.0,"

You need to look into the Multivariate-ANOVA (called MANOVA). I'm not sure about good internet-based resources; the Wikipedia page is here, and the entry in StatSoft's online textbook is here, but neither seem great. For a 'real' textbook that is more or less math-free, Tabachnick & Fidell is the most popular. If you use SPSS, SAS, or Stata, the excellent UCLA statistics help site has some quick tutorials; they can be found here.

+",2013-10-22 01:36:51.040 +57979,20179.0,1,,,,Is p-value involved in Neyman-Pearson Testing procedure?,,CC BY-SA 3.0,"

It seems to me that, after having a predetermined level of significance, the Neyman-Pearson approach only requires to see if the observed x lies in the rejection region or not. +In this case, how to relate it to p-value? +Or is it an alternative way to judge if x is rejected or not by using p-values? +After reading some references about NP and Fisher's treatment, I am confused.

+ +

Thank you for any suggestion.

+",2013-10-22 01:44:10.797 +57980,22328.0,1,,,,Growing Self-Organizing Map for mixed-type data,,CC BY-SA 3.0,"

I am trying to write code to build a growing SOM for mixed-type data. I came across a paper Growing Self-Organizing Map with cross insert for mixed-type data (http://www.sciencedirect.com/science/article/pii/S1568494612001731). It was very interesting and handled both categorical and numeric data in a unified way. However, my dataset has variables/attributes that can have multiple values (for example: attribute ""interests"" can have more than one value - movies, sports, and so on...). I got stuck at handling such attributes. Any inputs how to handle attributes with sets of values in a mixed-type dataset? References to material that talk about this problem would be greatly appreciated.

+",2013-10-22 02:20:10.527 +57981,20981.0,2,,57942.0,,,,CC BY-SA 4.0,"

You can use a Wald approximation for confidence intervals for the first set of experiments, and probably the second. For the third, you can't calculate confidence intervals at all, but you can test whether any of the other experiments were significantly different from zero.

+

Wikipedia has a formula for the Wald and other approximations and this paper: Interval Estimation for a Binomial Proportion, describes the various approximations in more detail.

+

You should probably test for differences between the samples in each experiment group (which is what I assume Aniko is implying. If there are big differences between the 15 experiments in the first group for instance, it would call into question these simple confidence intervals, and you might want to consider an effects model of some sort.

+",2013-10-22 02:24:09.137 +57982,4656.0,2,,57968.0,,,,CC BY-SA 3.0,"

In simplistic terms, there is no such thing as a joint density of a continuous random variable and a discrete random variable because all the probability mass lies +on two straight lines ($v=0$ and $v=1$) and on these lines, the joint +density, being the probability mass per unit area, is infinite. On the other +hand, the line density of the mass on the two lines +is a (univariate) exponential density (measured in probability mass per unit +length). More specifically, the line density on the line $v=0$ is the density +of $U_2$ and the line density on the line $v=1$ is the density of $U_1$.

+",2013-10-22 03:17:53.623 +57983,22923.0,2,,51577.0,,,,CC BY-SA 3.0,"

Standard cross validation (when you randomly split data into k blocks) is a wrong thing to do for time-series because in time-series you often have serial dependence, data are not iid.

+ +

For example, draw a random walk path, then cut-off one point in the middle, fit a regression on the rest and predict the value for the cut-off point - your prediction error will be small because cut-off point's value if very similar to neighbor values, it's not independent from them so cross-validation in this case will give you very optimistic estimate of prediction error.

+ +

Better thing to do to test a time-series regression is to run a moving 1-step-ahead prediction with fitting a regression on every step.

+ +

But even then, just calculating mean squared error is not good enough if your series aren't stationary - it's not very indicative. You may need to compare 1-step-ahead prediction error of your model and some simple reference model like ""next step predicted value is the same as previous step realized value"". This will indicate how good is you model out-of-sample compared to a reference.

+",2013-10-22 04:14:44.413 +57984,18268.0,1,,,,Approach to a Poisson question,,CC BY-SA 3.0,"

This is a question from an old test, taken completely out of context:

+ +

""The customers arrive at a shop according to a Poisson distribution with mean $\lambda$ per hour. +Each customer takes $1/k$ hours of service. What is the expected value and the variance of the service time for customers arriving within an hour? What is the probability that the service time exceeds $t$ hours?

+ +

I am not entirely certain how to approach this question. This is supposed to be a simple Poisson-distribution question, with no Poisson processes or queuing.

+ +

Are there any ways to look at this question in a simpler way? What is the general approach here?

+ +

Apologies if I am completely misunderstanding this.

+",2013-10-22 05:17:37.467 +57985,6204.0,2,,57968.0,,,,CC BY-SA 3.0,"

What you have here is a mixture model, specifically a mixture of exponentials. If I understand your problem setup correctly, I believe what you're looking for looks something like this:

+ +

$$ +u \sim f(x) = +\begin{cases} +f_{Y_1}(x), & V=1 \\ +f_{Y_2}(x), & V=0 +\end{cases} +$$

+ +

or alternatively

+ +

$$u \sim f(x) = \theta f_{Y_1}(x) + (1-\theta)f_{Y_2}(x)$$

+ +

Where $\theta$ is the expected proportion of samples generated by $Y_1$ (or using your formulation, $\theta = E[V]$).

+ +

You can confirm this experimentally. Here's a mixture model with arbitrarily selected parameters r1, r2 and theta:

+ +
n=1e5
+theta=.2
+v=rbinom(n,1,theta)
+r1=5; r2=1
+sample=v*rexp(n,r1) + (1-v)*rexp(n,r2)
+
+f=function(x){theta*dexp(x,r1) + (1-theta)*dexp(x,r2)}
+
+plot(density(sample), xlim=c(0,6))
+xv=seq(from=0,to=6, length.out=1e4)
+lines(xv,f(xv), col='red')
+
+ +

+",2013-10-22 05:27:22.733 +57986,5045.0,1,,,,Constructing the OLS standard error by hand to avoid regression,,CC BY-SA 3.0,"

I am having trouble deriving the standard error of a simple regression estimator by hand. Stata code and output for a toy example using the cars dataset is below.

+ +

The basic idea is that I have a binary treatment that interacts with a binary covariate. All observations are independent. I'm probably overlooking something simple.

+ +

My outcome is a weighted average of the effects in the two groups: +\begin{equation} +TE= \frac{N_1 \cdot \delta_1 + N_2 \cdot \delta_2}{N_1+N_2}=\frac{N_1 \cdot (\bar X_{T_1}-\bar X_{C_1}) + N_2 \cdot (\bar X_{T_2}-\bar X_{C_2})}{N_1+N_2}, +\end{equation} +where $N_i=T_i+C_i$ is the number of people in group $i$ and $\delta_i$ is the effect of treatment in group $i$. I will estimate the $\delta$s with the group-specific difference in sample means. My manual calculation agrees with the regression output below.

+ +

Using the standard sum of weighed random variables, I get +\begin{equation} +Var[TE]= \left(\frac{N_1}{N_1+N_2}\right)^2 \cdot \left[ \frac{\sigma^2_{T_1}}{N_{T_1}}+\frac{\sigma^2_{C_1}}{N_{C_1}}\right] ++\left(\frac{N_2}{N_1+N_2}\right)^2 \cdot\left[ \frac{\sigma^2_{T_2}}{N_{T_2}}+\frac{\sigma^2_{C_2}}{N_{C_2}}\right], +\end{equation}

+ +

This is the quantity that I am having trouble scaling to get the standard error. My intuition is that the delta method used by Stata is using the covariance terms, whereas I am dropping them by assumption. Is there a way to get the OLS standard error without using OLS? I am trying to avoid running a regression.

+ +

Here's my code and output:

+ +
. /* Make Fake Data */
+. sysuse auto, clear
+(1978 Automobile Data)
+
+. rename foreign treat
+
+. label define treat 0 ""C"" 1 ""T""
+
+. lab val treat treat
+
+. gen group=cond(weight>3000,1,0)
+
+. sum group, meanonly
+
+. local mean = r(mean)
+
+. /* Get Summary Stats */
+. table treat group, c(mean price sd price freq) format(%9.3fc) // use 3 SDs to match the margins output
+
+--------------------------------
+          |        group        
+ Car type |         0          1
+----------+---------------------
+        C | 4,183.800  6,838.081
+          |   743.072  3,359.359
+          |    15.000     37.000
+          | 
+        T | 5,773.900  12492.500
+          | 1,803.450    703.571
+          |    20.000      2.000
+--------------------------------
+
+. /* Regression Treatment Effects */
+. reg price ib(0).treat##ib(0).group
+
+      Source |       SS       df       MS              Number of obs =      74
+-------------+------------------------------           F(  3,    70) =    7.78
+       Model |   158773405     3  52924468.2           Prob > F      =  0.0001
+    Residual |   476291991    70  6804171.31           R-squared     =  0.2500
+-------------+------------------------------           Adj R-squared =  0.2179
+       Total |   635065396    73  8699525.97           Root MSE      =  2608.5
+
+------------------------------------------------------------------------------
+       price |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
+-------------+----------------------------------------------------------------
+       treat |
+          T  |     1590.1   890.9658     1.78   0.079    -186.8752    3367.075
+     1.group |   2654.281   798.4409     3.32   0.001     1061.841    4246.721
+             |
+ treat#group |
+        T#1  |   4064.319   2092.798     1.94   0.056    -109.6345    8238.272
+             |
+       _cons |     4183.8   673.5068     6.21   0.000     2840.533    5527.067
+------------------------------------------------------------------------------
+
+. lincom 1.treat + 1.treat#1.group*`mean'
+
+ ( 1)  1.treat + .527027*1.treat#1.group = 0
+
+------------------------------------------------------------------------------
+       price |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
+-------------+----------------------------------------------------------------
+         (1) |   3732.106   1083.335     3.45   0.001     1571.463    5892.748
+------------------------------------------------------------------------------
+
+. margins, dydx(treat)
+
+Average marginal effects                          Number of obs   =         74
+Model VCE    : OLS
+
+Expression   : Linear prediction, predict()
+dy/dx w.r.t. : 1.treat
+
+------------------------------------------------------------------------------
+             |            Delta-method
+             |      dy/dx   Std. Err.      t    P>|t|     [95% Conf. Interval]
+-------------+----------------------------------------------------------------
+       treat |
+          T  |   3732.106   1083.335     3.45   0.001     1571.463    5892.748
+------------------------------------------------------------------------------
+Note: dy/dx for factor levels is the discrete change from the base level.
+
+. /* By Hand */
+. di ""E[TE] =  ""((15+20)*(5773.9 - 4183.8)  + (37+2)*(12492.5 - 6838.081))/(15 + 37 + 20 + 2)
+E[TE] =  3732.106
+
+. di ""SE[TE] = ""sqrt( (15+20)^2*((743.072^2)/15 + (1803.45^2)/20 )/(15+37+20+2)^2 + (37+2)^2*((3359.359^2)/37+(703.571^2)/2)/(15+37+20+2)^2)/sqrt(4) 
+SE[TE] = 222.53007
+
+ +

Here's just the code in case someone wants to cut n' paste:

+ +
/* Make Fake Data */
+sysuse auto, clear
+rename foreign treat
+label define treat 0 ""C"" 1 ""T""
+lab val treat treat
+
+gen group=cond(weight>3000,1,0)
+sum group, meanonly
+local mean = r(mean)
+
+/* Get Summary Stats */
+table treat group, c(mean price sd price freq) format(%9.3fc) // use 3 SDs to match the margins output
+
+/* Regression Treatment Effects */
+reg price ib(0).treat##ib(0).group
+lincom 1.treat + 1.treat#1.group*`mean'
+margins, dydx(treat)
+
+/* By Hand */
+di ""E[TE] =  ""((15+20)*(5773.9 - 4183.8)  + (37+2)*(12492.5 - 6838.081))/(15 + 37 + 20 + 2)
+di ""SE[TE] = ""sqrt( (15+20)^2*((743.072^2)/15 + (1803.45^2)/20 )/(15+37+20+2)^2 + (37+2)^2*((3359.359^2)/37+(703.571^2)/2)/(15+37+20+2)^2)/sqrt(4) 
+
+",2013-10-22 05:30:39.013 +57987,1717.0,1,,,,Forecasting in ARCH(1) models,,CC BY-SA 3.0,"

I read this definition of an ARCH(1) model:

+ +

$$r_{ŧ}=\sigma_{t|t-1}\epsilon_{t}$$ +$$\sigma^{2}_{t|t-1} = \omega + \alpha r_{t-1}^{2}$$

+ +

However, when it comes to forecasting the h-step-ahead variance, I don't understand why is defined in this way. This is the h-step-ahead conditional variance:

+ +

$$\sigma^{2}_{t+h|t}=E(r^{2}_{ŧ+h}|r_{t},r_{t-1},...)$$

+ +

How can I see that the equation above is correct?

+ +

Another question. I'd like to know what is going on in the first steps of the derivation of the recursive conditional variance:

+ +

$$\sigma^{2}_{t+h|t} = E(r^{2}_{t+h}|r_{t},r_{t-1},...) = E[E(\sigma^{2}_{t+h|t+h-1}\epsilon^{2}_{t+h}|r_{t+h-1},r_{t+h-2},...)|r_{t},r_{t-1},...] = \omega + \alpha \sigma^{2}_{t+h-1|t}$$

+ +

I skipped the final steps because they are easier to follow. I guess this is an instance of $E[E(X|Y)]=E(X)$ but I don't know why those conditional variables are chosen in this case.

+ +

UPDATE

+ +

A couple of additional questions.

+ +
    +
  1. Based on the expression for $E(r^{2}_{t+h}|r_{t}, r_{t-1},...)$, does it mean that, for instance, $E(r^{2}_{t+h+1}|r_{t},r_{t-1}) = \sigma^{2}_{t+h+1|t}$ but since $\sigma^{2}_{t+h+1|t} = \omega + \alpha r_{t}$, then any other expectation for additional steps $t+ h + 2$, $t+h+3$, etc are going to produce the same result?

  2. +
  3. When we take the expectation $E(r^{2}_{t+h}|r_{t}, r_{t-1},...)$, why don't we modify the conditional subindex from $t$ to $t+h-1$ in $\sigma^{2}_{t+h|t}$ and we do change it when we write:

  4. +
+ +

$$E(r^{2}_{t+h}|r_{t},r_{t-1},...) = E[E(\sigma^{2}_{t+h|t+h-1}\epsilon^{2}_{t+h}|r_{t+h-1},r_{t+h-2},...)|r_{t},r_{t-1},...]$$

+",2013-10-22 06:12:42.927 +57988,21119.0,1,,,,Quadratic lower bound on Gaussian,,CC BY-SA 3.0,"

Suppose I have a multivariate Gaussian such that $p(y)=\mathcal{N}(\mathbf{0},\Sigma)$. What would be a quadratic lower bound, $f(y)$ on on $p(y)$.

+ +

i.e. for what values of k and $\Omega$ will $f(y)=-y^T\Omega y+k\le p(y)$ s.t. $p(0)=f(0)=\frac{1}{|2\pi\Sigma|^{1/2}}$

+ +

As pointed out below $k=\frac{1}{|2\pi\Sigma|^{1/2}}$

+",2013-10-22 06:28:52.470 +57989,22423.0,2,,57972.0,,,,CC BY-SA 3.0,"

A simple answer to your question would be (but not as beautiful as thought):

+ +

If it is the case of randomly scattered infinitely small water droplets in a continuous 2-D field, then probability of water in the hoop would be $p_w \approx 1$ for all sizes of hoop.

+ +

Why? We can imagine the hoop consisting of N number of equally sized 'pockets'. Probability of water found in hoop = $P(\text{water found any of N pockets}) = 1-(P(\text{water not found in a single pocket}))^N$

+ +

As long as size of pocket $\ne 0$, $P(\text{water not found in a single pocket}) =1-p \ne 1$ and thus $(P(\text{water not found in a single pocket}))^N \approx 0$ as $N \to \infty$.

+ +

**

+ +

If though, we have a more specific and realistic definition of water:

+ +

This can be an example:

+ +
    +
  1. The body of water can be made up of finite number of water patches.
  2. +
  3. Each patch of water has to be closed and bounded.
  4. +
+ +

Even with this question, the answer relies very much on the form of the water patches as noted by others here. Zig-zag lines of water would mean $p_w =1$. Big circle pools of water would yield different probability(and still no easy answer). Because of the infinite number of arrangement of the water and the possible complexity of solution with each, this question is probably impossible to answer.

+ +

Unless the we know the exact water arrangement already, I believe we wouldn't be able to quantify the chance of having water in the hoop.

+",2013-10-22 06:34:22.423 +57990,21918.0,1,58014.0,,,Comparing maximum likelihood estimation (MLE) and Bayes' Theorem,,CC BY-SA 3.0,"

In Bayesian theorem, $$p(y|x) = \frac{p(x|y)p(y)}{p(x)}$$, and from the book I'm reading, $p(x|y)$ is called the likelihood, but I assume it's just the conditional probability of $x$ given $y$, right?

+ +

The maximum likelihood estimation tries to maximize $p(x|y)$, right? If so, I'm badly confused, because $x,y$ are both random variables, right? To maximize $p(x|y)$ is just to find out the $\hat y$? One more problem, if these 2 random variables are independent, then $p(x|y)$ is just $p(x)$, right? Then maximizing $p(x|y)$ is to maximize $p(x)$.

+ +

Or maybe, $p(x|y)$ is a function of some parameters $\theta$, that is $p(x|y; \theta)$, and MLE tries to find the $\theta$ which can maximize $p(x|y)$? Or even that $y$ is actually the parameters of the model, not random variable, maximizing the likelihood is to find the $\hat y$?

+ +

UPDATE

+ +

I'm a novice in machine learning, and this problem is a confusion from the stuff I read from a machine learning tutorial. Here it is, given an observed dataset $\{x_1,x_2,...,x_n\}$, the target values are $\{y_1,y_2,...,y_n\}$, and I try to fit a model over this dataset, so I assume that, given $x$, $y$ has a form of distribution named $W$ parameterized by $\theta$, that is $p(y|x; \theta)$, and I assume this is the posterior probability, right?

+ +

Now to estimate the value of $\theta$, I use MLE. OK, here comes my problem, I think the likelihood is $p(x|y;\theta)$, right? Maximizing the likelihood means I should pick the right $\theta$ and $y$?

+ +

If my understanding of likelihood is wrong, please show me the right way.

+",2013-10-22 06:38:49.637 +57991,6204.0,2,,57990.0,,,,CC BY-SA 3.0,"

Normally $p(x|y)$ is a function of the parameter $y$. Consider the following reformulation of Bayes theorem:

+ +

$$p(\theta|x) = \frac{p(x|\theta)p(\theta)}{p(x)}$$

+ +

Or even more explicitly (with regards to the notion of the likelihood):

+ +

$$p(\theta|x) = \frac{L(\theta;x)p(\theta)}{p(x)}$$

+ +

For a concrete example, consider the model

+ +

$$ +X|\theta \sim Binomial(\theta) \\ +\theta \sim Beta(\alpha,\beta) +$$

+",2013-10-22 06:46:00.550 +57992,22925.0,1,,,,How to generate random data that conforms to a given mean and upper / lower endpoints?,,CC BY-SA 3.0,"

I'm in need of generating a set of random numbers (between 100 and 300 would suffice), which conform to a given mean, and also fall within certain upper and lower endpoints. Using the statistics software that I have (Minitab), I can find options to

+ +
    +
  1. generate random data with a given mean but no endpoints, and
  2. +
  3. generate random data with endpoints and a given mode (not mean).
  4. +
+ +

I think the problem I'm having is that it doesn't seem to be a normal distribution; for instance, one of the data sets should give a mean of 2.90, but range between 0 and 30.66.

+ +

It is for a paper I'm writing, wherein the concern is with being able to run statistical analyses on the data in question; in other words, once I generate the data set, I have to do ANOVA and other tests on it. (I'm fine with doing all that; I just can't figure out how to generate the data.)

+",2013-10-22 06:51:41.413 +58202,22480.0,1,58233.0,,,How can I get the prior of a random variable that's a function of a random variable in Bayesian data analysis?,,CC BY-SA 3.0,"

I have a model which includes the following priors:

+ +

$\lambda_C \rightarrow \dfrac{1}{\sigma_C^2}$

+ +

and

+ +

$\sigma \sim \text{uniform}(0,500)$

+ +

Where $\sigma$ is the standard deviation and $\lambda_C$ is the precision of a normal distribution.

+ +

Being a beginner, I have problems conceptualizing the prior of $\lambda_C$. How would I even calculate the single values for the prior of $\lambda_C$?

+",2013-10-25 08:44:45.667 +57994,5045.0,5,,,,,,CC BY-SA 3.0,"

This tag is ambiguous. Please do not use it as is, and use the specific tags generalized-moments (econometric), gaussian-mixture for Gaussian Mixture Models (machine learning), or growth-mixture-model for (psychometric).

+ +

One entry is retained with gmm so that the tag itself does not disappear: When should one consider using GMM?. Please do not remove the tag from that entry.

+",2013-10-22 06:57:19.783 +57993,5045.0,4,,,,,,CC BY-SA 3.0,"This tag is ambiguous. Please do not use it as is, and use the specific tags `generalized-moments` for econometric GMM, `gaussian-mixture` for machine learning GMM, and `growth-mixture-model` for psychometric GMM.",2013-10-22 06:57:19.783 +57995,594.0,2,,57992.0,,,,CC BY-SA 3.0,"

If you want the distribution on the range min to max and with a given population mean:

+ +

One common solution when trying to generate a distribution with specified mean and endpoints is to use a location-scale family beta distribution.

+ +

The usual beta is on the range 0-1 and has two parameters, $\alpha$ and $\beta$. The mean of that distribution is $\frac{\alpha}{\alpha+\beta}$. If you multiply by $\text{max}-\text{min}$ and add $\text{min}$, you have something between $\text{max}$ and $\text{min}$ with mean $\text{min}+\frac{\alpha}{\alpha+\beta}(\text{max}-\text{min})$.

+ +

This suggests you should take

+ +

$\beta/\alpha = \frac{\text{max} - \text{mean}}{\text{mean}-\text{min}}$

+ +

Or

+ +

$\alpha/\beta = \frac{\text{mean} - \text{min}}{\text{max}-\text{mean}}$

+ +

This leaves you with a free parameter (you can choose $\alpha$ or $\beta$ freely and the other is determined). You could choose the smaller of them to be ""1"". Or you could choose it to satisfy some other condition, if you have one (such as a specified standard deviation). Larger $\alpha$ and $\beta$ will look more 'bell-shaped'.

+ +

In Minitab, Calc $\to$ Random Data $\to$ Beta.

+ +

--

+ +

Alternatively, you could generate from a triangular distribution rather than a beta distribution. (Or any number of other choices!)

+ +

The triangular distribution is usually defined in terms of its min, max and mode, and its mean is the average of the min, max and mode. The triangular distribution is reasonably easy to generate from even if you don't have specialized routines for it. To get the mode from a given mean, use mode = 3$\,\times\,$mean - min - max. However, the mean is restricted to lie in the middle third of the range (which is easy to see from the fact that the mean is the average of the mode and the two endpoints).

+ +

Below is a plot of the density functions for a beta (specifically, $\text{beta}(2,3)$) and a triangular distribution, both with mean 40% of the way between the min and the max:

+ +

+ +
+ +

One the other hand, if you want the sample to have a smallest value of min and a largest value of max and a given sample mean, that's quite a different exercise. There are easy ways to do that, though some of them may look a bit odd.

+ +

One simple method is as follows. Let $p=\frac{\text{mean} - \text{min}}{\text{max}-\text{min}}$. Place $b=\lfloor p(n-1)\rfloor$ points at 1, and $n-1-b$ points at 0, giving an average of $b/(n-1)$ and a sum of $b$. To get the right average, we need the sum to be $np$, so we place the remaining point at $np-b$, and then multiply all the observations by ${\text{max}-\text{min}}$ and add $\text{min}$.

+ +

e.g. consider $n$ = 12, min = 10, max = 60, mean = 30, so $p$ = 0.4, and $b$ = 4. With seven (12-1-4) points at 0 and four at 1, the sum is 4. If we place the remaining point at 12$\,\times\,$0.4$\,$-$\,$4 = 0.8, the average is 0.4 ($p$). We then multiply all the values by ${\text{max}-\text{min}}$ (50) and add $\text{min}$ (10) giving a mean of 30. Then randomly sample the whole set of $n$ without replacement, (or equivalently, just randomly order them). You now have a random sample with the required mean and extremes, albeit one from a discrete distribution.

+",2013-10-22 07:44:24.763 +57996,22899.0,1,,,,Average partial effects,,CC BY-SA 3.0,"

I need to explain what average partial effects (APEs) are to a very general non-statistical audience (i.e. the APEs derived from a probit model). I have tried to define APEs using layman's terms but I find it hard, can somebody please help me with this?

+",2013-10-22 08:45:29.923 +57997,22928.0,1,58003.0,,,Large sample size and partial F-test for multiple regression makes adding a variable always significant,,CC BY-SA 3.0,"

I am developing a two variable multiple regression model. +ie. +$$ Y = b0 - b1 * X1 + b2 * X2 $$

+ +

I am using the following formula for partial F-test from http://luna.cas.usf.edu/~mbrannic/files/regression/Reg2IV.html under the section Testing Incremental R2. The F-statistics calculated is supposed to tell me if adding the second variable is significant (more details in that link).

+ +

$$ F= {\frac{(R_L^2 - R_S^2)/(k_L-k_s)}{(1-R_L^2)/(N-k_L-1)}}$$

+ +

My first variable has a strong correlation: +regression_coeff_string: b1 = 0.664, b0 = 0.035 +R2_val: 0.564

+ +

My second variable has a weak correlation: +regression_coeff_string: b1 = -25.026, b0 = 0.469, +R2_val: 0.027

+ +

Adding my seond variable only marginally improves the R2 value +regression_coeff_string: b0 = 0.0559, b1 = 0.6633, b2 = -5.2222, +R2_val: 0.565

+ +

However, because I have a sample size 2949, that +With $$ R_L^2 = 0.565, R_S^2 = 0.564$$ +$k_L$ the number of predictors in the full set being 2, +$k_S$ the number of predictors in the subset being 1 +$$ F= {\frac{(0.565 - 0.564)/(2-1)}{(1-0.565)/(2949-2-1)}} = 6.77$$

+ +

With F(1,2946) at 0.05 confidence having a F_stat of 4.182, the result is significant. But it seems that it is only because the sample size is large. If I sort the second variable X2 in ascending order in Excel and leave the order of the Y and X1 variables unchanged, I would still get a significant F score.

+ +

Question: How can I do a fair incremental R2 test for the addition of a new variable in multiple regression when the sample size becomes large?

+ +

Simply looking at the R2 of each X variable individually does not take into account that that they may be cross-correlated, that is why I turned to the incremental R2 test to see how the overall R2 improves relative to adding a new variable.

+ +

EDIT1:

+ +

The context of my example is predicting solar radiation. The first variable is a solar radiation variable from NWP (numerical weather prediction) software (ie. high correlation). The other variables are other NWP output variables and we are trying to improve our prediction.

+",2013-10-22 08:58:05.767 +57998,12282.0,2,,57990.0,,,,CC BY-SA 3.0,"
    +
  • ""...$p(x|y)$ is called the likelihood...""
  • +
+ +

$p(x|y)$ is the likelihood of y given x. Saying what it is the likelihood of is important. And yes, it is just the conditional probability of $x$ given $y$.

+ +
    +
  • ""...if these 2 random variables are independent, then $p(x|y)$ is just $p(x)$, right? Then maximizing $p(x|y)$ is to maximize $p(x)$...""
  • +
+ +

If they are independent, i.e. $p(x|y) = p(x)$, the $p(x)$ is constant with respect to $y$. Be careful here, as you don't specify what you are maximising with respect to - from what you wrote earlier, I would assume you are maximising with respect to $y$.

+ +
    +
  • ...Or maybe, $p(x|y)$ is a function of some parameters $\theta$, that is $p(x|y;\theta)$, and MLE tries to find the $\theta$ which can maximize $p(x|y)$? Or even that y is actually the parameters of the model, not random variable, maximizing the likelihood is to find the $\hat{y}$?...
  • +
+ +

Introducing $\theta$ makes this an entirely new problem. In general, the answer to most of this question here seems to be 'it depends'. We could denote parameters as $y$ if we wanted, and maximise with respect to them. Equally, we could have a situation where we maximise $p(x|y;\theta)$ with respect to parameters $\theta$ if that was a sensible way of approaching the problem at hand.

+",2013-10-22 09:14:44.040 +57999,22930.0,1,,,,"How to interpret Hidden Markov Model parameters (transition matrix, emission matrix, and pi values)?",,CC BY-SA 3.0,"

I am working on channel modeling for cognitive radio using HMM. I've written a MATLAB program for forward, backward and Baum-Welch algorithm for multiple sequences. After given some random input and running the program for 1000 to 4000 iterations I'm getting some results. But I'm not getting how to interpret my results.

+ +

Will be glad if anyone can talk about inputting matrix/log-likelihood/transition matrix/emission matrix.

+",2013-10-22 09:58:26.890 +58000,22899.0,1,,,,Asymptotic Theory in Economics,,CC BY-SA 3.0,"

I am interested in deepening my Asymptotic Theory understanding. My current knowledge is that of a typical PhD student (from a decent University), say at the level of Green's textbook. Are there any good book(s) that you would recommend?

+ +

Much appreciated.

+",2013-10-22 10:29:40.320 +58001,20470.0,2,,57999.0,,,,CC BY-SA 3.0,"

For MatLab, I would recommend using the HMM toolbox. It allows you to do pretty much all you would need from an HMM model.

+ +

If you feel strongly about using your own code, before running on a real dataset, you should probably validate your Baum Welch implementation by checking whether it actually returns sensible results. You can use an experimental setup similar to below. Please note that I am using the HMM toolbox functions, but it is more the order of steps that I am trying to draw your attention to.

+ +
M = 3; % number of observation levels
+N  = 2; % number of states
+
+% A - ""true"" parameters (of your validation model)
+prior0 = normalise(rand(N ,1));
+transmat0 = mk_stochastic(rand(N ,N ));
+obsmat0 = mk_stochastic(rand(N ,M));
+
+% B- using the real parameters in step A, simulate a sequence of states and corresponding observations
+n_seq = 5;  % you want to generate 5 multiple sequences
+seq_len= 100; % you want each sequence to be of length 100
+obs_seq, state_seq = dhmm_sample(prior0, transmat0, obsmat0, n_seq, seq_len);
+
+% C- like you say you do, generate some initial guesses of the real parameters (from step A) that you want to learn
+prior1 = normalise(rand(N ,1));
+transmat1 = mk_stochastic(rand(N ,N ));
+obsmat1 = mk_stochastic(rand(N ,M));
+
+% D - train based on your guesstimates using EM (Baum-Welch)
+[LL, prior2, transmat2, obsmat2] = dhmm_em(data, prior1, transmat1, obsmat1, 'max_iter', 5);
+
+% E- Finally, compare whether your trained values in step D are actually similar to the real values (that generated your data) from Step A. 
+% The simplest way to do that is to print them side by side or look at the absolute differences...
+obsmat0
+obsmat2
+
+transmat0
+transmat2
+
+ +

The interpretations of the transition matrix, observation (emission) matrix and loglikelihoods is a broad topic. I assume you already have a fair understanding since you could implement your own Baum-Welch. The easiest and best read on HMMs (in my opinion) is Rabiner's paper. I would recommend having a look at this if you haven't yet.

+",2013-10-22 10:33:21.310 +58002,10135.0,2,,57987.0,,,,CC BY-SA 3.0,"

$E(r_t^2|r_{t-j},j=1,2,...)=E(\sigma^2_{t|t-1}\epsilon_t^2|r_{t-j},j=1,2,...)$. Now given $r_{t-j},j=1,2,...$, you know the values of $\sigma_{t|t-1}$ because $\sigma^2_{t|t-1}=\omega+\alpha r_{t-1}^2$. So you can take it out of expectation to have: $E(r_t^2|r_{t-j},j=1,2,...)=\sigma^2_{t|t-1}E(\epsilon_t^2|r_{t-j},j=1,2,...)$. Also $\epsilon_t$ is independent of $r_{t-j},j=1,2,...$, so (roughly speaking) any function of that is also independent of $r_{t-j},j=1,2,...$. In particular, $f(x)=x^2$. So, $\epsilon_t^2$ is independent of $r_{t-j},j=1,2,...$. Hence, $E(r_t^2|r_{t-j},j=1,2,...)=\sigma^2_{t|t-1}E(\epsilon_t^2)$. Finally note that $\epsilon_t\sim N(0,1)$. So $Var(\epsilon_t)=E(\epsilon_t^2)=1$. Therefore, $E(r_t^2|r_{t-j},j=1,2,...)=\sigma^2_{t|t-1}.$ Now we can change $t$ to $t+h$ to get $h$-step ahead forecasts, i.e. $E(r_{t+h}^2|r_{t-j},j=1,2,...)=\sigma^2_{t+h|t-1}.$ I think you can now answer your 2nd question. Hint: you condition on previous returns $r_t$ that makes your expectation easy to compute.

+",2013-10-22 11:06:12.883 +58003,503.0,2,,57997.0,,,,CC BY-SA 3.0,"

The test you are doing is ""fair"", it's just that p-values don't answer the question you want to ask (they often don't). The way to proceed is to figure out what change in effect size is substantively meaningful and base decisions on that.

+ +

This is entirely dependent on your field and, indeed, on your question. To illustrate: If 1 in 1000 children misunderstand a question on a test, that is a very small proportion, and won't affect the validity of the test much. But if 1 in 1000 airplane trips end in a crash, that is a very large proportion and would end aviation.

+ +

Is there any context in which a change of $R^2$ from 0.564 to 0.565 is important? I can't think of one, offhand, but I haven't had all my coffee :-). Perhaps some variation on the plane crash scenario.

+",2013-10-22 11:15:23.507 +58004,1406.0,2,,58000.0,,,,CC BY-SA 3.0,"

Since you mention Greene's book, I assume you are interested in more in-depth understanding of asymptotic statistics. Then I can recommend A. van der Vaart's ""Asymptotic statistics"" and H. White's ""Asymptotic theory for econometricians"". Also J. Wooldridge's ""Econometric Analysis of Cross Section and Panel Data"" has nice chapters on asymptotic theory.

+",2013-10-22 11:16:17.970 +58005,11262.0,1,,,,Questions about posterior linear regression,,CC BY-SA 3.0,"

Just 2 simple questions I´m struggling with. Hope you can help.

+ +

Suppose that the model $y=\beta X +\epsilon$ with $\epsilon \sim \text{ Normal}(o,\sigma^2I_n)$ has a prior $\beta \sim \text{ Normal}( \beta_0, k(X^TX)^{-1})$

+ +

I want two things:

+ +

1.) I want to show that for the density of $\beta$ that

+ +

$$p(\beta) \propto \exp \{-\frac{1}{2} k^{-1} (\beta^TX^TX\beta-2\beta^TX^TX\beta_0) \} $$

+ +

2.) I want to show that +$$p(\beta | y,X,\sigma^2) \propto p(y|X,\beta, \sigma^2)\cdotp(\beta) $$

+ +

It would be very nice if somebody can say anything about this. I would also know why $$p(\beta|y,X,\sigma^2) \propto \exp\{-\frac{1}{2}(k^{-1}+\sigma^{-2})\beta^TX^TX\beta -2\beta^TX^TX\beta_0 \}$$

+",2013-10-22 11:23:40.697 +58006,9095.0,1,58009.0,,,Finding mean and SD of 2 parts of a whole,,CC BY-SA 3.0,"

I am putting together a review/meta-analysis of body composition in children. The data I will analyze consists of measures of fat-mass (FM) and fat-free-mass (FFM), which when summed equal total mass. While most papers report the means and SDs for FM and FFM, which is what I want, every so often I get a paper that reports the mean and SD of total mass, and a mean and SD of %FM (i.e. FM/(FM+FFM)).

+ +

My question: Can I calcuate the mean and SD of FM and FFM, given I have the mean and SD of total mass, and the mean and SD of FM/(FM+FFM).

+",2013-10-22 11:36:50.420 +58007,10135.0,2,,58000.0,,,,CC BY-SA 3.0,"

""Asymptotic theory for econometricians"" by Halbert White. ""Asymptotic Theory of Statistics and Probability"", by Anirban DasGupta.

+",2013-10-22 11:38:09.667 +58008,22821.0,1,58016.0,,,What does the inverse of covariance matrix say about data? (Intuitively),,CC BY-SA 3.0,"

I'm curious about the nature of $\Sigma^{-1}$. Can anybody tell something intuitive about ""What does $\Sigma^{-1}$ say about data?""

+ +

Edit:

+ +

Thanks for replies

+ +

After taking some great courses, I'd like to add some points:

+ +
    +
  1. It is measure of information, i.e., $x^T\Sigma^{-1}x$ is amount of info along the direction $x$.
  2. +
  3. Duality: Since $\Sigma$ is positive definite, so is $\Sigma^{-1}$, so they are dot-product norms, more precisely they are dual norms of each other, so we can derive Fenchel dual for the regularized least squares problem, and do maximization w.r.t dual problem. We can choose either of them, depending on their conditioning.
  4. +
  5. Hilbert space: Columns (and rows) of $\Sigma^{-1}$ and $\Sigma$ span the same space. So there is not any advantage (other that when one of these matrices is ill-conditioned) between representation with $\Sigma^{-1}$ or $\Sigma$
  6. +
  7. Bayesian Statistics: norm of $\Sigma^{-1}$ plays an important role in the Bayesian statistics. I.e. it determined how much information we have in prior, e.g., when covariance of the prior density is like $\|\Sigma^{-1}\|\rightarrow 0 $ we have non-informative (or probably Jeffreys prior)
  8. +
  9. Frequentist Statistics: It is closely related to Fisher information, using the Cramér–Rao bound. In fact, fisher information matrix (outer product of gradient of log-likelihood with itself) is Cramér–Rao bound it, i.e. $\Sigma^{-1}\preceq \mathcal{F}$ (w.r.t positive semi-definite cone, i.e. w.r.t. concentration ellipsoids). So when $\Sigma^{-1}=\mathcal{F}$ the maximum likelihood estimator is efficient, i.e. maximum information exist in the data, so frequentist regime is optimal. In simpler words, for some likelihood functions (note that functional form of the likelihood purely depend on the probablistic model which supposedly generated data, aka generative model), maximum likelihood is efficient and consistent estimator, rules like a boss. (sorry for overkilling it)
  10. +
+",2013-10-22 12:00:53.823 +58009,7949.0,2,,58006.0,,,,CC BY-SA 3.0,"

Just a partial answer, adressing one of your questions.

+ +

Generally, for probability distributions you have:

+ +

$$ +E(X+Y) = E(X)+E(Y) +$$ +and +V +$$ +V(X+Y) = V(X)+V(Y) + 2\text{Cov}(X,Y) +$$ +with E and V being the expectation and variance respectively ($SD = \sqrt{V}$).

+ +

For you, this means the mean of the sum is just the sums of the means (so you can get it). For the SD of the sum you also need the covariance or correlation between the FM and FFM. One might use sensible guesses to get realistic lower and upper bounds. Your question regarding FM/(FM+FFM) is much more complicated.

+ +

I am not even sure that it is possible without knowing the exact distributions. There is an approximate solution using the delta method: +http://en.wikipedia.org/wiki/Taylor_expansions_for_the_moments_of_functions_of_random_variables. But you also require the covariance for that.

+",2013-10-22 12:07:08.897 +58035,6204.0,2,,58029.0,,,,CC BY-SA 3.0,"

Further to Eupraxis1981's discussion of informative priors, you can think of the ""information"" in a prior as inversely proportional to its variance. Consider a prior with near zero variance: you're basically saying ""before looking at the data, I'm almost positive I already know the location of the true value of the statistic."" Conversely, if you set a really wide variance, you're saying ""without looking at the data, I have really no assumptions about the true value of the parameter. It could be pretty much anywhere, and I won't be that surprised. I've got hunch it's probably near the mode of my prior, but if it turns out to be far from the mode I won't actually be surprised.""

+ +

Uninformative priors are attempts to bring no prior assumptions into your analysis (how successful they are is open to debate). But it's entirely possible and sometimes useful for a prior to be only ""weakly"" informative.

+",2013-10-22 16:59:02.657 +58010,3922.0,2,,58000.0,,,,CC BY-SA 3.0,"

Ferguson's A Course in Large Sample Theory is the best concise introduction to the topic, and it is written in a nice didactic way of having an equivalent of a week's lecture course material in a chapter followed by a strong set of exercises. (Ferguson introduced GMM in 1968 under the name of the minimum $\chi^2$, and it is tucked in as one of the exercises in that book). Van der Vaart's Asymptotic Statistics, recommended by others, is great, too, but it's going off in weird directions (for an economist). Another relatively easy introduction to the first-order asymptotics is Lehmann's Elements of Large Sample Theory. I would argue though that you would get a better mileage out of a book like Smith & Young's Essentials of Statistical Inference, as it will teach you about how statisticians think (sufficiency, UMPT, Cramer-Rao bound, etc.).

+ +

Of course you won't find the odd econometric asymptotics such as unit roots or weak instruments. Few statisticians have heard of them, and these are wa-a-ay too exotic for them. However, you would definitely want to revisit these unusual papers to shake off the wrong belief that everything asymptotic is asymptotically normal at $\sqrt{n}$ rate (you can find disturbing counterexamples here and there, too).

+",2013-10-22 12:09:32.127 +58011,10409.0,1,,,,"Simple way to categorize: terrible, poor, average, good, excellent",,CC BY-SA 3.0,"

I have a data frame with the following:

+ +
> summary(d5$points)
+   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+ -4.200   0.000   1.000   2.579   5.000  23.400 
+
+> sd(d5$points)
+[1] 3.736616
+
+ +

What's a simple, but statistically sound way to categorize this data into terrible, poor, average, good, excellent.

+ +

I'm using R.

+ +

Edit:

+ +

Higher is points is better. Negative points is terrible. A good game would be a player scoring 6+ points, but that's just from my observations.

+ +

As requested, are are the histograms.

+ +

All Players

+ +

+ +

Top 100 Players (based on their avg points)

+ +

+",2013-10-22 12:24:58.413 +58012,17573.0,2,,57924.0,,,,CC BY-SA 3.0,"

First, the incidental parameter problem is pretty easy to solve in a discrete duration model. As long as you are willing to assume a logistic form for your model, you can eliminate the incidental parameters via a clever conditioning argument. The usual cite in economics is Chamberlain (1980, Rev Econ Stud). If you prefer a textbook, there is Greene's Econometric Analysis (any of the recent editions) --- look up ""fixed effects model binary choice"" or ""Chamberlain"" in the index. In the seventh edition, the discussion runs from pg 721 through 725. The resulting estimator is usually called ""fixed-effects logit"" or ""Chamberlain's estimator.""

+ +

To be clear, you DO NOT just run a logistic regression with a bunch of dummy variables for households. If you are a Stata user, the xtlogit command with the fe option runs Chamberlain's fixed effects logit model. In R, I don't know how to do it. There are a couple of questions on this here at cross validated (one, two), and one over at stack overflow. The answers in those threads seem mostly to misunderstand what the Chamberlain estimator is, and I think the right conclusion from them is that Chamberlain's estimator is not currently implemented in R. (I would love to be corrected if I am wrong)

+ +

Looking over your question again, I wonder whether you really want a fixed effects estimator. As with any fixed effects estimator, you are not going to be able to directly estimate the effects of any household characteristic which does not change over time. Generally, schooling, occupation, household size are fixed or almost fixed in a short panel. If you include time dummies (and why wouldn't you?), then any characteristic which changes regularly with time within each household cannot be included. Age, for example, since, once you control for time, it is just birth date, a fixed characteristic of household head. Similarly, even the effect of the duration of the unemployment spell cannot be measured once you have time dummies and household dummies, unless some households have multiple unemployment spells.

+",2013-10-22 12:56:29.680 +58013,22705.0,1,,,,Handling stationarity issues in proc ucm/state space time series models,,CC BY-SA 3.0,"

Hope I'm able to find someone who can answer this question. The previous one didn't get answered!

+ +

Proc ucm is the SAS implementation (using state space concepts) to isolate the unobserved trend, seasonality & estimate the coefficients of regressors simultaneously.

+ +

Documentation/research on proc ucm is sparse. There are two questions I'm trying to find answers for: +a. Does the dependent variable have to be treated for weak stationarity (differenced) before running proc UCM? +Ans. http://www.iasri.res.in/sscnars/socialsci/17-Structural%20Time%20Series%20Models%20for%20Describing%20Trend%20in%20All%20India%20Sunflower%20Yield%20Using%20SAS.pdf

+ +

The attached paper seems to suggest that I don't need to worry about stationarity while using proc ucm.

+ +

However, I have found a contrary situation in my data. Consider my dependent is y. And, t & s are the smoothed trend & season components isolated by proc ucm. I'd have expected the series (y-t-s) to be stationary. But, it was not.

+ +

Thereby, I conclude that proc ucm is not capable of handling non-stationary time series, until I explicitly difference/stationarize the dependent.

+ +

Is this right?

+ +

b. I also have regressors which exhibit co-integrated relationships with y. From Granger's research paper, it is evident that spurious regression results when co-integrated series exists. But, I hear that's only if we use OLS-based proc reg. Proc UCM is based on maximum likelihood estimation. Is co-integration not a problem with maximum likelihood estimation based regression?

+ +

I could be ambiguous in my problem statement above and can clarify if need be.

+",2013-10-22 13:12:30.567 +58026,8958.0,2,,57156.0,,,,CC BY-SA 3.0,"

I doubt that unsupervised will work better but it could be a cool exercise to try out. Unsupervised learning with random forest is done by constructing a joint distribution based on your independent variables that roughly describes your data. Then simulate a certain number of observations using this distribution. For example if you have 1000 observations you could simulate 1000 more. Then you label them, e.g. 1:= real observation, 0:=simulated observation. After this, you run a usual random forest classifier trying to distinguish the real observations from the simulated ones. Note that you must have the calculate the proximity option turned on. The real useful output is exactly this, a description of proximity between your observations based on what Random Forest does when trying to assign these labels. You now have a description of how ""close"" or ""similar"" your observations are from each other and you could even cluster them based on many techniques. A straightforward one would be to select thresholds for these ""distances"". I mean stick together observations that are closer than a certain threshold. Another easy option is to do hierarchical clustering but using this particular distance matrix. If you can work with R, most hierarchical clustering packages allow you to feed the functions custom distance matrices. You then select a cutoff point, you may visualize it as a dendrogram and so on and so forth.

+ +

This used to be a very good tutorial on Random Forest clustering and they shared some useful R functions which they wrote for this purpose but the link seems to be dead now. Maybe it will come back up later. They also wrote a very neat random glm R package (which is analogous to random forest but based on duh...glms) if you want to check that out. You could always write to the authors and ask for the material for Random Forest classification which used to be available on the dead link. I have the R code but it's too large to paste here, I can send it to you if you send me a private message.

+",2013-10-22 15:43:34.490 +58014,16043.0,2,,57990.0,,,,CC BY-SA 3.0,"

I think the core misunderstanding stems from questions you ask in the first half of your question. I approach this answer as contrasting MLE and Bayesian inferential paradigms. A very approachable discussion of MLE can be found in chapter 1 of Gary King, Unifying Political Methodology. Gelman's Bayesian Data Analysis can provide details on the Bayesian side.

+ +
+

In Bayes' theorem, $$p(y|x)=\frac{p(x|y)p(y)}{p(x)}$$ + and from the book I'm reading, $p(x|y)$ is called the likelihood, but I assume it's just the conditional probability of $x$ given $y$, right?

+
+ +

The likelihood is a conditional probability. To a Bayesian, this formula describes the distribution of the parameter $y$ given data $x$ and prior $p(y)$. But since this notation doesn't reflect your intention, henceforth I will use ($\theta$,$y$) for parameters and $x$ for your data.

+ +

But your update indicates that $x$ are observed from some distribution $p(x|\theta,y)$. If we place our data and parameters in the appropriate places in Bayes' rule, we find that these additional parameters pose no problems for Bayesians: +$$p(\theta|x,y)=\frac{p(x,y|\theta)p(\theta)}{p(x,y)}$$

+ +

I believe this expression is what you are after in your update.

+ +
+

The maximum likelihood estimation tries to maximize $p(x,y|\theta)$, right?

+
+ +

Yes. MLE posits that $$p(x,y|\theta) \propto p(\theta|x,y)$$ +That is, it treats the term $\frac{p(\theta,y)}{p(x)}$ as an unknown (and unknowable) constant. By contrast, Bayesian inference treats $p(x)$ as a normalizing constant (so that probabilities sum/integrate to unity) and $p(\theta,y)$ as a key piece of information: the prior. We can think of $p(\theta,y)$ as a way of incurring a penalty on the optimization procedure for ""wandering too far away"" from the region we think is most plausible.

+ +
+

If so, I'm badly confused, because $x,y,\theta$ are random variables, right? To maximize $p(x,y|\theta)$ is just to find out the $\hat{\theta}$?

+
+ +

In MLE, $\hat{\theta}$ is assumed to be a fixed quantity that is unknown but able to be inferred, not a random variable. Bayesian inference treats $\theta$ as a random variable. Bayesian inference puts probability density functions in and gets probability density functions out, rather than point summaries of the model, as in MLE. That is, Bayesian inference looks at the full range of parameter values and the probability of each. MLE posits that $\hat{\theta}$ is an adequate summary of the data given the model.

+",2013-10-22 13:33:40.650 +58015,22933.0,2,,16209.0,,,,CC BY-SA 3.0,"
as.numeric(factor(c(""d"", ""a"", ""b"", ""b"", ""c"")))
+
+[1] 4 1 2 2 3
+
+",2013-10-22 13:36:16.093 +58016,22399.0,2,,58008.0,,,,CC BY-SA 3.0,"

It is a measure of precision just as $\Sigma$ is a measure of dispersion.

+ +

More elaborately, $\Sigma$ is a measure of how the variables are dispersed around the mean (the diagonal elements) and how they co-vary with other variables (the off-diagonal) elements. The more the dispersion the farther apart they are from the mean and the more they co-vary (in absolute value) with the other variables the stronger is the tendency for them to 'move together' (in the same or opposite direction depending on the sign of the covariance).

+ +

Similarly, $\Sigma^{-1}$ is a measure of how tightly clustered the variables are around the mean (the diagonal elements) and the extent to which they do not co-vary with the other variables (the off-diagonal elements). Thus, the higher the diagonal element, the tighter the variable is clustered around the mean. The interpretation of the off-diagonal elements is more subtle and I refer you to the other answers for that interpretation.

+",2013-10-22 13:42:11.060 +58017,306.0,2,,58011.0,,,,CC BY-SA 3.0,"

Apply a Box-Cox transformation. Then use the following :

+ +
    +
  1. x < mean - 2 * sigma : terrible
  2. +
  3. mean - 2 * sigma <= x < mean - sigma : poor
  4. +
  5. mean - sigma <= x < mean + sigma : average
  6. +
  7. mean + sigma <= x < mean + 2 * sigma : good
  8. +
  9. mean + 2 * sigma <= x : excellent.
  10. +
+ +

in absence of any other information, i would go this away. however at least a histogram could have been given.

+",2013-10-22 13:45:33.197 +58018,668.0,2,,57972.0,,,,CC BY-SA 3.0,"

This question comes up in analyses of spatial relationships, especially in studying edge effects in spatial statistics. The answer depends intimately on the shape of the field, the shape of the water within the field, and the size of the hoop.

+ +

A hoop of radius $r$ ""contains water"" if and only if its center lies within distance $r$ of water, which we may view as a point set $W$ in the plane. To solve this, construct the $r$ buffer of the water: this is the locus of all points within distance $r$ of water. Let's call it $B_r(W).$

+ +

Assume now that the field $F$ is also modeled as a set of points in the plane, and suppose it is bounded and measurable. Regardless of the shape of $W$ (it might not be measurable), this implies (via standard compactness arguments) that $B_r(W) \cap F$ is (Borel) measurable for all positive buffer distances $r$. Accordingly, it is meaningful to talk about the probability that a uniformly distributed hoop center lies within $B_r(W) \cap F$. By the very definition of a uniform distribution, that probability is precisely the buffer area divided by the area of the field.

+ +

Examples

+ +

+ +
    +
  1. (A single puddle, at left.) The field is a circle of diameter $R$ and the water is represented by a single interior point of distance at least $r$ from the boundary of the field. The buffer $B_r(W)$ then is a circle of radius $r$ and its area relative to that of the field is $(r/R)^2$.

  2. +
  3. (Multiple puddles, center.) As before, but now the water is a finite set of $n$ points all separated by at least $2r$ from each other and all lying at least $r$ from the field's boundary. The probability is now $n(r/R)^2.$

  4. +
  5. (Circular pond, at right.) Suppose the water is a circular pond of radius $w$ and, as before, lies no closer than $r$ to the field's boundary. Its buffer is a circle of radius $w+r,$ whence the probability is $\left((w+r)/R\right)^2.$

  6. +
  7. (Canal system.) Let the field be a rectangle and suppose the water forms a gridded irrigation system covering the field. The spacing between the edge of one irrigation canal and the edge of the next is no greater than $2r$. The buffer $B_r(W)$ now covers the entire field (by design!) and the probability is $1$.

    + +

    + +

    This figure (taken from an answer at https://gis.stackexchange.com/a/17390) shows a small-radius buffer of an irregular set of line segments (the irrigation ditches, shown in black) within a rectangular field. As the radius grows, eventually the buffer--the gray area--will merge into the entirety of the field. (The pink areas are not relevant to the present discussion.)

  8. +
+ +

Note that in many of these cases the water may have practically no area--it would be represented on a map as a collection of points or line segments--while the probability could end up being anything between $0$ and $1$. When the water forms a connected, simply-connected domain that is not too tortuous and the buffer distance $r$ is very small (smaller than the width of the water at its narrowest point), then the buffer area is approximately

+ +

$$|W| + rP + \pi r^2$$

+ +

where $|W|$ is the area of the water and $P$ is the length of the perimeter of the water (assuming it is finite). (This formula is exact for piecewise linear convex domains: that is, convex polygons.) To first order in $r,$ then, the probability is approximately

+ +

$$\frac{|W| + rP}{|F|}$$

+ +

where $|F|$ is the area of the field. This shows that initially, for very small $r$, the probability grows linearly with $r$ and the slope (constant of proportionality) is the ratio of the perimeter of $W$ to the area of the field, $P/|F|.$

+",2013-10-22 13:51:13.770 +58019,10147.0,1,,,,Ordinal Regression,,CC BY-SA 3.0,"

In my data, the response variable is ordinal and there are 16 predicting variables which are also ordinal. I plan to perform an ordinal regression. But before doing that I want to graphically show the relationship between the response and predictor variables. In regular regression where the variables are numerical, we can produce a scatterplot matrix for that purpose. I am wondering whether for ordinal regression with ordinal predictors there is any graph that can be used to visually display the relationship before the formal analysis. Another possibility is to calculate the association between the response and the ordinal predictors before the analysis. Any recommendation on that part? +Also, I plan to perform a proportional odds model using R. Can anyone recommend a R package or functions which can be used to test the proportionality and also used for the analysis?

+",2013-10-22 14:56:25.547 +58020,4735.0,1,58034.0,,,"kurtosis, positive skewed and negative skewed for probability distribution",,CC BY-SA 3.0,"

When discussing probability distribution, I always read something such as excess kurtosis, positive kurtosis, positive skewed and negative skewed. What exactly do these concepts indicate? In practical applications, such as market return or something else, what can these characteristics tell us?

+",2013-10-22 15:06:37.997 +58021,12358.0,1,58022.0,,,What is the name for this distribution defined on a circle,,CC BY-SA 3.0,"

One can define the probability distribution: +$$ +p(\theta; \alpha, \theta_0) = \frac{ e^{ \alpha \cos( \theta-\theta_0) }}{ 2 \pi I_0(\alpha)} +$$ +over an angular variable $\theta \in [0,2 \pi]$. By what name(s) is this distribution called?

+ +

($I_0$ is the modified Bessel function and serves to normalize the distribution)

+",2013-10-22 15:15:20.203 +58022,3922.0,2,,58021.0,,,,CC BY-SA 3.0,"

It's von Mises distribution, aka Tikhonov distribution, and plays the role similar to the normal distribution in 1D statistics.

+ +

For reference, $I_0(z)$ is the modified Bessel function of order 0.

+",2013-10-22 15:17:50.823 +58024,3922.0,4,,,,,,CC BY-SA 3.0,"Directional statistics deals with distributions of angles (on a circle, on a sphere, or another kind of hypersphere appropriate for dimension at hand)",2013-10-22 15:20:37.460 +58023,0.0,5,,,,,,CC BY-SA 3.0,,2013-10-22 15:20:37.460 +58025,22935.0,1,58030.0,,,Are MRFs with edges to all observed data possible?,,CC BY-SA 3.0,"

I have been discussing the following issue with a colleague of mine and I can't seem to wrap my head around it. I have a computer vision background, so I'm mostly familiar with 2D MRFs/CRFs for image restoration and segmentation.

+ +

What is clear to me is that MRFs typically have a simple 2D grid structure for p(x) and each underlying variable x_i has an edge to the corresponding pixel y_i in the graph for the posterior p(y|x). In CRFs you have an edge to each pixel y in the image. I can fully understand that CRFs are trained in a discriminative way and thus they are tractable and that it wouldn't be possible to really learn a generative model based on such a CRF graph. However in all the literature it seems that by definition the MRF model cannot have edges from a x_i to all pixels y, as it then would no longer be a MRF. For example the following states:

+ +
+

If the conditional independence assumption is not used, the posterior will usually not be a MRF making the inference difficult.

+
+ +

Now that it would not be tractable to actually learn such a model I can understand, but why would it no longer be an MRF and also what would it be then? Because I don't see why it would by definition become a CRF, as we do not NEED to condition it on the data.

+ +

I really hope someone can tell me what I am missing here, or where my mistake is!

+",2013-10-22 15:42:00.327 +58027,22923.0,1,58028.0,,,Non-stationary series keep close to each other but correlation between growth rates is ~0 - how is this possible?,,CC BY-SA 3.0,"

I have 2 (monthly) time-series that look like this:

+ +

+ +

Economical intuition suggests that they are positively related and I can see this on the plot but if I compute correlation between their log-returns $\ln x_t/x_{t-1}$ and $\ln y_t/y_{t-1}$ this correlation is -0.04 this is basically zero and not statistically significant for my data size (~60 points).

+ +

How can it be?

+ +

One may say that series are cointegrated $y_t = a x_t +\varepsilon_t$, but then returns should follow $\Delta y_t = a \Delta x_t +\Delta \varepsilon_t$ and correlation between returns would also be significant. So if I see zero correlation between returns, there is no cointegration between levels as well - right?

+ +

So does this zero correlation means that there is no relation between series? +If yes - why do they follow each other so closely.. +If no - how to quantify this relation if correlation between diff'ed series is ~0 and cointegration tests for original series are inconclusive.

+ +

EDITS:

+ +

-- added cointegration -> correlation link to address @AlecosPapadopoulos question.

+",2013-10-22 15:50:53.610 +58028,12358.0,2,,58027.0,,,,CC BY-SA 3.0,"

Your measure is a short-time-scale measurement; note that you are only looking at the (log) differences between successive time stamps. There is enough short-time-scale noise that is is masking the longer-term, $O(1 year)$, timescale correlations in the data.

+",2013-10-22 16:04:04.133 +58029,6630.0,1,58033.0,,,What exactly is weakly informative prior?,,CC BY-SA 3.0,"

Is there a precise definition of weakly informative prior?

+ +

How is it different from a subjective prior with broad support?

+",2013-10-22 16:22:35.387 +58030,3183.0,2,,58025.0,,,,CC BY-SA 3.0,"

Restricted Boltzmann Machines are Markov random fields that usually have complete connectivity between the hidden and visible variables. If we drop the ""restricted"" part and look at Boltzmann Machines more generally, there can also be complete connectivity among the visible variables (pixels).

+ +

So yes, they're possible.

+ +

I haven't read the whole paper you linked to, but here is my assessment of what they mean.

+ +
    +
  • If $p(x)$ is an MRF, and all the elements of $y$ are conditionally independent of one another given $x$, then $p(x, y)$ is also an MRF.

  • +
  • If the $y$ variables are not conditionally independent of one another, then this will not necessarily be true (although it still could be). For instance, if there were directed connections among the $y$ variables, then you wouldn't have an MRF anymore.

  • +
+ +

However, the case you seem to be thinking about, where there are undirected connections between the variables (like the Boltzmann Machines I linked to) are still considered MRFs because all their pieces are MRFs.

+",2013-10-22 16:34:44.577 +58031,22938.0,1,,,,Coding for Regression Analysis,,CC BY-SA 3.0,"

Q: What is the mathematical theory behind coding categorical variables for regression analysis?

+ +

The situation is this:

+ +

For numeric variables, we can code observations via the transformation

+ +

$\textrm{coded value} = \frac{\textrm{uncoded value} - \frac{(high+low)}{2}}{\frac{(high-low)}{2}}$.

+ +

For categorical variables, the above clearly will not work. I have been given the following codings without justification. For two variables, such as material type A and type B, we code them as

+ +

$ +Type A = -1 \\ +Type B = 1. +$

+ +

If the there are three variables, we code them as

+ +

$ +Type A = \{1,0\} \\ +Type B = \{0,1\} \\ +Type C = \{-1,-1\}. +$

+ +

Once coded, we can then enter the data into SAS and run PROC REG to determine a regression model.

+ +

What is the mathematical basis for this coding? My textbook, Montgomery Design and Analysis of Experiments, 8th Edition, provides little more than a page on coding in general. My instructors cannot provide me with anything more than that it has something to do with orthogonality of vectors.

+ +

For the three variable case, we can see that any two vectors are linearly independent, but the three together are dependent. As such if we arrange them in a matrix, the matrix must then be singular.

+ +

Just as there is a justification for the numeric case, I want to understand the categorical. I'm not afraid of matrices or linear algebra, although it seems like there could be a simple geometric explanation. If it is too lengthy to explain, I would be happy with a textbook or online reference.

+",2013-10-22 16:42:07.447 +58032,40516.0,1,,,user2551700,Comparison of two curves,,CC BY-SA 3.0,"

I have two measurements of something. You can think of them both as their own curve with known error on each point. They look quite similar, and if I calculate the R2 value it comes out to >0.9. But what I want to be able to calculate is a P value comparing these two curves (i.e., what is the probability that the difference i'm looking at is just due to noise?). Now I could easily do a student t-test at each point and come up with a P-value at each point, sure. But is there some way to come up with an over-all P value that uses all the points and not just one? Thanks very much for any help.

+",2013-10-22 16:47:18.103 +58033,,2,,58029.0,user31668,,,CC BY-SA 3.0,"

The above comment is accurate. For a quantitive discussion, there are a number of ""uninformative"" priors in the literature. See for example Jeffreys' prior; see earlier post What is an "uninformative prior"? Can we ever have one with truly no information?

+ +

They are defined in different ways, but the key is that they do not place too much probability in any particular interval (and hence favor those values) with the uniform distribution being a canonical example. The idea is to let the data determine where the mode is.

+",2013-10-22 16:48:53.887 +58034,,2,,58020.0,user31668,,,CC BY-SA 3.0,"

Excess kurtosis $=$ kurtosis $-$ 3, since the normal distribution has kurtosis $=$ 3 (that is what the ""excess"" refers to). Also, kurtosis is always positive, so any reference to signs suggests they are saying that a distribution has more kurtosis than the normal. Skew indicates how asymmetrical the distribution is, with more skew indicating that one of the tails ""stretches"" out from the mode farther than the other does.

+ +

Practically: High kurtosis indicates a high propensity of a distribution to give you ""outliers"", in the sense that you will tend to get a lot of rather closely spaced outcomes, followed by a few, rare, way-out-there values. In the markets, this type of distribution can lull you into a sense of complacency, with well localized values most of the time, only to ruin your day with a crazy loss. For skewed distributions, a right-skew to a financial product indicates that its positive returns tend to be higher than its losses, for a simple example, which all other things being equal, is good.

+",2013-10-22 16:57:16.083 +58036,22399.0,2,,58031.0,,,,CC BY-SA 3.0,"

Consider your two types situation (say Gender to be concrete). Suppose, that you decide to code as follows:

+ +

If respondent is male: Set M = 1 or 0 otherwise

+ +

If respondent is female: Set F = 1 or 0 otherwise

+ +

Then the data set will look like so:

+ +
M F Y
+1 0 y1
+0 1 y2
+1 0 y3
+1 0 y4
+0 1 y5
+0 1 y6
+
+ +

Obviously, including both these variables as part of your $x$ matrix would lead to a singular design matrix which cannot be inverted. In other words, you cannot estimate the effect of both 'male' and 'female' respondents on your dependent variable independently of each other. Thus, you decide to drop one of these variables (say 'F'). Then assuming a model that looks like so:

+ +

$y=a + b M $

+ +

the OLS estimate of the intercept will give us effect of the average 'male' respondent on $y$ and $a+b$ the effect of the 'female' respondent. In other words, $b$ gives us the difference between female and male respondents on $y$.

+ +

What happens if we code M as $1, -1$?

+ +

Then the equation for male respondents and female respondents are:

+ +

$y_m = a + b$ and

+ +

$y_f = a - b$

+ +

Thus, $b$ is now $\frac{y_m-y_f}{2}$ which is bit complex to interpret. Thus, a simpler way to obtain interpretability is to choose a coding scheme such as: $0.5, -0.5$ instead of $1, -1$ which then results in:

+ +

$y_m = a + 0.5 b$ and

+ +

$y_f = a - 0.5 b$

+ +

Thus, $b$ is $y_m-y_f$ which is identical to the interpretation we obtained when we chose to code them as $1, 0$.

+ +

The above logic can be extended to the context of more than two categorical variables. If you want to know more you should probably look up 'Contrast Coding' as the different ways to code categorical variables are different ways to estimate contrasts between the categorical variables (e.g., do we want to estimate the impact of male - female in which case we pick the 'M' column or do we want to estimate the impact of female - male in which we case we pick the 'F' column).

+",2013-10-22 17:27:14.333 +58037,22637.0,1,58060.0,,,Value that increases the Standard Deviation,,CC BY-SA 3.0,"

I am puzzled by the following statement:

+ +

"" In order to increase the standard deviation of a set of numbers, you must add a value that is more than one standard deviation away from the mean""

+ +

What is the proof of that? I know of course how we define the standard deviation but that part I seem to miss somehow. Any comments?

+",2013-10-22 17:40:55.053 +58038,14799.0,2,,58008.0,,,,CC BY-SA 3.0,"

Using superscripts to denote the elements of the inverse, $1/\sigma^{ii}$ is the variance of the component of variable $i$ that is uncorrelated with the $p-1$ other variables, and $-\sigma^{ij}/\sqrt{\sigma^{ii}\sigma^{jj}}$ is the partial correlation of variables $i$ and $j$, controlling for the $p-2$ other variables.

+",2013-10-22 17:57:30.763 +58039,503.0,2,,58037.0,,,,CC BY-SA 3.0,"

Leaving aside the algebra (which also works) think about it this way: The standard deviation is square root of the variance. The variance is the average of the squared distances from the mean. If we add a value that is closer to the mean than this, the variance will shrink. If we add a value that is farther from the mean than this, it will grow.

+ +

This is true of any average of values that are non-negative. If you add a value that is higher than the mean, the mean increases. If you add a value that is less, it decreases.

+",2013-10-22 17:57:56.327 +58040,7927.0,2,,58037.0,,,,CC BY-SA 3.0,"

I'll get you started on the algebra, but won't take it quite all of the way. First, standardize your data by subtracting the mean and dividing by the standard deviation: $$ Z = \frac{x-\mu}{\sigma} .$$ Note that if $x$ is within one standard deviation of the mean, $Z$ is between -1 and 1. Z would be 1 if $x$ were exactly one sd away from the mean. Then look at your equation for standard deviation: $$\sigma = \sqrt{\frac{\sum_{i=1}^{N}Z_i^2}{N-1}}$$ What happens to $\sigma$ if $Z_N$ is between -1 and 1?

+",2013-10-22 18:02:10.477 +58041,9030.0,1,58054.0,,,conditional mutual information and how to deal with zero probabilities,,CC BY-SA 3.0,"

the conditional mutual between three sets of mutually exclusive variables, X, Y, and Z, is defined as follows.

+ +

$I(X,Y|Z) = \sum_{xyz} P(x,y,z) \log \frac{P(z)P(x,y,z)}{P(x,z)P(y,z)}$

+ +

my questions concern the $\log$ of the ratio of the probability products.

+ +
    +
  1. if $P(z)$ or $P(x,y,z)$ is 0, then $\log(0)$ is undefined.
  2. +
  3. if $P(x,z)$ or $P(y,z)$ is 0, then $\log(\infty)$ is undefined.
  4. +
+ +

how do i deal with these 2 situations? the approach can be very flexible. for example, i thought about ignoring the inner sums where such conditions occur, but is this correct or reasonable?

+ +

any help is appreciated.

+",2013-10-22 18:08:36.503 +58042,22941.0,2,,57997.0,,,,CC BY-SA 3.0,"

You may want to consider other metrics such as adjusted R square, or Mallow's Cp.

+",2013-10-22 18:10:17.457 +58043,22942.0,1,99851.0,,,Problem with factorial design in Minitab,,CC BY-SA 3.0,"

I'm having a problem with a factorial design in Minitab, as it is not as clear how to proceed as all problems I've had thus far.

+ +

The problem is to create a factorial design with two factors where the first factor, Environment, which has two levels - H2O and Salt H2O whose values overlap (i.e. they cannot be differentiated by simply stating a value for low which is only met for H2O and one value for high which is only met by Salt H2O).

+ +

The second factor is Frequency and it is 10 for four measurements and 1 for four others.

+ +

The data is structured as a table such as:

+ +
                         Environment
+                   H2O                  Salt H2O
+
+            10    value                  value 
+Frequency 
+
+ +

The problem is that I don't know how to create this in Minitab. I'm aware that it will be a 2$\times$2 factorial design with 4 replicates, but as mentioned earlier I don't know how to define the Environment factor. I also don't understand how to put in the response variable.

+ +

I guess it will be the values in the table, but to what do these entries in the table correspond to in Minitab?

+",2013-10-22 18:40:24.197 +58044,11359.0,2,,54637.0,,,,CC BY-SA 3.0,"

Yes, it is possible and, yes, there are R functions that do it. Instead of computing the p-values of the repeated analyses by hand, you can use the package Zelig, which is also referred to in the vignette of the Amelia-package (for a more informative method see my update below). I'll use an example from the Amelia-vignette to demonstrate this:

+ +
library(""Amelia"")
+data(freetrade)
+amelia.out <- amelia(freetrade, m = 15, ts = ""year"", cs = ""country"")
+
+library(""Zelig"")
+zelig.fit <- zelig(tariff ~ pop + gdp.pc + year + polity, data = amelia.out$imputations, model = ""ls"", cite = FALSE)
+summary(zelig.fit)
+
+ +

This is the corresponding output including $p$-values:

+ +
  Model: ls
+  Number of multiply imputed data sets: 15 
+
+Combined results:
+
+Call:
+lm(formula = formula, weights = weights, model = F, data = data)
+
+Coefficients:
+                Value Std. Error t-stat  p-value
+(Intercept)  3.18e+03   7.22e+02   4.41 6.20e-05
+pop          3.13e-08   5.59e-09   5.59 4.21e-08
+gdp.pc      -2.11e-03   5.53e-04  -3.81 1.64e-04
+year        -1.58e+00   3.63e-01  -4.37 7.11e-05
+polity       5.52e-01   3.16e-01   1.75 8.41e-02
+
+For combined results from datasets i to j, use summary(x, subset = i:j).
+For separate results, use print(summary(x), subset = i:j).
+
+ +

zelig can fit a host of models other than least squares.

+ +

To get confidence intervals and degrees of freedom for your estimates you can use mitools:

+ +
library(""mitools"")
+imp.data <- imputationList(amelia.out$imputations)
+mitools.fit <- MIcombine(with(imp.data, lm(tariff ~ polity + pop + gdp.pc + year)))
+mitools.res <- summary(mitools.fit)
+mitools.res <- cbind(mitools.res, df = mitools.fit$df)
+mitools.res
+
+ +

This will give you confidence intervals and proportion of the total variance that is attributable to the missing data:

+ +
              results       se    (lower    upper) missInfo    df
+(Intercept)  3.18e+03 7.22e+02  1.73e+03  4.63e+03     57 %  45.9
+pop          3.13e-08 5.59e-09  2.03e-08  4.23e-08     19 % 392.1
+gdp.pc      -2.11e-03 5.53e-04 -3.20e-03 -1.02e-03     21 % 329.4
+year        -1.58e+00 3.63e-01 -2.31e+00 -8.54e-01     57 %  45.9
+polity       5.52e-01 3.16e-01 -7.58e-02  1.18e+00     41 %  90.8
+
+ +

Of course you can just combine the interesting results into one object:

+ +
combined.results <- merge(mitools.res, zelig.res$coefficients[, c(""t-stat"", ""p-value"")], by = ""row.names"", all.x = TRUE)
+
+ +

Update

+ +

After some playing around, I have found a more flexible way to get all necessary information using the mice-package. For this to work, you'll need to modify the package's as.mids()-function. Use Gerko's version posted in my follow-up question:

+ +
as.mids2 <- function(data2, .imp=1, .id=2){
+  ini <- mice(data2[data2[, .imp] == 0, -c(.imp, .id)], m = max(as.numeric(data2[, .imp])), maxit=0)
+  names  <- names(ini$imp)
+  if (!is.null(.id)){
+    rownames(ini$data) <- data2[data2[, .imp] == 0, .id]
+  }
+  for (i in 1:length(names)){
+    for(m in 1:(max(as.numeric(data2[, .imp])))){
+      if(!is.null(ini$imp[[i]])){
+        indic <- data2[, .imp] == m & is.na(data2[data2[, .imp]==0, names[i]])
+        ini$imp[[names[i]]][m] <- data2[indic, names[i]]
+      }
+    } 
+  }
+  return(ini)
+}
+
+ +

With this defined, you can go on to analyze the imputed data sets:

+ +
library(""mice"")
+imp.data <- do.call(""rbind"", amelia.out$imputations)
+imp.data <- rbind(freetrade, imp.data)
+imp.data$.imp <- as.numeric(rep(c(0:15), each = nrow(freetrade)))
+mice.data <- as.mids2(imp.data, .imp = ncol(imp.data), .id = NULL)
+
+mice.fit <- with(mice.data, lm(tariff ~ polity + pop + gdp.pc + year))
+mice.res <- summary(pool(mice.fit, method = ""rubin1987""))
+
+ +

This will give you all results you get using Zelig and mitools and more:

+ +
                  est       se     t    df Pr(>|t|)     lo 95     hi 95 nmis   fmi lambda
+(Intercept)  3.18e+03 7.22e+02  4.41  45.9 6.20e-05  1.73e+03  4.63e+03   NA 0.571  0.552
+pop          3.13e-08 5.59e-09  5.59 392.1 4.21e-08  2.03e-08  4.23e-08    0 0.193  0.189
+gdp.pc      -2.11e-03 5.53e-04 -3.81 329.4 1.64e-04 -3.20e-03 -1.02e-03    0 0.211  0.206
+year        -1.58e+00 3.63e-01 -4.37  45.9 7.11e-05 -2.31e+00 -8.54e-01    0 0.570  0.552
+polity       5.52e-01 3.16e-01  1.75  90.8 8.41e-02 -7.58e-02  1.18e+00    2 0.406  0.393
+
+ +

Note, using pool() you can also calculate $p$-values with $df$ adjusted for small samples by omitting the method-parameter. What is even better, you can now also calculate $R^2$ and compare nested models:

+ +
pool.r.squared(mice.fit)
+
+mice.fit2 <- with(mice.data, lm(tariff ~ polity + pop + gdp.pc))
+pool.compare(mice.fit, mice.fit2, method = ""Wald"")$pvalue
+
+",2013-10-22 18:47:52.037 +58045,14799.0,2,,58037.0,,,,CC BY-SA 3.0,"

The puzzling statement gives a necessary but insufficient condition for the standard deviation to increase. If the old sample size is $n$, the old mean is $m$, the old standard deviation is $s$, and a new point $x$ is added to the data, then the new standard deviation will be less than, equal to, or greater than $s$ according as $|x-m|$ is less than, equal to, or greater than $s\sqrt{1+1/n}$.

+",2013-10-22 18:49:23.243 +58046,22944.0,1,,,,Comparing two survey items with a third variable,,CC BY-SA 3.0,"

I want to compare the results of two items by age category and do not really know what test to use. I tried a Cochran–Mantel–Haenzel–test at first, but I don't think it is testing what I really want to know.

+ +

We have two items (both dichotomous yes/no). We are interested in the proportion who said yes to item 2 if they said yes to item 1 and if this differs by age category.

+ +

Anyone have any ideas? I thought about doing a logistic regression with the ""events"" as those who chose item 2 and the ""trials"" as the number who chose item 1, with the age category as the dependent variable. Does that make sense?

+",2013-10-22 19:18:45.377 +58086,2666.0,2,,58085.0,,,,CC BY-SA 3.0,"

The ""significance"" of the effect has no effect at all on the interpretation, given that you pay attention to confidence intervals. Even better might be to compute simultaneous confidence intervals as made easy by the R rms package Predict, plot.Predict, and lrm functions using the R multcomp package.

+",2013-10-23 13:07:07.897 +58047,22914.0,2,,57965.0,,,,CC BY-SA 3.0,"

I found this beautiful web page with precisely the information that I was looking for: Birthday Problem at the university of Alabama in Huntsville.

+ +

Following the derivation in Eq (13) therein, the number of possible $D$-samples from a population of $N$ balls, that exclude exactly $C$ of them is +$$ +\#\{\# excluded=C\} = {N\choose C}\sum_{k=0}^{N-C}(-1)^k{N-C\choose k}(N-C-k)^D. +$$

+ +

Below, we evaluate the sum analytically. So the probability of ending up with exactly $C$ clean (unmarked) balls is the ratio of $\#\{\# excluded=C\}$ to the total number of possible samples:

+ +

$$ +P_C = \frac{\#\{\# excluded=C\}}{N^D}=N^{-D} (N-C)! \binom{N}{C}\mathcal{S}_D^{(N-C)}, +$$ +where $\mathcal{S}_D^{(N-C)}$ is the Stirling number of the second kind.

+",2013-10-22 19:35:19.693 +58048,21762.0,2,,58046.0,,,,CC BY-SA 3.0,"

Edited after clarification

+ +

Your logistic regression looks just fine. Maybe easier to sell, but similar, would be to use a chi squared test of independence between age and item 2 for people with positive item 1. An alternative to this test would be a trend test (e.g. test for non-zero Spearman rank correlation) that would consider the ordinal nature of age categories (also only for persons with positive answer to item 1).

+",2013-10-22 19:35:57.857 +58049,19089.0,2,,55209.0,,,,CC BY-SA 3.0,"

Your reasoning here is not incorrect, but I understand why it is shaky. In essence, the K-S test looks for sufficient evidence that the true distribution, $F$, is different from your assumed distribution $F_0$ by looking at the maximum absolute deviation $K = \sup_x \hat F(x) - F_0(x)$. But first we must note two things:

+ +
    +
  1. Your ""reliability"" functions here are also called survival functions, often denoted $\bar F$, and defined as $$\bar F(x) = 1 - F(x)$$ where $F$ is the cdf. So it is easily seen that $$K = \sup_x \hat F(x) - F_0(x) = \sup_x \hat {\bar F}(x) - \bar F_0(x).$$

  2. +
  3. The statistic $\sqrt nK$ has a Kolmogorov distribution asymptotically, but it does not at $n=20$. The military handbook you site above is taking a shortcut around this requirement to provide you with your critical value of 0.264.

  4. +
+ +

As for interpretation of your plot, I think you need to recognize the following: the power of the K-S test is very low at this sample size. Power of course is the probability of rejecting the null when the null is not true. You'll notice that, for example, at $t=4$, you would have to have over 30% failure in order to reject your distribution assumption, a far cry from the 5% the client wanted.

+ +

How to remedy this? That's the tough question for professional statisticians. My thought is that the reason the K-S test has such low power is that it is so general. Do you really need the Weibull fit? How about setting up tests for both 4-year and 8-year marks, and adjusting your level according to the fact that you have two tests. In fact, since these would be binary results (failed vs. didn't fail), you can possibly use Fisher's Exact Test to obtain both the level and the power of your test. Then you can be sure about what your saying in response to the requirement of ""90% confidence"".

+",2013-10-22 19:37:42.740 +58050,8374.0,1,,,,Generate data from a t-distribution with specified mean and standard deviation,,CC BY-SA 3.0,"

How does one randomly sample from a T-distribution in R. From what I've found, the function rt in R doesn't let you specify the mean and standard deviation. For a normal distribution it is simply rnorm(x,mu,sd).

+ +

EDIT:

+ +

For the t-distribution there is a central and a non-central version. I want to know the difference between the two. In addition, if I want to specify the mean and standard deviation, does that automatically mean I am dealing with the non-central version of the t-distribution?

+ +

I chose to use the t-distribution because the data I am using are rather fat tailed and a t-distribution with a low degrees of freedom seems like a good idea. What other distributions are there for handling fat tailed data? Also it would be great if you can specify the function in R too.

+",2013-10-22 20:10:35.257 +58051,22880.0,1,,,,Missing value treatment,,CC BY-SA 3.0,"

I have a data set with 18% of AGE variable missing which is an important variable for analysis.

+ +
    +
  1. Should I try regression imputation or should I drop those observations?

  2. +
  3. Does even regression imputation make sense in this case (for age)??

  4. +
  5. I also have income variable but the correlation between age and income is negative and strength is .1

  6. +
+ +

What should I do?

+",2013-10-22 20:14:21.513 +58052,594.0,2,,57463.0,,,,CC BY-SA 3.0,"

While it's possible to do it recursively for fixed degrees of freedom (write the cdf for a given d.f. in terms of the cdf for lower degrees of freedom, and the integrals foir the two lowest-integer df may be done directly), I've never seen anyone try to implement it that way.

+ +

Some algorithms for the cdf of the $t$ are based on the incomplete beta function (which is a commonly used function in various parts of mathematics or physics).

+ +

There are some for the inverse cdf (quantile function) based on ratios of polynomials.

+ +

Plain googling on algorithm cdf|""distribution function"" student t turns up plenty of references within the pages linked (e.g. here), such as Abramowitz and Stegun's Handbook of Mathematical Functions (which gives some small-d.f.-exact and approximate calculations), and various other books and papers.

+ +

If you want the noncentral t (e.g. for power calculations) a standard reference is Lenth, R. V. 1989. ""Algorithm AS 243: Cumulative distribution function of the noncentral t distribution"". Applied Statistics, 38, 185-189.

+ +

However, if you're doing many of these, hypothesis tests may not suit your purposes. Something more like a measure of effect size might be better.

+",2013-10-22 20:23:11.970 +58053,13037.0,1,58058.0,,,Simulating new x's in regression simulation study,,CC BY-SA 3.0,"

One of my homework problems is a simulation that compares three estimators (least squares, ridge regression with known parameters, and ridge regression with estimated parameters) for the following model $$Y_i = \beta X_i + \epsilon_i,\quad \epsilon_i\sim N(0,\sigma^2)$$

+ +

I am supposed to do 1000 replications with $X_i\sim N(0,2)$. Initially I generated my $X$ vector of data and used the same $X$ vector for each of the 1000 repetitions (so only thing different between repetitions is what random error gets added on).

+ +

Then I thought that might be wrong and that I should generate new $X$ data between each repetition.

+ +

What is the correct thing to do?

+ +

I can provide code if need be, but not really necessary to answer my question.

+",2013-10-22 20:28:41.463 +58054,4320.0,2,,58041.0,,,,CC BY-SA 3.0,"

Ignoring the terms where this happens is the correct thing to do. You can justify this by noting that in each case you've outlined, no matter what happens inside the $\log$ you will have $P(x,y,z) = 0$. You can see this by applying the Frechet inequalities, namely that $P(A,B) \le \min\{P(A), P(B)\}$.

+",2013-10-22 20:35:35.647 +58055,22507.0,2,,57965.0,,,,CC BY-SA 3.0,"

You may use the recurrent formula:

+ +

$p(D+1,C) = p(D,C){N-C \over N} + p(D,C+1){C+1 \over N}$

+ +

$p(0,C) = \cases{1,&if C=N\\0,&otherwise}$

+",2013-10-22 20:45:17.047 +58087,2666.0,2,,58067.0,,,,CC BY-SA 3.0,"

Simpler than BIR is the logarithmic or quadratic (Brier) scoring rules. These are proper scores that, unlike the proportion classified correctly, will not give rise to a bogus model upon optimization.

+",2013-10-23 13:09:47.363 +58056,16464.0,1,,,,How to think about iid observations,,CC BY-SA 3.0,"

I'm trying to make sure my understanding of iid observations is rock solid and so I have two write-ups in which I've attempted to accurately explain the concept. Can anyone comment on the accuracy of each of these write-ups?

+ +
(1) One way to represent a population is through a random variable X, which would have
+a pdf/pmf that modeled the relative frequencies of values with the population. If you
+were to sample from the population such that each subject is equally likely to be
+selected and so that selecting a particular subject has no bearing on the
+likelihood of any other subject being selected, then each observation can be thought of
+as a realization of the random variable X. As such, we denote the observations x_i.
+Under these conditions the observations can be thought of as independent draws from the
+random variable X, and so the observations are said to be iid (independently and
+identically distributed) – i.e. the observations can be thought of as n realizations of
+the random variable X.
+
+ +

Or

+ +
(2) One way to represent a population is through a random variable X, which would have
+a pdf/pmf that modeled the relative frequencies of values within the population. If you
+were to sample from the population such that each subject is equally likely to be
+selected and so that selecting a particular subject has no bearing on the
+likelihood of any other subject being selected, then each observation can be thought of
+as a realization of the random variable X. As such, we denote the observations x_i.
+Under these conditions the observations can be thought of as independent draws from
+identically distributed random variables, X_1,…,X_n, where x_i is the realization of
+the random variable X_i for all i. Hence, the observations are said to be iid
+(independently and identically distributed).
+
+ +

There are a couple of portions that I want to get answers about, specifically.

+ +
    +
  • Could the statement ""such that each subject is equally likely to be selected and so that"" in the second sentence of each write up be removed without affecting the accuracy of the statement?
  • +
  • Are both of these ways of thinking about iid draws correct?
  • +
  • If both statements are correct, is it conventional to interpret the draws as being realizations from one RV, or each observation as a realization from separate but identical RV's?
  • +
+",2013-10-22 20:49:49.513 +58057,503.0,2,,58051.0,,,,CC BY-SA 3.0,"

In any missing data situation the first thing to ask is why the data are missing. There are three types;

+ +

Missing completely at random (MCAR) - this means that the missing data are a totally random set of the data. This rarely happens unless there is some sort of mechanical glitch

+ +

Missing at random (MAR) - this means that the missing data could be a non-random subset of the data, but that the non-randomness can be completely explained by variables that are in the data.

+ +

Not missing at random aka nonignorable non-response (NMAR) - neither of the first two.

+ +

If it's MCAR, then the only thing lost by deleting the data is statistical power. If MAR, then the usual approach is multiple imputation, to account for the variance in single imputation regression imputation

+ +

If the data are NMAR then, technically, nothing will really work. However, multiple imputation may still be a good choice. Joe Schafer said (informally; he gave a talk at my old workplace and this was in the Q and A) that MI works well unless the data are ""really NMAR"".

+",2013-10-22 21:15:00.103 +58058,7189.0,2,,58053.0,,,,CC BY-SA 3.0,"

I think it makes sense to change X between iterations. But within an iteration you should have the same X to compare the three estimators.

+ +

You want to compare the three estimators in general. If you keep X constant, it could happen that an estimator that is not better in general performs better, because that particular X you chose turned out to have some special property.

+",2013-10-22 21:33:39.603 +58059,8671.0,1,68114.0,,,boosting with linear svm,,CC BY-SA 3.0,"

I am working on boosting classifier. I am planning to use linear svm as the weak classifier. I am using liblinear for it.

+ +

My question is how can I weight each instance of liblinear based on the boosting weights?

+",2013-10-22 22:07:51.183 +58060,4656.0,2,,58037.0,,,,CC BY-SA 3.0,"

For any $N$ numbers $y_1,y_2, \ldots, y_N$ with mean +$\displaystyle \bar{y} = \frac{1}{N}\sum_{i=1}^N y_i$, the variance is given by +$$\begin{align} +\sigma^2 &= \frac{1}{N-1}\sum_{i=1}^N (y_i-\bar{y})^2\\ +&= \frac{1}{N-1}\sum_{i=1}^N \left(y_i^2 - 2y_i\bar{y} + \bar{y}^2\right)\\ +&= \frac{1}{N-1}\left[\left(\sum_{i=1}^Ny_i^2\right) - 2N(\bar{y})^2 ++ N(\bar{y})^2 \right] \\ +\sigma^2 &=\frac{1}{N-1}\sum_{i=1}^N \left(y_i^2 - (\bar{y})^2\right) \tag{1} +\end{align}$$ +Applying $(1)$ to the given set of $n$ numbers $x_1, x_2, \ldots x_n$ +which we take for convenience in exposition to have mean $\bar{x} = 0$, +we have that +$$\sigma^2 = \frac{1}{n-1}\sum_{i=1}^n \left(x_i^2-(\bar{x})^2\right) += \frac{1}{n-1}\sum_{i=1}^n x_i^2$$ +If we now add in a new observation $x_{n+1}$ to this data set, then the new mean of +the data set is +$$\frac{1}{n+1}\sum_{i=1}^{n+1}x_i += \frac{n\bar{x} + x_{n+1}}{n+1} = \frac{x_{n+1}}{n+1}$$ +while the new variance is +$$\begin{align} +\hat{\sigma}^2 &= \frac{1}{n}\sum_{i=1}^{n+1} \left(x_i^2-\frac{x_{n+1}^2}{(n+1)^2}\right)\\ +&= \frac{1}{n}\left[\left((n-1)\sigma^2 + x_{n+1}^2\right) +- \frac{x_{n+1}^2}{n+1}\right]\\ +&= \left.\left.\frac{1}{n}\right[(n-1)\sigma^2 + \frac{n}{n+1}x_{n+1}^2\right]\\ +&> \sigma^2 ~ \text{only if}~ x_{n+1}^2 > \frac{n+1}{n}\sigma^2. +\end{align}$$ +So $|x_{n+1}|$ needs to be larger than $\displaystyle\sigma\sqrt{1+\frac{1}{n}}$ +or, more generally, $x_{n+1}$ needs to +differ from the mean $\bar{x}$ of the original data +set by more than $\displaystyle\sigma\sqrt{1+\frac{1}{n}}$, in order for +the augmented data set to have larger variance than the original data set. +See also Ray Koopman's answer which points out that the new variance is larger +than, equal to, or smaller than, the original variance according as $x_{n+1}$ +differs from the mean by more than, exactly, or less than $\displaystyle\sigma\sqrt{1+\frac{1}{n}}$.

+",2013-10-22 22:16:54.387 +58061,22949.0,1,,,,Difference between Gaussian Distribution and Cauchy Distribution,,CC BY-SA 3.0,"

I have searched for the above topic but did not have an answer. Can someone please tell me the detailed difference between the multivariate Gaussian Distribution and multivariate Cauchy Distribution? How do they differ in sampling is Estimation of Distribution algorithm for optimisation.

+",2013-10-22 22:37:54.810 +58062,22372.0,1,,,,How can find the slope of a GLM logit model?,,CC BY-SA 3.0,"

I've created a logit model using GLM and I´d like to get the slope of the independent variables.

+ +

I've read that using coef() in R it´s possible to do it, but the only thing I have back is the coefficients that I already have using summary().

+ +

I'm trying to get the slope at mean, like the one that is given when we use Gretl.

+",2013-10-22 22:41:03.607 +58063,23157.0,1,58473.0,,Eric,Gradient and Hessian of a likelihood function where y is defined implicitly,,CC BY-SA 3.0,"

$\newcommand{\implicit}{\mathrm{implicit}}$

+ +

I'm attempting to optimize model parameters $\theta$ by maximizing the likelihood function +$$ +f(y) = \ln \Bigl(\frac {n!}{k!(n-k)!}y^k(1-y)^{n-k}\Bigl) $$ +where $y$ must be calculated iteratively, as it is defined implicitly by $g(\theta)=0$, as follows: +$$ +g(\theta) = -1 + \frac{a\cdot \theta_2}{\ln \bigl(\frac{-y}{y - 1} \bigl) - \theta_1} + \frac{b\cdot \theta_4}{\ln \bigl(\frac{-y}{y - 1} \bigl) - \theta_3} +$$

+ +

($n$, $k$, $a$, and $b$ are constants).

+ +

As far as I can tell, using partial implicit differentiation, the gradient (first partial derivative with respect to $\theta_i$) would be defined by $$ +\frac{\partial \operatorname{f}}{\partial \operatorname{\theta_i}} = \frac{\frac{\partial \operatorname{f}}{\partial \operatorname{y}}\cdot \frac{-\partial \operatorname{g}}{\partial \operatorname{\theta_i}}}{\frac{\partial \operatorname{g}}{\partial \operatorname{y}}} +$$ +This seems to match numerical approximations computed by R, but when I try to calculate the Hessian by taking the partial derivative of the gradient with respect to $\theta_i$, I get strange results that are nowhere close to numerical approximations.

+ +

This is the formula I came up as my attempt at calculating the Hessian (second-order mixed partial derivative of $f$ with respect to $\theta_i$):

+ +

$$ +\frac{\partial}{\partial \operatorname{\theta_i}} \Bigl(\frac{\partial \operatorname{f}}{\partial \operatorname{\theta_j}}\Bigl) = \frac{\frac{\partial}{\partial \operatorname{\theta_i}} \Bigl(\frac{\partial \operatorname{f}}{\partial \operatorname{y}}\Bigl) \cdot \frac{-\partial \operatorname{g}}{\partial \operatorname{\theta_i}}}{\frac{\partial \operatorname{g}}{\partial \operatorname{y}}}+ +\frac{\frac{\partial \operatorname{f}}{\partial \operatorname{y}} \cdot \frac{\partial}{\partial \operatorname{\theta_i}} \Bigl(\frac{-\partial \operatorname{g}}{\partial \operatorname{\theta_i}}\Bigl)}{\frac{\partial \operatorname{g}}{\partial \operatorname{y}}}+ +\frac{\frac{\partial \operatorname{f}}{\partial \operatorname{y}} \cdot \frac{-\partial \operatorname{g}}{\partial \operatorname{\theta_i}}\cdot\frac{\partial}{\partial \operatorname{\theta_i}}\Bigl(\frac{\partial \operatorname{g}}{\partial \operatorname{y}}\Bigl)}{\Bigl(\frac{-\partial \operatorname{g}}{\partial \operatorname{y}}\Bigl)^2} +$$

+ +

Considering that the values I get from these calculations don't match (not even close) the automatically generated numerical approximations returned by R, I suspect that I've done something wrong here. Can anyone spot an error with the Hessian formula? Is there a property of higher-order mixed partial implicit differentiation that requires a different approach?

+ +

Thanks!

+ +

UPDATE: Alecos Papadopoulos posted a solution that eliminates the need for iteration by solving directly for $g(\theta)$, thus providing an exact value for $y$. This works perfectly for this problem, as the calculation of the gradient and Hessian does not require implicit differentiation in this case!

+ +

For proof of concept (and in case I come across any $g(\theta)$ functions that can't be solved directly), I'm still interested in figuring out what went wrong with my attempt at the Hessian. If anyone has any insight into a general solution for the Hessian using implicit differentiation, it is certainly welcome.

+",2013-10-23 00:00:39.590 +58064,20667.0,1,58066.0,,,Why do zero differences not enter computation in the Wilcoxon signed ranked test?,,CC BY-SA 3.0,"

The Wilcoxon signed ranked test tells us if the median difference between paired data can be zero. The test is executed by computing a statistic, then a z-score and comparing it to a critical value.

+ +

The thing that I find shocking is that we

+ +

discard all the pairs with same values from the process of computing the statistic.

+ +

From Wikipedia we have in step2:

+ +
+

Exclude pairs with $|x_{2,i} - x_{1,i}| = 0$. Let $N_r$ be the reduced + sample size.

+
+ +

And only $N_r$ is used in the rest of the computation.

+ +

One of the sources cited says:

+ +
+

In most applications of the Wilcoxon procedure, the cases in which + there is zero difference between $X_A$ and $X_B$ are at this point + eliminated from consideration, since they provide no useful + information, and the remaining absolute differences are then ranked + from lowest to highest, with tied ranks included where appropriate.

+
+ +

The author then proceeds to compute in the same manner as in the Wikipedia article.

+ +

I tried to look at the original Wilcoxon's article, but he does not seem to mention same value pairs.

+ +

The reason why I think this is madness is:

+ +

Ok, same value pairs do not change the value of the statistic, but they change the z-score. Imagine having a sample of $10^{1000}$ pairs while in $10$ pairs, the second value is higher and in all the remaining pairs, the values are the same. According to the above mentioned articles, we should discard these $10^{1000}-10$ pairs since they ""provide no useful information"" and consider only the remaining $10$ pairs. But those $10^{1000} - 10$ pairs do provide useful information. They scream in favor of the null hypothesis.

+ +

Please, could you explain how to do the test right?

+",2013-10-23 00:35:45.353 +58065,3993.0,2,,51644.0,,,,CC BY-SA 3.0,"

""I have always been taught that random effects only influence the variance (error), and that fixed effects only influence the mean.""

+ +

As you have discovered, this is only true for balanced, complete (i.e., no missing data) datasets with no continuous predictors. In other words, for the kinds of data/models discussed in classical ANOVA texts. Under these ideal circumstances, the fixed effects and random effects can be estimated independent of one another.

+ +

When these conditions do not hold (as they very very often do not in the ""real world""), the fixed and random effects are not independent. As an interesting aside, this is why ""modern"" mixed models are estimated using iterative optimization methods, rather than being exactly solved with a bit of matrix algebra as in the classical mixed ANOVA case: in order to estimate the fixed effects, we have to know the random effects, but in order to estimate the random effects, we have to know the fixed effects! More relevant to the present question, this also means that when data are unbalanced/incomplete and/or there are continuous predictors in the model, then adjusting the random-effects structure of the mixed model can alter the estimates of the fixed part of the model, and vice versa.

+ +

Edit 2016-07-05. From the comments: ""Could you elaborate or provide a citation on why continuous predictors will influence the estimates of the fixed part of the model?""

+ +

The estimates for the fixed part of the model will depend on the estimates for the random part of the model -- that is, the estimated variance components -- if (but not only if) the variance of the predictors differs across clusters. Which will almost certainly be true if any of the predictors are continuous (at least in ""real world"" data -- in theory it would be possible for this to not be true, e.g. in a constructed dataset).

+",2013-10-23 00:51:08.400 +58066,594.0,2,,58064.0,,,,CC BY-SA 3.0,"

It has to do with the assumptions of the test for which the distribution of the test statistic under the null is derived.

+ +

The variables are assumed to be continuous.

+ +

The probability of a tie is therefore 0 ... and this makes it possible to compute the permutation distribution of the test statistic under the null for given sample size.

+ +

Without that assumption being true, you could still do a test, but if you're going to get the null distribution of the test statistic, you'll have to try to compute it conditional on the pattern of tied values (or more easily, simulate).

+ +

The easier alternative is to only consider untied values.

+ +

Note further that observing ties is not 'evidence in favor of the null', it only contains a lack of evidence against it. With discrete distributions, a range of non-null alternatives are likely to produce ties, not just the null itself.

+ +

The 'correct' thing to do is not use a test that assumes continuous distributions on data that don't satisfy the assumptions. If you don't have that, you have to do something to deal with that failure.

+ +

I believe that conditioning on the untied data preserves the required properties for the significance level in a way that including ties in some way would not. We might check by simulation.

+",2013-10-23 01:06:41.770 +58067,22950.0,1,58077.0,,,"Choosing a classification performance metric for model selection, feature selection, and publication",,CC BY-SA 3.0,"

I have a small, unbalanced data set (70 positive, 30 negative), and I have been playing around with model selection for SVM parameters using BAC (balanced accuracy) and AUC (area under the curve). I used different class-weights for the C parameter in libSVM to offset the unbalanced data following the advice here (Training a decision tree against unbalanced data).

+ +
    +
  1. It seems that k-fold cross-validation error is very sensitive to the type of performance measure. It also has an error in itself because the training and validation sets are chosen randomly. For example, if I repeat BAC twice with different random seeds, I will get different errors, and subsequently different optimal parameter values. If I average repeated BAC scores, averaging 1000 times will give me different optimal parameter values than averaging 10000 times. Moreover, changing the number of folds gives me different optimal parameter values.

  2. +
  3. Accuracy metrics for cross validation may be overly optimistic. Usually anything over a 2-fold cross-validation gives me 100% accuracy. Also, the error rate is discretized due to small sample size. Model selection will often give me the same error rate across all or most parameter values.

  4. +
  5. When writing a report, how would I know that a classification is 'good' or 'acceptable'? In the field, it seems like we don't have something like a goodness of fit or p-value threshold that is commonly accepted. Since I am adding to the data iteratively, I would like to know when to stop- what is a good N where the model does not significantly improve?

  6. +
+ +

Given the issues described above, it seems like accuracy can't be easily compared between publications while AUC has been described as a poor indicator for performance (see here, or here, for example).

+ +

Any advice on how to tackle any of these 3 problems?

+",2013-10-23 02:27:10.847 +58068,22953.0,1,58079.0,,,Interpreting a negative confidence interval,,CC BY-SA 3.0,"

How do I interpret a negative confidence interval when comparing two population means?

+ +

For example, a confidence interval is $(-23.11, -1.02)$, what is the significance of these values being negative? Is it strictly signifying that $\bar{x}_1 < \bar{x}_2$ ?

+",2013-10-23 02:48:42.997 +58069,19750.0,1,69580.0,,,What is the relationship between the mean squared error and the residual sum of squares function?,,CC BY-SA 4.0,"

Looking at the Wikipedia definitions of:

+ + + +

It looks to me that

+ +

$$\text{MSE} = \frac{1}{N} \text{RSS} = \frac{1}{N} \sum (f_i -y_i)^2$$

+ +

where $N$ is he number of samples and $f_i$ is our estimation of $y_i$.

+ +

However, none of the Wikipedia articles mention this relationship. Why? Am I missing something?

+",2013-10-23 02:55:28.270 +58070,22955.0,1,58666.0,,,Multiple Linear Regression Models Comparison Based on R-squared And Residual Errors,,CC BY-SA 3.0,"

am currently working on a problem where I have to calibrate weather parameters at a ground location using Satellite data available (1979-2012) over rectangular grid points and surface observatory data for a location (1980-2000). I have built two models - One for January months over 1980-2000 period and other taking data from November to April over 1980-2000.

+ +

Now the problem arises when I compare adjusted R-squared for both the models. Nov-Apr model gives me higher R-squared (0.7 vs. 0.2) whereas residual errors tell me the opposite story, giving poor results in the case of Nov-Apr model.

+ +

One option would be to calculate different components of R-squared and see if it can explain the differences. So how do I calculate and explain these components in R? or is there any better and efficient way?

+ +

Thanks in advance.

+",2013-10-23 04:37:43.580 +58071,15766.0,1,58074.0,,,Basic binomial question,,CC BY-SA 3.0,"

The following appeared on an assignment of mine (already turned in). I contend that not enough information is given to provide an answer.... it seems pretty cut and clear to me. However, instructor insisted it's solvable in minitab. Can you help me figure out what I'm not understanding?

+ +

How do you solve this without a model of distribution of weekly demand, or at least an average value to use as constant approximation. I must be missing something simple.

+ +

The problem:

+ +

Consider a service company.

+ +

10% of the weekly demand is for a service category named ""X"" [Assume service categories are mutually exclusive].

+ +

The company must revise their resource plan iff there are too few customer orders(less than one/week) or too many customer orders (more than five/week) of service category ""X"".

+ +

For the next 12 weeks, what is the probability that the company will not need to revise the resource plan?

+ +

Thanks

+",2013-10-23 05:34:47.950 +58072,22956.0,1,,,,Probability Calculation,,CC BY-SA 3.0,"

Assume that a typical computer manufactured by a company lasts 10 Months and that the standard deviation is 50 days. Computer life follows a normal distribution. What is the probability that a computer made by this company will last at most 1 Year?

+ +

Assumption is that one month has 30 days.

+ +

Can you please explain how this is calculated ?

+ +

Thanks

+",2013-10-23 05:54:28.690 +58073,15766.0,2,,58072.0,,,,CC BY-SA 3.0,"

Normal distribution has 2 parameters: mean and variance (or standard deviation which is the square root of variance).

+ +

Mean=10months*30days/month= 300 days

+ +

Therefore, lifetime, T, is distributed as Normal(mean=300 days, std. dev=50 days).

+ +

You want to find P(T< 365 days)
+Calculate the z-score for T=365--> z = (365-300)/50=1.3
+And use a table or software to find the appropriate cumulative (""lower tail"") probability corresponding to z=1.3

+",2013-10-23 06:12:59.573 +58074,7949.0,2,,58071.0,,,,CC BY-SA 3.0,"

This depends on the total number of customer orders; consider the situation if you have just one order per week. Then you are almost certainly have less than one in any given week. OTH if you have 1000 customers, you will have about 100 ordering ""X"" each week which is too much.

+ +

It's also not clear if the 10% is an average or a fixed number; the same is the case for the missing number of orders per week. The most likely way to interpret this question is to assume that each customer order has a chance of 10% of belonging to category ""X"" - but then we will still need the number of customer orders. If the number of orders is fixed then X the number of orders of ""X"" per week would be binomially distributed and the question would be solvable.

+ +

I think it is really questionable to claim that it is ""solvable in Minitab"" and give no theoretical background. There may be a button in Minisab that takes these numbers and gives an answer, but is it the answer to the question as it is stated here?

+ +

Short version, I agree with you.

+",2013-10-23 06:26:43.607 +58075,20470.0,2,,58067.0,,,,CC BY-SA 3.0,"

As you point out, predictive accuracy and AUC are limited in certain aspects. I would give the Bayesian Information Reward (BIR) a go, which should give a more sensitive assessment of how well or badly your classifier is doing and how that changes as you tweak your parameters (number of validation folds, etc.).

+ +

The intuition of BIR is as follows: a bettor is rewarded not just for identifying the ultimate winners and losers (0's and 1's), but more importantly for identifying the appropriate odds. Furthermore, it goes a step ahead and compares all predictions with the prior probabilities.

+ +

Let's say you have a list of 10 Arsenal (football team in England) games with possible outcomes: $Win$ or $Lose$. The formula for binary classification rewarding per game is:

+ +

+ +

where, $p$ is your model's prediction for a particular Arsenal game, and $p'$ is the prior probability of Arsenal winning a game. The catch-point is: if I know beforehand that $p'=0.6$, and my predictor model produced $p =0.6$,even if its prediction was correct it is rewarded 0 since it is not conveying any new information. As a note, you treat the correct and incorrect classifications differently as shown in the equations. As a result, based on whether the prediction is correct or incorrect, the BIR for a single prediction can take a value between $(-inf, 1]$.

+ +

BIR is not limited to binary classifications but is generalised for multinomial classification problems as well.

+",2013-10-23 07:47:35.447 +58076,22960.0,2,,51895.0,,,,CC BY-SA 3.0,"

I have encountered a similar problem, and I solved it by transferring the class values (""status"" in your case) into factor type. After using data$status=factor(data$status), newData prints as follows:

+ +
     looking risk every status
+7          0    0     0      1
+2          0    0     0      1
+7.1        0    0     0      1
+12         0    0     0      1
+4          0    0     0      1
+12.1       0    0     0      1
+11         0    0     0      3
+8         NA   NA    NA      3
+9         NA   NA    NA      3
+10        NA   NA    NA      3
+111       NA   NA    NA      3
+121       NA   NA    NA      3
+13        NA   NA    NA      3
+
+ +

No errors!

+",2013-10-23 08:25:37.997 +58077,2958.0,2,,58067.0,,,,CC BY-SA 3.0,"
+

It seems that k-fold cross-validation error is very sensitive to the type of performance measure. It also has an error in itself because the training and validation sets are chosen randomly.

+
+ +

I think you've discovered the high variance of performance measures that are proportions of case counts such as $\frac{\text{# correct predictions}}{\text{# test cases}}$. You try to estimate e.g. the probability that your classifier returns a correct answer. From a statistics point of view, that is described as a Bernoulli trial, leading to a binomial distribution. You can calculate confidence intervals for binomial distributions and will find that they are very wide. This of course limits your ability to do model comparison.

+ +

With resampling validation schemes such as cross validation you have an additional source of variation: the instability of your models (as you build $k$ surrogate models during each CV run)

+ +
+

Moreover, changing the number of folds gives me different optimal parameter values.

+
+ +

That is to be expected due to the variance. You may have an additional effect here: libSVM splits the data only once if you use their built-in cross validation for tuning. Due to the nature of SVMs, if you built the SVM with identical training data and slowly vary the parameters, you'll find that support vectors (and consequently accuracy) jumps: as long as the SVM parameters are not too different, it will still choose the same support vectors. Only when the paraters are changed enough, suddenly different support vectors will result. So evaluating the SVM parameter grid with exactly the same cross validation splits may hide variability, which you see between different runs.

+ +

IMHO the basic problem is that you do a grid search, which is an optimization that relies on a reasonably smooth behaviour of your target functional (accuracy or whatever else you use). Due to the high variance of your performance measurements, this assumption is violated. The ""jumpy"" dependence of the SVM model also violates this assumption.

+ +
+

Accuracy metrics for cross validation may be overly optimistic. Usually anything over a 2-fold cross-validation gives me 100% accuracy. Also, the error rate is discretized due to small sample size. Model selection will often give me the same error rate across all or most parameter values.

+
+ +

That is to be expected given the general problems of the approach.

+ +

However, usually it is possible to choose really extreme parameter values where the classifier breaks down. IMHO the parameter ranges where the SVMs work well is important information.

+ +

In any case you absolutely need an external (double/nested) validation of the performance of the model you choose as 'best'.

+ +

I'd probably do a number of runs/repetitions/iterations of an outer cross validation or an outer out-of-bootstrap validation and give the distribution of

+ +
    +
  • hyperparameters for the ""best"" model
  • +
  • reported performance of the tuning
  • +
  • observed performance of outer validation
  • +
+ +

The difference between the last two is an indicator of overfitting (e.g. due to ""skimming"" the variance).

+ +
+

When writing a report, how would I know that a classification is 'good' or 'acceptable'? In the field, it seems like we don't have something like a goodness of fit or p-value threshold that is commonly accepted. Since I am adding to the data iteratively, I would like to know when to stop- what is a good N where the model does not significantly improve?

+
+ +

(What are you adding? Cases or variates/features?)

+ +

First of all, if you do an iterative modeling, you either need to report that due to your fitting procedure your performance is not to be taken seriously as it is subject to an optimistic bias. The better alternative is to do a validation of the final model. However, the test data of that must be independent of all data that ever went into training or your decision process for the modeling (so you may not have any such data left).

+",2013-10-23 08:35:20.010 +58096,803.0,2,,58091.0,,,,CC BY-SA 3.0,"

There's a few things here.

+ +
    +
  1. For binomial data, the variance is directly determined by the mean, and isn't an additional parameter, so there's no need to do a t-test... a normal z-test is slightly more efficient.
  2. +
  3. For binomial data, the Normal approximation (i.e. a Wald test) often fails. See Agresti and Coull, 1998, for some more detailed discussion and simulation studies. http://www.stat.ufl.edu/~aa/articles/agresti_coull_1998.pdf
  4. +
+ +

They give some recommendations about when it's okay to use or not use the normality assumption (as do others)... generally the closer you get to p=.5, and the larger your data set, the better it is, the further away from .5 you get (towards p=0 or p=1), or the smaller the data, it's worse.

+ +

But the Wilcoxon sign rank test is popular for this kind of data.

+",2013-10-23 14:40:40.563 +58078,22961.0,1,,,,"Quantitative ,longitudinal, observational study:Population, sample and instrument",,CC BY-SA 4.0,"

I am studying US public company financial performance for the last 2 decades. I am searching for evidence of ""sustainable superior performance"", thus to isolate those companies that display such characteristics. I will use the Compustat database.

+ +

Questions:

+ +
    +
  1. Is the total population the universe of US public company with +their past, present and future performance or the population is +the subset of those displaying superior performance (subject of the +study)?
  2. +
  3. Because all the companies will be observed to spot those +with certain characteristics (i.e. superior performance), what is +the sample? Can I consider the company present in the Compustat +database (independently if they have the traits I am seeking or not) +to be the sample?
  4. +
  5. What is the Instrument I need to ensure validity and reliability? +-The Compustat database with its screening capabilities? +-The criteria I will use to assess ""sustained, superior performance""? (like sales growth above industry average and so on)
  6. +
  7. The statistical technique used to find such outlier? (like Cluster Analysis for instance)
  8. +
+",2013-10-23 08:39:29.317 +58079,22959.0,2,,58068.0,,,,CC BY-SA 3.0,"

You should write your hypothesis first :)

+ +

But I guess your hyp was x_1 >= x_2 ? Then we can say that we are (1-alfa)% confident that the difference between the true mean of x_1 and x_2 is between (−23.11,−1.02)

+",2013-10-23 08:57:00.940 +58080,20470.0,1,,,,What is an intuitive explanation of the Bayesian score?,,CC BY-SA 3.0,"

I am familiar with the Bayesian score, which is used to compare competing structures of a BN. However, I have difficulty in understanding how the Bayesian score formula below is derived.

+ +

1) What is an intuitive explanation of the Bayesian score? (e.g. how do the fractions within the product operators equate to $P(D|B_S)$)

+ +

2) What type of structures does it favour? (e.g. more/less connected, higher/lower number of parents, etc.)

+ +

Background: The Bayesian score of a BN structure, $B_S$, for a data set, $D$, is used to calculate the joint probability of $P(B_S,D)$. For a discrete network (after some assumptions*), it is given as:

+ +

$$P(B_S,D) = P(B_S)\prod_{i=0}^n\prod_{j=1}^{qi} \frac{\Gamma(N'_{ij})}{\Gamma(N'_{ij} + N_{ij})} \prod_{k=1}^{ri} \frac{\Gamma(N'_{ijk} + N_{ijk})}{\Gamma(N'_{ijk})} $$

+ +
    +
  • $\Gamma()$ is the gamma function. Above, since all counts are positive integers, $\Gamma(int) = (int-1)!$

  • +
  • $P(B_S)$ is the prior on the network structure ( taken to be uniform/uninformative by many versions of the Bayesian score)

  • +
  • $xi$ is a node in the network with $n$ nodes;

  • +
  • $ri (1 ≤ i ≤ n)$ is the cardinality of $xi$;

  • +
  • $qi$ denotes the cardinality of the parent set of $xi$ , that is, the number of different values to which the parents of xi, $pa(xi)$, can be instantiated.

  • +
  • $N_{ij} (1 ≤ i ≤ n, 1 ≤ j ≤ qi)$ denotes the number of records in $D$ for which $pa(xi)$ takes its $j$th value.

  • +
  • $N_{ijk} (1 ≤ i ≤ n, 1 ≤ j ≤ qi, 1 ≤ k ≤ ri)$ denotes the number of records in $D$ for which $pa(xi)$ takes its $j$th value and for which $xi$ takes its $k$th value. So, $N_{ij} = \sum_{k=1}^{ri} N_{ijk}$.

  • +
  • $N′_{ij}$ and $N′_{ijk}$ represent choices of priors on counts restricted by $N'_{ij} = \sum_{k=1}^{ri} N'_{ijk}$.

  • +
+ +

*List of asssumptions: 1 - Multinomial Sample; 2- Parameter Independence; 3- Parameter Modularity; 4 - Dirichlet; 5- Complete Data

+",2013-10-23 10:42:29.047 +58081,11359.0,1,58115.0,,,Analyzing multiply imupted data from Amelia in R: Why do results from zelig and mice differ?,,CC BY-SA 3.0,"

Motivated by my answer to this question, I played around with analyzing mulitply imputed data from the Amelia package in R. As I have explained in my answer, the multiply imputed datasets can be analyzed using the combined Zelig and mitools packages or using a combination of Zelig and mice.

+ +

Now, to me it seemed rather inconvenient to fit the linear model using zelig() as mice, too, provides the with.mids()-function to fit linear models to multiply imputed datasets. However, I found that the results differ depending on the function used for fitting. For the analysis using with.mids, I first had to circumvent a bug in the current mice-package by defining the following function, as has been explained in another question:

+ +
as.mids2 <- function(data2, .imp=1, .id=2){
+  ini <- mice(data2[data2[, .imp] == 0, -c(.imp, .id)], maxit=0)
+  names  <- names(ini$imp)
+  if (!is.null(.id)){
+    rownames(ini$data) <- data2[data2[, .imp] == 0, .id]
+  }
+  for (i in 1:length(names)){
+    for(m in 1:(max(as.numeric(data2[, .imp])) - 1)){
+      if(!is.null(ini$imp[[i]])){
+        indic <- data2[, .imp] == m & is.na(data2[data2[, .imp]==0, names[i]])
+        ini$imp[[names[i]]][m] <- data2[indic, names[i]]
+      }
+    }
+  }
+  return(ini)
+}
+
+ +

Once I had done this, I used Zelig:

+ +
library(""Amelia"")
+data(freetrade)
+amelia.out <- amelia(freetrade, m = 15, ts = ""year"", cs = ""country"")
+
+library(""Zelig"")
+zelig.fit <- zelig(tariff ~ pop + gdp.pc + year + polity, data = amelia.out$imputations, model = ""ls"", cite = FALSE)
+zelig.results <- lapply(zelig.fit, function(x) x$result)
+
+library(""mice"")
+zelig4mice <- as.mira(zelig.results)
+zelig.mice.res <- summary(pool(zelig4mice, method = ""rubin1987""))
+
+ +

Then I tried the same thing using only mice:

+ +
imp.data <- do.call(""rbind"", amelia.out$imputations)
+imp.data <- rbind(freetrade, imp.data)
+imp.data$.imp <- as.numeric(rep(c(0:15), each = nrow(freetrade)))
+mice.data <- as.mids2(imp.data, .imp = ncol(imp.data), .id = NULL)
+
+mice.fit <- with(mice.data, lm(tariff ~ polity + pop + gdp.pc + year))
+mice.res <- summary(pool(mice.res2, method = ""rubin1987""))
+
+ +

These are the results:

+ +
> zelig.mice.res
+                  est       se     t    df Pr(>|t|)     lo 95     hi 95 nmis   fmi lambda
+(Intercept)  3.18e+03 7.22e+02  4.41  45.9 6.20e-05  1.73e+03  4.63e+03   NA 0.571  0.552
+pop          3.13e-08 5.59e-09  5.59 392.1 4.21e-08  2.03e-08  4.23e-08   NA 0.193  0.189
+gdp.pc      -2.11e-03 5.53e-04 -3.81 329.4 1.64e-04 -3.20e-03 -1.02e-03   NA 0.211  0.206
+year        -1.58e+00 3.63e-01 -4.37  45.9 7.11e-05 -2.31e+00 -8.54e-01   NA 0.570  0.552
+polity       5.52e-01 3.16e-01  1.75  90.8 8.41e-02 -7.58e-02  1.18e+00   NA 0.406  0.393
+
+> mice.res
+                  est       se     t     df Pr(>|t|)     lo 95     hi 95 nmis    fmi lambda
+(Intercept)  3.42e+03 8.87e+02  3.86   8.01 4.80e-03  1.38e+03  5.47e+03   NA 0.7599 0.7066
+pop          3.20e-08 5.25e-09  6.10 504.30 2.10e-09  2.17e-08  4.24e-08    0 0.0927 0.0891
+gdp.pc      -2.09e-03 5.31e-04 -3.93 189.23 1.19e-04 -3.13e-03 -1.04e-03    0 0.1543 0.1454
+year        -1.70e+00 4.46e-01 -3.83   8.02 5.02e-03 -2.73e+00 -6.78e-01    0 0.7594 0.7061
+polity       5.74e-01 3.60e-01  1.59  13.93 1.34e-01 -2.00e-01  1.35e+00    2 0.5907 0.5358
+
+ +

From these data it is apparent, that the linear models fit by the two methods differ and so do the determined degrees of freedom.

+ +

Why do these results differ? What is the correct analysis procedure?

+",2013-10-23 10:48:16.873 +58082,22967.0,1,327191.0,,,Extreme value theory for count data,,CC BY-SA 3.0,"

I am aware of extreme value theory for continuous distributions. I need to fit an extreme value distribution to the maximum observation of number of events on a day, per month. This seems to be the block maxima problem, which is approximated by the GEV family of distributions for continuous distributions. How do I do this for count data?

+ +

As a secondary question, let's assume the basic count process is ~ Poisson. Then does this lead to a different answer to the original question?

+",2013-10-23 11:35:49.187 +58083,17573.0,2,,57996.0,,,,CC BY-SA 3.0,"

It helps to make things concrete, so I will assume we are analyzing the effect of credit score on loan default, controlling for other stuff, like income. You will have to search and replace to put in your example. I recommend against trying to explain it in general. Laymen are pretty much never interested in that. How about (maybe too long?):

+ +

The partial effect of credit score on default probability is the amount that default probability goes up (or down) when credit score rises by one point and all other factors stay the same. Think about two people, one with a credit score of 650 and one with a credit score of 651. In all other respects (income, time on job, loan-to-value ratio, etc), they are identical. The one with the higher credit score will have a lower probability of default. The probability may only be a tiny, tiny bit lower since this is such a small difference in credit scores, but, because everything else is the same, it will be lower. This difference in default probabilities between a person with a credit score of 650 and a credit score of 651 but with everything else identical is the partial effect of credit score on default probability.

+ +

There are two complications, though. First, the difference in default probability between a person with a 650 score and a 651 credit score will not be the same as the difference in default probability between someone with a 750 and 751 credit score. Second, the 650 vs 651 difference in default probability will depend on their other characteristics. Two low income people, one with a 650 and one with a 651, may have a larger difference in default probability than two high income people, one with a 650 score and one with a 651.

+ +

To deal with these complications, we first calculate a personalized partial effect, the difference in default probabilities due to a one point increase in credit score, for each person in the sample. Then, we average over these personalized partial effects to give the average partial effect. This is called the ""average partial effect"" of credit scores on default probabilities.

+",2013-10-23 12:18:00.803 +58084,22968.0,1,58093.0,,,Comparing data files,,CC BY-SA 3.0,"

We have several weather files for a year's data sampled hourly. In each we have several variables (up to ten), temperature, wind speed, solar intensity etc.

+ +

I would like to try and develop a system that could determine if these files are statistically different. If I was using a single variable, I could use a KS-test, but with a collection of variables how should I approach this problem?

+",2013-10-23 12:34:44.323 +58085,22970.0,1,58086.0,,,Prediction with not significant covariate in logistic regression,,CC BY-SA 3.0,"

I have a logistic regression model with several variables and one of those variables (called x3 in my example below) is not significant. However, x3 should remain in the model because it is scientifically important.

+ +

Now, x3 is continuous and I want to create a plot of the predicted probability vs x3. Even though x3 is not statistically significant, it has an effect on my outcome and therefore it has an effect on the predicted probability. This means that I can see from the graph, that the probability changes with increasing x3. However, how should I interpret the graph and the change in the predicted probability, given that x3 is indeed not statistically significant?

+ +

Below is a simulated data in R set to illustrate my question. The graph also contains a 95% confidence interval for the predicted probability (dashed lines):

+ +
> set.seed(314)
+> n <- 300
+> x1 <- rbinom(n,1,0.5)
+> x2 <- rbinom(n,1,0.5)
+> x3 <- rexp(n)
+> logit <- 0.5+0.9*x1-0.5*x2
+> prob <- exp(logit)/(1+exp(logit))
+> y <- rbinom(n,1,prob)
+> 
+> model <- glm(y~x1+x2+x3, family=""binomial"")
+> summary(model)
+
+Call:
+glm(formula = y ~ x1 + x2 + x3, family = ""binomial"")
+
+Deviance Residuals: 
+    Min       1Q   Median       3Q      Max  
+-2.0394  -1.1254   0.5604   0.8554   1.4457  
+
+Coefficients:
+            Estimate Std. Error z value Pr(    >|z|)    
+(Intercept)   1.1402     0.2638   4.323 1.54e-05 ***
+x1            0.8256     0.2653   3.112  0.00186 ** 
+x2           -1.1338     0.2658  -4.266 1.99e-05 ***
+x3           -0.1478     0.1249  -1.183  0.23681    
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+(Dispersion parameter for binomial family taken to be 1)
+
+Null deviance: 373.05  on 299  degrees of freedom
+Residual deviance: 341.21  on 296  degrees of freedom
+AIC: 349.21
+
+Number of Fisher Scoring iterations: 3
+
+> 
+> dat <- data.frame(x1=1, x2=1, x3=seq(0,5,0.1))
+> preds <- predict(model, dat,type = ""link"", se.fit = TRUE )
+> critval <- 1.96
+> upr <- preds$fit + (critval * preds$se.fit)
+> lwr <- preds$fit - (critval * preds$se.fit)
+> fit <- preds$fit
+> 
+> fit2 <- mod$family$linkinv(fit)
+> upr2 <- mod$family$linkinv(upr)
+> lwr2 <- mod$family$linkinv(lwr)
+> 
+> plot(dat$x3, fit2, lwd=2, type=""l"", main=""Predicted Probability"", ylab=""Probability"", xlab=""x3"", ylim=c(0,1.00))
+> lines(dat$x3, upr2, lty=2)
+> lines(dat$x3, lwr2, lty=2)
+
+ +

+ +

Thanks!

+ +

Emilia

+",2013-10-23 12:50:26.157 +58088,11359.0,2,,25072.0,,,,CC BY-SA 3.0,"

I think this article will provide answers for your questions 1 and 4:

+

Fabrigar, L. R., Wegener, D. T., MacCallum, R. C., & Strahan, E. J. (1999). Evaluating the use of exploratory factor analysis in psychological research. Psychological Methods, 4, 272–299. doi:10.1037//1082-989X.4.3.272 (PDF)

+

From page 275:

+
+

[...] it is important to recognize that the goal of identifying latent constructs (i.e., understanding the structure of correlations among measured variables) is different from that of data reduction. Data reduction involves taking scores on a large set of measured variables and reducing them to scores on a smaller set of composite variables that retain as much information from the original variables as possible. Data reduction does not attempt to model the structure of correlations among the original variables.

+

[...]

+

If the goal is to arrive at a parsimonious representation of the associations among measured variables, EFA can be an appropriate form of analysis. If the goal is data reduction, principal components analysis (PCA) is more appropriate.

+
+

The author than continues to elaborate on the conceptual differences differences, which will probably help to clarify things.

+",2013-10-23 13:11:02.900 +58089,13427.0,2,,54637.0,,,,CC BY-SA 3.0,"

Normally you would take the p-value by applying Rubin's rules on conventional statistical parameters like regression weights. Thus, there is often no need to pool p-values directly. Also, the likelihood ratio statistic can be pooled to compare models. Pooling procedures for other statistics can be found in my book Flexible Imputation of Missing Data, chapter 6.

+ +

In cases where there is no known distribution or method, there is an unpublished procedure by Licht and Rubin for one-sided tests. I used this procedure to pool p-values from the wilcoxon() procedure, but it is general and straightforward to adapt to other uses.

+ +

Use procedure below ONLY if all else fails, as for now, we know little about its statistical properties.

+ +
lichtrubin <- function(fit){
+    ## pools the p-values of a one-sided test according to the Licht-Rubin method
+    ## this method pools p-values in the z-score scale, and then transforms back 
+    ## the result to the 0-1 scale
+    ## Licht C, Rubin DB (2011) unpublished
+    if (!is.mira(fit)) stop(""Argument 'fit' is not an object of class 'mira'."")
+    fitlist <- fit$analyses
+        if (!inherits(fitlist[[1]], ""htest"")) stop(""Object fit$analyses[[1]] is not an object of class 'htest'."")
+    m <- length(fitlist)
+    p <- rep(NA, length = m)
+    for (i in 1:m) p[i] <- fitlist[[i]]$p.value
+    z <- qnorm(p)  # transform to z-scale
+    num <- mean(z)
+    den <- sqrt(1 + var(z))
+    pnorm( num / den) # average and transform back
+}
+
+",2013-10-23 13:51:33.063 +58090,1809.0,1,,,,Comparing category distributions,,CC BY-SA 3.0,"

I have categorical count data (for categories C1 to C3, but potentially several more categories) for two datasets:

+ +
        | --- Dataset 1 --- | --- Dataset 2 --- |
+        |  C1    C2    C3   |  C1    C2    C3   |
+Item 1  |  0     200   300  |  0     2      3   |
+Item 2  |  0     200   300  |  5     0      0   |
+
+ +

The total number of data-points in each dataset is different (500 and 5 in this example).

+ +

What statistical test should I use to determine if the distribution of counts for each item across the categories is the same between the two datasets?

+ +

For example, the distribution of Item 1 is the same across the two datasets, but the distribution of Item 2 is not. I will test each item separately.

+",2013-10-23 14:05:15.060 +58091,22972.0,1,,,,Paired t-test for binary data,,CC BY-SA 3.0,"

I have one sample with n=170 and two binary variables (A,B) that can take as a value 1 or 0, where 1 counts as a success and 0 counts as a failure. What I want to know is whether the means of these two variables are equal.

+ +

To find this out I generate a new variable that takes the difference between these two variables called C, so C = B-A. I then compute the p-value for the hypothesis that C is normally distributed with the Shapiro-Wilk test and I find a p-value of .96, so I choose not to reject this hypothesis. Apart from that the difference is normally distributed, I am not worried about the other assumptions required for a paired t-test.

+ +

Question: Can I use the paired t-test in this circumstance or is it a mistake to use the Shapiro-Wilk test for binary data to check for normality and should I use the Wilcoxon sign rank test instead?

+ +

I would much prefer to use the t-test, because I believe it has a higher power than the Wilcoxon sign rank test, but that higher power pretty much does not matter if the test used is the wrong one.

+ +

Cheers,

+ +

Martin

+",2013-10-23 14:10:58.907 +58092,22729.0,1,,,,Test incorrect functional form when residuals have non-normal distribution,,CC BY-SA 3.0,"

J. B. Ramsey (in ""Tests for specification errors in classical linear least-squares regression analysis."" Journal of the Royal Statistical Society. 1969) says that the RESET test assumes that the residuals are normally distributed.

+ +

If one wants to test the incorrect functional form of a model but the residuals have a non-normal distribution, how can it be done?

+ +

Ramsey also says that ""the cases where mis-specification leads to a non-normal distribution of û [residuals] are to be discussed in a later paper"". Does any one know which paper is this?

+",2013-10-23 14:11:37.900 +58093,10469.0,2,,58084.0,,,,CC BY-SA 3.0,"

Use Kolmogorov-Smirnov for each variable - this will tell you the p-value for each one being different.

+ +

Now you have 10 binary values (telling you whether the original variable failed the KS), and you need to test the hypothesis that the ""yes"" values occurred at random with probability at most p.

+",2013-10-23 14:16:01.433 +58094,503.0,2,,58090.0,,,,CC BY-SA 3.0,"

If your reshape your data so that it is datasetXC, then you can use chi square or similar. E.g.

+ +
            C1       C2     C3
+1            0       200    300
+2            0         2     3
+
+ +

Here you would need an exact test.

+",2013-10-23 14:31:50.700 +58095,22399.0,2,,58091.0,,,,CC BY-SA 3.0,"

You are using the term 'mean' but actually you are comparing 'proportions' as your variables are categorical. I would ignore any issues with normality as the sampling distribution of the proportions will be normal (ignoring some pathological situations such as low sample size which is not an issue here or proportions close to $0$ or $100$).

+ +

I recommend looking at the two-proportion z-test at the wiki: Common Statistical Tests. Search for ""two-proportion z-test"" in the table for the relevant test and conditions under which it is valid.

+",2013-10-23 14:33:18.997 +58098,9095.0,2,,45279.0,,,,CC BY-SA 3.0,"

So far, the best options I've found, thanks to your suggestions, are these:

+ +
  library (igraph)
+  library (ggparallel)
+
+# Generate random data
+
+  x1 <- sample(1:1, 1000, replace=T)
+  x2 <- sample(2:3, 1000, replace=T)
+  x3 <- sample(4:6, 1000, replace=T)
+  x4 <- sample(7:10, 1000, replace=T)
+  x5 <- sample(11:15, 1000, replace=T)
+  results <- cbind (x1, x2, x3, x4, x5)
+  results <-as.data.frame(results)
+
+# Make a data frame for the edges and counts
+
+  g1           <- count (results, c(""x1"", ""x2""))
+
+  g2           <- count (results, c(""x2"", ""x3""))
+  colnames(g2) <- c     (""x1"", ""x2"", ""freq"")
+
+  g3           <- count (results, c(""x3"", ""x4""))
+  colnames(g3) <- c     (""x1"", ""x2"", ""freq"")
+
+  g4           <- count (results, c(""x4"", ""x5""))
+  colnames(g4) <- c     (""x1"", ""x2"", ""freq"")
+
+  edges        <- rbind (g1, g2, g3, g4)
+
+# Make a data frame for the class sizes
+
+  h1            <- count (results, c(""x1""))
+
+  h2            <- count (results, c(""x2""))
+  colnames (h2) <- c     (""x1"", ""freq"")
+
+  h3            <- count (results, c(""x3""))
+  colnames (h3) <- c     (""x1"", ""freq"")
+
+  h4            <- count (results, c(""x4""))
+  colnames (h4) <- c     (""x1"", ""freq"")
+
+  h5            <- count (results, c(""x5""))
+  colnames (h5) <- c     (""x1"", ""freq"")
+
+  cSizes        <- rbind (h1, h2, h3, h4, h5)
+
+# Graph with igraph
+
+  gph    <- graph.data.frame (edges, directed=TRUE)
+
+  layout <- layout.reingold.tilford (gph, root = 1)
+  plot (gph,
+        layout           = layout,
+        edge.label       = edges$freq, 
+        edge.curved      = FALSE,
+        edge.label.cex   = .8,
+        edge.label.color = ""black"",
+        edge.color       = ""grey"",
+        edge.arrow.mode  = 0,
+        vertex.label     = cSizes$x1 , 
+        vertex.shape     = ""square"",
+        vertex.size      = cSizes$freq/20)
+
+# The same idea, using ggparallel
+
+  a <- c(""x1"", ""x2"", ""x3"", ""x4"", ""x5"")
+
+  ggparallel (list (a), 
+              data        = results, 
+              method      = ""hammock"", 
+              asp         = .7, 
+              alpha       = .5, 
+              width       = .5, 
+              text.angle = 0)
+
+ +

Done with igraph

+ +

+ +

Done with ggparallel

+ +

+ +

Still too rough to share in a journal, but I've certainly found having a quick look at these very useful.

+ +

There is also a possible option from this question on stack overflow, but I haven't had a chance to implement it yet; and another possibility here.

+",2013-10-23 14:51:25.430 +58099,8926.0,2,,4187.0,,,,CC BY-SA 3.0,"

Temptation to use advanced statistical methods without understanding them, just because they sound impressive or because they happen to better support researcher's initial hypothesis.

+ +

When one uses an advanced method he or she should have solid reasons as to why the method is appropriate.

+",2013-10-23 15:02:20.477 +58100,22974.0,1,,,,Netflix Challenge - some help with SVD/SoftImpute,,CC BY-SA 3.0,"

I'm currently working on the Netflix Challenge with the original huge dataset and have run into some problems. I don't have access to any servers or computing clusters so I've been running everything (slowly) on my personal machine.

+ +

I'm trying to implement the softImpute function in R and the algorithm converges in a reasonable amount of time. However, I can't feasibly do cross validation (SV) to optimize the best ""rank.max"" and ""lambda"" values in order to get the result.

+ +

I'm wondering if there is some sort of rule of thumb for these collaborative filtering problems in terms of what rank to choose based on the dimensions of your data matrix and perhaps an estimate on what lambda to then select. I understand this will never be as accurate as CV and parameter optimising but perhaps some basic rule of thumb can help me get a closer answer.

+",2013-10-23 15:38:02.280 +58101,22.0,2,,58091.0,,,,CC BY-SA 3.0,"

If I understand the context correctly, then McNemar's test is exactly what you want. It compares two binomial variables measured in each subject, sort of a paired chi-square test. The key point is that your data are paired -- you've measured two different binomial outcomes in each subject, so need a test that accounts for that.

+",2013-10-23 15:48:34.750 +58102,21762.0,2,,58097.0,,,,CC BY-SA 3.0,"

As long as you correct for multiplicity (e.g. Bonferroni-Holm), this is one of many possible ways to test for association. Of course it can capture only linear aspects of the association. A non-significant result can thus be due to low power, lack of linear aspects in the true association or due to conservativeness of the correction for multiplicity.

+",2013-10-23 16:02:04.757 +58103,13526.0,1,58108.0,,,Fuzzy regression discontinuity design in Stata,,CC BY-SA 3.0,"

I am currently running computations through a ""Fuzzy"" Regression discontinuity Design. Suppose my data are in the following form:

+ +
    +
  • $Z$: assignment variable; if $Z > Z_0$ then the person is assigned to the treatment with a certain probability $p_D$ (since we are in the ""fuzzy"" RDD framework, $p_D<1$).
  • +
  • $D$: treatment status; $D=1$ if the person is treated, 0 otherwise.
  • +
  • $X$: set of exogenous variables.
  • +
  • $Y$: Binary outcome variable.
  • +
+ +

To my knowledge - see e.g. [1] - running a fuzzy RDD is equivalent to apply Instrumental Variables using $Z$ as instrument (hence at the first stage we should have $D$ regressed on $Z$ and $X$).

+ +

In order to estimate the model through Stata I used the following code:

+ +
biprobit (Y = X D) (D = X Z)
+
+ +

According to some research I have done - see Nichols' pdf at [2] - the -biprobit- package should be required because of the binary nature of the endogenous variable ($D$).

+ +

Do you find the above codes correct? Is it also possible to use a simple linear probability model like this?

+ +
ivregress 2sls Y X (D=Z)
+
+ +

Thanks fo any help,

+ +

Stefano

+ +
+

[1] Angrist, J. D., Pischke, J. (2008). Mostly Harmless Econometrics: An Empiricist's Companion. Princeton University Press.

+ +

[2]: http://www.google.it/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CDQQFjAA&url=http://www.stata.com/meeting/chicago11/materials/chi11_nichols.pdf&ei=GvVnUvKOFIPv4gT-moH4DQ&usg=AFQjCNGv9pmEIOIvhVsmmMq38q05pRbFbg&bvm=bv.55123115,d.bGE

+
+",2013-10-23 16:05:58.830 +58104,22976.0,1,58113.0,,,Learning probability bad reasoning. Conditional and unconditional,,CC BY-SA 3.0,"

I have a problem, I'm learning probability at the moment (I'm a programmer) and starting I have this:

+
+

(Source: Minka.) My neighbor has two children. Assuming that the gender of a child is like a coin flip, it is most likely, a priori, that my neighbor has one boy and one girl, with probability 1/2. The other possibilities—two boys or two girls—have probabilities 1/4 and 1/4.

+
    +
  • a. Suppose I ask him whether he has any boys, and he says yes. What is the probability that one child is a girl?

    +
  • +
  • b. Suppose instead that I happen to see one of his children run by, and it is a boy. What is the probability that the other child is a girl?

    +
  • +
+
+

Now my reasoning is:

+
BB = 1/4 = 0.25
+BG = 1/4 = 0.25
+GB = 1/4 = 0.25
+GG = 1/4 = 0.25
+
+

So for a., the probability of G I get it just by summing p(B,G) + p(G,B) = 0.5

+

And for b. p(G|B) = p(G,B)/p(B) = 0.5/0.5 = 1 that is wrong but I'm not getting why.

+",2013-10-23 16:31:13.440 +58105,17628.0,1,58118.0,,,Parameter confidence intervals which include errors in data,,CC BY-SA 3.0,"

My question seems to be very basic one but my search has not given any similar question. I have small dataset of 8 $(x,y)$ values with uncertainties for $y$ (dependent variable) and the theory predicts quadratic dependence $y=a x^2 + b x + c$. I wish to fit this dataset to the quadratic equation and to calculate confidence intervals for the parameters $a$, $b$, $c$ based both on the residuals (they are very small, i.e. the quadratic model describes the experimental data with very small residuals) and on declared uncertainty for every measurement (the uncertainties are much bigger than the residuals). What is the most correct way to do this?

+",2013-10-23 16:37:54.700 +58218,10060.0,2,,58210.0,,,,CC BY-SA 3.0,"

Sorry, can't wrap my head around it...

+ +

From the plot above (the red line is when x = y)... I can see your first condition where when x increases, y can only increase. But I don't understand given the first condition, how can ""X values are anything for all Y values.""

+ +

Please post your scatter plot, while I'll go prepare some popcorn.

+",2013-10-25 13:24:51.600 +58106,21576.0,2,,56875.0,,,,CC BY-SA 3.0,"

An interesting and topical issue in risk modelling. In my experience, risk models for credit which are developed on shorter periods of data tend to produce unstable coefficients - cross validation and/or out-of-time testing for model performance have usually shown this to be the case.

+ +

For stress testing of credit risk models, we are concerned with estimating probability of default (PD, probability of customer not repaying a loan) and loss given default (LGD, proportion of loan lost in event of default) during downturn macroeconomic conditions.

+ +

In terms of regulatory perspectives for stress testing, the Basel Committee on Banking Supervision (BCBS), which can be considered the central bank of central banks, indicates a minimum of 5 to 7 years of data (dependent on model and portfolio type) for model development, unless strong evidence is shown that more recent data is more predictive. Banking regulators typically adhere to these time periods as part of Basel II and III standards for calculating regulatory capital.

+ +

Additionally, the time period of 2008 to 2009 would seem to be somewhat short as the financial crisis has persisted for a longer period in some countries, e.g. UK where macro economic conditions worsened in 2007. It could be argued that the crisis is still ongoing in some parts of the world.

+",2013-10-23 16:46:43.820 +58107,22978.0,1,,,,What is the best convergence measure of MCMC,,CC BY-SA 3.0,"

I use measures of Gelman and Rubin or Geweke. However, they are not applicable to sampling from multi-modal distribution, say p(x), because a chain can be stuck in a local mode. In such cases, the chain seems to be stationary and the two convergence measures will give a bad result that it is safe to terminate sampling, actually it is not. When I handled toy problems, I set first or second moment of target distributions is known. The distance between the value and its approximation using the corresponding chain was used as a convergence measure, but it is practically not applicable.

+ +

Is there any MCMC convergence measure that makes use of target distributions?

+",2013-10-23 16:53:02.503 +58108,5045.0,2,,58103.0,,,,CC BY-SA 3.0,"

This is partial answer. I think you should probably use both the biprobit and the ivreg/ivreg2 commands to check how robust your effects are. I like the biprobit approach given your data, but it does make some strong assumptions (no heteroskedasticity, no hetrogenous effects, normality of errors).* However, there's also a dedicated RD command in Stata called rdrobust. It can handle the fuzzy design and may be installed with:

+ +
net install rdrobust, from(http://www-personal.umich.edu/~cattaneo/rdrobust) replace
+
+ +

You can find an intro to the command in Cattaneo, Calonico, and Titiunik's Stata Journal paper Robust Data-Driven Inference in the Regression-Discontinuity Design.

+ +
+ +

*Austin Nichols' simulation results indicate that the marginal effects may be less sensitive than the latent index function parameters to biprobit assumption violations. The LPM model is also not always the model of steel that A&P make it out.

+",2013-10-23 16:54:44.293 +58109,22399.0,1,,,,Interpretation of regression coefficients in the presence of modest correlations,,CC BY-SA 3.0,"

I have a multiple regression model where I have nearly 20 independent variables. These variables are modestly correlated with each other (e.g., the maximum VIF is around 4 with most of them in the 2s).

+ +

One of the coefficients is statistically significant and is negative when I expected that it would be positive. I know that 'wrong signs' can be because of several reasons such as multi-collinearity, missing data, omitted variables etc but I am wondering if there is a simpler explanation for the 'wrong sign'.

+ +

The usual interpretation of the coefficients is that it represents the impact on the dependent variable when we change the independent variable by 1 unit holding everything else constant.

+ +

However, the above interpretation is accurate only if the independent variables are completely uncorrelated with one another. In the presence of modest correlations among the independent variables, when we increase one of them by 1 unit the others are also bound to go up/down by a modest amount (depending on the sign of the correlation) and hence the only way to predict the impact of a unit change of an independent variable is to evaluate its impact on the other independent variables and then assess the overall impact on the dependent variable. When we do such an analysis we may well discover that the 'wrong sign' is a non-issue as increasing that variable by 1 unit may result in an increase in the dependent variable via the changes in the other independent variables in the model.

+ +

Does the above explanation make sense or am I missing something?

+",2013-10-23 17:09:32.380 +58110,22977.0,1,,,,Parameter space exploration,,CC BY-SA 3.0,"

I do realise this question is quite specific and practical, but I seek for some general help which helps me progress further in my analysis.

+ +

Let $y(\boldsymbol{x})\in\mathbb{R}$ be the function I'd like to regress, whereas $\boldsymbol{x}\in\mathbb{R}^6$. My design matrix $\boldsymbol{X}\in\mathbb{R}^{11\times6}$ (therefore, at the moment, $y(\boldsymbol{X})\in\mathbb{R}^{11}$, but I can run further experiments, if needed).
+My aim is maximising $y(\boldsymbol{x})$ over $\boldsymbol{x}$, $x_{2k+1}=\{5,7,9\}$, $x_{2k}=\{2,4\}$.

+ +

If I would run a dense experimental analysis, I would to test $3^3\times2^3=216$ cases, and each of those are very time consuming.

+ +

The collected data is the following (yes, I know, they do not respect the constrain given above, which it allows me to run a reasonably lower total number of experiments if the dense experimental analysis will be considered as only feasible solution):

+ +
X = [
+     4     2     3     2     3     2
+     7     2     5     2     5     4
+    10     2     7     4     5     2
+    10     2     9     2     8     4
+    10     4     6     2     4     2
+     7     2     5     4     5     2
+     7     4     5     2     5     2
+     7     2     5     4     5     4
+     7     4     5     2     5     4
+     7     4     5     4     5     2
+     7     4     5     4     5     4
+     ];
+
+y = [
+     64.7
+     68.0
+     69.4
+     68.5
+     68.5
+     71.9
+     71.6
+     71.7
+     71.0
+     71.6
+     68.6
+     ];
+
+ +

I did scatter plot $(x_i, y(x_i)),i=\{1,2,3,4,5,6\}$, but the output is not that informative.

+ +
name = {
+     'ks1'
+     'ss1'
+     'ks2'
+     'ss2'
+     'ks3'
+     'ss3'
+     };
+
+[m, n] = size(X); % # of experiments, # of parameters = 3 conv kernel size + 3 subsamp
+
+for j = 1:2:n
+    subplot(2,3,(j+1)/2)
+    h = plot(X(:,j),y,'x','MarkerSize',10,'LineWidth',2);
+    xlabel(name{j})
+    ylabel('Testing accuracy [%]')
+    axis([2 10 60 75])
+    grid on
+end
+
+for j = 2:2:n
+    subplot(2,3,3+j/2)
+    h = plot(X(:,j),y,'x','MarkerSize',10,'LineWidth',2);
+    xlabel(name{j})
+    ylabel('Testing accuracy [%]')
+    axis([1 5 60 75])
+    grid on
+end
+
+ +

+ +

So, my question is, how can I understand (and/or visualise) what is going on in this multivariate problem?
+To my mind a PCA keeps coming up as possible useful tool, even though I know it is used for visualising main orientations rather than helping me finding the maxima of my scalar fields..

+ +

Update
+I am currently running the 216 cases batch experiment, limited to 30 epochs training. Still, I would like to visualise my results. Actually, I do have a testing (and training) accuracy function per case, whereas above I just picked its maximum.

+ +

This is an overview of the testing accuracy functions +

+ +
+ +

What I am trying to accomplish is determining the best configuration for a convolutional artificial neural network. The parameters I am considering here are kernel size (ks) and subsamplig (ss) factor for multiple layers (1, 2 and 3), whereas $y(\cdot)$ is the testing (or cross-validation) accuracy.

+",2013-10-23 17:19:28.557 +58111,7927.0,1,58132.0,,,Why use bayesglm?,,CC BY-SA 3.0,"

My overall question is: why use bayesglm instead of other classification methods?

+ +

Note:

+ +
    +
  1. I'm only interested in prediction.
  2. +
  3. I have a decent amount of data (~ 100,000 obs).
  4. +
+ +

I feel like the sample size is large enough the parameters of a regular logistic regression are going to be normally distributed (CLT). What would I gain by specifying priors? My hunch is that it will only matter for a small dataset, but I don't have any theoretical or applied evidence.

+",2013-10-23 17:25:23.237 +58112,14172.0,1,,,,Ratio data with categorical events,,CC BY-SA 3.0,"

I have a question about treating qualitative (categorical) events within otherwise quantitative (ratio scale) data. Without going into too much detain, the experiment is on throwing. I measure the distance between the target and the result of the throw for each attempt. Some attempts are failed because the participant hits an obstacle on the way to the target. In the end the dependent variable may look something like this: +[.2 .1 .3 .2 -1 .2 .2 .8 -1] where the values may range from 0 to anything, but -1 represent the obstacle hits. What are some ways to deal with such categorical data within quantitative variable? I cannot just throw them out because they are meaningful from the performance perspective. Substituting with some high error distance seems arbitrary. Is there anything else? In the end I'm interested in how people get better at throwing as a result of practice.

+",2013-10-23 17:39:16.837 +58241,23046.0,1,,,,Slope of a Regression,,CC BY-SA 3.0,"

+ +

Given this chart, how do you determine the slope of the regression Y on F?

+",2013-10-25 17:58:52.327 +58113,19331.0,2,,58104.0,,,,CC BY-SA 3.0,"

For (a), a simple way to look at is that you've reduced your probability space to only the combinations that have at least one boy:

+ +
BB = 1/3
+BG = 1/3
+GB = 1/3
+
+ +

GG is no longer a possibility based on the fact that your neighbor said he had at least one boy. Of the possibilities remaining, you're left with a 2/3 probability that he has a girl. The information he gave you reduced the probability of him having a girl from 3/4 to 2/3. Formally, this can be shown as follows: $$P(At\ least\ one\ girl|At\ least\ one\ boy) = \frac{P(At\ least\ one\ girl\ \cap At\ least\ one\ boy)}{P(At\ least\ one\ boy)} $$ +From your original box, we can see the probability of having at least one boy and at least one girl is BG + GB = 0.25 + 0.25 = 0.5, but we need to divide by the probability of at least one boy, which is BB + BG + GB = 0.25 + 0.25 + 0.25 = 0.75, so we get $\frac{\tfrac{1}{2}}{\tfrac{3}{4}} = \frac{2}{3}$.

+ +

For (b), now that we've seen a boy, the only uncertainty remaining is the gender of the other child, and given no other information, the probability of the other child being female is 1/2, which is the answer.

+",2013-10-23 17:41:54.690 +58114,5045.0,2,,58109.0,,,,CC BY-SA 4.0,"

This is not an answer, but it is too long for a comment.

+ +

I would say the interpretation is accurate even with multicollinearity, but the ceteris paribus coefficient is not the quantity you care about. If you believe that the multicollinearity arises from an approximate linear relationship among some of the regressors, that relationship could be formalized either through some constraint on the parameters (such as dropping a variable or something more) or with a simultaneous equation approach. Without more details about the nature of your problem, it's hard to be more specific. There are some examples (28, 29 and 5) in Peter Kennedy's paper Oh No! I Got the Wrong Sign! What Should I Do?.

+",2013-10-23 17:44:29.413 +58115,22355.0,2,,58081.0,,,,CC BY-SA 3.0,"

Stef is right. An initial mice object is created in as.mids2() with m = 5 as a default. Even though the mids object is completely filled in, the with() function only uses the first five. This is because the number of imputations - corresponding to the largest value in .imp - is not changed in the returned object. The following code yields the correct mids object

+ +
as.mids2 <- function(data2, .imp=1, .id=2){
+  ini <- mice(data2[data2[, .imp] == 0, -c(.imp, .id)], m = max(as.numeric(data2[, .imp])), maxit=0)
+  names  <- names(ini$imp)
+  if (!is.null(.id)){
+    rownames(ini$data) <- data2[data2[, .imp] == 0, .id]
+  }
+  for (i in 1:length(names)){
+    for(m in 1:(max(as.numeric(data2[, .imp])))){
+      if(!is.null(ini$imp[[i]])){
+        indic <- data2[, .imp] == m & is.na(data2[data2[, .imp]==0, names[i]])
+        ini$imp[[names[i]]][m] <- data2[indic, names[i]]
+      }
+    } 
+  }
+  return(ini)
+}
+
+ +

and yields the same results as Zelig given the active random seed.

+ +
> zelig.mice.res
+                      est           se         t        df     Pr(>|t|)         lo 95
+(Intercept)  3.118344e+03 7.031673e+02  4.434711  55.20608 4.437533e-05  1.709283e+03
+pop          3.074658e-08 5.993667e-09  5.129844 211.31179 6.550041e-07  1.893154e-08
+gdp.pc      -2.185839e-03 5.968324e-04 -3.662400 174.91116 3.308584e-04 -3.363759e-03
+year        -1.551702e+00 3.535625e-01 -4.388762  55.26609 5.183456e-05 -2.260180e+00
+polity       4.649357e-01 3.102389e-01  1.498638 125.11682 1.364865e-01 -1.490600e-01
+                    hi 95 nmis       fmi    lambda
+(Intercept)  4.527404e+03   NA 0.5206397 0.5035824
+pop          4.256162e-08   NA 0.2643263 0.2573962
+gdp.pc      -1.007919e-03   NA 0.2909757 0.2829145
+year        -8.432231e-01   NA 0.5203580 0.5033089
+polity       1.078931e+00   NA 0.3448966 0.3345077
+
+> mice.res
+                      est           se         t        df     Pr(>|t|)         lo 95
+(Intercept)  3.118344e+03 7.031673e+02  4.434711  55.20608 4.437533e-05  1.709283e+03
+pop          3.074658e-08 5.993667e-09  5.129844 211.31179 6.550041e-07  1.893154e-08
+gdp.pc      -2.185839e-03 5.968324e-04 -3.662400 174.91116 3.308584e-04 -3.363759e-03
+year        -1.551702e+00 3.535625e-01 -4.388762  55.26609 5.183456e-05 -2.260180e+00
+polity       4.649357e-01 3.102389e-01  1.498638 125.11682 1.364865e-01 -1.490600e-01
+                    hi 95 nmis       fmi    lambda
+(Intercept)  4.527404e+03   NA 0.5206397 0.5035824
+pop          4.256162e-08    0 0.2643263 0.2573962
+gdp.pc      -1.007919e-03    0 0.2909757 0.2829145
+year        -8.432231e-01    0 0.5203580 0.5033089
+polity       1.078931e+00    2 0.3448966 0.3345077 
+
+",2013-10-23 17:48:43.877 +58116,,2,,58112.0,user31668,,,CC BY-SA 3.0,"

In statistical terminology, the -1 data represent censored data. Two general approaches for sensoring are multiple imputation and censored likelihood

+ +

One of those two should be helpful.

+",2013-10-23 17:54:04.683 +58117,22982.0,1,,,,Is Cronbach's alpha interchangeable with $\rho$?,,CC BY-SA 3.0,"

I am trying to calculate the reliability of my change scores using the following formula. Is $r$ ($\rho$) in this instance interchangeable with Cronbach's alpha? +$$ +\frac{.5(r_{xx}+ r_{yy}) - r_{xy}}{1 - r_{xy}} +$$ +Where reliability of difference scores is a function of the score reliability of each measure ($r_{xx}, r_{yy}$) and the correlation between the measures ($r_{xy}$).

+",2013-10-23 17:56:19.437 +58118,,2,,58105.0,user31668,,,CC BY-SA 3.0,"

Based on the OP's response to my questions, I'd suggest boostrapping, which will take into account the possibility that your small sample size results in overfitting the model. +I would look into the body of theory called Error in Variables Regression. See link and link

+",2013-10-23 17:57:03.863 +58119,20622.0,1,58496.0,,,Does the presence of an outlier increase the probability that another outlier will also be present on the same observation?,,CC BY-SA 3.0,"

**Edit: (10/26/13) More clear (hopefully) mini-rewrites added at the bottom**

+

I'm asking this from a theoretical/general standpoint - not one that applies to a specific use case.

+

I was thinking about this today:

+

Assuming the data does not contain any measurement errors, if you're looking at a specific observation in your data and one of the measurements you recorded contains what could be considered an outlier, does this increase the probability (above that of the rest of the observations that do not contained measured outliers) that the same observation will contain another outlier in another measurement?

+

For my answer I'm looking for some sort of theorem, principle, etc. that states what I'm trying to communicate here much more elegantly. For clearer explanations see Gino and Behacad's answers.

+

Example:

+

Let's say you're measuring the height and circumference of a certain type of plant. Each observation corresponds to 1 plant (you're only doing this once).

+

For height, you measure:

+
+Obs 1  |   10 cm
+Obs 2  |   9 cm
+Obs 3  |   11 cm
+Obs 4  |   22 cm
+Obs 5  |   10 cm
+Obs 6  |   9 cm
+Obs 7  |   11 cm
+Obs 8  |   10 cm
+Obs 9  |   11 cm
+Obs 10  |   9 cm
+Obs 11  |   11 cm
+Obs 12  |   10 cm
+Obs 13  |   9 cm
+Obs 14  |   10 cm
+
+

Since observation 4 contains what could be considered an outlier from the rest of the data, would the probability increase that measured circumference also contains an outlier for observation #4?

+

I understand my example may be too idealistic but I think it gets the point across...just change the measurements to anything.

+

Edited in attempts to make more clear:

+

(10/26/13)

+

Version 1 Attempt of Abbreviated Question:

+
+

In nature and in general, is there a tendency (even a weak one) that +the probability is greater that the "degree of variance from the +mean in any attribute(s) of an observation" will be similar to the +"degree of variance from the mean in any other* specific attribute of +that same observation" in comparison to the probability that it +will INSTEAD be more similar to the "degree of variance from the +mean in that same* specific attribute of any other observation."

+
+

* next to a word means I was pairing what they reference.
+"Quotes" used above mean nothing and are used simply to help section parts together/off for clarity.

+

Version 2 Attempt of Abbreviated Question:

+
+

In nature and in general, is variance from the mean across +observations for one attribute¹ correlated (even with extremely +loose correlation) to the variance from the mean across observations for all +attributes¹?

+
+

¹Attribute meaning measurement, quality, presence-of-either of these, and/or nearly anything else that the word "attribute" could even slightly represent as a word. Include all synonyms of the word "attribute" as well.

+",2013-10-23 18:17:15.587 +58161,9074.0,2,,58159.0,,,,CC BY-SA 3.0,"

What do you mean by ""average improvement rate""? If you mean that the number of faulty products are halved every year, then it is simply an exponential model that you are looking for.

+ +

$3\sigma=66,807$ faults per $1,000,000$

+ +

$6\sigma = 3.4 $ faults per $1,000,000$

+ +

An exponential model of faults is then given by +$f(t)=f(0)(1+r)^t$, with $r = -0.5$, which can be solved as follows: +$3.4=66,807*0.5^t \leftrightarrow {3.4 \over 66,807} = 0.5^t \leftrightarrow \log({3.4 \over 66,807})=t\log(0.5) \leftrightarrow \\ {\log({3.4 \over 66,807}) \over log(0.5)} = t$

+ +

$t=-9.8859/-0.693 = 14.265$ years. Or 14 years and 97 days.

+",2013-10-24 12:29:36.073 +58120,,2,,58082.0,user31668,,,CC BY-SA 3.0,"

I don't know a definitive answer for your primary question. Although I found the following two references:

+ +

Anderson, C. W., “Extreme value theory for a class of discrete distributions with applications to some stochastic processes”, Journal of Applied Probability, vol 7, 1970, pp. 99–113.

+ +

Anderson, C. W., “Local limit theorems for the maxima of discrete random variables”, +Mathematical Proceedings of the Cambridge Philosophical Society, vol 88, 1980, pp. 161– +165.

+ +

For your secondary question, the CDF of the Poisson is $\frac{\Gamma(\lfloor k+1\rfloor,\lambda)}{\lfloor k\rfloor!}$ so $P(\max\limits_N X_n \leq M) = (\frac{\Gamma(\lfloor k+1\rfloor,\lambda)}{\lfloor k\rfloor!})^N$. Apply the difference operator (lag1) and you get the PMF of the max.

+",2013-10-23 18:20:32.227 +58121,2069.0,2,,58119.0,,,,CC BY-SA 3.0,"

I'm going to say yes. Of course something being an outlier on one measure does not necessarily mean it will be an outlier on other measures, but I think it certainly increases the odds. In essence, I think anything being unusual in one way is more likely to be unusual in other ways. This is perhaps the result of different variables being correlated in all kinds of ways. So if one is ""extreme"", this will also correlate with other factors that might also be in the extreme range.

+ +

I hope someone can provide a better answer with some empirical evidence. Perhaps we can look at this from the multivariate outlier perspective. In my experience, individuals who are outliers on one variable are often outliers on many other variables as well, contributing to multivariate outlierness.

+ +

Take this answer with a grain of salt since it depends on how we define an outlier and such, but I think in general it makes sense.

+",2013-10-23 18:36:40.280 +58122,22985.0,1,58151.0,,,Correlation of parts of two variables that miminize correlation with another variable,,CC BY-SA 3.0,"

I have three data vectors $A$, $B$ and $C$ that are all more or less correlated.

+ +

What I want is a meaningful definition for a measure of the degree of correlation between those parts of $A$ and $B$ that are not correlated with $C$.

+ +

I do apologize if this is not fully understandable. I struggle to come to an easier formulation of the problem. However I can give an example:

+ +

Example: Say $A$, $B$, $C$ could all be binary, i.e. only contain the values 0 and 1. I then would want to know the number of positions where $A$ and $B$ are 1 and $C$ is 0.

+ +

As a side note: How does one ""uncorrelate"" variables, i.e. given two variables $A$ and $B$ how to split $A$ into a part that is maximally correlated with B and another that is maximally uncorrelated?

+",2013-10-23 18:50:22.557 +58123,22987.0,1,,,,Standard error from correlation coefficient,,CC BY-SA 3.0,"

Many studies only report the relationship between two variables (e.g. linear or logistic equation), $n$, and $r^2$. I want to use these reported statistics to reproduce this relationship with its variation. Most statistical software will generate a parameter distribution from a mean and standard error. Assuming a normal distribution, can the standard error of the parameter estimates be calculated with just these three statistics? Essentially, can I get a standard error from $r^2$?

+ +

Or will I need to do some kind of bootstrapping procedure to generate a distribution that has the same $r^2$ and then calculate the standard error? if so are there better ones for linear vs. nonlinear equations?

+",2013-10-23 18:53:42.190 +58124,21762.0,2,,57968.0,,,,CC BY-SA 3.0,"

Sheldon, Sheldon. How comes that you have to ask a question about math to people like us?

+ +

In survival analysis, your setting is called ""competing risk"". The joint distribution of the earliest failure time and the type of failure is fully described by the so called ""cumulative incidence function"" (it even allows for censoring, i.e. no failure until end of time horizon). I am quite sure that you will find relevant information in the literature stated in +Assumptions and pitfalls in competing risks model

+",2013-10-23 19:04:02.220 +58125,211.0,1,58128.0,,,"""Kernel density estimation"" is a convolution of what?",,CC BY-SA 3.0,"

I am trying to get a better understanding of kernel density estimation.

+ +

Using the definition from Wikipedia: https://en.wikipedia.org/wiki/Kernel_density_estimation#Definition

+ +

$ + \hat{f_h}(x) = \frac{1}{n}\sum_{i=1}^n K_h (x - x_i) \quad = \frac{1}{nh} \sum_{i=1}^n K\Big(\frac{x-x_i}{h}\Big) +$

+ +

Let's take $K()$ to be a rectangular function which gives $1$ if $x$ is between $-0.5$ and $0.5$ and $0$ otherwise, and $h$ (window size) to be 1.

+ +

I understand that the density is a convolution of two functions, but I am not sure I know how to define these two functions. One of them should (probably) be a function of the data which, for every point in R, tells us how many data points we have in that location (mostly $0$). And the other function should probably be some modification of the kernel function, combined with the window size. But I am not sure how to define it.

+ +

Any suggestions?

+ +

Bellow is an example R code which (I suspect) replicates the settings I defined above (with a mixture of two Gaussians and $n=100$), on which I hope to see a ""proof"" that the functions to be convoluted are as we suspect.

+ +
# example code:
+set.seed(2346639)
+x <- c(rnorm(50), rnorm(50,2))
+plot(density(x, kernel='rectangular', width=1, n = 10**4))
+rug(x)
+
+ +

+",2013-10-23 19:36:10.130 +58126,19750.0,1,,,,Mean absolute deviation,,CC BY-SA 3.0,"

Wikipedia states:

+ +
+

The mean absolute error (MAE) is a common measure of forecast error in time + series analysis, where the terms ""mean absolute deviation"" is + sometimes used in confusion with the more standard definition of mean + absolute deviation. The same confusion exists more generally.

+
+ +

What does that mean? What exactly is the confusion?

+ +

Also, why is MAE is used in time series analysis specifically? (as opposed to more general measures of error such as MSE)?

+",2013-10-23 19:47:38.237 +58127,22961.0,2,,58119.0,,,,CC BY-SA 3.0,"

I will try to reply using empirical evidence. Let's assume you are measuring the heights in a men sample. In this case, the outlier will be represented by a very tall men (a giant). It is very likely this men will represent an outlier also for other variables like for instance shoe size or arms lengths and so on. +Other case, you are measuring financial performance of US Public company. An outlier will be a very successful company with a sales growth twice the industry average. Very likely the same company will be an outlier in respect of any measure of profitability or stock price appreciation. In a nut shell, I am incline to think something behaving exceptionally out of the norm will tend to conserve this property across different manifestations. Is there a theorem that disprove this theory?

+",2013-10-23 19:56:26.190 +58128,668.0,2,,58125.0,,,,CC BY-SA 4.0,"

Corresponding to any batch of data $X = (x_1, x_2, \ldots, x_n)$ is its ""empirical density function""

+ +

$$f_X(x) = \frac{1}{n}\sum_{i=1}^{n} \delta(x-x_i).$$

+ +

Here, $\delta$ is a ""generalized function."" Despite that name, it isn't a function at all: it's a new mathematical object that can be used only within integrals. Its defining property is that for any function $g$ of compact support that is continuous in a neighborhood of $0$,

+ +

$$\int_{\mathbb{R}}\delta(x) g(x) dx = g(0).$$

+ +

(Names for $\delta$ include ""atomic"" or ""point"" measure and ""Dirac delta function."" In the following calculation this concept is extended to include functions $g$ which are continuous from one side only.)

+ +

Justifying this characterization of $f_X$ is the observation that

+ +

$$\eqalign{ +\int_{-\infty}^{x} f_X(y) dy +&= \int_{-\infty}^{x} \frac{1}{n}\sum_{i=1}^{n} \delta(y-x_i)dy \\ +&= \frac{1}{n}\sum_{i=1}^{n} \int_{-\infty}^{x} \delta(y-x_i)dy \\ +&= \frac{1}{n}\sum_{i=1}^{n} \int_{\mathbb{R}} I(y\le x) \delta(y-x_i)dy \\ +&= \frac{1}{n}\sum_{i=1}^{n} I(x_i \le x) \\ +&= F_X(x) +}$$

+ +

where $F_X$ is the usual empirical CDF and $I$ is the usual characteristic function (equal to $1$ where its argument is true and $0$ otherwise). (I skip an elementary limiting argument needed to move from functions of compact support to functions defined over $\mathbb{R}$; because $I$ only needs to be defined for values within the range of $X$, which is compact, this is no problem.)

+ +

The convolution of $f_X(x)$ with any other function $k$ is given, by definition, as

+ +

$$\eqalign{ +(f_X * k)(x) &= \int_{\mathbb{R}} f_X(x - y) k(y) dy \\ + &=\int_{\mathbb{R}} \frac{1}{n}\sum_{i=1}^{n} \delta(x-y-x_i) k(y) dy \\ + &= \frac{1}{n}\sum_{i=1}^{n}\int_{\mathbb{R}} \delta(x-y-x_i) k(y) dy \\ + &=\frac{1}{n}\sum_{i=1}^{n} k(x_i-x). +}$$

+ +

Letting $k(x) = K_h(-x)$ (which is the same as $K_h(x)$ for symmetric kernels--and most kernels are symmetric) we obtain the claimed result: the Wikipedia formula is a convolution.

+",2013-10-23 19:57:19.317 +58129,4582.0,2,,31575.0,,,,CC BY-SA 3.0,"

Here is my implementation in R

+ +
x <- c(1,2,1,1,3,4,4,1,2,4,1,4,3,4,4,4,3,1,3,2,3,3,3,4,2,2,3)
+xChar<-as.character(x)
+library(markovchain)
+mcX<-markovchainFit(xChar)$estimate
+mcX
+
+",2013-10-23 20:06:30.957 +58130,5237.0,2,,58123.0,,,,CC BY-SA 3.0,"

If you look at the Wikipedia page for the Pearson product-moment correlation, you will find sections that describe how confidence intervals can be calculated. Typically, people will use Fisher's $z$-transformation (arctan) to turn the $r$ into a variable that is approximately normally distributed:
+$$ +z_r = \frac 1 2 \ln \frac{1 + r}{1 - r} +$$ +Having applied this transformation, the standard error will be approximately $^1/_{\sqrt{(N-3)}}$. With this you can form whatever length confidence interval you like. Once you've found the confidence limits you want, you can back-transform them to the original $r$ scale (i.e., $[-1, 1]$) like so:
+$$ +\text{CI limit}_r = \frac{\exp(2z) - 1}{\exp(2z) + 1} +$$ +In other words, you can form a confidence interval for $r$ without the original data, so long as you have the original $N$.

+ +

Notes: This approach is an approximation, there are exact formulae listed on the Wikipedia page, but they are harder to use. Although it doesn't say on the Wikipedia page, there are several conditions you want to meet in order for this approximation to be reasonable. The $N$ should be at least $30$ (IIRC), and the marginal distributions (i.e., the univariate distributions of the two variables being correlated) should be normal. For example, I'm not sure that this will be accurate if the correlation were composed of two vectors of $1$s and $0$s. However, higher $N$ should allow you to compensate for minor non-normality.

+",2013-10-23 20:15:29.683 +58131,16046.0,1,,,,How to sample using MCMC from a posterior distribution in general?,,CC BY-SA 3.0,"

Assume one has the posterior distribution of a parameter, $p(\theta|y)$ and what I mean by having it is that for each point of $\theta$, one can use Monte Carlo method+MCMC to calculate the $p(\theta|y)$. Now my question is if I want to sample from $p(\theta|y)$, them basically I have to do one Gibbs sampling(for example) to sample from distribution and at any point I have to run Monte Carlo method on the point to calculate $p(\theta|y)$'s value right? i.e. it needs two loops, one inside of the other. Is this correct?

+ +

As I got an answer to this question and I thought maybe my question is vague I will try to clarify it a bit more:

+ +

From what I know by reading for a week the whole time about Monte Carlo method and MCMC, I understood(correct me if I am wrong) that: +$$p(\theta|y)=\frac{p(y|\theta)p(\theta)}{\int_{\Theta}{p(y|\theta)p(\theta)}\text{d}\theta}.$$

+ +

Now if you consider that we only have a sampling algorithm for $\theta$ and we can only calculate $p(y|\theta)$ explicitly(and not the other functions!), therefore to get values from $p(\theta|y)$ one needs to numerically integrate the denominator. And for each value of this posterior one needs to apply a sampling scheme like Gibbs sampling to generate a sample of $p(\theta|y)$; each new transition in the parameter space should then sample from the distribution which is $p(\theta|y)$ here and to calculate that the above proportion should be computed.

+",2013-10-23 20:15:39.310 +58132,15766.0,2,,58111.0,,,,CC BY-SA 3.0,"

In engineering, as well as supply chain risk management, ""engineering knowledge"" --eg an educated persons best guess-- may be the best data you have. For example, the likelihood of a tsunami occurring and disrupting the supply chain, without additional data, can be estimated by an expert in the subject (there are better methods for constructing priors). As time passes, tsunamis occur and, as a result, we gain more data, and can update our priors (engineering knowledge) with posteriors (priors adjusted for new data). At some point, there will be so much data that the initial prior is irrelevant, and no matter whom made the prediction, you will have equal predictions of likelihood.

+ +

It is my belief that if you have that much data, a ""traditional"" Frequentist approach is (typically) preferable to the Bayesian approach (of course others will disagree, especially with choosing between statistical philosophies rather than sticking to one and selecting an appropriate method). Note that it is entirely possible (and occurs often) that the Frequentist approach yields similar/identical results to the Bayesian.

+ +

That said, when the difference in methods is a line of code, why not implement multiple methods and compare the results yourself?

+",2013-10-23 20:34:04.577 +58133,2666.0,2,,45280.0,,,,CC BY-SA 3.0,"

Had you had a larger sample size, Hoeffding's D test is the most general one and is nonparametric. It will even detect non one-to-one relationships (e.g., a circle). This test is implemented in the R Hmisc package hoeffd function.

+",2013-10-23 21:40:29.270 +58134,16644.0,2,,58126.0,,,,CC BY-SA 3.0,"

One of the reasons MAE is used in time series or forecasting is that non-scientists find it easy to understand. So if you tell your client the MAE is 1.5 units, for example, he/she can interpret that as the average amount that the forecast is in error (in absolute units). But if you tell them the MSE you may well get a blank look because it has no such interpretation.

+ +

I'm not sure what causes the confusion between MAE and mean absolute deviation, but I'd attribute it to a lack of clear definitions or explanations in the specific context where it is used.

+",2013-10-23 22:08:16.910 +58135,15025.0,1,,,,Is there a notion of the product of confidence bands?,,CC BY-SA 3.0,"

Given two confidence bands, it seems natural to consider the confidence band of the product of these two distributions. Does this have a name? Are there any reference that might be helpful regarding this notion?

+ +

Edit: Assume that the errors of the first, say, regression curve are not correlated with the errors of the second.

+",2013-10-23 22:17:47.940 +58136,594.0,2,,57893.0,,,,CC BY-SA 3.0,"

Here's the actual pdf required:

+ +

$$f(x)=\begin{cases} + c\quad &|x|\leq x_0\\ + c_1 \exp(-|x|)\quad &|x|> x_0 +\end{cases}$$ +that is the probability is constant between $(-x_0,x_0)$ and falls exponentially outside it, and where $c_1$ and $c$ are such that the pdf is continuous and integrates to 1.

+ +

That is, we have the conditions that $c_1\exp(-x_0) = c$; i.e. $c_1 = c\exp(x_0)$ and that the $\int_{-\infty}^\infty f(x) dx = 1$.

+ +

\begin{eqnarray} +\int_{-\infty}^\infty f(x) dx +&=& 2c\left( \int_{0}^{x_0} 1 dx + \exp(x_0)\int_{x_0}^\infty \exp(-x) dx \right)\\ +&=& 2c\left( x|_{0}^{x_0} - \exp(x_0)\exp(-x)|_{x_0}^\infty \right)\\ +&=& 2c\left( x_0 - \exp(x_0)[0-\exp(-x_0)] \right)\\ +&=& 2c( x_0 +1 ) +\end{eqnarray}

+ +

Implying $c=\frac{1}{2( x_0 +1 )}$. Here's $f$ for $x_0=1$ (black) and $x_0=2$ (green):

+ +

+ +

As a quick check on our algebra, via approximate numerical integration the area under both curves seems to be 1 to about the right number of figures.

+ +

Now we have the pdf right, we can write the cdf:

+ +

$$F(x)= +\begin{cases} +c\exp(x+x_0)&x\leq -x_0\\ +1/2+cx& -x_0<x<x_0\\ +1/2+cx_0+c(1-\exp(x_0-x))& x\geq x_0 +\end{cases}$$

+ +

(though there are perhaps better ways to write that last term; I'm not going to investigate that here)

+ +

Here are the corresponding cdfs for the above two cases:

+ +

+ +

Here's the quantile function, $Q(p) = F^{-1}(p)$:

+ +

$$Q(p) = +\begin{cases} +\ln(p/c)-x_0 & 0<p\leq 1/2 -cx_0\\ +(p-.5)/c & 1/2-cx_0<p<1/2+cx_0\\ +x_0-\ln(1-[(p - 0.5)/c - x_0]) & 1/2+cx_0\leq p<1 +\end{cases}$$

+ +

and a drawing of the inverse cdf for the same cases as above:

+ +

+ +

I wouldn't have got these right without drawing diagrams along the way, especially of $f$ and $F$.

+",2013-10-23 22:22:06.500 +58137,19374.0,1,58184.0,,,Diagnosing collinearity in a Cox proportional hazards model,,CC BY-SA 3.0,"

I am building a Cox Proportional Hazards Model to predict the survival outcome of seabird faced with predation pressure. I have 6 factor variables with two or three levels each that I have predicted to affect survival. Three of which are management relevant (they can be manipulated by wildlife managers to increase or decrease survival if significant). The ultimate goal of the model is prediction but I would like to include the management relevant variables as well even if not significant. How can I check for multicollinearity among my variables. I am using program R for the analysis.

+",2013-10-23 22:39:16.600 +58138,1506.0,1,58244.0,,,Matching X'X with Wishart Samples in R,,CC BY-SA 3.0,"

$X'X \sim Wishart(\Sigma,n)$, however I'm having a tough time producing this in R.

+ +

Example:

+ +
data=cbind(rnorm(100,10,5),rnorm(100,5,2),rnorm(100,-4,3))
+
+X=cbind(rnorm(1,10,5),rnorm(1,5,2),rnorm(1,-4,3))
+t(X)%*%X
+rWishart(10,99,cov(data))
+
+ +

The data generated from rWishart is not close to $X'X$. What am I doing wrong? The help documentation mentions $\Sigma$ should be a scaled matrix, however I'm unsure what this is.

+",2013-10-23 22:55:44.577 +58139,8374.0,1,,,,R: Multivariate T distribution Sampling,,CC BY-SA 3.0,"

In the question https://stackoverflow.com/questions/18153450/generating-random-variables-from-the-multivariate-t-distribution, I am confused by why the answer requires we modify the Sigma in such a way that we need to multiply is by (D-2)/D. Here sigma is the covariance matrix for me. The answer also mentions that the correlation matrix is defined when df > 2, shouldn't it be df>= 2? This is because the correlation coefficient can't be calculated when the data is continue itself one, there must be more than 1 series. Am I interpreting this correctly??

+",2013-10-23 23:15:32.967 +58140,22449.0,1,,,,How do you derive the Success-Run Theorem from the traditional form of Bayes Theorem?,,CC BY-SA 3.0,"

In my industry it is common to test a sample of 20-30 and then use that data to draw conclusions about the reliability of the product with a certain confidence. We have tables for such things but it appears that for the case of 0 failures in the sample, the ""Success Run Theorem"" is used. In my references this appears as: +$$R_c = (1-C)^{\frac{1}{(n+1)}}$$ +where +$C$ = confidence level, +$R$ = reliability at confidence level $C$, and +$n$ = sample size.

+ +

However, I cannot find an explanation of how to get to the above equation from Bayes' theorem:

+ +

+ +

Every attempt to talk myself through Bayes theorem to arrive at the Success Run Theorem gets me confused. Even more confusing is when I try to extend my understanding to cases where some failures are observed in the sampling. Then I know to use this formula:

+ +

+

+ +

But again I don't understand where it comes from (binomial?) or how it relates to the above two other formulas, if at all.

+ +

My specific question would be how you go from Bayes Theorem (written as probabilities) to the Success Run Theorem (written as confidence, reliability, sample size)?

+ +

Thank you for helping a poor engineer lost in the world of stats.

+",2013-10-23 23:21:54.180 +58141,10469.0,1,58146.0,,,How do I test that two continuous variables are independent?,,CC BY-SA 3.0,"

Suppose I have a sample $(X_n,Y_n), n=1..N$ from the joint distribution of $X$ and $Y$. How do I test the hypothesis that $X$ and $Y$ are independent?

+ +

No assumption is made on the joint or marginal distribution laws of $X$ and $Y$ (least of all joint normality, since in that case independence is identical to correlation being $0$).

+ +

No assumption is made on the nature of a possible relationship between $X$ and $Y$; it may be non-linear, so the variables are uncorrelated ($r=0$) but highly co-dependent ($I=H$).

+ +

I can see two approaches:

+ +
    +
  1. Bin both variables and use Fisher's exact test or G-test.

    + +
      +
    • Pro: use well-established statistical tests
    • +
    • Con: depends on binning
    • +
  2. +
  3. Estimate the dependency of $X$ and $Y$: $\frac{I(X;Y)}{H(X,Y)}$ (this is $0$ for independent $X$ and $Y$ and $1$ when they completely determine each other).

    + +
      +
    • Pro: produces a number with a clear theoretical meaning
    • +
    • Con: depends on the approximate entropy computation (i.e., binning again)
    • +
  4. +
+ +

Do these approaches make sense?

+ +

What other methods people use?

+",2013-10-23 23:54:16.613 +58142,14748.0,1,,,,Machine-learning input data distribution,,CC BY-SA 3.0,"

I'm trying to build a binary 1/0 ML classification algorithm, and was thinking about how to set up the input dataset. If the event I want to predict (the 1's) occur relatively less frequently in the total data than the 0's, does it makes sense to pare the dataset in such a way to get a more equal distribution of 1's and 0's? Would that be falsely representing the data to the algorithm? What are the disadvantages to doing so?

+",2013-10-24 00:15:29.007 +58162,15658.0,1,58164.0,,,When to use residual plots?,,CC BY-SA 3.0,"

I have performed a simple regression analysis between one dependent variable (DV) and one explanatory variable (IV).

+ +

If the p-value from the regression analysis for the IV is not significant, should I still use residual plots to verify that the regression model used was correct (and the statement of non-significant IV is correct)?

+ +

Or should one use residual plots only for models that include significant IVs?

+",2013-10-24 13:13:57.760 +58143,5448.0,2,,58131.0,,,,CC BY-SA 3.0,"

We don't use MCMC to calculate the $p(\theta | y)$ for each value (or many values) of $\theta$. What MCMC (or the special case of Gibbs sampling) does is generate a (large) random sample from $p(\theta | y)$. Note that $p(\theta | y)$ is not being calculated; you have to do something with that vector (or matrix) of random numbers to estimate $p(\theta)$. Since you're not calculating $p(\theta)$ for lots of values of $\theta$, you don't need a Gibbs (or MCMC) loop inside a $\theta$ loop - just one (long) Gibbs (or MCMC) loop.

+ +

EDIT in response to an update to the question: We do not need to integrate the distribution to get the constant of integration (CoI)! The whole value of MCMC is is found in situations where we can't calculate the CoI. Using MCMC, we can still generate random numbers from the distribution. If we could calculate the CoI, we could just calculate the probabilities directly, without the need to resort to simulation.

+ +

Once again, we are NOT calculating $p(\theta|y)$ using MCMC, we are generating random numbers from $p(\theta|y)$ using MCMC. A very different thing.

+ +

Here's an example from a simple case: the posterior distribution for the scale parameter from an Exponential distribution with a uniform prior. The data is in x, and we generate N <- 10000 samples from the posterior distribution. Observe that we are only calculating $p(x|\theta)$ in the program.

+ +
x <- rexp(100)
+
+N <- 10000
+theta <- rep(0,N)
+theta[1] <- cur_theta <- 1  # Starting value
+for (i in 1:N) {
+   prop_theta <- runif(1,0,5)  # ""Independence"" sampler
+   alpha <- exp(sum(dexp(x,prop_theta,log=TRUE)) - sum(dexp(x,cur_theta,log=TRUE)))
+   if (runif(1) < alpha) cur_theta <- prop_theta
+   theta[i] <- cur_theta
+}
+
+hist(theta)
+
+ +

And the histogram:

+ +

+ +

Note that the logic is simplified by our choice of sampler (the prop_theta line), as a couple of other terms in the next line (alpha <- ...) cancel out, so don't need to be calculated at all. It's also simplified by our choice of a uniform prior. Obviously we can improve this code a lot, but this is for expository rather than functional purposes.

+ +

Here's a link to a question with several answers giving sources for learning more about MCMC.

+",2013-10-24 00:41:29.560 +58144,22997.0,2,,32317.0,,,,CC BY-SA 3.0,"

SCaVis data analysis framework has a built-in clustering program with a nice GUI to show clusters and their centers. There is a number of cluster algorithms available (k-means, fuzzy etc.)

+",2013-10-24 01:25:36.453 +58145,449.0,2,,58141.0,,,,CC BY-SA 3.0,"

Rarely (never?) in statistics can you demonstrate that your sample statistic = a point value. You can test against point values and either exclude them or not exclude them. But the nature of statistics is that it is about examining variable data. Because there is always variance then there will necessarily be no way to know that something is exactly not related, normal, gaussian, etc. You can only know a range of values for it. You could know if a value is excluded from the range of plausible values. For example, it's easy to exclude no relationship and give range of values for how big the relationship is.

+ +

Therefore, trying to demonstrate no relationship, essentially the point value of relationship = 0 is not going to meet with success. If you have a range of measures of relationship that are acceptable as approximately 0. Then it would be possible to devise a test.

+ +

Assuming that you can accept that limitation it would be helpful to people trying to assist you to provide a scatterplot with a lowess curve. Since you're looking for R solutions try:

+ +
scatter.smooth(x, y)
+
+ +

Based on the limited information you've given so far I think a generalized additive model might be the best thing for testing non-independence. If you plot that with CI's around the predicted values you may be able to make statements about a belief of independence. Check out gam in the mgcv package. The help is quite good and there is assistance here regarding the CI.

+",2013-10-24 01:53:45.937 +58146,7483.0,2,,58141.0,,,,CC BY-SA 4.0,"

(Answer partially updated in 2023.)

+

This is a very hard problem in general, though your variables are apparently only 1d so that helps. Of course, the first step (when possible) should be to plot the data and see if anything pops out at you; you're in 2d so this should be easy.

+

Here are a few approaches that work in $\mathbb{R}^d$ or even more general settings, to match the general title of the question.

+

One general category is, related to the suggestion here, to estimate the mutual information:

+
    +
  • Estimate mutual information via entropies, as mentioned. In low dimensions with sufficient samples, histograms / KDE / nearest-neighbour estimators should work okay, but expect them to behave very poorly as the dimension increases. In particular, the following simple estimator has finite-sample bounds (compared to most approaches' asymptotic-only properties):
  • +
+
+

Sricharan, Raich, and Hero. Empirical estimation of entropy functionals with confidence. arXiv:1012.4188 [math.ST]

+
+
    +
  • Similar direct estimators of mutual information, e.g. the following based on nearest neighbours:
  • +
+
+

Pál, Póczos, and Svepesári. Estimation of Rényi Entropy and Mutual Information Based on Generalized Nearest-Neighbor Graphs, NeurIPS 2010.

+
+
    +
  • Variational estimators of mutual information, based on optimizing some function parameterized typically as a neural network; this is probably the "default" modern approach in high dimensions. The following paper gives a nice overview of the relationship between various estimators. Be aware, however, that these approaches are highly dependent on the neural network class and optimization scheme, and can have particularly surprising behaviour in their bias/variance tradeoffs.
  • +
+
+

Poole, Ozair, van den Oord, Alemi, and Tucker. On Variational Bounds of Mutual Information, ICML 2019.

+
+

There are also other approaches, based on measures other than the mutual information.

+
    +
  • The Schweizer-Wolff approach is a classic one based on copula transformations, and so is invariant to monotone increasing transformations. I'm not very familiar with this one, but I think it's computationally simpler but also maybe less powerful than most of the other approaches here. (I vaguely expect it can be framed as a special case of some of the other approaches but haven't really thought about it.)
  • +
+
+

Schweizer and Wolff, On Nonparametric Measures of Dependence for Random Variables, Annals of Statistics 1981.

+
+
    +
  • The Hilbert-Schmidt independence criterion (HSIC): a kernel (in the sense of RKHS, not KDE)-based approach, based on measuring the norm of $\operatorname{Cov}(\phi(X), \psi(Y))$ for kernel features $\phi$ and $\psi$. In fact, the HSIC with kernels defined by a deep network is related to one of the more common variational estimators, InfoNCE; see discussion here.
  • +
+
+

Gretton, Bousqet, Smola, and Schölkopf, Measuring Statistical Independence with Hilbert-Schmidt Norms, Algorithmic Learning Theory 2005.

+
+
    +
  • Statisticians are probably more familiar with the distance covariance/correlation as mentioned here previously; this is in fact a special case of the HSIC with a particular choice of kernel, but that choice is maybe often a better kernel choice than the default Gaussian kernel typically used for HSIC.
  • +
+
+

Székely, Rizzo, and Bakirov, Measuring and testing dependence by correlation of distances, Annals of Statistics 2007.

+
+",2013-10-24 02:16:41.853 +58147,22998.0,1,,,,Detrending Discrete Data,,CC BY-SA 3.0,"

I am trying to detrend some discrete data and I am having difficulty finding a model to describe the trend. There is a number of discrete data points and there is a linear error being introduced with time. I created some representative data in excel:

+ +

Unbiased:

+ +

+ +

Biased data:

+ +

+ +

The discrete data here is 1,2,3 with a positive linear bias over time. Traditional least squares does a poor job, orthogonal least squares does no better. I tried a few other models with no success. I have been able to visually inspect the trend and select some of the ~2s, perform a linear regression, and then subtract the bias. I would like to programmatically do this across many data sets, so a generic way to do this reliability is essential. Any ideas on how to detrend the data and remove the bias?

+ +

Actual data: data

+",2013-10-24 02:45:42.290 +58148,7155.0,2,,58142.0,,,,CC BY-SA 3.0,"

Define an accuracy metric that reasonably models how you want your algorithm to perform.

+ +

Once you have a metric in hand you can cross-validate this question and see if it is an improves the performance.

+ +

Some common accuracy metrics that model the problem different: Normalized mutual information, Gini on the labels argsorted by the probabilities, Precision, Recall, AUC.

+ +

If the classes are extremely unbalanced and FN are crucial, you'll see an improvement.

+",2013-10-24 04:00:00.463 +58149,22564.0,1,58150.0,,,Possible Paradox: Calculating a confidence interval with within-experiment error,,CC BY-SA 3.0,"

This is a spinoff of

+ +

How to calculate the confidence interval of the mean of means?

+ +

and related to

+ +

When making inferences about group means, are credible Intervals sensitive to within-subject variance while confidence intervals are not?

+ +

Dataset 1 here is taken from the first link above. Dataset 2 has the approximately the same experimental means but different within experiment variance. My first question is:

+ +

1) How do I calculate a confidence interval for the overall mean for each of these data sets?

+ +

If I understand @Stéphane Laurent's answer in the two linked questions they should be the same.If that is true this goes strongly against all my scientific intuition and also appears to be a paradox.

+ +

2) How can it be that the confidence interval is apparently both sensitive to and not sensitive to within experiment error?

+ +

+ +

dataset 1:

+ +
  Experiment Value
+          1    34
+          1    41
+          1    39
+          2    45
+          2    51
+          2    52
+          3    29
+          3    31
+          3    35
+
+structure(list(Experiment = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
+3L, 3L, 3L), .Label = c(""1"", ""2"", ""3""), class = ""factor""), Value = c(34, 
+41, 39, 45, 51, 52, 29, 31, 35)), .Names = c(""Experiment"", ""Value""
+), row.names = c(NA, -9L), class = ""data.frame"")
+
+ +

dataset2:

+ +
  Experiment    Value
+          1 38.20744
+          1 37.99410
+          1 37.96299
+          2 49.27085
+          2 49.40519
+          2 49.24894
+          3 31.81259
+          3 31.73708
+          3 31.73834
+
+structure(list(Experiment = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
+3L, 3L, 3L), .Label = c(""1"", ""2"", ""3""), class = ""factor""), Value = c(38.2074373061779, 
+37.9941025108851, 37.9629896019425, 49.2708491636015, 49.4051867974062, 
+49.2489418702291, 31.8125943239769, 31.7370826901692, 31.7383364604132
+)), .Names = c(""Experiment"", ""Value""), row.names = c(NA, -9L), class = ""data.frame"")
+
+",2013-10-24 04:23:26.413 +58150,449.0,2,,58149.0,,,,CC BY-SA 3.0,"

I think you're confusing things with a manufactured example. Yes, you could have a specific case where the two CI's matched and you could have ones where the Dataset 1 CI was lower but on average the Dataset 1 CI will be higher. Furthermore, if these really are supposed to be multiple experiments tackling the same problem (within each Dataset) then there's something seriously wrong with Dataset 2. Lower within experiment variability should be leading to lower between experiment variability.

+",2013-10-24 04:46:24.827 +58163,23014.0,2,,33598.0,,,,CC BY-SA 3.0,"

Ho: Parameter is structurally stable when your probability is less than 5% as in your case it is 0.0016 then do reject Ho (i.e., the null hypothesis). It means there is a structural break in your data. The best thing you can do is check each variable one by one by using the chow test.

+",2013-10-24 13:43:49.420 +58151,668.0,2,,58122.0,,,,CC BY-SA 3.0,"

Correlation, according to its usual definition, is the cosine of the angle between vectors. Least squares regression decomposes a vector into a component within the linear span of a given set of vectors and an orthogonal component (the ""residuals"" or ""errors""). Those two components, being at a right angle to each other, have zero correlation. Therefore,

+ +
    +
  • A meaningful measure of the ""degree of correlation between those parts of $A$ and $B$ that are not correlated with $C$"" is the correlation of the residuals of $A$ and $B$ with each other when regressed separately against $C$.

  • +
  • The part of $A$ that is maximally correlated with $B$ is the projection of $A$ onto $B$ (the least-squares fit) and the part that is ""maximally uncorrelated"" is the residual of the regression of $A$ against $B$.

  • +
+ +
+ +

The example in the question suggests that ""correlation"" may be used there in a less conventional but undefined sense. Note that the residuals of the regressions of $A$ and $B$ on $C$ when all are binary will usually not be binary.

+",2013-10-24 05:29:43.127 +58152,22507.0,2,,58105.0,,,,CC BY-SA 3.0,"

Let $z_i$ be true value of $y_i$. If $z_i \sim N(y_i, \Delta y_i)$, and the theory predicts $z_i=a x_i^2$ then you should minimize $$\sum_i \left( {a x_i^2 + b x_i + c - y_i \over \Delta y_i} \right)^2$$

+ +

This is a weighted linear regression problem.

+",2013-10-24 07:24:14.877 +58153,22957.0,1,58155.0,,,How to specify/restrict the sign of coefficients in a GLM or similar model in R,,CC BY-SA 3.0,"

The situation: I'm struggling with a predictive analysis of food sales prices using a generalized linear model. My dataset contains different kinds of food (cheeses, vegetables, meats, spices etc.) and hence I am splitting the dataset completely by these kinds when doing the analysis, because they are very different by nature.

+ +

The current model: The dataset/model contains both factors such as ""country of production"" and numeric variables such as ""transport distance"" which is all used in the gamma based GLM i R.

+ +

The problem: Now in general my model fits pretty well, however sometimes in rare cases some of the metric variables gets the opposite sign (+/-) than you would expect it to have, because the model somehow catches other influences.

+ +

An example: An example would be spices. All spices have a relative long ""transport distance"" and a relative long shelf life and hence a pretty small impact on the sales price compared to e.g. meat. So in this case the model might by accident end up giving the ""transport distance"" variable a small but negative value - which is of cause wrong because it would mean that the longer the distance the food was transported the lower the price would be.

+ +

My question: What kind of model should I use in R if I wan't something similar to a GLM model but I want to be able to specify restrictions on some of the variables/coefficients? E.g. if I want to say that an increased ""transport distance"" should ALWAYS have a positive impact on the sales price?

+ +

Ideas: I have heard something about both ""Bayesian GLM"" models or using a so called ""prior distribution"" but I have no idea which one, if any, would be the best to use..?

+ +

UPDATE +The answer below by @ACD is not, exactly what I'm looking for. I don't need an explanation of WHY this occurs, I need a solution to restricting the coefficient signs :-)

+",2013-10-24 07:41:50.433 +58154,22339.0,1,,,,Graphical Multiple Linear Regression in Stata,,CC BY-SA 3.0,"

Consider this standard linear regression model:

+ +

$Y = \beta_0+\beta_1X_1+\cdots+\beta_pX_p+\epsilon$

+ +

I've loaded such a dataset into Stata 12.0, so I have some variables $y,x_1,x_2,\dots,x_p$. How do I produce a plot, like I would with . scatter y x for a simple linear regression model?

+",2013-10-24 08:04:22.503 +58155,12787.0,2,,58153.0,,,,CC BY-SA 3.0,"

The negative estimated coefficient on something that you KNOW is positive comes from omitted variable bias and/or colinearity between your regressors.

+ +

For prediction, this isn't so problematic, so long as you are sampling new data to predict the outcome (price?) of from the same population as your sample. The negative coefficient comes because the variable is highly correlated with something else, making the coefficient estimate highly variable, OR because it is correlated with something important that is omitted from your model, and the negative sign is picking up the effect of that omitted factor.

+ +

But it sounds like you are also trying to do inference -- how much does an exogenous change in $X$ change $Y$. Causal inferential statistics uses different methods and has different priorities than predictive statistics. It is particularly well developed in econometrics. Basically you need to find strategies such that you can convince yourself that $E(\hat\beta|X,whatever)=\beta$, which generally involves making sure that the regressor of interest is not correlated with the error term, which is generally accomplished by controlling for observables (or unobservables in certain cases). Even if you get to that point however, colinearity will still give you highly variable coefficients, but negative signs on something that you KNOW is positive will generally come with huge standard errors (assuming no omitted variable bias).

+ +

Edit: if your model is

+ +

$$ +price = g^{-1}\left(\alpha + country'\beta + \gamma distance + whatever + \epsilon\right) +$$

+ +

then country will be correlated with distance. hence, if you are in Tajikistan and you are getting a spice from Vanuatu, then the coefficient on Vanuatu will be really high. After controlling for all of these country effects, the additional effect of distance may well not be positive. In this case, if you want to do inference and not prediction (and think that you can specify and estimate a model that gives a causal interpretation), then you may with to take out the country variables.

+",2013-10-24 08:10:26.823 +58156,,1,,,user31790,How to compute whether the mean of a non-random sub-sample is statistically different from the mean of the sample?,,CC BY-SA 3.0,"

I have a variable called ""obs"" and from this variable I generated a new variable called ""obs_sub"" by excluding all observations for which a dummy variable is equal to one. Now what I want to know is if I remove these observations whether the mean of the sub-sample is equal to the mean of the sample. This is my null hypothesis.

+ +

I could run a regression of $E[obs|d] = \alpha + \beta \cdot d$, but the regression with the dummy $(d)$ will only tell me whether the coefficient on the dummy is statistically significant from zero. It will not tell me whether the data are unlikely given $E[obs|d=1] = E[obs]$ or is the null hypothesis $\beta=0$ the same as the null hypothesis as $E[obs|d=1] = E[obs]$?

+ +

Cheers,

+ +

Martin

+",2013-10-24 08:43:06.080 +58157,12683.0,2,,58119.0,,,,CC BY-SA 3.0,"

It might be true for your example (or it might not—plant No. 4 could be an etiolated seedling elongating its stem in a hunt for light without expanding its girth), but it's a matter of prior botanical knowledge rather than any general statistical law. It's not hard to construct counter-examples: someone who's an outlier on hours per week spent exercising is less likely to be an outlier on blood pressure. Any type of dependence is possible in principle—Chaos Theory doesn't say the butterfly that flaps its wings hardest causes the biggest hurricane (I don't know what Zen Buddhism says). A general theory that ranges over all measurements anyone might want to make on all kinds of things is going to be elusive.

+ +

If you consider a world with a well-defined set of objects, $a_1, a_2, \ldots$, & a well-defined set of measurements we can make on each object, $x_{11}, x_{12}, \ldots, x_{21}, x_{22}\ldots$, then your question could perhaps be framed in an answerable way (""Pick $n$ objects at random from the $a$s, pick two measurements at random from the $x$s, ...""); for our world I don't see how it can be.

+",2013-10-24 10:04:05.427 +58158,503.0,2,,58156.0,,,,CC BY-SA 3.0,"

A better null hypothesis (and substantively equivalent) is to see whether the subsample's mean is equal to the mean of all the other cases. That is, divide the sample into two mutually exclusive subsamples. Then you can run a t-test between them (or some nonparametric variation, if need be).

+ +

If this variation of the null isn't workable, could you say why not?

+ +

And, if it isn't, you could do a permutation test

+",2013-10-24 10:54:18.750 +58164,750.0,2,,58162.0,,,,CC BY-SA 3.0,"

They are still useful in assessing whether the relationship between the explanatory variables and the dependent variable is linear (or modeled properly given the equation). For an extreme example, I generated some data with a quadratic relationship and fit a linear regression of the form $Y = \alpha + \beta(X) + e$. (Because the parabola is approximately centered on zero $\beta$ is insignificant in the equation).

+ +

+ +

If you plot $X$ versus the residuals though the quadratic relationship is still very clear. (Imagine just detilting the first plot.)

+ +

+ +

I'm sure you can dream up other scenarios in which regression coefficients are insignificant but examining the residuals will show how the model is inadequate.

+",2013-10-24 13:44:19.557 +58165,22479.0,1,,,,Combining posterior probabilities from multiple classifiers,,CC BY-SA 3.0,"

I am new to machine learning and can't get my head around this problem. I have two patient datasets, the first ($D_1$) contains $Y,Z,X$ that convey blood-sample information and the second ($D_2$) contains $W,T,X$ that convey x-ray information. In both datasets, $X$ is the common diagnostic output.

+ +

Since there are two distinct datasets, I modelled a solution with two Naive Bayes models as below, where $X$ is the common dependent variable. ( I am aware I can use other techniques than Naive Bayes but that is not the aim of my question.)

+ +
    +
  • $P(X|Y,Z,D_1) \propto P(Y|X,D_1) \cdot P(Z|X,D_1) \cdot P(X,D_1)$
  • +
  • $P(X|W,T,D_2) \propto P(W|X,D_2) \cdot P(T|X,D_2) \cdot P(X,D_2)$
  • +
+ +

I want to combine the posterior probabilities of X in these models: $P(X|Y,Z)$ and $P(X|W,T)$ to determine an overall outcome probability $P(X|Y,Z,W,T)$ (diagnostic outcome) of a new patient.

+ +

How can I combine these probabilities?

+ +

Can it be done as below?

+ +

$P(X|Y,Z,W,T) \propto \frac{P(X|Y,Z) * P(X|W,T)}{P(X,D_{combined})}$

+ +

If so, what is the relationship between the priors $P(X,D_1)$, $P(X,D_2)$ and $P(X,D_{combined})$?

+",2013-10-24 14:06:01.187 +58166,15827.0,2,,58162.0,,,,CC BY-SA 3.0,"

Assume for simplicity that you have fitted some line $\hat y = b_0 + b_1 x$ given a dependent or response variable $y$ and a predictor or independent variable $x$. This specific assumption can be relaxed, which we will get to in good time.

+ +

With one variable on each side, a residual plot (meaning, a plot of residual $y - \hat y =: e$ versus fitted or predicted $\hat y$) in principle shows just the same information as a scatter plot with regression line superimposed. On the latter, the residuals are just the vertical differences between the data points and the line and the fitted are the corresponding values on the line, i.e. for the same value of $x$.

+ +

In practice, a residual plot can make structure in the residuals more evident:

+ +
    +
  • The regression line is rotated to the horizontal. Seeing structure in anything is easiest when the reference indicating no structure is a horizontal straight line, here the line $e = 0$.

  • +
  • There is better use of space.

  • +
+ +

In this easy example, some structure in the residuals is discernible in the scatter plot

+ +

+ +

but even easier to see in the residual plot:

+ +

+ +

The recipe here was simple. The data were fabricated as a quadratic plus Gaussian noise, but the quadratic is only roughly captured by the naive linear fit.

+ +

But it is still generally true that structure is easier to see on a residual plot. Some caution is needed in not over-interpreting residual plots, especially with very small sample sizes. As usual, what you spot should make scientific or practical sense too.

+ +

What if the fitted is more complicated than $b_0 + b_1 x$? There are two cases:

+ +
    +
  • Everything can still be shown on a scatter plot, e.g. the right-hand side is a polynomial or something in trigonometric functions of $x$. Here, if anything, the residual plot is even more valuable in mapping everything so that zero residual is a reference.

  • +
  • The model uses two or more predictors. Here also the residual plot can be invaluable as a kind of health check showing how well you did and what you missed.

  • +
+ +

The health check analogy is a fair one more generally: Residual plots can help you spot if something is wrong. If nothing is evidently wrong, no news is good news, but there is no absolute guarantee: something important may have been missed.

+ +

On whether the predictor had a significant effect, I know of no rule whatever for drawing or not drawing a residual plot. In the concocted example here, significance levels and figures of merit such as $R^2$ are extremely good, but the straight line model still misses a key part of the real structure. Conversely, a residual plot often illuminates why a model failed to work: either the pattern really is all noise, so far as can be seen, or your model misses something really important, such as some nonlinearity.

+ +

Footnote: for many statistical people IV means instrumental variable, not independent variable.

+",2013-10-24 14:12:26.123 +58167,1406.0,2,,58109.0,,,,CC BY-SA 3.0,"

If you hold everything else constant, you assume that it is constant. So it does not matter that the independent variables are correlated and they might change when you change your variable of interest. The assumption was that everything else is constant. It is perfectly ok to question this assumption, but the interpretation holds nevertheless. Whether you should care about it is another matter.

+",2013-10-24 14:23:10.210 +58168,5556.0,2,,57867.0,,,,CC BY-SA 3.0,"

This fact is called the data processing inequality. If the variables $X$, $O_1$ and $O_2$ form a Markov chain ($X \rightarrow O_1\rightarrow O_2$), then $Var [X|O_1] \leq Var[X|O_2] $, with equality iff $E[X|O_1] = E[X|O_2] $.

+ +

You can prove it by using the Markov property and the fact that conditioning decreases variance (which is itself the consequence of the law of total variance).

+ +

See Theorem 11 in ""Functional Properties of Minimum Mean-Square Error and Mutual Information"", +Y. Wu, S. Verdú, IEEE Trans. Info. Theory, March 2012

+ +

http://www.princeton.edu/~verdu/reprints/WuVerIT2012f.pdf

+",2013-10-24 15:26:10.970 +58169,20304.0,1,,,,Machine Learning problem - identifying fake fraudulent names,,CC BY-SA 3.0,"

I have a dataset of fraudulent orders from some business. Each order has a bunch of features such as order_amount, address, state, city, phone_number, and name. Obviously a criminal would not be using his/her real name when making a fraudulent order. So I was wondering if there was any sort of machine learning strategy to identify fake names. I assume there must be some sort of underlying structure to how fake names are selected - so understanding this structure could allow me to identify them. Unless the fake names are completely randomly selected. Any thoughts on how to do this?

+",2013-10-24 15:55:46.143 +58170,23017.0,2,,20667.0,,,,CC BY-SA 3.0,"

This Matlab script generates 2D data for clustering. It accepts several parameters so that the generated data is within user requirements.

+",2013-10-24 15:59:38.697 +58171,20304.0,2,,58131.0,,,,CC BY-SA 3.0,"

MCMC is a family of sampling methods (Gibbs, MH, etc.). The point of MCMC is that you cannot sample directly from the posterior distribution that you mentioned. The way MCMC works is a Markov Chain (the first MC in MCMC) is identified whose stationary distribution is the posterior that you are interested in. You can sample from this Markov Chain and when it converges to its equilibrium distribution, you are essentially sampling from the posterior distribution that you are interested in.

+",2013-10-24 16:02:48.610 +58172,23019.0,1,,,,ANOVA versus nonlinear fit,,CC BY-SA 3.0,"

I have a data set which looks something like this (not real data):

+ +
conc   Resp
+0      5
+0.1    18
+0.2    20
+0.3    23
+0.4    24
+0.5    24.5
+0      5
+0.1    17
+..     ..
+
+ +

which happens to fit perfectly to the Michaelis-Menten equation:

+ +
+

Resp = max_value * conc / (conc_value_at_half_max + conc)

+
+ +

Even though it is something else entirely importantly, the response increases quickly with ""conc"" and then reaches a ceiling or max value of sorts.

+ +

Anyway, I want to know how low I can go in ""conc"" before the value of ""Resp"" is not significantly lower than the max value.

+ +

Using a simple ANOVA accomplices this nicely, but I was thinking: ""should I not be exploiting the fact that the structure of the data is so nicely explained by a known equation?"" Is there such a way?

+ +

I am using minitab for this because it is easier, but would work in R all the time.

+",2013-10-24 17:03:46.683 +58173,1575.0,1,,,,Overlap in time series training sets,,CC BY-SA 3.0,"

I have a time series prediction problem where the aim is to forecast the average value of $y_t$ over the next $T$ periods, given all the information available up to point $t$. For example, I want to forecast

+ +

$$\bar{y}_t = \frac{1}{T}\sum_{k=1}^T y_{t+k}$$

+ +

as a function of a bunch of other variables $x_t$ which are available at time $t$.

+ +

When building a training set from the data, I could ensure that I have no overlapping responses by considering

+ +

$$t = 0, T, 2T, 3T, \dots$$

+ +

However, I feel that this may not be making best use of the available data, and result in models with a lot of variance. An alternative is to use overlapping responses, for example

+ +

$$t=0, \tfrac{1}{2}T, T, \tfrac{3}{2}T, \dots$$

+ +

but I worry that this may create a lot of bias in the trained model.

+ +

Are there known results about how using overlapping data affects the bias/variance tradeoff? Is there a ""best"" level of overlap to use?

+",2013-10-24 17:08:18.863 +58174,21804.0,2,,58160.0,,,,CC BY-SA 3.0,"

Each cluster center is a point in the dimensional space with as many dimensions as extracted factors from your PCA. The row you report is the value of one dimension for each of the cluster centers. If you want to report on the position of the cluster centers, then you need the full coordinates. It is misleading though, to say that a factor ""influences"" a cluster, as the clusters are built based on factor values, in other words there would be no clusters if you did not start out with those factors.

+ +

Whether 0 is a special position for a cluster center depends on the nature of the input variables, but even if you assume that 0 represents mean and median for the values on this dimension it is informative. To give an intutiive demosntration why a value of 0 is informative: imagine a cluster analysis based on a one-dimensional variablwe and you learn that the cluster centers are located at -3,0, and 5. Then you know something about the relative position of the cluster centers. If you ignored the factor you could not even describe the middle cluster.

+ +

I think the question might be motivated by the practice of dropping small factor laodings when reporting the results of PCAs, which is offered by some statistical packages (and of dubious value in many practical applications).

+",2013-10-24 17:14:04.797 +58175,19089.0,2,,57670.0,,,,CC BY-SA 3.0,"

As far as I can tell the fact that these are networks or graphs have no bearing on the problem you had in mind. Instead, it sounds like you have three binomial trials, with $n$ values $n_1, n_2, n_3$, respectively, and estimated proportions $\hat p_1 = \frac{a}{n_1}$, $\hat p_2 = \frac{d}{n_1}$, $\hat p_1 = \frac{g}{n_1}$, respectively, for this first criteria you're interested in. These you should be able to analyze with the usual $t$-test.

+ +

Beyond this, if you want to test all three graphs at once, recognize that you basically have a two-way anova. The blocks are which graph you're in, and the parameters are the treatment. The responses are of course 0 and 1, which you might consider remapping to -1 and 1. I recommend researching two-level factorial designs, as in Chapter 29 of Applied Linear Statistical Models

+",2013-10-24 18:07:45.343 +58176,12756.0,1,58178.0,,,Calculation of probability of selling a Poisson number of products,,CC BY-SA 3.0,"

You sell 10 product on average every day. The model is based on a Poisson distribution. +To be enable to cover the indirect cost, the manager needs to sell 8 products per day.

+ +

Two questions:

+ +
+
    +
  1. What is the probability to sell between 0 - 5 products in a day?

  2. +
  3. What is the probability to have at least 6 products in a single day?

  4. +
+
+",2013-10-24 19:04:53.360 +58177,23018.0,1,,,,Sample size analysis for a study,,CC BY-SA 3.0,"

I am a clinical researcher and orthopedic surgeon and one of my PhD student is starting on the last part of her thesis. Her work is centered around the potential benefits of interprofessional teaching at an orthopedic ward. The study is a retrospective cohort study on register data for hip fracture patients. Our hypothesis is that an interprofessional ward (with medical students and nurse students guided by real doctors and nurses) does not put patient at higher risk for adverse events than a control ward. As the primary outcome variable we have a proxy variable for adverse events; readmission rate at 3 months. The number of patients treated at the interprofessional ward is about 1:4 compared to the control ward.

+ +

My question: Assuming a 3-month readmission rate of about 20% (about normal for this patient group) in the control ward, how large a sample size do I need when I aim at detecting a 5% difference in readmission rates? This is for 80% power and $p \le 0.05$.

+ +

When performing this calculation as a superiority calculation for proportions in for instance SamplePower 3, I wind up at approximately 500 and 2000 in the interprofessional and control ward, respectively. Is this calculation correct, or should I do a non-inferiority analysis for proportions instead where the minimum clinical difference I am interested in is 5%? I cant find any such non-inferiority calculators where the number of subjects in the two groups are of different numbers, as in this study.

+",2013-10-24 19:08:11.140 +58178,21182.0,2,,58176.0,,,,CC BY-SA 3.0,"

For a random variable $X \sim \textrm{Poisson}(\lambda)$, the probability of seeing a value of $k$ is: +$$ +P_k = \frac{e^{-\lambda} \lambda^k}{k!} +$$ +So the probability of selling less than or equal to $5$ products in a given day, where the long-term average is $8$ is: +$$ +P_{0\ldots5}=\sum_{i=0}^5\frac{e^{-8} \lambda^i}{i!} +$$

+ +

The probability of at least $6$ is everything other than $0$ to $5$ or $(1-P_{0\ldots5})$ for which you were asked in the first part.

+",2013-10-24 19:18:01.523 +58179,22564.0,1,,,,What is the best way to analyze data if your experimental design changes while running the experiment?,,CC BY-SA 3.0,"

I imagine this is a somewhat common situation in practice. I am thinking mainly in terms of preclinical drug trials.

+ +

1) During the course of the study a new technique is learned or some time/money is freed up to be able to use a new technique as a secondary outcome.

+ +

2) Some key piece of equipment breaks, the lab member with the skill to perform a technique leaves, or a planned technique is looking like it is yielding inconsistent results for unknown reasons so that outcome is dropped for financial reasons.

+ +

3) When time comes to analyze data it comes to the knowledge of the researcher that the planned methods are inadequate/inappropriate and they wish to use a different approach.

+ +

4) Experiment 1 (e.g. test drug A) does not look like it will pan out, so a financial decision is made to stop that experiment and instead test drug B. Alternatively some other part of the experiment may be modified such as cell culture or animal strain.

+ +

5) Study is stopped prematurely or additional funds are freed up to increase sample size.

+ +

I can think of more, but you get the idea. Are p values of any use under these circumstances? Should corrections for multiple comparisons be made in the case of #4? At what point does deviation from the plan invalidate the hypothesis testing procedure?

+ +

The best way to me would be to simply plot the data and describe it in the hopes it may be useful for someone. However this behaviour is discouraged in favor of performing significance/hypothesis tests and the data may not be published if these are not included.

+ +

EDIT: +I was thinking about it and the actual best thing to do is to take the data and come up with a model to explain it that makes a precise prediction that can then be tested. It seems like this should always be the best thing to do, though.

+",2013-10-24 19:26:11.043 +58180,1809.0,1,58182.0,,,Chi Square Test for Independence in R and Python,,CC BY-SA 3.0,"

Consider the following R code and output:

+ +
row1 = c(0,23,0,0)
+row2 = c(0,1797,0,0)
+data.table = rbind(row1, row2)
+chisq.test(data.table)
+
+    Pearson's Chi-squared test
+
+data:  data.table
+X-squared = NaN, df = 3, p-value = NA
+
+ +

Now consider the same in Python:

+ +
import scipy.stats
+scipy.stats.chi2_contingency([[0,23,0,0], [0,1797,0,0]])
+
+Traceback (most recent call last):
+  File ""<stdin>"", line 1, in <module>
+  File ""/usr/lib/python2.7/dist-packages/scipy/stats/contingency.py"", line 236, in
+     chi2_contingency
+    ""frequencies has a zero element at %s."" % zeropos)
+ValueError: The internally computed table of expected frequencies has a zero element at [0, 0, 0, 1, 1, 1].
+
+ +

Is this expected behaviour? Should I just trap for the error in Python. A search for the message ""The internally computed table of expected frequencies has a zero element at"" did not reveal anything useful.

+",2013-10-24 19:42:29.797 +58181,22.0,2,,58172.0,,,,CC BY-SA 3.0,"

I think you should rethink the question.

+ +

Why would it possibly matter to find the highest concentration where the response is far enough below the maximum plateau to give a P value less than some arbitrary threshold?

+ +

If you added more concentrations, or collected replicate values, or obtained cleaner (smaller experimental error) data, then you'd reach a different conclusion about the largest concentration that gives a response ""significantly"" less than the plateau. So the answer to that question is partly determined by details of the experimental design that you can change. The answer tells you nothing fundamental about the system.

+ +

As with most questions in statistics, I urge you to set aside the word ""significant"" and try to articulate the question you want answered in scientific terms.

+",2013-10-24 20:15:11.560 +58182,449.0,2,,58180.0,,,,CC BY-SA 3.0,"

They're both errors but in R it just reported NaN.

+ +

The reason they are errors likely has to do with divide by 0 issues. You must have some kind of count in each cell, typically at least 4-7 is preferred. See any online article on the assumptions and requirements of a chi-square test. It tests independence but it can't do so with no data in either cell in a 2 by X design.

+ +

If the problem is just that python will exit then, by all means, trap the error.

+",2013-10-24 20:17:03.853 +58183,23021.0,1,58185.0,,,Maximum Likelihood for shifted Geometric Distribution,,CC BY-SA 3.0,"

Really struggling with this please help.

+ +

Find MLE for p and c

+ +

\begin{equation} +\ {f}(x,p,c) = (1-p)^{x-c}p +\end{equation}

+ +

x=c,c+1,c+2,.....

+ +

p is between 0 and 1

+ +

c is element of the integers

+ +

I am more interested in the mle for c

+ +

For the mle of p is it fine to just take the derivative of the log likelihood function.

+",2013-10-24 20:21:40.683 +58184,21762.0,2,,58137.0,,,,CC BY-SA 3.0,"

(Almost) perfect multicollinearity (MC) will lead to large standard errors of the estimates and/or non-convergence of the optimization routines. Any other MC is no issue since your goal is prediction (and not interpretation of effects).

+",2013-10-24 20:26:39.817 +58185,22399.0,2,,58183.0,,,,CC BY-SA 3.0,"

This should get you started:

+ +

$L = \prod_i (1-p)^{x_i-c} p$

+ +

To maximize the above we work with the log-likelihood:

+ +

$LL = \sum_i \Big( (x_i-c) \log(1-p) + \log(p) \Big)$

+",2013-10-24 20:34:35.450 +58186,19388.0,1,,,,Regression estimate of a non-negative variable,,CC BY-SA 3.0,"

I have to estimate linear weight $\beta$ for regression $Y \sim \mathbf{X}$, where $Y$ are non-negative samples. If I perform vanilla regression (lets assume ridge regression) it will find $\beta$ such that most of estimated $\hat{Y}$ are non-negative. But some of them will be negative. But I need them to be invariably non-negative. In Bayesian framework, it implies I need a prior $\beta$ that is non-negative (ridge assume normal priors). Is there simple way to modify ridge regression to obtain this? Or putting non-negative priors and estimating via MCMC is only solution to it.

+ +

For further description of problem (why I need estimated $\hat{Y}$ to be positive). My full model is a hierarchical model where there is a variable $Z$ that is a linear model of latent variable $Y$ i.e $Z \sim N(\Theta^T \mathbf{Y}, \sigma^2)$. Observable variable are $Z$ and $X$. I need estimates of $\beta$ and $\theta$. I am currently estimating $\beta$ given Y. Given $\hat{Y}$ I am estimating $\theta$. I am iterating between this 2 steps. If $\hat{Y}$ is negative the problem become unidentifiable.

+",2013-10-24 21:16:19.347 +58187,23021.0,1,,,,Challenging Likelihood Ratio Test,,CC BY-SA 3.0,"

Derive Likelihood Ratio Test of size $\alpha$. H$_0$: $\theta=\theta_0$ H$_1$:$\theta \neq \theta_0$

+ +

\begin{equation} +{f}(x,\theta , c) = \theta(x-c)^{\theta-1} \ {c<x<c+1} +\end{equation}

+ +

I am having trouble with the mle for c. We have to use order stats and I am not sure which is the correct one to use

+",2013-10-24 21:26:54.730 +58188,1930.0,1,58195.0,,,regarding conditional independence and its graphical representation,,CC BY-SA 3.0,"

When studying covariance selection, I once read the following example. With respect to the following model:

+ +

+ +

Its covariance matrix and inverse covariance matrix are given as follows,

+ +

+ +

I do not understand why the independence of $x$ and $y$ is decided by the inverse covariance here?

+ +

What is the mathematical logic underlying this relationship?

+ +

Also, the left graph in the following figure is claimed to capture the independence relationship between $x$ and $y$; why?

+ +

+",2013-10-24 21:32:11.820 +58189,5213.0,1,,,,error of the mean in presence of background,,CC BY-SA 3.0,"

Suppose I have a normal distribution, $N[\mu,\sigma]$, and I have a sample of size $n$. It is well know that the error (std deviation) of the mean is $\sigma/\sqrt{n}$.

+ +

Now suppose that my distribution is a mixture of signal (normal distributed) and a flat background +$$\text{pdf}[x|\mu, \sigma, s, b] = \frac{sN[x|\mu,\sigma] + b U}{s+b} $$

+ +

suppose that the uniform distribution is much wider than the normal but in the signal window the background is important. Question: what is the error of $\mu$ now?

+ +

($s$ and $b$ are random variables, they can be assumed to be Poisson distributed)

+",2013-10-24 21:35:19.270 +58190,23022.0,1,58206.0,,,1-d extension of 2-d variance?,,CC BY-SA 3.0,"

A random variable X={X1..Xm} and Var(X) is the mXm variance-covariance matrix. Is there an accepted 1-d statistic like variance that may be extracted from the sample variance of data representing a 2-d random variable?

+ +

(explanation of the 2d equivalent of a sample variance) +How to find variance between multidimensional points?

+",2013-10-24 21:38:34.937 +58191,5671.0,2,,58160.0,,,,CC BY-SA 3.0,"

Be really careful when using PCA on discrete values such as likert scales. PCA is designed for continuous variables, not for discrete values.

+ +

There is a high chance that you will discover artifacts from the discrete scale.

+ +

In fact, the vector looks susceptible much like the frequencies of the 5 answers, or something like this...

+ +

If you would share more of what you've been doing, it would be easier to help you.

+",2013-10-24 21:40:44.713 +58192,22936.0,1,,,,Entropy of Inverse-Wishart distribution,,CC BY-SA 3.0,"

What is the entropy of the Inverse-Wishart distribution? I need just a reference, but derivation (e.g. using inverse property) would be interesting too.

+",2013-10-24 22:01:38.600 +58193,503.0,2,,58180.0,,,,CC BY-SA 3.0,"

The problem is not that there are 0 cells, the problem is that only one column has any data. E.g

+ +
row1 = c(100,23,0,100)
+row2 = c(0,1797,100,0)
+data.table = rbind(row1, row2)
+chisq.test(data.table)
+
+ +

works fine

+ +

and

+ +
row1 = c(10,0,0,100)
+row2 = c(0,1797,100,0)
+data.table = rbind(row1, row2)
+chisq.test(data.table)
+
+ +

gives only a notice that the approximation may be incorrect - here an exact test should be used.

+ +

Even

+ +
row1 = c(23,0,0,0)
+row2 = c(0,1797,0,0)
+data.table = rbind(row1, row2)
+chisq.test(data.table)
+
+ +

gives only that same warning.

+",2013-10-24 22:49:10.423 +58194,2352.0,1,,,,How should I approach modeling these subjective probability estimates?,,CC BY-SA 3.0,"

In my data about 1000 people have made estimates of the probability of 100 unique events. On average people forecast on about 50 events, but some forecast on all of events and some on only a few. About 500 of these people are working in teams of 15. They see each others forecasts and communicate with each other, but each individual still offers their own estimates). One question I'd like to answer is how much of the variance in these forecasts is explained by which team people were (randomly) assigned to. But I also suspect that the share of the variance explained by team will vary by event. So I'd like to examine that variation as well. I have a bit of experience with mixed models with lmer, but its not immediately obvious to me how to approach this.

+",2013-10-24 22:51:47.620 +58195,1889.0,2,,58188.0,,,,CC BY-SA 3.0,"

The inverse covariance matrix can be used to work out conditional variances and covariances for multivariate Gaussian distributions. An earlier question gives some references

+ +

For example to find the conditional covariance of $Y$ and $Z$ given the value $X=x$, you would take the bottom right corner of the inverse covariance matrix

+ +

$$\left( \begin{array}{rr} +1 & -1 \\ +-1 & 3 \end{array} \right) \text{ and re-invert it to }\left( \begin{array}{rr} +\tfrac32 & \tfrac12 \\ +\tfrac12 & \tfrac12 \end{array} \right)$$

+ +

which does indeed give the covariance matrix of $Y$ and $Z$ conditioned on the the value for $X=x$.

+ +

So similarly to find the conditional covariance matrix of $X$ and $Y$ given the value for $Z=z$, you would take the top left corner of the inverse covariance matrix

+ +

$$\left( \begin{array}{cc} +1 & 0 \\ +0 & 1 \end{array} \right) \text{ and re-invert it to }\left( \begin{array}{cc} +1 & 0 \\ +0 & 1 \end{array} \right)$$

+ +

telling you that the conditional covariance between $X$ and $Y$ given $Z=z$ is $0$ (and that each of their conditional variances is $1$).

+ +

To conclude that this zero conditional covariance implies conditional independence, you also have to use the fact this is a multivariate Gaussian (as in general zero covariance does not necessarily imply independence). You know this from the construction.

+ +

Arguably you also know about the conditional independence from the construction, since you are told that $\epsilon_1$ and $\epsilon_2$ are iid, so conditioned on a particular value for $Z=z$, $X=z+\epsilon_1$ and $Y=z+\epsilon_2$ are also iid. If you know $Z=z$, there is no additional information from $X$ that helps you say anything about possible values of $Y$.

+",2013-10-24 23:16:29.650 +58196,2806.0,1,58197.0,,,What is the intuition behind (M)ANCOVA and when/why should one use it?,,CC BY-SA 3.0,"

As per my understanding here's the what/why/when of the following hypotheses tests in a crude sense:

+ +
    +
  • t-test: Used when comparing means between two samples
  • +
  • ANOVA (one way): Used when you have one dependent variable and one independent (i.e., categorical) variable and you wish to analyze the 'means' (i.e., effects) across multiple groups. Simply stated, multi-way t-tests in essence.
  • +
  • ANOVA (two way): Similar to one-way except you have two independent (i.e., categorical) variables
  • +
  • MANOVA: ANOVA with multiple dependent variables
  • +
  • ANCOVA: ??
  • +
  • MANCOVA: ??
  • +
+ +

Intuitively, the concepts/intuition behind (M)ANOVA makes sense and I understand when/how to apply it and why is it necessary. I've just overly simplified my understanding about them above. However, I lack the similar intuition behind (M)ANCOVA.

+",2013-10-25 00:54:47.617 +58197,5237.0,2,,58196.0,,,,CC BY-SA 3.0,"

To complete your scheme:

+ +
    +
  • ANCOVA: ANOVA conducted to compare multiple (possibly only two) conditions on at least one independent variable while controlling for a set of continuous nuisance variables (possibly only one).
  • +
  • MANCOVA: MANOVA conducted to compare multiple (possibly only two) conditions on at least one independent variable while controlling for a set of continuous nuisance variables (possibly only one).
  • +
+",2013-10-25 01:23:54.390 +58198,82.0,2,,58172.0,,,,CC BY-SA 3.0,"

You could model it using nonlinear least squares regression, then use the modeled values and SE of the fit to determine the conc level that is ""different"" than the max. Once you fit the model, you could brute force search progressively lower thresholds until you reach a conc where Resp differs by a pre-specified alpha value from the max value and the estimated error around that.

+ +

So, fit a nls, then write a for loop to calculate the p values for, say, a t-test comparison, and search that for the critical conc value you're looking for.

+",2013-10-25 01:54:05.503 +58199,23027.0,1,58200.0,,,Translating R lme comand to mathematical equation,,CC BY-SA 3.0,"

I would appreciate if someone could help me in translating the following R command into a mathematical equation:

+ +
lme(score ~ factor(timeslot), random=~1|subjectid, data=a)
+
+ +

For each subject there are observations at 6 different times (timeslots) during a day.

+",2013-10-25 06:49:54.090 +58200,21638.0,2,,58199.0,,,,CC BY-SA 3.0,"

$$ +y = X\beta + Z\gamma + \epsilon +$$

+ +

$y$ is an $N$ length vector of all observations across all subjects

+ +

$X$ is an $N \times 6$ design matrix. The first column is all $1$s for the intercept term which is implicit in the call to lme - i.e. your call is equivalent to

+ +

lme(score ~ 1+factor(timeslot),random=~1|subjectid,data=a)

+ +

The remaining $5$ columns of $X$ map the observations to the timeslots at which they were observed. Note that there are $5$ rather than $6$ of these columns as the effects of each timeslot are taken relative to the first one.

+ +

$\beta$ is a $6$ length vector of fixed-effects where the first element is the intercept term and the remaining $5$ elements correspond to the relative effect of the last $5$ timeslots compared to the first.

+ +

$Z$ is an $N \times n$ design matrix mapping the observations to the subject on which they were measured. $n$ is the total number of subjects in your data set (which you have not specified).

+ +

$\gamma$ is an $n$ length vector of random intercept terms, one for each subject.

+ +

$\epsilon$ is an $N$ length vector of error terms.

+",2013-10-25 07:40:15.233 +58201,22637.0,2,,58176.0,,,,CC BY-SA 3.0,"

You can calculate the probabilities manually or you can do it with R. The code for the cumulative distr. of Poisson is ppois(k,m), k being the limit $P(X\leq k) $ and m the parameter of the distribution.

+",2013-10-25 08:11:17.770 +58203,17321.0,1,66211.0,,,"When doing systematic sampling, what should be done if the sampling interval (i.e. the skip) is not an integer?",,CC BY-SA 3.0,"

Let: +population size $=N$; +sample size $=n$; +sampling interval $=\frac{N}{n} = k$, which can be non-integer; and +$r=$ random starting point, which can be non-integer, $0 < r < k$.

+ +

http://en.wikipedia.org/wiki/Systematic_sampling says we round up $r + mk$ (where $m$ is an integer between $0$ and $n-1$, both inclusive) although the values given (11, 26, 41, 56, 71, 86, 101, and 116) show some rounded-down values.

+ +

ocw.jhsph.edu/courses/statmethodsforsamplesurveys/PDFs/Lecture2.pdf says:

+ +

1) ""if $k=5$ is considered, stop the selection of samples when $n=175$ achieved.""

+ +

But this means the last few members would not have any chance of being picked.

+ +

2) ""if $k=6$ is considered, treat the sampling frame as a circular list and continue the selection of samples from the beginning of the list after exhausting the list during the first cycle.""

+ +

This doesn't give equal chance to each member of being picked, does it?

+ +

3) ""An alternative procedure is to keep $k$ non-integer and continue the sample selection as follows: +Let us consider, $k=5.71$, and $r=4$. +So, the first sample is 4th in the list. The second $=(4+5.71) =9.71$ ~ 9th in the list, the third $=(4+2\times5.71) =15.42$ ~ 15th in the list, and so on. (The last sample is: $4+5.71\times(175-1) = 997.54$ ~ 997th in the list).""

+ +

This uses rounding down of $r + mk$ (different from the rounding up suggested by the Wikipedia page mentioned above).

+ +

Shouldn't we be rounding off instead to give equal chance to each member of being picked?

+ +

An even better way is to let random starting point be $R$, randomly selected from the integers 1 to $N$, both inclusive, and use $r + mk$, rounded off and modulo $N$?

+",2013-10-25 08:50:00.713 +58204,9554.0,1,,,,Leave-out data approach to get intervals for predictive performance of a regression model,,CC BY-SA 3.0,"

When measuring predictive performance of a regression model, I am thinking about using repeated data splitting (or leave-out data at random) instead of using bootstrapping.

+ +

By ""repeated data splitting"" (not sure if that's even term) I understand:

+ +
FOR N=10000
+Sample 70% of the data points and fit the model
+Predict for the remaining 30%
+Store performance
+END FOR
+Calculate quantiles from stored performance data
+
+ +

I am currently using a GLM in order to rank my data according to a ratio scaled response variable Y (which my model is supposed to predict). Later I am evaluating the quality of the ranking using a specific metric.

+ +

I would like to calculate the prediction (or at least) confidence intervals for this metric -- that's the thing I care about (and not, say, the variance of the $\beta_3$ estimate)

+ +

Would the repeated data-splitting provide useful intervals for the performance of my ranking? i.e. Can I consider them as prediction intervals?

+ +

I would argue that this approach makes more sense than non-parametric bootstrap, as long as the data set is large enough, since we are using only actual data (that really occurred at one point in history, as opposed to a bootstrapped point the quality of which completely depends on the quality of the empirical approximation of the underlying distribution).

+ +

Since my data set is fairly large ($>20,000$) but also extremely skewed and long tailed, I would prefer using the data-splitting but I am not sure if this is a plain heuristic, or whether the intervals are actual prediction intervals.

+",2013-10-25 09:08:56.807 +58205,23028.0,1,,,,How to calculate spatial correlation between two variables?,,CC BY-SA 3.0,"

I have a dataset of point coordinates of individuals and different variables of these individuals. I want to calculate if the spatial distribution of a certain variable is correlated to the spatial distribution of a different variable. For example: do patches of large plants have higher predation rates, even small plants within a patch of large plants. +Is it possible to calculate a correlation or a level of significance for these two variables and what software should I use?

+",2013-10-25 09:28:08.600 +58206,21029.0,2,,58190.0,,,,CC BY-SA 3.0,"

I don't believe there is an answer to your question. You can't scale down a variance-covariance matrix. If you did (say by taking the determinant) you will lose all the usefull information -- it would no longer be explaining the variance.

+ +

As suggested in the comments, you can run a Principal Component Analysis to reduce the dimension of your data to one. Then the inertia (not the variance) of the first component can provide a variance-like summary of the full data.

+ +

However, this is different from the variance because you will be calculating the variation with respect to a new calculated principal component and not the original variables. It is therefore no longer the variance, but the inertia.

+",2013-10-25 09:56:19.230 +58207,9554.0,2,,58202.0,,,,CC BY-SA 3.0,"

I am not sure if I really get what is troubling you, but perhaps a couple of hints will help you:

+ +
    +
  • How can I conceptualize the prior of a deterministic variable in Bayesian data analysis?
  • +
+ +

Formally, deterministic variables don't have a probability distribution (yes you guessed it, since they are not random variables!). No random variable => no prior.

+ +
    +
  • Now, as far as I understand the first is a deterministic variable and +the second is stochastic.
  • +
+ +

If by first you mean $\lambda_C$, since it is a function of r.v. $\sigma$, IT IS a random variable. +(just a side note: usually you reserve the term ""stochastic"" for random quantities involving time, such as a stochastic process)

+ +
    +
  • However, I have problems conceptualizing the prior of the deterministic variable.
  • +
+ +

I think I answered this in the first comment. But let me elaborate. In Bayesian statistics there are multiple ways how you can treat a parameter, depending on your previous knowledge. In general you assume the parameter to have an uncertain value, and consider it random. Depending on your beliefs and prior knowledge about its values you can either, use a uniform distribution, or a distribution that favors a certain value, e.g. if you know the precision is most likely centered around a certain value, you can use a Gamma($\alpha$,$\beta$) distributed prior with appropriate values $\alpha$ and $\beta$. Now here is the point that might be tripping you up. Priors are there since sometimes you have a strong belief, but usually you will not be a 100% certain about the value of a certain parameter. +Technically, that case can be considered a random variable, with a point mass. But that is completely beyond the point. Priors help model uncertainty of parameters. +Deterministic parameters in the model are something completely different. They are unknowns, that you decide to model as single values without any uncertainty.

+ +
    +
  • How would you, for example, plot the graph of the prior?
  • +
+ +

By plotting its probability distribution.

+ +
    +
  • How would I even calculate the single values for the prior of $\lambda_C$?
  • +
+ +

Not sure what you mean here. I hope by now you can see this question makes no sense.

+ +

HTH

+",2013-10-25 10:03:45.003 +58216,,2,,58210.0,anon,,,CC BY-SA 3.0,"

The question is rather vague. From your description, the only case where I can imagine the scatterplot to look like a triangle (and fit the description), is a case where the variance of the Y increases with the X values. The scatter plot would appear something like:

+ +
|           .
+|         ...  
+|       ..... 
+|    ........
+|  ......     
+|.... 
++----------------
+
+ +

What is the aim of your analysis? If you're trying to predict the Y from the X, you might want to take the increasing variance into account during the analysis using something like generalized least squares or other such methods. If you want to find other such cases from a larger data set that contains several X variables, as crude solution you might consider calculating, e.g., a range or variance of the Y variable along the X variable. Generating all the possible plots, and eyeballing them for interesting patterns might be a viable option, also.

+",2013-10-25 12:59:48.207 +58208,23030.0,1,,,,Relation between autocorrelation function and periodogram in time series analysis,,CC BY-SA 3.0,"

I was wondering if anyone could give me some insight on the relation between the ACF and the periodogram of a time series.

+ +

I have a bunch of timeseries and their ACF's and periodograms are typically much like the examples below.

+ +

For my analysis, I'm mostly interested in periodicity at lag 8 and 16 (for theoretical reasons)

+ +

The frequencies 'B' and 'HB' correspond to lag 16 and lag 8 respectively. The time series actually concerns interresponse intervals in musical performance of a piece that consists solely of eighth notes (16 of them in a 4:4 bar so 'B' stand for bar and 'HB' for half bar).

+ +

The thing I actually wanted to ask: in my periodograms, I consistently get very large peaks at frequency 0.25 (which corresponds to lag 4). However, the ACF peak at lag 4 is much smaller than those at lag 8 or 16. I was wondering how to interpret this finding. A lot of time series variance can be explained at this frequency even though the lag 4 autocorrelation is quite low?

+ +

I hope I was sufficiently clear in my question. If not, don't hesitate to ask me.

+ +

+ +

+",2013-10-25 10:10:59.607 +58209,23031.0,2,,54836.0,,,,CC BY-SA 3.0,"

I would suggest you look at page 8 of ""Probabilistic Topic Models"" by Mark Steyvers and Tom Griffiths. I found their explanation of the Gibbs algorithm quite clear and easy to implement.

+ +

To answer your questions:

+ +
    +
  • i seems to range over (indexes for) all the words in all the documents, and d indeed seems to refer to the document of the word under consideration.
  • +
  • There are some algorithms for estimating alpha and beta but I haven't really understood any of them myself. Just tuning the parameters manually should be ok in most cases.
  • +
  • W seems to be the size of the vocabulary, i.e. the number of unique words.
  • +
  • I'm confused about the beta with / without subscript as well. If they use a different beta per word, then maybe in the denominator it should say the sum of all beta_w, instead of beta * W?
  • +
+",2013-10-25 10:17:58.963 +58210,23032.0,1,,,,Are there statistical techniques that investigate such relationships ...?,,CC BY-SA 3.0,"

If we have data set, X and Y variable. Say, we do correlation analysis and get some correlation coefficient. Besides, we find an important fact after observing their relationship: That is, the scatter plot of X and Y has a triangular shape. Which means that (for example) when X values are increasing Y values are increasing for all X values, whereas vice versa is not true; when the Y value is increasing X values are anything for all Y values.

+ +

What kind of analysis should I do to investigate this?

+ +
+ +

(Update in response to @Penguin_Knight)

+ +

Your graph is exactly what I explained. Just take away the negative values with an imaginary y=0 line. As you can see there are many data points the X of which is either 0 or very small, and Y has pretty high value. However there is no data point that have y=0 and x is quite high value. And as you can see it makes the data scatter shape a right angle triangle. Thus we could say that the high X values necessitates high Y values but high Y values does not always have high X values. I find it very interesting. In practice for example I measure Complexity and Size of given entities. And my analysis show that all complex entities have big size but not all big-size entities are complex. Then I conclude that the certain amount of complexity requires defined amount of size. You cannot put more complexity in a given size. This is a bit abstract but you see my point? It is very interesting and I would like to get some help on how this kind of relationships are discussed or described in statistics formally.

+",2013-10-25 10:55:34.690 +58211,18514.0,2,,58208.0,,,,CC BY-SA 3.0,"

The relation between the autocovariance (or autocorrelation) and the spectral density (for which the periodogram is an estimator) is given by the Fourier transform. The two form a so-called Fourier-transform pair meaning the two are time(or space)-domain vs. frequency-domain representations of the same thing. Specifically, if time series $\{X_t\}$ has autovariance function $\gamma(\tau)$ at time lag $\tau$, then the spectral density is defined by +\begin{equation} +f(\nu)=\sum\limits_{\tau=-\infty}^{\infty} \gamma(\tau)e^{-2i\pi\nu\tau}. +\end{equation} +In words, the spectral density partitions the autocovariance as energy-per-hertz of a signal. For example, if you have a deterministic signal with period $t=12$, then the series lagged with itself (ACF) at lag 12 will be perfectly correlated (autocorrelation=1). Subsequently, all power in the spectral density will be concentrated at frequency $1/t$.

+",2013-10-25 11:08:27.573 +58212,9554.0,2,,58190.0,,,,CC BY-SA 3.0,"

I guess it really depends on what you are doing. If your are investigating the relationship between the two variables, the answer would be the covariance. (off-diagonal element in your 2x2 matrix.)

+ +

In case you are trying to track the uncertainty of estimates from 2 different systems using their covariance matrices, you could use:

+ +

$det(\Sigma)$, or $tr(\Sigma)$, where $\Sigma$ is the covariance matrix.

+ +

i.e. the determinant of the covariance matrix is a 1D measure to track uncertainty, and there is some theory about how and why it might make sense. Intuitively, the det tells you by how much you would the cov matrix scale a space if applied as a linear transformation.

+",2013-10-25 11:26:16.530 +58213,9554.0,2,,58210.0,,,,CC BY-SA 3.0,"

Typically, to say a scatterplot has triangular shape, it would have to look like this: /\. What you are describing sounds like this: /.

+ +

The part about Y~X being positively correlated in the plot, and X~Y not, is IMPOSSIBLE. Please check your code, or the description.

+ +

In general you can use a regression model to model a relationship between a ratio scaled X and Y. Look up the assumptions for different regression models in a book or on the Wiki.

+",2013-10-25 11:45:34.933 +58214,20388.0,1,,,,Sample from Wishart distribution with inverse Scale matrix,,CC BY-SA 4.0,"

I tried to model precision matrix in a hierarchical Bayesian setup with Wishart prior given d.f. and inverse scale matrix, and matrix normal likelihood, since it's a conjugate prior, my posterior on the precision matrix $K$ ends up in the form: +$$ +K \sim \text{Wishart}(\text{df}, \Lambda^{-1}) +$$ +Since the dimension of $\Lambda$ is quite big, I do not wish to take the inverse of the matrix and use the built-in sampler. I looked at the source code for Matlab's wishrnd, they used Bartlett decomposition for large dimensions (81+n), which should also work for smaller dimensions albeit its inefficiency.

+ +

snips of the code:

+ +
[n,m] = size(Lambda);
+[d,p] = cholcov(Lambda,1);
+
+% Otherwise use the Smith & Hocking procedure
+d = eye(size(d)) / d; 
+% Load diagonal elements with square root of chi-square variates
+a = diag(sqrt(chi2rnd(df-(0:n-1))));
+
+% Load upper triangle with independent normal (0, 1) variates
+a(itriu(n)) = randn(n*(n-1)/2,1);
+% Desired matrix is D'(A'A)D
+x = a(:,1:size(d,1))*d;
+
+a = x' * x;
+
+% --------- get indices of upper triangle of p-by-p matrix
+function d=itriu(p)
+
+d=ones(p*(p-1)/2,1);
+d(1+cumsum(0:p-2))=p+1:-1:3;
+d = cumsum(d);
+
+ +

In order to accommodate my needs,i.e. inputing degrees of freedom $df$ and the inverse scale matrix $\Lambda$ I added

+ +
d = d \ eye(size(d))
+
+ +

after the second line, so that I would have the inverse of the cholesky decomp of the inverse scale matrix, in others words, the cholesky decomp of the scale matrix. Then everything should be okay from there ( I hoped).

+ +

I tested this by firstly generating a 3 by 3 positive definite matrix:

+ +
>> test = rand(190, 3); Lambda = test'*test;
+>> Lambda
+
+Lambda =
+
+  62.7642   46.4970   45.6662
+  46.4970   61.9178   45.4114
+  45.6662   45.4114   59.1070b
+
+ +

Setting $df = 6$, 100,000 randoms samples were generated and the empirical mean is compare with the first moment of the distribution

+ +
>> df = 6;
+>> sam = 0; for loop = 1:100000 sam = sam + mywishrnd(Lambda,df); end
+>> sam/100000
+ans =
+
+ 0.0956   -0.1069   -0.0689
+-0.1069    0.3380   -0.0286
+-0.0689   -0.0286    0.3872
+
+>> inv(Lambda)*df
+
+ans =
+
+ 0.2647   -0.1118   -0.1187
+-0.1118    0.2692   -0.1205
+-0.1187   -0.1205    0.2857
+
+ +

There is quite a big differences in some entries of the results, but I don't see anything wrong with this theoretically. Is it possibly due to numerical error? Could someone enlighten me on this problem?

+",2013-10-25 12:07:31.487 +58215,3868.0,1,,,,"need name, reference, and/or study for the following variable reduction procedure in regression",,CC BY-SA 3.0,"

I have seen the following commonly used: 1. fit a model with all variables, 2. in a single reduction step, remove from the model all variables at once that do not fit some criteria (p-value, whatever), 3. calibrate the reduced model, in my case to a new data set, 4. check model results and hopefully everything went well and you can stop.

+ +

I'm betting this will perform better than stepwise, especially when used with data splitting, but would appreciate a name for this procedure if it exists and perhaps a reference related to it so that I can learn more. Maybe it is so simple/bad/obvious that no one has bothered? -WVG

+",2013-10-25 12:15:34.753 +58217,10547.0,1,,,,Confusion about the kind of offset-value for this non homogenous poisson-process,,CC BY-SA 3.0,"

Suppose that I observe $M$-times this kind of table:

+ +
     t X
+ 1:  1 0
+ 2:  2 0
+ 3:  3 0
+ 4:  4 0
+ 5:  5 0
+ 6:  6 1
+ 7:  7 0
+ 8:  8 0
+ 9:  9 0
+10: 10 0
+11: 11 0
+12: 12 0
+13: 13 0
+14: 14 0
+15: 15 0
+16: 16 5
+17: 17 1
+18: 18 0
+19: 19 0
+20: 20 0
+.   . .
+.   . .
+T-1:T-1 0
+T:  T 0
+
+ +

So, what I get is like: $x_1 = 0, x_2 = 0, x_3 = 0, x_4 = 0, x_5 = 0, x_6 = 1....$, i.e., $X_t \in \{0,1,2,3,4,5,...\}$ where $t = 0,1,2,...,T-1,T$. As we can see, I observe jumps greater then one for particular $t$'s (like $t = 16, \rightarrow x_{16} = 5$).

+ +

I want to investigate if more counts occur if $t$ moves towards $T$ and what the impacts of different covariates, captured in $\boldsymbol{X}$, are.

+ +

To do that, I could assume that $N(t) = \sum_{i=1}^t X_t$ is a poisson-process, i.e.,

+ +

(1) $P(N(t) = k) = \frac{\lambda(t)t^{k}}{k!}\text{exp}\{-\lambda(t)t\}$

+ +

The regression model will be:

+ +

(2) $\text{log}\{E(N(t)|t)\} = \boldsymbol{X\beta + \epsilon}$

+ +

$\Leftrightarrow \text{log}\{\lambda(t)t\} = \boldsymbol{X\beta + \epsilon}$

+ +

$\Leftrightarrow \text{log}\{\lambda(t)\} = -\text{log}\{t\} + \boldsymbol{X\beta + \epsilon}$

+ +

In this case, I would model $-\text{log}\{t\}$ as the offset-value and take $N(t)$ as the dependend variable - which is just the cumsum of $X_i$, where $i$ goes from $1$ to $T$.

+ +

Question 1: +I think here I'll get a problem because, as far as I remember, for particular $t$'s, the process is not supposed to jump by more then one?

+ +

Another way I could think of, is, that I see my data as a big cross-table. If I assume that each cell count is a realization of an independent poisson-process and the total count $n$ is fixed, then I'm able to say that I observe a set of counts $n_1,...,n_T$ which are each poissonian with rate $\lambda_t$.

+ +

If these assumptions hold, it can be shown that the cell counts are Multinomial$(\frac{\lambda_t}{n_1},...,\frac{\lambda_t}{n_T},n)$.

+ +

Hence, if I see the observations as a multinomial respones, for the cell counts $X_t$, I'll get the model:

+ +

(3) $\text{log}\{E(X_t)\}) = \boldsymbol{X\beta + \epsilon}$

+ +

$\Leftrightarrow \text{log}\{\lambda_t\} = \boldsymbol{X\beta + \epsilon}$

+ +

$\Leftrightarrow \text{log}\{np_t\} = \boldsymbol{X\beta + \epsilon}$

+ +

$\Leftrightarrow \text{log}\{p_t\} = -\text{log}\{n\} + \boldsymbol{X\beta + \epsilon}$

+ +

Question 2: +Now, I would choose $-\text{log}\{n\}$ as my offset-value. If the sequence of $\lambda_t, \ \ t = 1,...,T$ is increasing in $t$, this would answer my question. Since my $n$ is actually fixed and with this approach I'm able to deal with the multiple jumps at some $t$ it seems that the model (3) would be the way to go?

+ +

Question 3: +If I use a glm() what would my dependend variable be? I thought about defining a new r.v. $Y_t$ which is 1 if $X_t>0$ and zero otherwise. If $X_t>1$ then I would duplicate the corresponding row $x_t$-times. If I do that, in my optinion, things are not supposed to change since I look at a ratio as a dependend variable anyway?

+ +

Question 4: +Another issue I've is, that in model (2) I'm not sure how (3) catches that moving towards $T$ the cumsum of $X_t$ inceases steadily, since (3) looks at each cell separately.

+",2013-10-25 13:17:25.907 +58219,750.0,1,58900.0,,,How to test for randomness in bins with small N?,,CC BY-SA 3.0,"

I observe a series of crime incidents linked by modus operandi or some other peculiar characteristic of the crime (e.g. cutting catalytic converters from underneath vehicles). I would like to know if the observed days of the week that the crimes occur on (ignoring the uncertainty that sometimes occur in crime incidents - e.g. it happens overnight) are random. I typically have very few linked crime events, say between 5-15.

+ +

So question 1, there is a lot of knowledge about using Pearson's $\chi^2$ on small n contingency tables, can I use all of that same advice for $2\times 2$ contingency tables that come with it for just 7 day of week bins? (In particular can I use the $N - 1$ correction given expected cell frequencies are at least 1 and still expect similar coverage rates - which would mean I need at least 7 observed events?)

+ +

Or alternatively question 2, are there any other obvious approaches I can take to test the hypothesis of the events being random with respect to the day of the week? (Permutation approaches perhaps given the limited nature of the potential permutations?)

+",2013-10-25 13:26:16.757 +58220,16174.0,1,58223.0,,,Does stand-alone dummy variables in linear regression models make sense?,,CC BY-SA 3.0,"

Dummy (or binary) variables ($X_2$) can be used in linear regression models to help explaining a possible group effect that a continuous predictor variable ($X_1$) might present in explaining the response variable ($Y$).

+ +

Now, I am wondering if it makes sense to have only dummy variables as predictors in a linear regression model with a continuous response variable. Does it?

+ +

Example:

+ +

% of population with instruction = influenced by politics A + influenced by politics B.

+ +

Both politics A and B can assume values 1 or 0.

+",2013-10-25 13:42:12.593 +58221,23037.0,2,,57939.0,,,,CC BY-SA 3.0,"

As far as I know, when comparing only 2 algorithms, Demšar suggests the Wilcoxon signed rank test rather than Friedman + posthoc. I am, sadly, just as confuzed as you when it comes to decyphering what demšar's dividing by k-1 is supposed to mean.

+",2013-10-25 13:42:22.577 +58222,22775.0,1,,,,overall effects of categorical variables,,CC BY-SA 3.0,"

I'm doing a Poisson regression in Stata, so the dependent variable is a count variable and I have some categorical predictors. If A is a categorical variable with, for example, 4 levels, in the parameter estimates table I get results for the 3 levels of the variable compared to the level I have set as the reference category. Similarly for an interaction term. However, I would also like to have an estimate of the overall effect of variable A and also an estimate of the overall effect of the interaction. How do I do this in Stata?

+ +

Ie. I need the test of the effect of A in the model on the dependent variable D, which tests the joint hypothesis that every one of the four levels of A has the same effect on D, and therefore A does not explain any of the variation in D, and so it is not a significant predictor of D in the model. +I do not need a test which tests the hypothesis that the effects on D of e.g. A2 , A3, or A4, respectively, are all equal to each other in this model, but not necessarily equal to the effect of A1.

+ +

Thank you

+ +

UPDATE: Well, prompted by the comments from all of you, and after hours of trying out different things, I found that what I was looking for was +""constrast A"", +described in http://www.stata.com/manuals13/u25.pdf +However, I am now wondering why ""contrast A"" does not give me the same results as +""test A4=A3=A2=A1"". I'm told I shouldn't make stata-specific questions, however I'm trying to see how stata corresponds to other packages and this is what prompted my question. I also think that if someone can answer this question for me i.e. why constrast is not the same as a joint test on whether all levels have the same effect, then this would be of general interest as it obviously translates to a general statistical question.

+",2013-10-25 13:50:09.057 +58223,10060.0,2,,58220.0,,,,CC BY-SA 3.0,"

Because linear regression does not assume any distribution of predictors, as long as

+ +
    +
  1. They are not perfectly collinear, and
  2. +
  3. None of them is a constant, it should be fine.
  4. +
+ +

Your example is just like using regression as an ANOVA sans interaction (aka, not a full-factorial design.) If additional effect due to co-influence by A & B is of your interest, compute an interaction term (by multiplying your two dummy variables) and include it as a predictor as well.

+",2013-10-25 13:57:50.543 +58224,23039.0,2,,58141.0,,,,CC BY-SA 3.0,"

How about this paper:

+ +

http://arxiv.org/pdf/0803.4101.pdf

+ +

""Measuring and testing dependence by correlation of distances"". Székely and Bakirov always have interesting stuff.

+ +

There is matlab code for the implementation:

+ +

http://www.mathworks.com/matlabcentral/fileexchange/39905-distance-correlation

+ +

If you find any other (simple to implement) test for independence let us know.

+",2013-10-25 14:22:34.343 +58225,16474.0,2,,58222.0,,,,CC BY-SA 3.0,"

Given your comments I will assume that you do not want an estimate of the size of the effect but instead a statistical test whether the expected (possibly adjusted) count for each of the categories are the same. This may or may not be wise depending on your circumstances, but this is an example of how you do it in Stata:

+ +
webuse dollhill3
+poisson deaths smokes i.agecat, exposure(pyears)
+testparm i.agecat
+
+ +

If you want something like a single effect size you could look into sheaf coefficients. In case of interaction terms this generalizes to a model with parametrically weighted covariates. A brief discussion on how to do those in Stata can be found here.

+",2013-10-25 14:30:54.353 +58226,23038.0,1,58238.0,,,Similarity of new element x with the training set X,,CC BY-SA 3.0,"

Suppose we have trained a model (function, algorithm) $M$ which gives prediction to a new sample $x$ not observed in the training set, $M(x)$. It is natural to assume that the quality of prediction $M(x)$ depends on similarity of $x$ with the training set $X$. I wonder if there are any indices/methods which would evaluate similarity of $x$ with training set $X$, or any indices which would evaluate the quality of prediction $M(x)$. To be more specific, I don't want an overall performance of the model (which could be evaluated by RMSE, R-squared or deviance function), but I would like to estimate the quality of the prediction $M(x)$ for this particular $x$. It would be great if anyone could point me to the literature on this problem.

+",2013-10-25 14:36:12.433 +58242,23046.0,1,,,,Directed Graph to Regression Help,,CC BY-SA 3.0,"

+ +

So I have this directed graph (above)

+ +

Each arrow represents a causal link. Is it possible to calculate the affect of X on Z, where all variables are observed except U and W. If so, what would the regression equation(s) look like. I believe it has to do with 2 step least squares, but I haven't been able to figure it out. Any help/tips would be much appreciated. Thanks.

+",2013-10-25 18:39:17.683 +58227,23041.0,1,,,,Does it matter if I use correlations or regression coefficients to suggest areas to focus on to improve overall customer satisfaction?,,CC BY-SA 3.0,"

I am working with customer satisfaction data where the dependent variable is ""Overall satisfaction"" and the independent variables are satisfaction with various areas such as customer support, delivery etc.

+ +

I want to suggest areas where the company should focus on in order to improve overall satisfaction.

+ +

Option 1: I could look at correlations between the 'Overall satisfaction"" and the independent variables and suggest that the company focus on the top 3 positive correlations as areas for improvement.

+ +

Option 2: I can use a linear regression and suggest that the company should focus on the areas associated with the 3 highest regression coefficients.

+ +

Are the two options equivalent? If not, which one is the better approach?

+",2013-10-25 15:15:10.080 +58228,23042.0,1,,,,Sample size and correct choice of test in g*power,,CC BY-SA 3.0,"

I am new to g*power and have a question about which test I have to choose and how to interpret the given sample size.

+ +

I have 2 measurements (pre / post), one control-group and one intervention-group. In a reference study, I was able to find an effect size of .7 for the most important parameter. In g*power I chose F-test -> ANOVA RM within-between interaction, a power of 0.8 and alpha .05.

+ +

Here are my questions:

+ +
    +
  1. Did I choose the correct test?
  2. +
  3. Is the resulting sample size referring to each group or total?
  4. +
  5. Regarding the effect size of .7 I found in a reference study, am I allowed to fill it in g*power -> effect size f, or should I leave it as it was at .25?
  6. +
+",2013-10-25 15:56:13.647 +58229,22637.0,1,58235.0,,,Cumulative Distribution Function Inequality (Discrete Distributions),,CC BY-SA 3.0,"

Let a discrete Random Variable $T$ have CDF $F_T(T)$. Could you please help me understand why $$ P \left[ F_T (T) \leq a_1 \right] \leq a_1 $$

+ +

I know that the result holds with equality for the continuous case, it is known as Probability Integral Transform, but I am having trouble understanding it for the Discrete one since the inverse is not defined. Thank you.

+",2013-10-25 16:11:47.767 +58230,306.0,2,,55722.0,,,,CC BY-SA 3.0,"

If you want to read probability as a story, read the best book ever by Feller. I am also guessing that you do not want to go to the level of measure theoretic definition of probabilities which has specialized books. another beginner level book is from Ross. Other specialized applications have specialized books. so more information will gather better suggestions.

+",2013-10-25 16:14:42.940 +58231,20473.0,2,,58227.0,,,,CC BY-SA 3.0,"

If I understand correctly, customers rate the company in various aspects of the transaction, and then, customers again give an overall score. This is the real-world structure. Making an assumption that customers are reasonably rational (i.e. consistent in their opinions), it means that somehow, they, in their minds, construct some sort of ""weighted average"" in order to go from the partial scores to the overall score.

+ +

Then you should use the regression approach, which reflects the above situation. Using partial correlation coefficients does not capture how one reasonably believes that the customers thought and acted when scoring the company.

+ +

This regression is in the spirit of ""hedonic index regression"", if we view ""overall satisfaction"" as the ""price"" of the ""product"" named ""transacting with company"", and the regressors as ""features"" of the product (that are provided in different levels for each customer, and hence their variability).

+ +

If the rankings are consistently coded (say, a higher number means a higher level of satisfaction for the partial scores and for the overall score), then a higher estimated regression coefficient on a partial score will indicate that this aspect of the transaction ""bears more heavily"" (has a higher marginal effect) on ""overall satisfaction"", and so indeed, focusing and improving on the areas with the higher regression coefficients, should yield larger benefits in overall satisfaction.

+ +

But also, in order to finally decide on the prioritization, one should also look how the various areas compare in average score. Say ""customer support"" has a higher regression coefficient than ""delivery"", but also, ""customer support"" is on average rated already very high by customers, compared to ""delivery"". Then the efforts to further improve ""customer support"" may be more costly and difficult, compared to improving ""delivery"". So while one unit of increase in customer support satisfaction may yield higher overall satisfaction increase compared to one unit increase in ""delivery"", this one unit increase may be more costly to achieve in customer satisfaction than in delivery, offsetting partially, or fully, the economic gains from the increase in ""overall satisfaction"".

+ +

Of course this last issue is not a statistical question, but I mentioned it so that any prioritization suggestion based on statistical analysis, at least mentions this aspect that must be taken into account for the final decisions.

+",2013-10-25 16:25:37.833 +58232,,2,,58194.0,user31668,,,CC BY-SA 3.0,"

Unless the forecasts (or your metric) for each event are commensurate, you will need to make 100 separate evaluations, one per event. For each event $i$ and associated forecaster $j$, you could try a linear mixed model:

+ +

$y_{ij} = \mu_i + \sum\limits_{k=0}^{\#Teams}a_{ik}T_{ijk}+\epsilon_j$

+ +

Where $\mu_i$ is the mean forecast across all members,$T_{ijk}$ is a binary variable indicating whether person j was assigned to team k for question i, with $a_{ik}$ being the team effect for question i. $\epsilon_j$ is the individual effect.

+ +

Now, if you make ""completed"" datasets via imputation (and the forecasts/metrics are commensurate) then you can do an analysis across all questions at once, so each person will have 100 y's associated with them.

+ +

I'm not familiar with how to work directly with unbalanced designs, so someone else may have a more sophisticated approach, but hopefully this give you ideas or gets you started.

+",2013-10-25 16:29:40.270 +58233,12358.0,2,,58202.0,,,,CC BY-SA 3.0,"

You can think of the relationship between $\lambda$ and $\sigma$ as just +a change of variables, a.k.a. a reparameterization, of the probability distribution that was initially specified in terms of $\sigma$.

+ +

We have the random variable $\sigma$ distributed as $p(\sigma)=1/500$ for $0<\sigma<500$. We want the distribution for $\lambda=\sigma^{-2}$:

+ +

\begin{equation} +\begin{aligned} +p(\sigma) d\sigma &= p(\lambda)d \lambda \\ +\frac{\lambda^{-3/2}}{1000} d\lambda &= p(\lambda) d\lambda \\ +\frac{1}{500^2} <& \lambda < \infty +\end{aligned} +\end{equation}

+ +

(Deriving this is involves equating the CDFs, $\Phi( \sigma) =\Phi(\lambda)$, reversing the sense of $\sigma$, i.e. remapping $\sigma \rightarrow 500-\sigma$, and then taking the derivative of the resulting CDF)

+",2013-10-25 16:32:57.587 +58234,6204.0,1,58237.0,,,"Why is standard error sometimes used for ""error bands"" in plots?",,CC BY-SA 3.0,"

It seems that often what someone really wants to plot is a confidence interval of some kind, but using SE for this purpose I think only ends up comprising something like a 68% confidence band. Therefore, plotting SE for error bars instead of a wider band more representative of the significance level of your analysis visually suggests significance in your data that may not actually be there.

+ +

Consider the following concrete example:

+ +
set.seed(123)
+X <- rnorm(100, 0, 1)
+Y <- rnorm(100,1.7,5)
+df = data.frame(X,Y)
+
+boxplot(df)
+
+se.x = sd(X)/sqrt(length(X))
+se.y = sd(Y)/sqrt(length(Y))
+
+X.err.CI = 1.96*se.x
+Y.err.CI = 1.96*se.y
+
+
+plot(1:2, colMeans(df), ylim=c(-1,3), xlim = c(0.5,4.5), col=""dark green""
+     , main=""Comparison of SE bars vs 95% CI"")
+lines(c(1,1), c(mean(X) + X.err.CI, mean(X) - X.err.CI), col=""dark green"")
+lines(c(2,2), c(mean(Y) + Y.err.CI, mean(Y) - Y.err.CI), col=""dark green"")
+text(1:2 + .2, colMeans(df), c(""X"",""Y""))
+
+points(3:4, colMeans(df), col=""blue"")
+lines(c(3,3), c(mean(X) + se.x, mean(X) - se.x), col=""blue"")
+lines(c(4,4), c(mean(Y) + se.y, mean(Y) - se.y), col=""blue"")
+text(3:4 + .2, colMeans(df), c(""X"",""Y""))
+
+abline(v=2.5, lty=2)
+
+legend(""topright""
+       ,c(""95% CI"", ""+/- SE"")
+       ,lty=c(1,1)
+       ,pch=c(1,1)
+       ,col=c(""dark green"", ""blue"")
+       )
+
+ +

+ +

If we just base our analysis on SE (the image on the right), visually it appears that there is significance between the means of X and Y because we don't have overlap in our error bars. But if we're testing at a 5% significance level, plotting the 95% confidence bands shows that this is clearly not the case.

+ +

Since we can expect that a test at the 32% level will never be appropriate, why even show the SE bars since they will probably be interpreted as though they represent a confidence interval? Do people use SE bars instead of more meaningful CIs because it's moderately easier to calculate (e.g. using a built-in function in Excel)? It seems that we're paying a pretty high cost in terms of the interpretability of our graphic in exchange for a few minutes' less work. Is there some value/utility in SE bars that I'm missing?

+ +

For context, I was prompted to write this after skimming this article. I was frustrated by the lack of confidence intervals in the plots provided by the authors, and then when they did finally provide them, it turned out they were just SE bars.

+",2013-10-25 16:33:44.443 +58243,22555.0,2,,58234.0,,,,CC BY-SA 3.0,"

Whether from convention or otherwise, it is honest in the sense that it is easy for the reader to develop their own idea of significance, ie mentally the reader can consider a multiple of, say 2 or 3 times larger to get their own idea of significance. In a sense you are letting the data speak for itself rather than speaking for the data.

+ +

From that perspective it is logical to provide SE as the basis for banding. In my view, however, the caption of the chart should clearly state that the basis of banding is, in fact, one SE. Similarly these should not be identified in any way as confidence intervals but simply as properties of the data set.

+",2013-10-25 18:49:00.690 +58342,1406.0,2,,58027.0,,,,CC BY-SA 3.0,"

This is a textbook example of spurious time series regression. The levels are highly correlated, but the differences are not. This happens when we have two independent random walk processes. To make sure that this is really the case, check that the residuals from the level regression have unit-root and that the residuals from the difference regression do not have it.

+",2013-10-28 08:16:41.137 +58235,10135.0,2,,58229.0,,,,CC BY-SA 3.0,"

Try to draw CDF of a Discrete random variable like the (upper) one you have here. Now draw a horizontal line to indicate the level of $a_1$. All you need to do is to find the values of $T$ such that your CDF i.e. $F_T$ satisfies $F_T(T)\leq a_1$. You can move your $a_1$ vertically. Now depending on the level of $a_1$ sometimes you get $P[F_T(T)\leq a_1]<a_1$ and sometimes $P[F_T(T)\leq a_1]= a_1$. The equality happens when your $a_1$ is equal to one of those horizontal (red) lines in the plot of your CDF. OK, see $a_1$ in the graph below.
+For what values of $T$ you have $F_T(T)\leq a_1$? Obviously for $t<t_1$. For $t<t_1$, $P_T(t)=P(T< t_1)=0$. And as the plot shows you have $0<a_1$. So the condition you want to prove is correct in this case. Now look at $a_1$ below.
+Again for what values of $T$ you have $F_T(T)\leq a_1$? Obviously for all $t< t_2$. Now if $t_1\leq t< t_2$ we have $P_T(t)=P(T\leq t)=a_1$. In this case you will end up with equality i.e. $P[F_T(T)\leq a_1]= a_1$. And if $t<t_1$ then $P_T(t)=P(T\leq t)=0$. Here again as the graph shows the condition holds i.e. $0\leq a_1$. You can do exactly the same argument if you move $a_1$ vertically.

+",2013-10-25 16:58:25.353 +58236,23045.0,1,58240.0,,,Pearson's correlation for time series requires normally distributed data?,,CC BY-SA 3.0,"

In order to use Pearson's correlation to measure the similarity of two time series, is normal distribution of both time series a necessary condition?

+",2013-10-25 16:59:53.883 +58237,12358.0,2,,58234.0,,,,CC BY-SA 3.0,"

Mostly its that ""it's been done that way in the past"", but in some domains it is precisely because the authors are not drawing statistical inferences directly from the reported standard errors (even though, for the example paper +it might be reasonable to do so).

+ +

As an example, physics research papers often depict the standard errors related to (estimated) statistical errors in the data collection. These are usually estimated from +running (as much a possible) the same experimental multiple times using the same setup and estimating the variance. However, these statistical errors +are only very rarely used in a direct confidence interval/degree of significance +type of assessment. This is due to the fact that in most experiments systematic +errors of various type are likely to be larger than the statistical errors, and these types of errors are not amenable to statistical analysis. Thus, representing the 95% confidence interval based on just the statistical errors could be deceiving. Experimental particle physicists in particular go to great pains to identify statistical uncertainties, systematic uncertainties and then combine them (in physics community approved ways) into confidence intervals (the preprints on the discovery of the Higgs boson are probably easily found examples of this).

+",2013-10-25 17:03:36.480 +58238,7155.0,2,,58226.0,,,,CC BY-SA 3.0,"

To evaluate the probability of observing some $x \in R^n$ in a data set defined by $D \in R^{i \times n}$, where $i$ is observations and $n$ is the cardinality of your features you can use density estimation/one-class classification.

+ +

While it stands to reason that there exists some relationship between the $p(x \in D$) and the quality of your prediction, you haven't observed this relationship.

+ +

What I'd recommend instead is exploring the literature on nonparametric confidence intervals. In particular, Gaussian Processes. They produce standard error estimates of your prediction and confer all of the advantages of other kernel machines at adapting to many different kinds of features.

+ +

The drawback is that they don't scale well with data size, without some hacks I've yet to master, and they are only good for regression and structured learning problems.

+ +

Gradient boosting algorithms can be adapted to produce confidence intervals, are good at classification and scale well to large datasets.

+",2013-10-25 17:21:53.337 +58239,668.0,2,,58229.0,,,,CC BY-SA 3.0,"

Consider a box $\Omega$ filled with tickets. On each ticket $\omega$ is written a number called $X(\omega)$. For any number $x$, whether or not it appears among the tickets, $F_X(x)$ is (defined to be) the proportion of tickets for which $X \le x.$

+ +

Let's add some new information to each ticket $\omega$: next to the value of $X$ written on it, we will also write the value of $F_X(X(\omega))$: it is the proportion of all tickets with values of $X$ less than or equal to this value, $X(\omega).$ (It's the same concept as a percentile or quantile: the tickets with the smallest values of $X$ get the smallest proportions and the tickets with the largest values of $X$ get the largest proportions.) These new values, being proportions, lie between $0$ and $1$ inclusive. But, when $X$ is discrete, they will not include all possible numbers, but only the proportions that actually occur in the box.

+ +

Consider drawing a single ticket from this box at random. Fixing a number $a$ in advance, what is the chance that the new value (the ""quantile"") written on the ticket will not exceed $a$? Of course it's the proportion of tickets with values of $a$ or lower. But all such tickets, by construction, have values of $X$ that lie within the lower $100a\%$ of all the values. Therefore this chance cannot exceed $a$.

+ +

The chance might be strictly less than $a$ when $a$ is not one of the actual proportions in the box. Because it cannot be greater than $a$ and now cannot be equal to $a$ it has to be less than $a$!

+ +

A simple example is afforded by a box with two tickets: on one of these $X$ equals $0$ and on the other it equals $1$. When we write the proportions on the tickets, then, we will write $1/2$ (or $50\%$) on the first ticket (because half the tickets have values of $0$ or less) and $1$ (or $100\%$) on the second ticket (because all the tickets have values of $1$ or less).

+ +

What is the chance that this new value on a randomly drawn ticket will be less than or equal to $a=3/4$ (or $75\%$)? Because the new values are only $50\%$ and $100\%$, and half of them are less than $75\%$, the answer obviously is $1/2$. This is strictly less than $a$ because there are no proportions in the box between $50\%$ and $75\%$. The issue is just that trivial and simple.

+ +
+ +

The preceding used a tickets in a box metaphor for reasoning about random variables. If we replace $\Omega$ by a probability space, insist that $X$ be a measurable function, and understand ""proportion"" as the value of the probability measure, then we will have a rigorous proof. And it's still just as trivial.

+",2013-10-25 17:37:09.560 +58240,15827.0,2,,58236.0,,,,CC BY-SA 3.0,"

Several questions are bundled together here:

+ +
    +
  • Pearson correlation measures linearity of relationship, not similarity of values. $y$ and $a + by$ have correlation $+1$ for $b > 0$: make $a$ as different from zero or $b$ as large as you like, and the similarity is at best one of shape, not size. (Concordance correlation does measure agreement in the sense that $y = x$ is required for perfect positive correlation.)

  • +
  • Normality of distribution is not a requirement to measure correlation; correlation is perfectly well defined as a descriptive statistic (or even as a non-statistical property; it's just a cosine from one point of view) so long as both variables are genuinely variable. Marginal distribution is of concern if you wish to test for significance, e.g. produce a P-value.

  • +
  • But you have time series and can expect dependence in time and (quite likely) other kinds of structure. The standard machinery for Pearson correlation is for independent data, and no inference for Pearson correlation for time series can be taken seriously without adjustment for dependence structure.

  • +
+ +

What is the real problem? For assessing similarity of two time series, I would always start with plotting the series and examining (as appropriate) the difference or ratio between them. The next step is harder and entails modelling the series to see if they have the same structure; others will predictably make positive suggestions here. Also, here as elsewhere, a single summary measure will rarely do justice to the fine structure of interesting data.

+",2013-10-25 17:37:25.393 +58278,15473.0,4,,,,,,CC BY-SA 3.0,"Refers to Toeplitz matrix or diagonal-constant matrix, which has constant entries on the diagonals, i.e. $A_{ij}=a_{i-j}$.",2013-10-26 19:31:28.250 +58244,20388.0,2,,58138.0,,,,CC BY-SA 3.0,"

a 'scale' matrix can be any matrix that is positive definite. Wishart distribution is often used in Bayesian hierarchical model to capture characteristics of a inverse covariance matrix.

+ +

Back to your problem, if you read the help page of rWishart carefully, it says:

+ +
+

If X1,...,Xm, Xi in R^p is a sample of m independent multivariate Gaussians with mean (vector) 0, and covariance matrix Σ, the distribution of M = X'X is W_p(Σ, m).

+
+ +

However in your toy example, you chose to sample $X_i$'s with different means and different variance, and the degree of freedom $p$ is predetermined by the size of your sample $X_i$, not randomly chosen.

+ +

A better example can be constructed as such:

+ +
require(MASS)
+data = cbind(rnorm(100,0,5),rnorm(100,0,2),rnorm(100,0,3))
+Sigma = cov(data) % this is a 3 by 3 matrix
+eigen(Sigma) %check positiv definite
+
+%construct X
+X = mvtnorm(100,rep(0,3),Sigma)
+
+%define d.f.
+df = dim(X)[1]
+
+%generate random wishart sample with df and Sigma
+rWishart(10,df,Sigma)
+
+% compute X'X
+t(X)%*% X
+
+ +

You'll find that these random samples are roughly in the range of X'X, a more rigorous check can be done by looping said X'X a few times(~100,000) and take the empirical mean. In theory this should agree with the first moment of Wishart distribution m*Σ by law of large numbers.

+ +

You can certainly generate non-central Wishart distribution, a good reference on this topic (or in fact any matrix variate distribution) is to look at Matrix Variate Distributions by Gupta and Nagar.

+ +

Hope this helps :)

+",2013-10-25 18:58:47.143 +58245,21905.0,1,,,,"Expected value of $Ye^X$ where $X \sim U(0,1)$ and $Y \sim U(0,1)$",,CC BY-SA 3.0,"

I am trying to find the expected value of $Z$ where $Z = Y\cdot e^X$ where $Y \sim U(0,1)$ and $X \sim U(0,1)$.

+ +

My attempt so far:

+ +

$$F_Z(z) = P(Ye^X \le z) = \int \int_{Ye^X \le z} f(x,y)\, dxdy$$

+ +

Where $f_{X,Y}(x,y) = f_y\cdot f_{e^x}$

+ +

$$f_Y(y) = \frac{1}{1-0}\,, \quad y \in (0,1)$$

+ +

I am stuck trying to find $f_{e^X}$ but I cannot remember how to find that pdf.

+",2013-10-25 19:06:11.117 +58246,23048.0,1,,,,Finding the expected value of two normal random variables,,CC BY-SA 3.0,"

Suppose $a_1 = b + c_1$ and $a_2 = 2b + c_2$ where $b, c_1, c_2$ are all $N(0,1)$

+ +

Find $E[b|a_1,a_2]$

+ +

My attempt: +As $E[b] = 0$, I assume $E[b|a_1, a_2] = 0$. Is this a logical assumption?

+",2013-10-25 19:19:09.190 +58247,23046.0,2,,58241.0,,,,CC BY-SA 3.0,"

@AsymLabs In R, you can just use the command lm(c(32,40,46) ~ c(1,2,3), weights=1/c(6,8,40)). To get slope = 7.4359 and intercept = 24.7179. That gives the fit whuber described. It's a regression weighted according to the inverse of the variance.

+",2013-10-25 20:13:45.023 +58248,12900.0,1,60256.0,,,Machine learning applications in number theory,,CC BY-SA 4.0,"

Is there any research into or applications of machine learning in number theory?

+

I am also looking for (leading examples of) statistical/empirical analysis of number theory questions. Also wondering if genetic algorithms in particular have ever been used in these areas.

+ +",2013-10-25 20:58:02.690 +58249,13385.0,2,,58245.0,,,,CC BY-SA 3.0,"

There is definitely an easier approach to this problem (hints were given in the comments), but since you asked about a specific step, I'll go from there.

+ +

You want to compute the pdf $f_{e^X}(x)$. Let's start with:

+ +

$$ F_{e^X}(x) = P(e^X < x) + = P(X < \log x) + = (F_X \circ \log)(x) +$$

+ +

Recall that we can compute $f_{e^X}$ by differentiating $F_{e^X}$. In this case, you can use the chain rule.

+ +

This kind of transformation generalizes to the multivariate case.

+",2013-10-25 23:49:46.850 +58250,23052.0,1,,,,Correlation coefficient between two arrays of 2D points?,,CC BY-SA 3.0,"

I have two arrays of 2D points and I need to estimate their correlation. What formula should I use?

+ +

Example of arrays:

+ +

$$X: ((1,5),(2,5),(1,7),(4,1)),$$

+ +

$$Y: ((3,4),(1,6),(4,6),(4,3)).$$

+",2013-10-26 00:01:06.250 +58251,23053.0,1,,,,Are $\mathbb{F}_2$-linear combinations of random variables in an i.i.d. Bernoulli process again an i.i.d. Bernoulli process?,,CC BY-SA 3.0,"

I'm having trouble understanding how certain combinations of random variables can correlate. The problem is as follows:

+ +

I have a binary $m \times n$ matrix $A$ of full rank (over the finite field $\mathbb{F}_2$ with two elements), where each row has exactly $w$ $1$'s (i.e., their weights are all $w$).

+ +

I randomly pick an $n$-dimensional binary vector $\boldsymbol{v} = (v_0,\dots,v_{n-1}) \in \mathbb{F}_2^n$, where each $v_i$ is $1$ with probability $p$ and $0$ with probability $1-p$. The probability $p$ is assumed to be strictly smaller than $\frac{1}{2}$. Here, $v_i$ are all independent, so that picking $\boldsymbol{v}$ is seen as an i.i.d. Bernoulli process of probability $p$.

+ +

I am going to hash this vector $\boldsymbol{v}$ into an $m$-dimensional vector $\boldsymbol{v}' = (v'_0,\dots,v'_{m-1})$ by taking $\boldsymbol{v}' = A\boldsymbol{v}^T$ over $\mathbb{F}_2$. So, if I write the $i$th row of $A$ as $\boldsymbol{r}_i$, each $v'_i$ is the product $v'_i = \boldsymbol{r}_i\cdot\boldsymbol{v}^T$ between the $i$th row of $A$ and $\boldsymbol{v}$. In other words, I'm just taking the mod $2$ sum of some bits of $\boldsymbol{v}$.

+ +

Because the weight of $\boldsymbol{r}_i$ is assumed to be $w$, the probability $P(v'_i = 1)$ that the $i$th bit $v_i'$ of the hashed vector $\boldsymbol{v}'$ is $1$ is

+ +

$$P(v'_i = 1) = \sum_{x: \text{odd}}^{w}\binom{w}{x}p^x(1-p)^{w-x},$$

+ +

which is a constant if we fix $p$ and $w$, and is also uniform across all $v_i'$.

+ +

Now, if $A$ had linearly dependent rows, the hashed bits $v_i'$ are clearly not independent. My question is:

+ +

What if all rows of $A$ are linearly independent? Is $\boldsymbol{v}'$ an i.i.d. Bernoulli process with probability $P(v'_i = 1)$? If not, can I approximate it as one if $\boldsymbol{r}_i$ and $\boldsymbol{r}_j$ have only few $1$'s at the same columns for all $i \not=j$?

+ +

I remember I read in some research paper in electrical engineering that $\boldsymbol{v}'$ is i.i.d. Bernoulli if rows of $A$ are linearly independent, although I can't seem to remember where it was. (I found one recent paper that says this is the case: V. Toto-Zarasoa, A. Roumy, C. Guillemot, Maximum likelihood BSC parameter estimation for the Slepian-Wolf problem, IEEE Comm. Lett. 15 (2011) 232–234. It's Lemma 1 on the second page.) But now I think about it, this is counterintuitive (to me) because if two rows of $A$ have $1$ at the same column, that means that I took at least one same $v_i$ when hashing $\boldsymbol{v}$ for those two rows, so the corresponding $v_i'$ look correlated.

+ +

If my reasoning and Alecos Papadopoulos's answer are correct, the last part of the above question in boldface becomes essential. For instance, are there standard methods for evaluating how close or similar a given set of random variables like $\boldsymbol{v}'$ is to i.i.d. Bernoulli?

+",2013-10-26 00:26:12.477 +58252,3185.0,1,58392.0,,,Sign of the unnormalized log likelihood in Ising model,,CC BY-SA 3.0,"

Here is a section of Machine Learning: a Probabilistic Perspective by Kevin Patrick Murphy

+ +

+ +

I don't understand in (19.18) why there is a negative sign. For me, $\log \tilde{p}(\mathbf{y})=\sum_{s\sim t}\log\psi_{st}(y_s,y_t)$ holds. When $y_s$ and $y_t$ agree, $\log\psi_{st}(y_s,y_t)=w_{st}$, otherwise $\log\psi_{st}(y_s,y_t)=-w_{st}$. So shouldn't it be $\log \tilde{p}(\mathbf{y})=\sum_{s\sim t} y_sw_{st}y_t$? Also, when all entries of $\mathbf{y}$ agree, $\sum_{s\sim t} y_sw_{st}y_t$ is maximized because all summands are positive. So where is the problem? Do I miss something? Thank you.

+",2013-10-26 02:02:38.987 +58253,7007.0,2,,58246.0,,,,CC BY-SA 3.0,"

Let $B,C_1,C_2$ be independent $\mathrm{N}(0,1)$ random variables. Define $A_1=B+C_1$ and $A_2=2B+C_2$. Since we are conditioning on the same information, and $C_1$ and $C_2$ have the same distribution, by symmetry we have +$$ + \mathrm{E}[C_1\mid A_1,A_2] = \mathrm{E}[C_2\mid A_1,A_2] +$$ +almost surely (we haven't used the independence assumption yet). Hence, +$$ + \mathrm{E}[B\mid A_1,A_2] = \mathrm{E}[B\mid A_1,A_2] + \mathrm{E}[C_2\mid A_1,A_2] - \mathrm{E}[C_1\mid A_1,A_2] +$$ +$$ + = \mathrm{E}[B+C_2-C_1\mid A_1,A_2] = \mathrm{E}[A_2-A_1\mid A_1,A_2] +$$ +$$ + = \mathrm{E}[A_2\mid A_1,A_2] - \mathrm{E}[A_1\mid A_1,A_2] = A_2 - A_1 = B+C_2-C_1 +$$ +almost surely. Therefore (Why? Remember the independence assumption and use this. What is the distribution of $-C_1$?), +$$ + \mathrm{E}[B\mid A_1,A_2]\sim \mathrm{N}(0,3) \, . +$$

+ +

(If you have any doubts about $\mathrm{E}[B\mid A_1,A_2]$ being a random variable, check this answer.)

+",2013-10-26 02:32:12.797 +58254,13385.0,2,,55722.0,,,,CC BY-SA 3.0,"

Finding a single, comprehensive book will be very difficult. If you're asking because you want to do some self-study, get a couple of used texts instead of a single new one. You can get classics for $3-10 dollars if you look around online.

+ +

Feller's ""Introduction to Probability"" is great for its completeness and expository style, but I don't like the exercises much. And the exposition would not make it so good for a reference. He tends to have a lot of long examples, which is great for fostering understanding, and not so great for looking things up.

+ +

I enjoyed Allan Gut's ""An Intermediate Course in Probability"". There is some overlap with Feller, but it goes into greater depth on those topics. He covers the various transformations, order statistics (which, if I recall, Feller only does by example).

+ +

Ross' Introduction to Probability Models is pretty comprehensive, but it is very example oriented. Again, that is not my favorite style (I'd rather they saved those examples for exercises with hints, and kept them out of the main flow), but if it works for you, I can recommend it.

+ +

You might as well consider Cacoullos' ""Exercises in Probability"" and Mosteller's ""50 Challenging Exercises in Probability"".

+",2013-10-26 02:37:43.937 +58255,22728.0,1,,,,Best subset selection,,CC BY-SA 3.0,"

My statistical learning text claims that for best subset selection, 2^p total models must be fit through regression if for p covariates, we fit p choose k models at each k, k = 1,...,p. I interpret this mathematically as 2^p = p choose 1 + ... + p choose k. Why 2^p models?

+",2013-10-26 02:45:12.413 +58256,594.0,2,,58255.0,,,,CC BY-SA 3.0,"

1) For each of p covariates, think of something indicating whether it's either included as a predictor (1) or it isn't (0). There are two possibilities of the indicator for each of $p$ predictors.

+ +

That is, there are 2 x 2 x 2 ... x 2 models (where there are p terms)

+ +

2) You're almost correct, in fact it's:

+ +

$$\sum_{i=0}^p {p\choose i} = 2^p$$

+ +

e.g. ${2\choose 0} + {2\choose 1} + {2\choose 2} = 1 + 2 + 1 = 2^2$

+ +

As for why that's true, think of the binomial expansion of $(1+1)^p$.

+",2013-10-26 03:05:53.883 +58257,17660.0,2,,55150.0,,,,CC BY-SA 3.0,"

A Counterexample

+ +

The problem doesn't seem to be that mean independence (the condition where $E[Y|X] = E[Y]$) implies that $Y$ and $X$ are uncorrelated. If $X$ and $Y$ are not correlated, it is not generally true that they are mean independent. So this doesn't seem problematic so far.

+ +

However, suppose you had a relationship (we can call it causal) defined as $Y = WX$, where $X$ is distributed with a standard normal distribution and $W$ is distributed with a Rademacher distribution so that $W = 1$ or $-1$, each with probability $1/2$ (see this Wikipedia article). Then notice that $E[Y|X] = E[Y]$. Under your definition, this relationship would not be causa even though $Y$ clearly depends on $X$.

+ +

An Example of a Formal Way of Thinking About Causality

+ +

To give you maybe a clearer and more mathematical way to look at causality, take the following example. (I borrow this example from the book ""Mostly Harmless Econometrics."") Suppose you want to analyze the effect of hospitalization on health. Define $Y_i$ as some health measure of individual $i$ and $D_i \in \{0,1\}$ to indicate whether or not that individual was hospitalized. In our first attempt, suppose we look at the average difference in health of the two kinds of individuals: +$$ +E[Y_i | D_i=1] - E[Y_i|D_i=0]. +$$ +On first look at the data, you might notice, counter intuitively, that individuals that have been hospitalized actually have worse health than those that have not. However, going to the hospital certainly does not make people sicker. Rather, there is a selection bias. People who go to the hospital are those people that are in worse health. So this first measure does not work. Why? Because we are not interested in just the observed differences, but rather in the potential differences (we want to know what would happen in the counter-factual world).

+ +

Define the potential outcome of any individual as follows: +$$ +\text{Potential Outcome} = \left \{ +\begin{array}{ll} +Y_{1,i} & \text{if } D_i = 1 \\ +Y_{0,i} & \text{if } D_i = 0. +\end{array} +\right . +$$ +$Y_{0,i}$ is the health of individual $i$ if he had not gone to the hospital, regardless of whether he actually went or not (we want to think about counterfactuals) and in the same way, $Y_{1,i}$ is the health of the individual is he did go. Now, write the actual observed outcome in terms of the potentials, +$$ +Y_i = \left \{ +\begin{array}{ll} +Y_{1,i} & \text{if } D_i = 1 \\ +Y_{0,i} & \text{if } D_i = 0. +\end{array} +\right. +$$ +Thus, $Y_i = Y_{0,i} + (Y_{1,i} - Y_{0,i}) D_i$. Now, we can define the causal effect as $Y_{1,i} - Y_{0,i}$. This works because it is in terms of potentials. Now, suppose we again look at the observed differences in average health: +\begin{align*} +E[Y_i | D_i=1] - E[Y_i|D_i=0] &= E[Y_{1,i}|D_i = 1] - E[Y_{0,i}|D_i = 1] \\ + & \qquad + E[Y_{0,i}|D_i=1] - E[Y_{0,i}|D_i=0]. +\end{align*} +Notice that the term $E[Y_{1,i}|D_i = 1] - E[Y_{0,i}|D_i = 1]$ can be interpreted as the average treatment effect on the treated and $E[Y_{0,i}|D_i=1] - E[Y_{0,i}|D_i=0]$ as the bias in selection. Now, if the treatment $D_i$ is assigned randomly, then we have +\begin{align*} +E[Y_i | D_i=1] - E[Y_i|D_i=0] &= E[Y_{1,i}|D_i] - E[Y_{0,i}|D_i=0] \\ + &= E[Y_{1,i}|D_i] - E[Y_{0,i}|D_i=1] \\ + &= E[Y_{1,i} - Y_{0,i}|D_i=1] \\ + &= E[Y_{1,i} - Y_{0,i}], +\end{align*} +where we see that $E[Y_{1,i} - Y_{0,i}]$ is the average causal effect that we are interested in. This is a basic way of thinking about causality.

+",2013-10-26 07:05:51.210 +58258,16737.0,1,,,,Overlapping time series: is there any better way to visualize them?,,CC BY-SA 3.0,"

I have this time series dataset:

+ +

+ +

The graph shows trend lines for 7 stock prices. They are very close and overlapping, but you will be able to get an idea that trend lines are layered (i.e. brown on the top and red/orange at the bottom, though far from conspicuous).

+ +

Is there any way to better visualize this data? like transforming $y$-axis to another scaling, mapping the whole thing onto cylinder/cone etc.? I tried with moving average, but the improvement is not so good.

+ +

NOTE: This is not an ML/DM problem. I am looking for a better/alternative/suitable visualization technique, that's all.

+",2013-10-26 07:20:29.463 +58259,15806.0,1,58274.0,,,Reporting operative effect in paired t test,,CC BY-SA 3.0,"

I just want to make sure I have something clear in my head. When I calculate the effect size for a paired samples t-test after obtaining a significant result, I simply take the mean of the differences divided by the standard deviation of the differences to get an effect size d. Do I then need to take d and divide it by the square root of 1-r, where r is the correlation between pairs and r is estimated from the sample pairs? I am confused because dividing by the square root of 1-r supposedly gives me an ""operative effect size"" and I'm not really sure if the operative effect size is what I should be reporting in my analyses. For example, in this report I am working on, I need to know if there was an effect size of 2 SD. So when I calculate my effect size, should I be dividing by square root 1-r? I don't think so, I think I need to report the actual detected effect size and not the operative effect, but I would love a second opinion. Thanks!

+",2013-10-26 07:42:44.340 +58260,15827.0,2,,58258.0,,,,CC BY-SA 3.0,"

Graphical comparison of time series is in principle straightforward: plot two or more series against time and look at the graph. Your example is one of many showing that it may not be so easy in practice.

+ +

This is pitched fairly generally. For stock prices, some of the strategies may not be especially relevant or successful, but they may have value for other kinds of series.

+ +

Some solutions, direct or indirect, include

+ +
    +
  • Graphical multiples, as already suggested by @Glen_b. Each series could be plotted separately. An extension to the idea of showing a reference series is this: For each series, plot the other series as backdrop in a subdued colour (e.g. a light gray) and then plot the series of interest on top in a more prominent colour.

  • +
  • Smoothing the series first. Even if you are also interested in fine structure, smoothing can help establish general patterns of change and thus aid understanding.

  • +
  • Looking at differences or ratios. One series of interest, or an average or other reference series, can be used to look at differences, or as appropriate ratios, of series rather than the series themselves. So, for example, plot (this series $-$ IBM) or (this series / IBM). If using ratios, then consider logarithmic scale too. (Ratios depend on all values being positive, or at least having the same sign, to work.)

  • +
  • Changing the aspect ratio. Erratic series with numerous changes of direction are often best plotted with an aspect ratio yielding short, long graphs, which you may need to split into different sections. The ideal is that typical segments are at about $45^\circ$. (That is a counsel of perfection for very long series.)

  • +
  • Sampling. Do you need every value? Would plotting every $k$th value be as informative visually? In some cases, sampling should include local maxima and minima to show important details. The principle here is that short-term changes are often noise and lacking in interest or intelligibility.

  • +
+",2013-10-26 09:40:20.397 +58261,20473.0,2,,58251.0,,,,CC BY-SA 3.0,"

1) Meta-issue:
+I believe the OP should include in the title of the question a signal that something special is going on here - for example instead of ""Are linear combinations of..."" the title should read ""Are $\mathbb{F}_2-$linear combinations of..."", so that the reader gets the message that concepts may have special meanings in this specific question.

+ +

2) Bernoulli or not Bernoulli?
+By $\mathbb{F}_2-$arithmetic, the sum of i.i.d Bernoulli rv's is not a Binomial, since all probabilities are condensed on the values $0$ and $1$. We first derive the binomial distribution and then add the probabilities of all even values of the support to the probability of $0$, and the probabilities of all odd values of the support to the probability of $1$ -and thus we obtain again a Bernoulli r.v. This is just to validate the probability mass function included in the OP's question. It is by construction a Bernoulli random variable, irrespective of how $P(v'_i = 1)$ is derived. Moreover if the number of $1$'s is the same in each row of the matrix $A$, then each $v'_i$ has an identical Bernoulli marginal distribution.

+ +

3) $\mathbb{F}_2-$ linear independence.
+$\mathbb{F}_2-$ linear dependence does not look much like the ""usual"" concept of linear independence. To be able to obtain a square matrix of full row/column rank under $\mathbb{F}_2-$arithmetic, I conjecture that $w$, the number of $1$'s in each row, should be an odd number. Consider a square matrix $n\times n$ matrix $A$, and the first element of one of its rows, say $a_{i1}$. I reason as follows:
+a) Assume $w$ is an even number.
+a1) Assume $a_{i1} = 0$. Then in the remaining $n-1$ elements of the row, there exists an even number of $1$'s, which by $\mathbb{F}_2-$arithmetic will give us $0$. The rest of the elements of the row all also zero, so overall the sum of the $n-1$ elements will give $0$, i.e equal to the value of the first element.
+a2) Assume $a_{i1} = 1$. Then in the remaining $n-1$ elements of the row, there exists an odd number of $1$'s, which by $\mathbb{F}_2-$arithmetic will give us $1$. The rest of the elements of the row are all zero, so overall the sum of the $n-1$ elements will give $1$, i.e again equal to the value of the first element.

+ +

So if $w=even$, the $\mathbb{F}_2-$sum of the $n-1$ columns will always equal the $n$-th column, depriving us of full rank.

+ +

b) Assume $w$ is an odd number.
+b1) Assume $a_{i1} = 0$. Then in the remaining $n-1$ elements of the row, there exists an odd number of $1$'s, which by $\mathbb{F}_2-$arithmetic will give us $1$. The rest of the elements of the row all zero, so overall the sum of the $n-1$ elements will give $1$, i.e different to the value of the first element.
+b2) Assume $a_{i1} = 1$. Then in the remaining $n-1$ elements of the row, there exists an even number of $1$'s, which by $\mathbb{F}_2-$arithmetic will give us $0$. The rest of the elements of the row all also zero, so overall the sum of the $n-1$ elements will give $0$, i.e again different than the value of the first element.

+ +

So it appears that $w= odd$, is at least a necessary condition to have a matrix $A$ of full column/row rank.

+ +

4) Stochastic (in)dependence in the $\mathbb{F}_2-$world.
+Does the characteristics of $\mathbb{F}_2-$field affect the concept of stochastic (in)dependence? No. Two r.v.'s are independent if and only if their joint distribution is the product of their marginals. Meaning, the conditional distributions must equal the unconditional ones. Maybe the way operations work in the $\mathbb{F}_2-$field produces some unexpected results? +Let's see: Assume that we have a square $A$ matrix that has $\mathbb{F}_2-$ linearly independent rows. The column vector process $\boldsymbol{v}'$, say of dimension $5\times 1$ is written

+ +

$$\boldsymbol{v}' = A\boldsymbol{v} = \left[\begin{matrix} +v_1'(v_0,...)\\ +v_2'(v_0,...)\\ +v_3'(v_0,...)\\ +v_4'(v_0,...)\\ +v_5'(v_0,...)\\ +\end{matrix}\right]$$

+ +

Now assume that $w=3$ and, say, that the $1$'s in $A$ as dispersed such that we have

+ +

$$ v_2' = v_0+v_1+v_5,\qquad v_4' = v_2+v_3+v_5$$

+ +

Consider the conditional probability (under $\mathbb{F}_2-$ arithmetic)

+ +

$$P_{\mathbb{F}_2}(v_2' =1\mid v_4'=0) = P_{\mathbb{F}_2}(v_0+v_1+v_5 =1\mid v_2+v_3+v_5=0)$$

+ +

If the conditioning statement is to affect the probabilities of $v_2'$, it will do so through $v_5$: the fact that $v_2+v_3+v_5=0$ must affect the probabilities related to $v_5$. Under the ""usual"" arithmetic this would be obvious: it would mean that $v_5$ should equal zero. What happens under $\mathbb{F}_2-$ arithmetic? +We can examine +$$P_{\mathbb{F}_2}(v_5 =1\mid v_2+v_3+v_5=0) = \frac{P_{\mathbb{F}_2}(\{v_5 =1\}\land \{v_2+v_3+v_5=0\})}{P_{\mathbb{F}_2}(v_2+v_3+v_5=0)}$$

+ +

The possible values of $v_2+v_3+v_5$ under $\mathbb{F}_2-$ arithmetic are +

+ +

from which we get the following contingency table

+ +

+ +

Therefore +$$P_{\mathbb{F}_2}(v_5 =1\mid v_2+v_3+v_5=0) = \frac{2p^2(1-p)}{(1-p)^3+3p^2(1-p)}=\frac{2p^2}{1-2p+4p^2} \neq p$$ +except when $p=1/2$ - but the set up explicitly specifies that $p<1/2$ . So we conclude that +$$P_{\mathbb{F}_2}(v_5 =1\mid v_2+v_3+v_5=0) \neq P_{\mathbb{F}_2}(v_5 =1)$$

+ +

and so +$$P_{\mathbb{F}_2}(v_2' =1\mid v_4'=0) \neq P_{\mathbb{F}_2}(v_2' =1)$$

+ +

I have shown that, in general, the elements of the vector process $\boldsymbol{v}'$ are not independent (although they have identical marginals), even under $\mathbb{F}_2$-arithmetic. I have found and read the lemma in the paper the OP mentions. There is no actual proof, just a verbal assertion that they are independent, due to the $\mathbb{F}_2$-linear independence of the rows of $A$. Hopefully, I am mistaken, but for the moment it appears the assertion is not valid.

+ +

5) Now what?
+Not much. The joint distribution of the random vector will depend on how the $1$'s are allocated in matrix $A$. Without a specific form, the various measures of distance between distributions become vacuous. Intuitively, if $w$ is small relative to the length of the rows of $A$, then one can expect/hope, that the dependence will be relatively weak, and so pretending that the vector is i.i.d. won't hurt much... but without knowing the joint distribution, one cannot really tell... Copulas for discrete r.v.'s suffer from identification issues... I am still thinking about it, but I am not optimistic.

+",2013-10-26 10:08:41.883 +58262,23058.0,1,,,,Simple OLS with two samples,,CC BY-SA 3.0,"

I want to obtain an unbiased estimator for $b_1$ in a simple regression like that: $Y_i = B_0 + B_1X_i + u_i$ when I have two samples, always same size for Y and X, but once the sample size is l and once the sample size is m. The respective sample means $\bar{Y_l},\bar{X_l}$ and $\bar{Y_m},\bar{X_m}$ are given. Now I wonder how I cans tart to get an unbiased estimator?

+ +

My idea was to use the 'normal/one-sample' formula and just put weights (correcting for different sample size between the two independent sets of data) in front.

+ +

An estimator for $b_1$ would be: (X'X)$^{-1}$X'Y without matrices: $\frac{\sum X_iY_i - N \bar{Y}\bar{X}}{\sum X_i^2 -N \bar{X}^2}$

+ +

which I wanted to modify to $\frac{l}{m+l} \frac{\sum X_iY_i - L \bar{Y_l}\bar{X_l}}{\sum X_i^2 - L \bar{X_l}^2} + \frac{m}{m+l} \frac{\sum X_iY_i - M \bar{Y_m}\bar{X_m}}{\sum X_i^2 - M \bar{X_m}^2}$

+ +

The capital M and L denoting the respective sample size.

+ +

Now I am not sure if my result is right, as I cannot show if it is unbiased, to be honest.

+ +

Is it unbiased in probabilistic terms? Or is it just a wrong estimator?

+",2013-10-26 10:25:30.220 +58279,22843.0,1,58284.0,,,What is a numerical example of $Var(X_1 + X_2) = Var(X_1) + Var(X_2)$,,CC BY-SA 3.0,"

I need a numerical example to illustrate cases where $Cov(X_1, X_2) = 0$. Can you think of examples involving functions or matrices?

+",2013-10-26 21:55:27.983 +58263,10450.0,2,,58258.0,,,,CC BY-SA 3.0,"

I am not certain, what exactly you are trying to capture, but as they are financial time series I've assembled some possible alternate methods to visualize the information.

+ +
    +
  • As they are stock time series, and I assume returns or price differences, I would recommend integrating (cumsum or cumprod) the series. The cumulative price +series would be a better way to visually discern differences between the series.
  • +
+ +

+ +
    +
  • If you are trying to visually get a feel for difference of the series +in the current form, I would consider breaking up the series into +smaller time ranges (using something like panels or trellis plots), as +the data looks too compressed to discern much. Here, one can see some correlation of daily series on monthly time intervals.
  • +
+ +

+ +
    +
  • You could also run overlapping density plots of each of the individual series to quickly ascertain differences in sample statistics (mean, variance, higher moments). +In your case, I would expect to see some separation between the distributions, indicating differences in mean (drift), as well as differences in variance (volatility) between series.
  • +
+ +

+ +

The plots were generated via R.

+",2013-10-26 10:40:43.203 +58264,22381.0,1,58271.0,,,How to interpret negative ACF (autocorrelation function)?,,CC BY-SA 3.0,"

+ +

So I plotted the ACF/PACF of oil returns and was expecting to see some positive autocorrelation but to my surprise I only get negative significant autocorrelation. How should I interpret the above graph? They seem to indicate that there is a tendency for oil returns to increase when it decreased previously and vice-versa, thus the oscillating behaviour. Please correct me if I'm wrong.

+",2013-10-26 11:45:24.910 +58265,1809.0,1,58266.0,,,Chi Squared Results in R and Python,,CC BY-SA 3.0,"

Based on this answer, Python requires expected values in a chi square test to be absolute frequencies. Consider the following in Python:

+ +
import numpy
+import scipy.stats
+# chisquare function requires (observed, expected)
+scipy.stats.chisquare(numpy.array([0,0,23,0]), numpy.array([1,1,1794,1]))
+(1751.2948717948718, 0.0)
+
+ +

results in a p-value of 0 (whatever that means).

+ +

The same calculation in R, which requires that the expected values be proprotions:

+ +
chisq.test(c(0, 0, 23, 0), p=c(1/1797,1/1797,1794/1797, 1/1797))
+
+        Chi-squared test for given probabilities
+
+data:  c(0, 0, 23, 0)
+X-squared = 0.0385, df = 3, p-value = 0.998
+
+ +

resulting in a p-value of 0.998.

+ +

Which is correct?

+",2013-10-26 12:31:31.527 +58266,503.0,2,,58265.0,,,,CC BY-SA 3.0,"

These two seem to be testing different things. The Python code looks like it is a two way chi square test (but a p value of 0 makes no sense here), while the R code is one way. I am not sure which you want.

+ +

To do the two way test in R use

+ +
x1 <- c(0, 0, 23, 0)
+x2 <- c(1, 1, 1794, 1)
+chisq.test(x1, x2)
+
+ +

Which gives a p value of 0.5.

+ +

However, since a lot of the expected values are less than 5, Rcorrectly gives a warning. You can simulate using

+ +
chisq.test(x1, x2, simulate = TRUE)
+
+ +

which gives a p of 0.25

+ +

Your code also gives a warning, but this

+ +
chisq.test(c(0, 0, 23, 0),
+           p=c(1/1797,1/1797,1794/1797, 1/1797),
+           simulate = TRUE)
+
+ +

gives a p of 1.

+ +

This certainly makes sense.

+ +

I don't have Python so I can't say for sure what is going on there.

+ +

A two way chi square tests whether two categorical variables are associated with each other; a one way tests whether one categorical variable is distributed equal to a certain set of proportions.

+",2013-10-26 12:47:35.687 +58267,20473.0,2,,58262.0,,,,CC BY-SA 3.0,"

The unbiasedness poperty of the OLS estimator in the linear regression model is a finite-sample property, and it is based on a specific assumption of the model being correct -that the regressors are ""strictly exogenous to the error term"", namely $E(u_i|\mathbf X)=0$.

+ +

So if you accept that this assumption holds, as you indicate in a comment, and so the OLS estimator for each sample has the unbiasedness property, then a combination of the two will be unbiased if it is a linear combination with weights adding up to unity (but not necessarily a convex combination). Namely, let $\hat B_{1l}$ and $\hat B_{1m}$ be the two single sample estimators. Consider an estimator that it is some function of the two:

+ +

$$\hat B^* = h\left(\hat B_{1l},\hat B_{1m}\right) $$ +Its expected value is

+ +

$$E\left[\hat B^*\right] = E\left[h\left(\hat B_{1l},\hat B_{1m}\right)\right] $$

+ +

If $h()$ is not an affine function, then by Jensen's inequality +$$E\left[h\left(B_{1l},\hat B_{1m}\right)\right] \neq h\left(E\hat B_{1l},E\hat B_{1m}\right)$$ +and in general $\hat B^*$ won't be unbiased.

+ +

Assume now that $h()$ is affine namely

+ +

$$\hat B^* = a_0 +a_1\hat B_{1l}+a_2\hat B_{1m} $$

+ +

with $a$'s being constants. Then

+ +

$$E\left[\hat B^*\right] = a_0 +a_1E\hat B_{1l}+a_2E\hat B_{1m} =a_0 + (a_1+a_2)B_{1}$$

+ +

For +$$E\left[\hat B^*\right] = B_{1} \Rightarrow a_0 = (1-a_1-a_2)B_{1} $$

+ +

This condition depends on the unknown coefficient $B_1$ except if we set $a_0=0,\; a_1=1-a_2$, in which case it will hold always. In principle, these conditions do not exclude the possibility that $a_2 >1, a_1<0$, in which case we have no longer a convex combination. But interpreting negative weights is difficult (although in forecasting literature negative weights have been found to increase efficiency occasionally), so usually we take the convex combination, i.e. $0<a_1<1,\; 0<a_2<1, \; a_1+a_2=1$.

+",2013-10-26 13:05:55.553 +58268,1809.0,1,,,,Test to compare large and small datasets,,CC BY-SA 3.0,"

I send Alice and Bob out to record people's eye colour (blue, brown, green, other). Alice does a great job and writes down the eye colour of 2000 people. Bob only records the eye colour of 20 people. Alice found all 2000 people had brown eyes. Bob found 19 people with brown eyes and one with green eyes. How important is the one green result? How can I compare the results? Is the green result significant?

+ +

Update +I now send Alice and Bob out to a different area on another day. Alice is thorough and finds 2000 people again, while Bob is still lazy and only finds 20 people. Their results would look something like this:

+ +
| Day | Dataset | Blue | Brown | Green | Other |
+|  1  | Alice   | 0    | 2000  | 0     | 0     |
+|  1  | Bob     | 0    | 19    | 1     | 0     |
+|  2  | Alice   | 10   | 1900  | 45    | 45    |
+|  2  | Bob     | 2    | 10    | 6     | 6     |
+
+ +

I repeat this process, sending them out to different areas on different days. Alice always finds 2000 people and Bob always finds 20.

+ +

Considering each day separately, how can I determine if Alice or Bob's results are more representative of the population they sampled on a given day? How can I compare their results for a given day?

+",2013-10-26 13:23:28.290 +58280,6204.0,2,,58279.0,,,,CC BY-SA 3.0,"

This will be (approximately) true of any two independent variables. If you're ok with cov(x,y) being nearly but not actually 0, generating an example should be trivial:

+ +
set.seed(123)
+N=1000
+x = rnorm(N)
+y = rnorm(N)
+cov(x,y)
+
+0.0865909
+
+ +

As N approaches infinity, your covariance will approach zero.

+",2013-10-26 22:05:46.537 +58269,21985.0,1,58299.0,,,Asymptotic normal distribution via the central limit theorem,,CC BY-SA 3.0,"

I have a sample $n = 100$ with two ""successes"" (Two kids having a disease among 100). So we obviously have a binomial distribution.

+ +

First I had to compute the maximum likelihood (ML) estimator $\hat{p}$. I got $\hat{p} = \frac{k}{n}$.

+ +

Now, I have to derive asymptotic normal distribution for $\hat{p}$ via the central limit theorem (CLT).

+ +

I know that the expected value of $\hat{p}$ is not infinite and also variance is not infinite, so I know it will be normally distributed.

+ +

I have to know expected value and variance of $\hat{p}$ to get the asymptotic normal distribution, right?

+ +

I know that expected value is $\frac{k}{n}$. But what is variance?

+",2013-10-26 13:27:28.340 +58270,22410.0,1,58286.0,,,What is the meaning of this formula in R?,,CC BY-SA 3.0,"

I have rows of data with columns age, sex, education and income.

+ +

I am doing homework that asks me to predict income with naive Bayes in R with the formula
+income ~ age + sex + educ

+ +

I know formulas tell R the shape of the model to apply in the method.

+ +

I also know that formulas don't refer to any specific variables in a dataset -- they only give the shape of the data.

+ +

So with all that said, what does the formula income ~ age + sex + educ mean? I am guessing that it means

+ +
p(income|age, sex, income) = P(y|x1,x2,x3) = P(x1|y) * P(x2|y) * P(x3|y) * P(y)
+
+",2013-10-26 14:37:56.087 +58271,10135.0,2,,58264.0,,,,CC BY-SA 3.0,"

Negative ACF means that a positive oil return for one observation increases the probability of having a negative oil return for another observation (depending on the lag) and vice-versa. Or you can say (for a stationary time series) if one observation is above the average the other one (depending on the lag) is below average and vice-versa. Have a look at ""Interpreting a negative autocorrelation"".

+",2013-10-26 14:48:01.100 +58272,34640.0,1,,,Lingxiang Cheng,Use hierarchical clustering in R to cluster items into fixed size clusters,,CC BY-SA 3.0,"

I am trying to use R to do Kmeans clustering and as most people I ran into the challenge of determining when to finish. I have 10,000 items and potentially 10 times of that down the road. My goal is to create a series of clusters with minimal size (e.g. 50 items per cluster) OR reasonably similar items. In other words, I don't want any of my output clusters to be too small (even if the items are quite different from each other), but I also don't mind if the clusters are too big as long as the items are similar enough.

+ +

I imagine I can use some kind of divisive hierarchical approach. I can start by building a small number of clusters and examine each cluster to determine if it needs to be split into more clusters. I can keep doing this till all clusters meet my stopping criteria.

+ +

I wonder if anyone knows good information on how other people do this?

+",2013-10-26 16:19:39.950 +58273,20603.0,2,,58272.0,lejlot,,,CC BY-SA 3.0,"

There is a whole family of hierarchical clustering which should suit your needs, as it creates a tree, where each level represents the bigger (more general) clusters. Analysis of this structure and some custom cutting will bring you to described solution.

+ +

In R you can check out this source http://cran.r-project.org/web/views/Cluster.html , where you will find some hierarchical clustering implementations.

+ +

The easiest approach would be to:

+ +
    +
  • run hierarchical clustering (any) and analyze the tree and select clusters generality which fits your constraints
  • +
  • cluster with any existing method, and then prune the small clusters (remove them iteratively and assign each point to the nearest of the remaining clusters).
  • +
+",2013-10-26 16:24:26.117 +58274,3993.0,2,,58259.0,,,,CC BY-SA 3.0,"

If you are constructing $d$ as the mean difference divided by the standard deviation of the difference scores -- rather than by the pooled standard deviation of scores in each group, as $d$ is conventionally defined! -- then that is already what I referred to (following Cohen, 1988) as the ""operative effect size."" So further dividing this operative effect size by $\sqrt{1-r}$ would not make sense, because that correction is already ""built in"" to that instantiation of $d$.

+ +

I think @John has a good brief discussion of different ways of computing $d$ at the bottom of his answer HERE. John mentions that some people firmly believe that $d$ should always be computed using the classical, independent-groups specificaton. I am one of these people. (Cohen was also one of these people. That's why he used the separate term ""operative effect size"" to talk about other ways of computing $d$.) I think it is a very bad idea to give $d$ different definitions in different contexts. Aside from the important problem this creates of killing any possible comparison of $d$ sizes between experimental paradigms that tend to use different designs, this inconsistent definition of $d$ also fosters confusion about what any given person means when they speak of $d$, unless they explicitly say which $d$ they mean! I believe this latter confusion is exactly what we have experienced here.

+ +

The ""operative effect size"" language convention is an attempt to allow us to talk sensibly and unambiguously about these nonstandard, but still useful, definitions of $d$ (nonstandard in the sense that they deviate from Cohen's definition). In case you are wondering, I believe Cohen calls these ""operative"" effect sizes because they are the effect sizes that are relevant for conducting a power analysis, which is what makes them useful. But let's keep in mind that this is only one of the uses of an effect size measure.

+ +
    +
  • Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd edition). Routledge.
  • +
+",2013-10-26 18:22:10.630 +58275,3993.0,2,,57916.0,,,,CC BY-SA 3.0,"

This is not a ""problem"" and does not need to be ""solved."" As you already noted yourself, this apparent multicollinearity is a natural consequence of using dummy codes. If you use non-orthogonal codes, you get non-orthogonal parameter estimates. My advice: ignore it.

+",2013-10-26 18:27:01.227 +58276,16046.0,1,,,,Numerical sampling in hierarchical Bayesian models (HBM),,CC BY-SA 3.0,"

I am reading chapter 5 of $\textit{Bayesian Data Analysis}$ by Gelman $\textit{et al.}$. There it explains the few steps of data analysis for hierarchical models and if I quote from the book it will be:

+ +

""We first perform the following three steps analytically.

+ +
    +
  1. Write the joint posterior density, $p( \theta, \phi|y)$, in unnormalized +form as a product of the hyperprior distribution $p(\phi)$, the +population distribution $p(\theta|\phi)$, and the likelihood $p(y|\theta)$.

  2. +
  3. Determine analytically the conditional posterior density of $\theta$ given +the hyperparameters $\phi$; for fixed observed $y$, this is a function +of $\phi$, $p(\theta|\phi, y)$.

  4. +
  5. Estimate $\phi$ using the Bayesian paradigm; that is, obtain its +marginal posterior distribution, $p(\phi|y)$.

  6. +
+ +

The first step is immediate, and the second step is easy for conjugate models +because, conditional on $t/J$, the population distribution for $\theta$ is just the iid +model (5.1), so that the conditional posterior density is a product of conjugate +posterior densities for the components $\theta_j$. +The third step can be performed by brute force by integrating the joint +posterior distribution over $\theta$: +$$p(\phi|y) = \int{p(\phi,\theta|y)d\theta} \ \ \ \ \ \ \ \ \ \ \ \text{(5.4)}""$$

+ +

Now my question is whether we can (in a numerical approach) sample from $p(\phi,\theta|y)$ and ignore the $\theta$s to generate a sample of $p(\phi|y)$?

+",2013-10-26 18:52:30.817 +58277,0.0,5,,,,,,CC BY-SA 3.0,,2013-10-26 19:31:28.250 +58284,7007.0,2,,58279.0,,,,CC BY-SA 3.0,"

As a very simple example (maybe too simple?), consider $X,Y\in\{0,1\}$ with joint distribution defined by the table

+ +
  Y \ X   0    1    
+  0     1/4  1/4 1/2
+  1     1/4  1/4 1/2
+        1/2  1/2   1
+
+ +

This table also displays the marginal distributions of $X$ and $Y$. First, check that $X$ and $Y$ are independent. For example, +$$ + \mathrm{Pr}(X=0,Y=0) = 1/4 = 1/2 \times 1/2 = \mathrm{Pr}(X=0)\,\mathrm{Pr}(Y=0) \, , +$$ +and so on. Now, compute the distribution of $Z=X+Y\in\{0,1,2\}$. For example, +$$ +\mathrm{Pr}(Z=1) = \mathrm{Pr}(X=1,Y=0) + \mathrm{Pr}(X=0,Y=1) = 1/2 \, . +$$ +Using these distributions, compute $\mathrm{Var}(X),\mathrm{Var}(Y)$, and $\mathrm{Var}(Z)$.

+",2013-10-26 22:18:23.570 +58285,8671.0,1,,,,approximation to maximum and minimum function : soft-min and soft-max,,CC BY-SA 3.0,"

The approximation to the function $max(x)$ can be written as a NOISY-OR as given below: $$ max_k(x) = 1-\prod_k(1-x) $$

+ +

Are there any way to approximate $min(x)$ ?

+",2013-10-26 23:02:44.790 +58286,594.0,2,,58270.0,,,,CC BY-SA 3.0,"

Unless you're using terms differently that what I understand you to mean, you're mistaken when you assert that ""formulas don't refer to any specific variables in a dataset"". They certainly do refer to specific variables, explicitly by name.

+ +

See this stackoverflow answer for some background information and where R formulas originate.

+ +

Formulas are used for many purposes in R, and a specific component of a formula (such as a variable name or an operator) may have a somewhat different meaning in a different context.

+ +

The meaning of the formula in plot(y ~ x1 + x2, data=mydata) and in lm(y ~ x1 + x2, data=mydata) and in glm(y ~ x1 + x2, family=binomial, data=mydata) are all somewhat different ... and as you go further afield, meanings can change even more, even between packages doing very similar things.

+ +

So what that formula might mean in R is very context dependent -- and we don't have sufficient context.

+ +

(You don't even mention whether you're using a package in R or building something yourself.)

+ +

Given this is a naive Bayes classifier, your interpretation certainly makes sense (think in terms of logs, for example), and likely that's what I'd have anticipated it to mean, but that's not really anything to do with R unless you're using some particular package... whose specific interpretation of formulas we might then be able to explain.

+",2013-10-26 23:04:56.650 +58287,6728.0,1,,,,Optimal orthogonal polynomial chaos basis functions for log-normally distributed random variables,,CC BY-SA 4.0,"

I hope this is the appropriate venue for this type of question. If not, please feel free to migrate! :)

+ +

I'm trying to solve a stochastic partial differential equation of the form $$\alpha(\omega)\nabla^2u=f$$ +where $\alpha(\omega)$ represents a random field that is log-normally distributed, i.e. it has a probability density function $$f(x)=\frac{1}{x\sqrt{2\pi\sigma^2}}e^{-\frac{(\log(x)-\mu)^2}{2\sigma^2}}.$$

+ +

I want to represent the solution of this problem as a polynomial chaos expansion $$u=\sum_{i=0}^p u_i(x)\Psi_i(\xi)$$ where $u_i(x)$ is a deterministic coefficient and $\Psi_i(\xi)$ are orthogonal polynomials in terms of a random variable $\xi$ with the same log-normal probability density function.

+ +

According to Xiu & Karniadakis (2002), certain orthogonal polynomial bases give optimal (exponential) convergence of finite expansions to the true solution $u$. For instance, Hermitte polynomials are optimal for Gaussian distributions, Legendre polynomials for uniform distributions, Laguerre for gamma distributions etc (see the above paper, bottom of page 8).

+ +

What is the corresponding optimal polynomial basis for log-normal distributions?

+",2013-10-26 23:41:14.957 +58288,,1,58750.0,,user31766,What's the difference between observable fixed effect and control variable?,,CC BY-SA 4.0,"

I am confused about the exact definitions here.

+ +

Assuming I have a cross-sectional regression, let's say, Wage on Education and I additionally control for observable characteristics with a set of dummies or variables like intelligence level, age, parent's education level, urbanization area, gender, race, work experience etc.

+ +
    +
  1. Does this mean I used dummies/variables to ""control for observable fixed effects"" that I obtained through my data collection? (Is, for example, parent's education level thus an observable fixed effect?)

  2. +
  3. Do unobservable fixed effects like ability (as often quoted in the literature) are then said to be ""controlled"" by proxies through my dummies like intelligence level, experience etc.?

  4. +
+ +

In a way I'd like to know the exact difference between controlling for a variable, observable and unobservable characteristics, observable and unobservable fixed effects. Thanks.

+",2013-10-27 00:01:04.910 +58289,22843.0,1,,,,Explain Statistics: Matching formulas for chi square tests,,CC BY-SA 3.0,"

The chi square formula given in my book is:

+ +

$\chi^2 = \frac{(n-1)s^2}{\sigma^2} $

+ +

At first I admitted to feeling some kind of strangeness to this formula, but after a few hours I realized that $\chi^2$ was $\displaystyle \sum_{i=1}^n \left(\frac{X_i - \mu}{\sigma}\right)^2 - \left( \frac{\bar{x} - \mu}{\frac{\sigma}{\sqrt{n}}} \right)^2$ in disguise. However, there's a formula in my book that says that

+ +

$\chi^2 = \displaystyle \frac{\sum_{i=1}^n (O_i - E_i)^2}{E_i}$ where $O_i$ is the observed value and $E_i$ is the expected value. I want to show that this expression is equivalent to the one above: $$\displaystyle \sum_{i=1}^n \left(\frac{X_i - \mu}{\sigma}\right)^2 - \left( \frac{\bar{x} - \mu}{\frac{\sigma}{\sqrt{n}}} \right)^2$$

+ +

This equation comes from the fact that $(n-1)s^2 = \displaystyle \sum_{i=1}^n ((X_i - \mu)+(\mu - \bar{x}))^2$

+",2013-10-27 01:28:50.803 +58297,23064.0,1,58298.0,,,Combining discrete and continuous variables,,CC BY-SA 3.0,"

I need to find the pdf of a random variable which is a mixture of discrete and continuous random variables. I have seen on this website but it does not exist in the general case, but maybe in this one it does.

+ +

In any case, I have $X \sim Bern(p)$ where $p$ is known, and I have $Y = XW+(1-X)Z$ where $W,Z$ are both continuous with pdf also known. For the moment, I've tried to +\begin{align*} +\text{cdf}_Y (y) & = P( Y \leq y) = P( XW+(1-X)Z \leq y) \\ +& = P( ... \leq y \mid X=0 ) + P( ... \leq y \mid X =1) \\ +& = \text{cdf}_W(y) + \text{cdf}_Z(y) +\end{align*} +I am just not sure I am allowed to go from the first line to the second...is this correct ? Does anyone have any suggestion on this problem ?

+ +

Thank you very much !

+",2013-10-27 13:06:52.080 +58298,1889.0,2,,58297.0,,,,CC BY-SA 3.0,"

$P( Y \leq y) $

+ +

$= P( Y \leq y|X=1)P(X=1) + P( Y \leq y|X=0)P(X=0)$

+ +

$=P( Y \leq y|X=1)p + P( Y \leq y|X=0)(1-p)$

+ +

$=pP( W\leq y) + (1-p)P( Z \leq y)$

+ +

So $F_Y(y)=pF_W(y)+(1-p)F_Z(y)$ and thus $f_Y(y)=pf_W(y)+(1-p)f_Z(y)$

+",2013-10-27 13:19:42.593 +58290,10957.0,1,58316.0,,,ROC curves and AUC in simulations to compare models,,CC BY-SA 3.0,"

I am using ROC curves to compare different methods but not sure if I need to re-simulate datasets using different seeds in R in order to reduce the ""by-chance"" issue for a particular output. Here is a brief outline of my simulation:

+ +
    +
  1. The function generate.data is used to simulate data of some distribution, and by simulation, I know which data are true positives. The random number generator is controlled by fixing the seed in R.

  2. +
  3. The function check.models is used to test a total of 5 methods, and return the quantities used to draw a ROC curve for each method. Also for each curve (method), the AUC is reported.

  4. +
  5. The function plot.roc is used for plotting.

  6. +
+ +

In step #1, there are some other factors to change so that the data are under different ""alternatives"". When I run steps #1 and #2 above using seed=123 and pick up the method with the highest AUC, I got one set of results. However, when I re-run using a different seed (say seed=456), I got another set of results not identical to the first run. Therefore, I think rigorously I should run my simulation across different seed's in R to generate data in step #1, so that the ""by-chance"" issue of using a particular dataset is reduced.

+ +

Am I correct? If so, then I should report the average of the AUC's for each method across (say, 1000) simulations, and pick up the highest among the methods compared? Thanks!

+",2013-10-27 02:05:28.200 +58291,7275.0,1,,,,Joint PMF for two Geometric distribution variables,,CC BY-SA 4.0,"

I am interested to know how to calculate the joint probability mass function for two independent geometric random variables.

+

Suppose two variables X1 and X2 are independent, such that Xi∼Geometric(theta), how to find the joint pmf distribution of X1 and X2. I am not sure but I think it should be the product of pmf of both mass function.

+

Also, how should I calculate the probability of the event where kth trial being the the first success/failure for both the variables or k1th trial for X1 and k2th trial for X2?

+",2013-10-27 02:37:25.640 +58292,23066.0,1,59638.0,,,"How do I use the “survival” package and ""Surv"" function in R with left-truncated data?",,CC BY-SA 3.0,"

I am trying to run survival analysis using the Surv and survfit functions from the survival package in R. Most of my data is left truncated, and I'm not sure if I'm entering it into the Surv function correctly. My response variable is time (measured in years) beginning from when a bridge is classified as deficient, and ending when it collapses. I can track each bridge's deficiency status from 2012 back to 1992, but no further. The censoring occurs because many bridges were classified as deficient from the time of their collapse back to 1992, and thus I don't know exactly when they became deficient, and therefore I don't know their true ""lifetime"" (number of years from deficient classification to collapse). Say for example a bridge collapsed in 1995, and was classified as being deficient in 1995, 1994, 1993, and 1992. It is possible that it was first classified as being deficient in 1992, it is also possible that it has been classified as deficient since 1984. Thus I believe my censoring is considered to be left truncated.

+ +

Some example data:

+ +
Year0 = c(1992, 1992, 1999, 1992, 1993, 2007, 2005, 1992) # The years when each bridge     was first observed as being deficient.
+Year1 = c(1993, 1994, 2002, 1996, 2004, 2012, 2011, 2000) # The years in which each bridge collapsed
+Defyears = Year1 - Year0 + 1 # The number of years for wich I can observe each bridge being deficient
+time1 = Year0 - 1992 # Since I want the time scale to be from 0 to 21 instead of 1992 - 2012, I subtract 1992 from each time observation.
+                     # This now becomes the beginning point for the lifetime of each bridge.
+time2 = Defyears + time1 # This is the ending point of the lifetime of each bridge.
+n = length(time2)
+
+ +

Notice that four out of the eight bridges are left truncated, bridge 1, 2, 4, and 8. I cannot observe exactly when they were first classified as being deficient. For bridges 3, 5, 6, and 7 I know their exact lifetimes since they became deficient after 1992, hence these observations are not censored.

+ +

I then fit the model:

+ +
bridges = survfit(Surv(time = time1, time2 = time2, event = rep(1,n)) ~ 1) # I do ""event = rep(1,n)"" because each bridge collapsed.
+
+ +

I'm just not sure that this model is correct. For one thing, in the documentation it says that time is for right censored data or the starting time for interval censored data. For another, I don't see how this model accounts for the observations that aren't censored. Can anyone tell me if this is right, and if not, what I need to change and why. Any help is greatly appreciated. Thanks so much!

+",2013-10-27 04:18:19.037 +58293,22419.0,1,58294.0,,,Intuition for consequences of multicollinearity,,CC BY-SA 3.0,"

So we have a regression equation with one explained variable and 10 explanatory variables.

+ +

What I have read so far:

+ +
    +
  1. Multicollinearity doesnt affect the regression of the model as a whole.

  2. +
  3. But if we start looking at the effect of individual predictor variable Xs on the explained variable, then we are going to have inaccurate estimates.

  4. +
+ +

I have tried to start thinking intuitively about it like follows:

+ +
    +
  • High Multicollinearity means that in a matrix, two or more rows/columns are linearly dependent on each other. In other words, in a 3-dimensional space there are 2 vectors which have (almost) the same direction only different magnitudes (is this right?)
  • +
+ +

I'd appreciate it if someone could explain how this translates into ""multicollinearity not affecting the regression as a whole but only individual variable's coefficient estimates"".

+ +

Also, could someone explain the statement in bold? I cant make sense out of it:

+ +
+

One other thing to keep in mind is that the tests on the individual + coefficients each assume that all of the other predictors are in the + model. In other words each predictor is not significant as long as all + of the other predictors are in the model. There must be some + interaction or interdependence between two or more of your predictors.

+
+ +

which was an answer to this question: How can a regression be significant but all predictors insignificant?

+",2013-10-27 06:02:56.510 +58294,21762.0,2,,58293.0,,,,CC BY-SA 3.0,"

Let us first distinguish between perfect multi-collinearity (model matrix not of full rank, so that usual matrix inversions fail. Usually due to misspecification of the predictors) and non-perfect multi-collinearity (some of the predictors are correlated without leading to computational problems). This answer is about the second type, which occurs in almost any multivariable linear model since the predictors have no reason to be uncorrelated.

+ +

A simple example with strong multi-collinearity is a quadratic regression. So the only predictors are $X_1 = X$ and $X_2=X^2$:

+ +
set.seed(60)
+
+X1 <- abs(rnorm(60))
+X2 <- X1^2
+cor(X1,X2)   # Result: 0.967
+
+ +

This example illustrates your questions/claims:

+ +

1. Multicollinearity doesnt affect the regression of the model as a whole.

+ +

Let's have a look at an example model:

+ +
Y <- 0.5*X1 + X2 + rnorm(60)
+fit <- lm(Y~X1+X2)
+summary(fit)
+
+#Result
+[...]
+
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  -0.3439     0.3214  -1.070    0.289
+X1            1.3235     0.8323   1.590    0.117
+X2            0.5861     0.3931   1.491    0.141
+
+Residual standard error: 1.014 on 57 degrees of freedom
+Multiple R-squared:  0.7147,    Adjusted R-squared:  0.7047 
+F-statistic: 71.39 on 2 and 57 DF,  p-value: 2.996e-16
+
+ +

Global statements about the model are just fine:

+ +
    +
  • R-Squared: $X$ explains about 71% of the variability of $Y$
  • +
  • Global F-test: At the 5% level, there is really an association between $X$ and $Y$
  • +
  • Predictions: For persons with $X$-value 2, a best guess for his $Y$-value is +$$ +-0.3439 + 1.3235\cdot 2 + 0.5861 \cdot 2^2 = 4.6475 +$$
  • +
+ +

2. But if we start looking at the effect of individual variable Xs on the explained variable, then we are going to have inaccurate estimates.

+ +

The estimates are accurate, this is not the problem. The problem with the standard interpretation of isolated effects is that we hold all other predictors fixed, which is strange if there are strong correlations to those other predictors. In our example it is even wrong to say ""the average $Y$ value increases by 1.3235 if we increase $X_1$ by 1 and hold $X_2$ fixed, because $X_2 = X_1^2$. Since we cannot interpret isolated effects descriptively, also all inductive statements about them are not useful: Look at the t-tests in the output. Both are above the 5% level, although the global test of association gives us a p-value below 5%. The null hypothesis of such a t-test is ""the effect of the predictor is zero"" or, in other words, ""the inclusion of this predictor does not increase the true R-squared in the population"". Because $X_1$ and $X_2$ are almost perfectly correlated, the model has almost the same R-squared if we drop one of the two variables:

+ +
summary(lm(Y~X1))
+
+# Gives
+
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)    
+(Intercept)  -0.7033     0.2148  -3.274  0.00179 ** 
+X1            2.5232     0.2151  11.733  < 2e-16 ***
+
+Residual standard error: 1.025 on 58 degrees of freedom
+Multiple R-squared:  0.7036,    Adjusted R-squared:  0.6985 
+F-statistic: 137.7 on 1 and 58 DF,  p-value: < 2.2e-16
+
+ +

This already illustrates the first part of the statement:

+ +

One other thing to keep in mind is that the tests on the individual coefficients each assume that all of the other predictors are in the model. In other words each predictor is not significant as long as all of the other predictors are in the model. There must be some interaction or interdependence between two or more of your predictors.

+ +

The last statement here is plainly wrong.

+",2013-10-27 07:54:54.227 +58295,503.0,2,,58293.0,,,,CC BY-SA 3.0,"

Another problem, in addition to those @Michael gave, is that when there is strong near-colinearity, small changes in the input data can lead to large changes in the output.

+ +

I made up some data (taking wild guesses at the average lengths of legs and torso (in inches) and weight (in pounds) for adult humans).

+ +
set.seed(1230101)
+lengthleg <- rnorm(100, 30, 5)
+lengthtorso <- lengthleg + rnorm(100, 0, 1)
+weight <- 1.2*lengthleg + 1.8*lengthtorso + rnorm(100, 0, 10)
+
+m1 <- lm(weight~lengthleg + lengthtorso)
+coef(m1)
+
+ +

the first time through, I got coefficients of -5.93, 0.43 and 2.73. Rerunning everything except set.seed gave me -9.91, 1.12 and 2.18.

+",2013-10-27 11:57:25.867 +58296,2081.0,1,58301.0,,,Suppression effect in regression: definition and visual explanation/depiction,,CC BY-SA 3.0,"

What is a suppressor variable in multiple regression and what might be the ways to display suppression effect visually (its mechanics or its evidence in results)? I'd like to invite everybody who has a thought, to share.

+",2013-10-27 12:08:33.367 +58343,1406.0,2,,37819.0,,,,CC BY-SA 3.0,"

This is an expected result. The matrix $\Pi$ has full rank, when the process is stationary. Of course Johansen procedure usually requires that the time series should be checked for unit roots first. The null hypothesis is that time series are unit-roots and they are cointegrated. If your variables are $I(0)$ then the first step should eliminate the need to use Johansen's test. I would hesitate to use Johansen's test for testing whether the processes are $I(0)$, since it was not designed to be used as such.

+",2013-10-28 08:29:21.737 +58299,20473.0,2,,58269.0,,,,CC BY-SA 3.0,"

Each child can be modeled as a Bernoulli r.v. $X_i$ with probability of having the disease equal to $p_i$, $X_i \sim B(p_i)$, $i=1,\dots ,n$. If you assume that a) $p_1 =p_2=\dots=p_n=p$ and b) that these are independent rv's then their joint density is

+ +

$$f(X_1,\dots,X_n) = \prod_{i=1}^{n}p^{x_i}(1-p)^{1-x_i}$$ +and their log-likelihood function, viewed as a function of $p$ is

+ +

$$\ln L =\sum_{i=1}^{n}\left\{x_i\ln p+(1-x_i)\ln (1-p)\right\}$$

+ +

which leads to the MLE for $p$ +$$\hat p =\frac 1n\sum_{i=1}^{n}x_i$$ +which is unbiased since $$E\hat p =\frac 1n\sum_{i=1}^{n}Ex_i = \frac 1n np =p$$

+ +

Consider now the variable +$$U_i = X_i - E(X_i) = X_i -p \Rightarrow X_i = U_i + p$$ +We have +$$EU_i = 0,\qquad Var(u_i) = Var(X_i) = p(1-p) $$ +so it is covariance-stationary.

+ +

Subsitute for the $x$'s in the estimator

+ +

$$\hat p =\frac 1n\sum_{i=1}^{n}(u_i+p) = \frac 1n\sum_{i=1}^{n}u_i +p$$ +and consider the quantity +$$\sqrt n (\hat p-p) =\sqrt n\frac 1n\sum_{i=1}^{n}u_i= \frac {1}{\sqrt n}\sum_{i=1}^{n}u_i$$

+ +

Since the $U$'s are covariance stationary, (and evidently i..i.d) then the CLT certainly applies and so

+ +

$$\sqrt n (\hat p-p) \rightarrow_d N\left (0, p(1-p)\right) $$

+ +

For approximate statistical inference, we manipulate this expression through +$$ \sqrt n (\hat p-p) = Z \Rightarrow \hat p = \frac {1} {\sqrt n}Z +p$$

+ +

and write that, for ""large samples""

+ +

$$\hat p \sim_{approx} N\left (p, \frac {p(1-p)}{n}\right)$$

+ +

(but not when $n$ truly goes to infinity, since then $\hat p$ does not have a distribution, but collapses to a constant, the true value $p$ since $\hat p$ is a consistent estimator).

+",2013-10-27 14:34:09.020 +58300,23073.0,1,,,,Confidence Intervals Intuition,,CC BY-SA 3.0,"

I am new to statistics and have run into some trouble understanding computing confidence intervals and am seeking some help. I will outline the motivating example in my textbook and hopefully someone can offer some guidance.

+ +

Example

+ +

There is a population of mean values and your goal is to figure out the true mean (as best you can). In order to accomplish this, a number of samples are taken, each of which has a mean value.

+ +

Next, because we know by the central limit theorem that as the number of samples increase, the sampling distribution will be normally distributed, we use the equation $z = \frac{X - \bar{X}}{s}$ (noting that in this case s = standard error) to compute a lower and upper bound taking each sample mean as the mean for the z-score equation and z-scores of -1.96 and +1.96, for example, to compute a 95% confidence interval.

+ +

I’ve included a graph from my textbook in attempt to add clarity.

+ +

+ +

So I do not understand how it is you can use each sample mean as the mean value in our z equation to compute intervals. We know that the sample distribution is normally distributed so isn’t it the case that only the mean of all the samples can be used? How can we compute an interval around each mean value that contributes to the sampling distribution?

+ +

Any help with this would be much appreciated

+ +

Note: I'm reading ""Discovering Statistics Using IBM SPSS Statistics 3rd Edition"" by Andy Field and this example is from pg 43-45

+",2013-10-27 14:38:22.093 +58301,2081.0,2,,58296.0,,,,CC BY-SA 4.0,"

There exist a number of frequenly mentioned regressional effects which conceptually are different but share much in common when seen purely statistically (see e.g. this paper "Equivalence of the Mediation, Confounding and Suppression +Effect" by David MacKinnon et al., or Wikipedia articles):

+
    +
  • Mediator: IV which conveys effect (totally of partly) of another IV +to the DV.
  • +
  • Confounder: IV which constitutes or precludes, totally or +partly, effect of another IV to the DV.
  • +
  • Moderator: IV which, varying, +manages the strength of the effect of another IV on the DV. +Statistically, it is known as interaction between the two IVs.
  • +
  • Suppressor: IV (a mediator or a moderator conceptually) which inclusion +strengthens the effect of another IV on the DV.
  • +
+

I'm not going to discuss to what extent some or all of them are technically similar (for that, read the paper linked above). My aim is to try to show graphically what suppressor is. The above definition that "suppressor is a variable which inclusion strengthens the effect of another IV on the DV" seems to me potentially broad because it does not tell anything about mechanisms of such enhancement. Below I'm discussing one mechanism - the only one I consider to be suppression. If there are other mechanisms as well (as for right now, I haven't tried to meditate of any such other) then either the above "broad" definition should be considered imprecise or my definition of suppression should be considered too narrow.

+

Definition (in my understanding)

+

Suppressor is the independent variable which, when added to the model, raises observed R-square mostly due to its accounting for the residuals left by the model without it, and not due to its own association with the DV (which is comparatively weak). We know that the increase in R-square in response to adding a IV is the squared part correlation of that IV in that new model. This way, if the part correlation of the IV with the DV is greater (by absolute value) than the zero-order $r$ between them, that IV is a suppressor.

+

So, a suppressor mostly "suppresses" the error of the reduced model, being weak as a predictor itself. The error term is the complement to the prediction. The prediction is "projected on" or "shared between" the IVs (regression coefficients), and so is the error term ("complements" to the coefficients). The suppressor suppresses such error components unevenly: greater for some IVs, lesser for other IVs. For those IVs "whose" such components it suppresses greatly it lends considerable facilitating aid by actually raising their regression coefficients.

+

Not strong suppressing effects occurs often and wildly (an example on this site). Strong suppression is typically introduced consciously. A researcher seeks for a characteristic which must correlate with the DV as weak as possible and at the same time would correlate with something in the IV of interest which is considered irrelevant, prediction-void, in respect to the DV. He enters it to the model and gets considerable increase in that IV's predictive power. The suppressor's coefficient is typically not interpreted.

+

I could summarize my definition as follows [up on @Jake's answer and @gung's comments]:

+
    +
  • Formal (statistical) definition: suppressor is IV with part +correlation larger than zero-order correlation (with the dependent).
  • +
  • Conceptual (practical) definition: the above formal definition + the zero-order +correlation is small, so that the suppressor is not a sound predictor +itself.
  • +
+

"Suppessor" is a role of a IV in a specific model only, not the characteristic of the separate variable. When other IVs are added or removed, the suppressor can suddenly stop suppressing or resume suppressing or change the focus of its suppressing activity.

+

Normal regression situation

+

The first picture below shows a typical regression with two predictors (we'll speak of linear regression). The picture is copied from here where it is explained in more details. In short, moderately correlated (= having acute angle between them) predictors $X_1$ and $X_2$ span 2-dimesional space "plane X". The dependent variable $Y$ is projected onto it orthogonally, leaving the predicted variable $Y'$ and the residuals with st. deviation equal to the length of $e$. R-square of the regression is the angle between $Y$ and $Y'$, and the two regression coefficients are directly related to the skew coordinates $b_1$ and $b_2$, respectively. This situation I've called normal or typical because both $X_1$ and $X_2$ correlate with $Y$ (oblique angle exists between each of the independents and the dependent) and the predictors compete for the prediction because they are correlated.

+

+

Suppression situation

+

It is shown on the next picture. This one is like the previous; however $Y$ vector now directs somewhat away from the viewer and $X_2$ changed its direction considerably. $X_2$ acts as a suppressor. Note first of all that it hardly correlates with $Y$. Hence it cannot be a valuable predictor itself. Second. Imagine $X_2$ is absent and you predict only by $X_1$; the prediction of this one-variable regression is depicted as $Y^*$ red vector, the error as $e^*$ vector, and the coefficient is given by $b^*$ coordinate (which is the endpoint of $Y^*$).

+

+

Now bring yourself back to the full model and notice that $X_2$ is fairly correlated with $e^*$. Thus, $X_2$ when introduced in the model, can explain a considerable portion of that error of the reduced model, cutting down $e^*$ to $e$. This constellation: (1) $X_2$ is not a rival to $X_1$ as a predictor; and (2) $X_2$ is a dustman to pick up unpredictedness left by $X_1$, - makes $X_2$ a suppressor. As a result of its effect, predictive strength of $X_1$ has grown to some extent: $b_1$ is larger than $b^*$.

+

Well, why is $X_2$ called a suppressor to $X_1$ and how can it reinforce it when "suppressing" it? Look at the next picture.

+

+

It is exactly the same as the previous. Think again of the model with the single predictor $X_1$. This predictor could of course be decomposed in two parts or components (shown in grey): the part which is "responsible" for prediction of $Y$ (and thus coinciding with that vector) and the part which is "responsible" for the unpredictedness (and thus parallel to $e^*$). It is this second part of $X_1$ - the part irrelevant to $Y$ - is suppressed by $X_2$ when that suppressor is added to the model. The irrelevant part is suppressed and thus, given that the suppressor doesn't itself predict $Y$ any much, the relevant part looks stronger. A suppressor is not a predictor but rather a facilitator for another/other predictor/s. Because it competes with what impedes them to predict.

+

Sign of the suppressor's regression coefficient

+

It is the sign of the correlation between the suppressor and the error variable $e^*$ left by the reduced (without-the-suppressor) model. In the depiction above, it is positive. In other settings (for example, revert the direction of $X_2$) it could be negative.

+

Suppression example

+

Example data:

+
         y         x1         x2
+
+1.64454000  .35118800 1.06384500
+1.78520400  .20000000 -1.2031500
+-1.3635700 -.96106900 -.46651400
+ .31454900  .80000000 1.17505400
+ .31795500  .85859700 -.10061200
+ .97009700 1.00000000 1.43890400
+ .66438800  .29267000 1.20404800
+-.87025200 -1.8901800 -.99385700
+1.96219200 -.27535200 -.58754000
+1.03638100 -.24644800 -.11083400
+ .00741500 1.44742200 -.06923400
+1.63435300  .46709500  .96537000
+ .21981300  .34809500  .55326800
+-.28577400  .16670800  .35862100
+1.49875800 -1.1375700 -2.8797100
+1.67153800  .39603400 -.81070800
+1.46203600 1.40152200 -.05767700
+-.56326600 -.74452200  .90471600
+ .29787400 -.92970900  .56189800
+-1.5489800 -.83829500 -1.2610800
+
+

Linear regression results:

+

+

Observe that $X_2$ served as suppressor. Its zero-order correlation with $Y$ is practically zero but its part correlation is much larger by magnitude, $-.224$. It strengthened to some extent the predictive force of $X_1$ (from r $.419$, a would-be beta in simple regression with it, to beta $.538$ in the multiple regression).

+

According to the formal definition, $X_1$ appeared a suppressor too, because its part correlation is greater than its zero-order correlation. But that is because we have only two IV in the simple example. Conceptually, $X_1$ isn't a suppressor because its $r$ with $Y$ is not about $0$.

+

By way, sum of squared part correlations exceeded R-square: .4750^2+(-.2241)^2 = .2758 > .2256, which would not occur in normal regressional situation (see the Venn diagram below).

+

Suppression and coefficient's sign change

+

Adding a variable that will serve a supressor may as well as may not change the sign of some other variables' coefficients. "Suppression" and "change sign" effects are not the same thing. Moreover, I believe that a suppressor can never change sign of those predictors whom they serve suppressor. (It would be a shocking discovery to add the suppressor on purpose to facilitate a variable and then to find it having become indeed stronger but in the opposite direction! I'd be thankful if somebody could show me it is possible.)

+

Suppression and coefficient strengthening

+

To cite an earlier passage: "For those IVs "whose" such components [error components] it suppresses greatly the suppressor lends considerable facilitating aid by actually raising their regression coefficients". Indeed, in our Example above, $X_2$, the suppressor, raised the coefficient for $X_1$. Such enhancement of the unique predictive power of another regressor is often the aim of a suppressor to a model but it is not the definition of suppressor or of suppression effect. For, the aforementioned enhancement of another predictor's capacity via adding more regressors can easily occure in a normal regressional situation without those regressors being suppressors. Here is an example.

+
   y       x1       x2       x3
+
+   1        1        1        1
+   3        2        2        6
+   2        3        3        5
+   3        2        4        2
+   4        3        5        9
+   3        4        4        2
+   2        5        3        3
+   3        6        4        4
+   4        7        5        5
+   5        6        6        6
+   4        5        7        5
+   3        4        5        5
+   4        5        3        5
+   5        6        4        6
+   6        7        5        4
+   5        8        6        6
+   4        2        7        7
+   5        3        8        8
+   6        4        9        4
+   5        5        3        3
+   4        6        4        2
+   3        2        1        1
+   4        3        5        4
+   5        4        6        5
+   6        9        5        4
+   5        8        3        3
+   3        5        5        2
+   2        6        6        1
+   3        7        7        5
+   5        8        8        8
+
+

Regressions results without and with $X_3$:

+

+

Inclusion of $X_3$ in the model raised the beta of $X_1$ from $.381$ to $.399$ (and its corresponding partial correlation with $Y$ from $.420$ to $.451$). Still, we find no suppressor in the model. $X_3$'s part correlation ($.229$) is not greater than its zero-order correlation ($.427$). Same is for the other regressors. "Facilitation" effect was there, but not due to "suppression" effect. Definition of a suppessor is different from just strenghtening/facilitation; and it is about picking up mostly errors, due to which the part correlation exceeds the zero-order one.

+

Suppression and Venn diagram

+

Normal regressional situation is often explained with the help of Venn diagram.

+

+

A+B+C+D = 1, all $Y$ variability. B+C+D area is the variability accounted by the two IV ($X_1$ and $X_2$), the R-square; the remaining area A is the error variability. B+C = $r_{YX_1}^2$; D+C = $r_{YX_2}^2$, Pearson zero-order correlations. B and D are the squared part (semipartial) correlations: B = $r_{Y(X_1.X_2)}^2$; D = $r_{Y(X_2.X_1)}^2$. B/(A+B) = $r_{YX_1.X_2}^2$ and D/(A+D) = $r_{YX_2.X_1}^2$ are the squared partial correlations which have the same basic meaning as the standardized regression coefficients betas.

+

According to the above definition (which I stick to) that a suppressor is the IV with part correlation greater than zero-order correlation, $X_2$ is the suppressor if D area > D+C area. That cannot be displayed on Venn diagram. (It would imply that C from the view of $X_2$ is not "here" and is not the same entity than C from the view of $X_1$. One must invent perhaps something like multilayered Venn diagram to wriggle oneself to show it.)

+
+

P.S. Upon finishing my answer I found this answer (by @gung) with a nice simple (schematic) diagram, which seems to be in agreement with what I showed above by vectors.

+",2013-10-27 16:31:46.460 +58302,23075.0,1,,,,Would the group means of PC scores differ from the PC scores of group means?,,CC BY-SA 3.0,"

I have $2$ $n\times p$ matrices, where $n$ are the rows (samples), and $p$ the columns (measurements). Each matrix has samples and measurements from different groups. I call these the ""raw"" data. I've conducted a principal components analyses of the complete raw data, and computed the mean of each PC score by group. The latter I call the mean of the PC scores by group.

+ +

My question is whether the means of the PC scores by group (raw-data $\rightarrow$ PCA $\rightarrow$ mean PCs by group) would differ from the PC scores derived from a PCA conducted on the ""raw"" group means (raw data $\rightarrow$ mean by group $\rightarrow$ PCA)?

+ +
+ +

Example analysis of simulated data

+ +
set.seed(123) 
+a <- matrix(rnorm(900),ncol=3,byrow=F) 
+a[1:100,] <- 4 + a[1:100,] 
+a[101:200,] <- -4 + a[101:200,]
+# compute PCA and extract PC scores
+pc <- prcomp(a)$x 
+    plot(pc[,1:2],col=rep(c(""red"",""blue"",""green""),each=100))
+    # compute PC means and plot
+    m <-rbind(colMeans(pc[1:100,1:2]),colMeans(pc[101:200,1:2]),colMeans(pc[201:300,1‌​‌​:2]))
+    points(m,col=""black"", pch=19,cex=1)
+    # compute means of raw data by group
+    b <- rbind(colMeans(a[1:100,]),colMeans(a[101:200,]),colMeans(a[201:300,]))
+    # conduct PCA on ""raw means"" and plot 
+    pc2 <- prcomp(b)$x
+points(pc2[,1:2],col=""black"", pch=17,cex=1)
+
+",2013-10-27 17:00:57.470 +58303,,1,,,user31966,Probability questions for statistics,,CC BY-SA 3.0,"

A set of final examination grades in a course is normally distributed with a mean of 73 and a standard deviation of 8.

+ +
    +
  1. What is the probability of getting a grade below 91 on the exam?
  2. +
  3. What is the probability that a student scored between 65 and 89?
  4. +
  5. If the professor grades on a curve (gives A’s to the top 10% of the class, regardless of the score), are you better off with a grade of 81 on this exam or a grade of 68 on a different exam, where the mean is 62 and the standard deviation is 3? Explain why.
  6. +
+",2013-10-27 18:05:36.960 +58304,22637.0,2,,58303.0,,,,CC BY-SA 3.0,"

What you need to do is standardize those grades so you can use the standard Normal Distribution which is extensively tabulated. Try that first and should you have any problems let us know.

+",2013-10-27 18:16:09.723 +58305,19750.0,1,236322.0,,,When is the differential entropy negative?,,CC BY-SA 4.0,"

The definition of entropy for a continuous signal is:

+

$$h[f] = \operatorname{E}[-\ln (f(X))] = -\int\limits_{-\infty}^{\infty} f(x) \ln (f(x))\, dx$$

+

According to Wikipedia, it can be negative. When would that happen? As far as I understand, $f(x)$ is always $\in[0,1]$ so $f(x)\cdot ln(f(x))$ can only be negative. What am I missing ?

+",2013-10-27 18:53:20.880 +58306,2806.0,1,,,,What is the most appropriate test for a multi-year different group/yr experiment?,,CC BY-SA 3.0,"

I've been working on a research project for close to five years now. For my thesis I have to show how ""well"" did my approach improve things.

+ +

Setup: Every year we use a tool A to brainstorm and negotiate software requirements. The tool was wiki-based and had very low participation from technical and non-technical (client) stakeholders. (Yes we have real living and breathing clients for our class :). I looked at the state of affairs and saw that perhaps social networking based/influenced technologies could help increase participation. So I created tool B to replace tool A. However, tool A was used for the first 2 years and tool B for the latter 3 years.

+ +

Environment: The students changed every year but the overall composition of the class was relatively same (i.e., we had the similar amount of awesome, average and underperforming teams/students). We strive for selecting client projects which are similar level of difficulty and are doable in the duration of the class. They can all be considered projects of the same 'class' (class as in category - same level of complexity etc).

+ +

Here's my hypothesis: Tool B will increase stakeholder participation as compared to Tool A (it was initially in past tense, then changed to present and now to future. Not sure what's right. Keep getting corrections for tense from advisors.)

+ +

Here are my measurements:

+ +
    +
  1. Client (non-technical) participation using Tool B vs Tool A - via access logs
  2. +
  3. Team (student) participation using Tool B vs Tool A - via access logs + observational data
  4. +
  5. Number of requirements captured/negotiated (new/updated) in Tool A vs Tool B
  6. +
  7. Client surveys (for ascertaining the usefulness of tool A for capturing/negotiating requirements. Already have for Tool B.)
  8. +
+ +

One of my advisors suggests I use MANCOVA with 1-3 above as DVs and covariates capturing ""sense of complexity of projects"", ""average number of use-cases per project"" and ""some metric for team composition/makeup"" (which are pretty much the same across the years). Another advisor thinks that simple t-tests would work just fine: That is, I compare the average performance of groups across the years (Group 1 = Tool A; Group 2 = Tool B) and it should be sufficient or maybe an ANOVA at most. Another advior says not to do anything since the data itself is highly skewed i.e., using Tool B has really increased each of the above! He said that doing a statistical test is only to increase the perceived success of the tool and just makes a pompous show of the rigor in analysis.

+ +

I'm not really sure what would be a good approach here? I'm familiar with t-tests but have never done a MANCOVA ever and am afraid that I may just crunch the numbers and falsify underlying assumptions. What would be an appropriate test for such an experimental design, which is done across multiple years, with different groups, keeping the environment relatively constant? I have many such hypotheses w.r.t. tool B since there are many things that it enables from a process standpoint than what was doable before. It's really confusing with 3 advisors giving different advice and I not being a statistician to be able to decide.

+",2013-10-27 19:04:49.590 +58450,503.0,2,,58444.0,,,,CC BY-SA 3.0,"

Given your further comment, I am not surprised at this result. BIC is a penalized log likelihood. It is useful for comparing models on one data set (here, each participant), but not for comparing across data sets.

+ +

What this result is telling you, in essence, is that the model fits very differently for different people, but that the amount of improvement in the fit by adding two parameters is about the same for each person.

+",2013-10-29 18:42:43.290 +58307,16046.0,1,,,,MCMC for an explicitly uncomputable prior?,,CC BY-SA 3.0,"

I am trying to sample from a posterior distribution and I only have an explicit formula for likelihood but I can sample from the prior distribution. How can I sample from the posterior distribution with such a restriction. Is there any specific method?

+ +

After seeing the answers I've decided to write my exact question to clarify stuff: +Its about learning hyper-parameters $\alpha$ and $\beta$ and parameters $\theta_i$ in the following case:

+ +

$\alpha$ and $\beta$ are uniformly chosen from the perimeter of a square by following vertices: $(0,0),(0,1),(1,0),(1,1)$. Now $\theta_i$ is uniformly chosen from this line. $\theta_i$ it self is the parameter for data $y_i\sim\text{Bin}(n_i,\theta_i)$.

+ +

In my first attempt, and maybe being foolish I wrote a neat vectored algorithm which would sample from $p(\alpha,\beta,\theta)$ where $\theta=(\theta_1,\theta_2,...)$. But afterwards I realised that it is hardly related to sampling from $p(y|\alpha,\beta,\theta)$ maybe as a result of the answers here.

+ +

So what I am doing now is that I ignored the whole sampling algorithm I had for the joint priors. To solve the problem is to make a MC random-walk on parameter space $(\alpha,\beta)$ and sub-sampling from it (according to discussion on another question of mine in each step), then sampling from $p(\theta|\alpha,\beta)$ and then calculating the likelihood and then test the new sample according to Metropolis Hasting! I am not even sure this is correct but after my studies, this is the what I can think of!

+",2013-10-27 20:00:57.170 +58308,16046.0,1,58319.0,,,MCMC on a bounded parameter space?,,CC BY-SA 3.0,"

I am trying to apply MCMC on a problem, but my priors(in my case they are $\alpha\in[0,1],\beta\in[0,1]$)) are restricted to an area? Can I use normal MCMC and ignore the samples that fall outside of the restricted zone(which in my case is [0,1]^2), i.e. reuse transition function when the new transition falls out of restricted(constrained) area?

+",2013-10-27 20:07:02.930 +58309,9554.0,2,,58306.0,,,,CC BY-SA 3.0,"

Guess what, all 3 of them might have a point. The issues with ""A causes B"" are tricky. :)

+ +

But first things first. If your hypothesis is:

+ +
+

Tool B will increase stakeholder participation as compared to Tool A.

+
+ +

Stick with it. That's probably the most straightforward and honest thing to measure (honest, in my opinion, since you don't use any covariates such as ""sense of complexity of projects"" that are unreliable in terms of how well can you measure that etc.). As you correctly pointed out, and so did Prof. 2, a two sample t-test is the right tool to measure mean differences between two groups.

+ +

However, then you seem to be wanting to analyze other four things you are measuring, out of which three weren't measured for both tools according to your description: 1:B, 2:B, 3:{A,B}, 4:A. Statistically, I don't see how you want to determine differences between groups for anything other than measurement 3. Which leaves you with a t-test again.

+ +

Your main problem is that a t-test will allow you to say that there is a significant difference. You always need a controlled experiment to claim causality, which in your case is tricky, as you didn't deploy both tools simultaneously using a control group, but rather you first test one, than the other. The obvious problem with this approach is that in the meantime, the usage pattern might have changed due to a number of factors so diverse such as better smartphones, more savvy users, you don't know.

+ +
    +
  • But I would still claim that what you have is more than observational +data.
  • +
  • I would roll with a t-test for the things measured for both +tools.
  • +
  • I would avoid metrics such as ""perceived complexity of XY"".
  • +
  • If the densities of the ""Number of requirements"" of A and B are +visibly different, plot them.
  • +
+ +

You can also run one of the plethora of tests for testing whether your two empirical distributions are the same. (I agree with Prof. 3 that if you can clearly see a different distribution shape, centered around a different value, formal tests are a bit of a showmanship, but he is a professor and can get away with saying that, you probably won't).

+ +

Best of luck!

+",2013-10-27 20:09:36.460 +58310,23078.0,1,,,,Does JMP aggregate data in graph-building?,,CC BY-SA 3.0,"

I am using JMP to find a relationship between drought index values and yearly corn yields for a 30 year period. I have drought data for seven different indeces and each index ranges from -6 (severe drought) to +6 (extremely high precipitation).

+ +

The drought data contains monthly drought index values for March-September for every year from 1981 to 2011. The corn yield data contains one number (bushels/acre) for every year from 1981 to 2011. I built two side-by-side graphs in JMP, with one showing corn yield data over the 30 year period and the other showing drought data from one particular index over the same time period. Both are smooth curves and visually line up and seem to have a relationship.

+ +

My problem is that I have no idea how JMP is processing the drought data. There are eight separate drought index values per year, every year, for each drought index, and only one value per year for the corn yield data. I assumed that JMP was smoothing out the drought curve by taking an average of the eight index values for every year, then using that number in the graph construction but I haven't been able to find any literature that can tell me this for certain.

+",2013-10-27 20:16:54.170 +58311,9554.0,2,,58305.0,,,,CC BY-SA 3.0,"

You are just confusing $f(x)$ and $F(X)$. The density function $f(x)$ can be greater 1. It just integrates to 1. It is $F(X) \in [0, 1]$.

+ +

Best

+",2013-10-27 20:30:28.140 +58313,503.0,4,,,,,,CC BY-SA 3.0,ggplot is an R package for creating graphics. It was developed by Hadley Wickham and based on the Grammar of Graphics by Leland Wilkinson.,2013-10-27 21:12:43.780 +58312,503.0,5,,,,,,CC BY-SA 3.0,,2013-10-27 21:12:43.780 +58314,19750.0,1,58315.0,,,Why does entropy increase with dispersion for continuous but not for discrete distributions?,,CC BY-SA 3.0,"

For a pdf $f(x)$ (i.e. continuous distribution), Entropy (differential entropy) is defined as:

+ +

$H_C(X) = -\int_\mathbb{X} f(x)\log f(x)\,dx.$

+ +

For a discrete distribution with p.m.f $F(x)$, Entropy is defined as:

+ +

$H_D(X) = -\sum_{i=1}^n {F(x_i) \log F(x_i)}.$

+ +

The definitions look analogous to each other. However, entropy increases with dispersion for continuous but not for discrete distributions. Why?

+",2013-10-27 21:15:30.070 +58324,855.0,2,,58310.0,,,,CC BY-SA 3.0,"

Just to make sure I understand you, I think you're plotting the index value by the year value, and there are several index values per year value. And you're in Graph Builder using the Smoother element. Here's a quick mock-up:

+ +

+ +

Actually in my mock-up, I also have the Points element turned on to emphasize the multiple Y values by X.

+ +

From the JMP 11 doc:

+ +
+

The smoother is a cubic spline with a default lambda of 0.05 and + standardized X values. You can change the value of lambda using the + slider. You can obtain the same spline in the Bivariate platform...

+
+ +

Cubic splines are technically only defined for data sets with unique X values. In case of duplicate Xs, JMP first takes the weighted mean of the corresponding Y values. Use the Freq drop zone in Graph Builder if you want to control the weighting, otherwise each Y is weighted equally.

+",2013-10-27 23:50:31.847 +58315,633.0,2,,58314.0,,,,CC BY-SA 3.0,"

It seems you might be asking why spreading discrete data has no effect on entropy. Because entropy is a measure of expected surprise, the various labels or values that a thing can take is immaterial. So, the discrete values $x_i$ don't matter, merely their masses and spreading the $x_i$s has no effect.

+ +

In the continuous case, spreading things out by scaling inevitably reduces the densities, which affects the entropy as defined in your question. The definition is consistent with our intuition of entropy, Shannon explains, because we typically compare two entropies, and since both are scaled, this effect cancels out. Differential entropy is also consistent with discrete entropy in the sense that it approximates what would happen if the entropy of the quantized distribution were measured.

+ +

Note that in the continuous case, spreading things out by other methods can leave entropy unchanged. For example, a uniform distribution over $[-\frac12,\frac12]$ has entropy zero. ""Spreading it out"" so that it is uniform over $[-10, -9.5], [9.5,10]$ still has entropy zero. Spreading never matters; only the expected surprisal does.

+ +
+

There is one important difference between the continuous and discrete + entropies. In the discrete case the entropy measures in an absolute + way the randomness of the chance variable. In the continuous case the + measurement is relative to the coordinate system [and] the entropy can + be considered a measure of randomness relative to an assumed standard, + namely the coordinate system chosen with each small volume element $dx_1, …, dx_n$ + given equal weight. When we change the coordinate system to + $y_1, …, y_n$, the entropy in the new system measures the randomness + when equal volume elements $dx_1, …, dx_n$ in the new system are given + equal weight.

+ +

In spite of this dependence on the coordinate system the + entropy concept is as important in the continuous case as the discrete + case. This is due to the fact that the derived concepts of information + rate and channel capacity depend on the difference of two entropies + and this difference does not depend on the coordinate frame, each of + the two terms being changed by the same amount.

+ +

The entropy of a + continuous distribution can be negative. The scale of measurements + sets an arbitrary zero corresponding to a uniform distribution over a + unit volume. A distribution which is more confined than this has less + entropy and will be negative. The rates and capacities will, however, + always be nonnegative. + — Shannon 1948

+
+",2013-10-27 21:32:09.783 +58316,9554.0,2,,58290.0,,,,CC BY-SA 3.0,"

Since you are using the ROC, I presume that you are running 5 classifiers. +Frank is right about the ROC, that's not the way people compare models. +For the linear, and generalized linear models you can apply the likelihood ratio test.

+ +

However, in case you are after the best prediction performance, and particularly in case you are not using a parametric model, but say a random forest classifier, I would do the following:

+ +
    +
  • generate data
  • +
  • split it randomly into a training and testing set
  • +
  • train all your 5 models and test their performance
  • +
  • repeat the entire procedure for as many time as the run time permits and store all 5 ROC curves (I would pick a 1000, or 10000 as a minimum, depending on the convergence of the mean predictions)
  • +
  • report the means of the 5 ROC curves together with a 90% pointwise confidence interval around them
  • +
+ +

The idea is of course that you pick a model that seems like the best combination of high AUC and low variance (narrow intervals around the mean) of the estimates.

+ +

Best

+",2013-10-27 21:32:43.350 +58317,7007.0,2,,58307.0,,,,CC BY-SA 3.0,"

Can you sample from the conditional distribution of $X\mid\Theta$? If you can, try using ABC to sample (approximately) from the posterior. The ABC rejection algorithm does not use the value of the prior density at each candidate point.

+",2013-10-27 21:43:48.200 +58318,22843.0,1,58321.0,,,Using expectation to detect bias,,CC BY-SA 3.0,"

I was going through Penn State's online notes and noticed this expression:

+ +

$ v^2 = \frac{1}{n} \sum_{i=1}^n (y_i - \bar{y})^2$

+ +

In the line below it they stated that the $E[v^2] = (1 - \frac{1}{n})\sigma^2$. I was wondering how would you get that?

+ +

Would it be wrong for me to say that since $\sum_{i=1}^n (y_i - \bar{y})^2 = (n-1)s^2$ the +$E[v^2] = E[\frac{(n-1)s^2}{n}] = (1 - \frac{1}{n})\sigma^2$ ? But this line of reasoning forces me to assume that $E[s^2] = \sigma^2$ and I don't even know why that's true.

+",2013-10-27 22:41:43.817 +58319,5448.0,2,,58308.0,,,,CC BY-SA 4.0,"

You have several nice, more-or-less simple, options. Your uniform prior helps make them simpler.

+

Option 1: Independence sampler. You can just set your proposal distribution equal to a uniform distribution over the unit square, which ensures that samples won't fall outside the restricted zone, as you call it. Potential downside: if the posterior is concentrated in a very small region of the unit square, you may have a very low acceptance rate. OTOH, it's hard to generate random numbers faster than from a U(0,1) distribution. Potential upside: less work for you.

+

Option 2: Transform your parameters to something that isn't bounded, make proposals for the transformed parameters, then transform the parameters back for use in the likelihood functions. Note that in this case the prior is going to be on the transformed parameters, because that's what you're making proposals for, so you'll have to mess with the Jacobian of the transform to get the new prior. For your analysis, of course, you'll transform the MCMC-generated parameter random numbers back to the original parameters. Potential downside: more initial work for you. Potential upside: better acceptance rate for your proposals.

+

Option 3: Construct a proposal distribution other than an independence sampler that is on the unit square. This allows you to keep your uniform prior, but at the cost of greater complexity when calculating the proposal probabilities. An example of this, letting $x$ be the current value of one of your parameters, would be a Beta distribution with parameters $(nx, n(1-x))$. The larger $n$ is, the more concentrated your proposal will be around the current value. Potential downside: more initial work for you. Potential upside: better acceptance rate for your proposals - but if you make $n$ too large, and move near to a corner, you might wind up making lots of small moves in the corner before getting out.

+

Option 4: Just reject any proposals that fall outside the unit square (Xian's half-hearted suggestion). Note that this is not the same as just generating another proposal; in this case you are rejecting the proposal, which means your next value for the parameter is the same as the current value for the parameter. This works because it's what would happen if you had a zero prior probability for some region of your parameter space and generated a random number that fell in that region. Potential downside: if you get near a corner, you may have a low acceptance probability and get stuck for a while. Potential upside: less work for you.

+

Option 5: Create an extended problem on the plane which, on the unit square, is the same as the actual problem you face, do everything right, then, when post-processing the results of the MCMC sampling, throw out all the samples outside of the unit square. Potential upside: If it's very easy to create that extended problem, it may be less work for you. Potential downside: if the Markov chain wanders off somewhere outside the unit square for a while, you may have, in effect, horrible acceptance probabilities, as you will throw out most of your samples.

+

No doubt there are other options, I'd be interested to see what other people suggest!

+

The difference between 2 and 3 is to some extent conceptual, although with real implications for what you actually do. I'd probably go with 3, as I'd just let R tell me what the proposal probabilities are (if I'm programming in R) and the amount of extra effort, aside from some tuning of the proposal distribution parameter $n$, looks small to me. If I was using JAGS or BUGS, of course, that would be a whole different matter, since those tools handle their own proposals.

+",2013-10-27 23:05:50.693 +58320,3894.0,1,,,,Computing non-central moments and normalizer of a quartic exponential distribution,,CC BY-SA 3.0,"

Consider a random variable $X$ which has quartic exponential distribution: $$X \sim P(x)=\frac{1}{Z}e^{ax + bx^2 + cx^3 + dx^4}$$ How can one compute $Z$ or non-central moments $E X^k$ given that they exist? As far as I understand, there are no closed-form formulas for these quantities, but is there a good numerical procedure for estimating them?

+ +

Since I'm very far from being any kind of expert in numerical integration, I'm open to any suggestions that will get the job done with reasonable precision.

+",2013-10-27 23:19:58.153 +58321,9554.0,2,,58318.0,,,,CC BY-SA 3.0,"

Ok, though I'm not sure about what is the course, or what is your $s^2$ exactly, the course notes seem to use $v^2$ to denote sample variance.

+ +

The proof (see bit on sample variance) of the line is not hard, but I guess ""hard"" is relative. I remember, a lot of people struggling with this one in the undergrad. The key is not to forget, after writing out the second power of the term in brackets and taking the expected value operator inside the sum, you have to use +$E[X^2] = V[X^2] + E[X]^2$ to substitute all the terms $E[y_i^2]$, the rest is re-arranging the sums.

+",2013-10-27 23:27:18.270 +58322,23081.0,1,,,,How to interpret the results of a t-test?,,CC BY-SA 3.0,"

I recently performed an experiment observing ants. For five days I measured the amount of food they ate, 25 ants in the fruits group and 25 ants in the vegetables group. Each ant was kept isolated in its own container.

+ +

My hypothesis was that the ant (3 mg in body weight) would be able to eat 20 times more food than what it weighed; I came to that assumption by relating it to the amount of weight they can lift.

+ +

The average for the fruits was 357 mg and the average for the vegetables was 358 mg. Keep in mind that I didn't take into account all the weight lost from the food by loss of water evaporating; I just assumed it was all eaten by the ants.

+ +

For my results I get a t-value of -0.0981 ; df=223 ; standard error of difference=0.015 ; p-value of 0.9220.

+ +

So since this is my first experiment dealing with t-test. I don't know how to interpret and present the results. Can someone please explain what they mean?

+",2013-10-27 23:36:55.623 +58323,22637.0,2,,58322.0,,,,CC BY-SA 3.0,"

The p-value indicates that the null hypothesis cannot be rejected. It seems that the ant is indeed able to eat 20 times more food than its weight. While I can only speak for the statistical significance of the test, I would advise you to incorporate as many parameters as possible in your research, including water evaporation.

+",2013-10-27 23:47:12.423 +58340,10450.0,2,,58337.0,,,,CC BY-SA 3.0,"

1) The error, $err_{m}$, in step 4. captures the relative ratio of weighted prediction errors for each iterated pass, $m$, of the sequential learning algorithm ( $ 0 < err_{m} < 1$) . The errors are equivalent in both equations. You can see the weight distribution $D_{t}(i) = w_{i,t} / {\sum_{i=1}^m w_{i,t}}$. Only the chosen variable names are reversed ($m=t$, the iteration step in one case, and $m=N$ the number of observations in the other).

+ +

$\text{err}_m = \frac{\sum_{i=1}^N w_{i,m} \mathbb{I}(\hat{y} \neq \phi(\mathbf{x_i)}}{\sum_{i=1}^N w_{i,m}} = $

+ +

$\epsilon_{t} = \sum_{i=1}^{m} D_{t}(i)I(y_i \ne h_{t}(x_{i}))$

+ +

2) While the stopping rule is not shown in Kevin's algorithm, one of the theoretical requirements of the adaboost learner is to have weak learners slightly greater than chance. He does mention it in the text.

+ +

3) The unclosed parenthesis does appear to be a typo.

+ +

4) see 2)

+",2013-10-28 06:22:23.923 +58325,15430.0,1,,,,Hypothesis testing with exponential family,,CC BY-SA 3.0,"

I'm interested in running hypothesis tests for a variety of members of the exponential family with continuous support, for different values of the parameter/s, for a sample of n i.i.d random variables (distributed according to a particular member and parametrization of the exponential family).

+ +

In general, it seems the sufficient statistic (at least for a single parameter) is a sum of the variables in the sample. I know this can be obtained at least theoretically from n-fold convolution, but in general, there does not seem to be a nice form for the distribution of this sum, which I will need to compute p-values.

+ +

so I'm wondering what distributions have known distributions for the sum of n i.i.d variables, and if there is a good reference for doing hypothesis testing with exponential family.

+ +

I know that:

+ +
    +
  • sums of iid exponentials are distributed Erlang
  • +
  • sums of iid normals are distributed normal
  • +
  • sums of iid gamma are distributed gamma (but what about product of gamma, which is the sufficient statistic for the shape parameter?)
  • +
+ +

So what about beta, pareto, log normal, and so on?

+ +

Is the difficulty of this the reason why no one does hypothesis tests anymore?

+",2013-10-27 23:57:23.037 +58326,15430.0,2,,58314.0,,,,CC BY-SA 3.0,"

In the continuous case, because they are continuous, spreading out the $x$ requires that you dampen the densities, and so this effects the entropy, since the density is tied in an explicit way to the values of $x$.

+ +

In the discrete case, the values of $x_i$ are more like indices, and there is no explicit connection between the density and the $x_i$ (unlike for continuous, where there is a function connecting them), so spreading out the $x_i$ doesn't affect the densities, and hence doesn't affect the entropy.

+ +

All that being said, it was my understanding that the entropy of a continuous distribution (with unbounded support) tends to diverges...

+",2013-10-28 00:04:20.103 +58327,8926.0,2,,58322.0,,,,CC BY-SA 3.0,"

Commonly used level of statistical significance at which null hypothesis can be considered rejected is 5% or p-value = 0.05, though this number differs depending on a problem or discipline, 0.1%, 1% or 10% are used. In your case you can reject the null hypothesis at 92.2% significance level or p-value = 0.922. Therefore, you fail to reject the null hypothesis for any meaningful significance level.

+ +

t-stat and p-value are inversely related, large t-stat corresponds to small p-value and vice-versa, but p-value is easier to interpret.

+",2013-10-28 00:08:25.780 +58328,21932.0,1,58363.0,,,hypothesis testing using poisson distribution,,CC BY-SA 3.0,"

At a nuclear plant great care is taken to measure the employees health.These are the number of visits made by each of the 10 employees to the doctor during a calender year. 3,6,5,7,4,2,3,5,1,4

+ +

Assuming the number of visits made by employee has a poisson distribution ,test the hypothesis that the annual mean per employee is greater than 3.

+ +

I am using the graphical method and I am not sure of which p[X=x] should i consider.

+ +

X: no.of visits by each employee to the doctor.

+ +
H0:lambda=3
+H1:lambda>3
+X follows a Poisson(3)
+
+ +

Then what is the probability that I should check?

+ +

What I did was as the average of sample data is 4.73636. Therfore calculated p[X>=4] and checked if it was in the critical region. Is this the correct probability to calculate? In a poisson distribution the expected value is calculated as x*p[X=x] right?Not as (sigma x*f(x))/(sigma x)

+",2013-10-28 01:17:32.970 +58329,22591.0,1,,,,Search in TF-IDF,,CC BY-SA 3.0,"

I want to find the similarity between a document with documents coded as TF-IDF in a pickle file (Python). TF-IDF is done as offline so there is no problem, but when I send a new document for similarity check it takes around 2 minute while I need something real-time (< 2 seconds). For this purpose I used the following code:

+ +
for p_tf in p_tfidf:
+    temp_similarity = 0
+    for item in p_tf:
+        (score,word) = item
+        if word in input_text:
+            temp_similarity += score
+
+    similarity_score.append([temp_similarity, id])
+
+ +

Any clue how to improve system?

+",2013-10-28 01:38:35.207 +58330,23085.0,1,59733.0,,,Prove/counter example: A minimax decision rule is always Bayes wrt some proper prior,,CC BY-SA 3.0,"

Not sure whether the claim is true or false.

+ +

If claim is true, intuitively, it might have something to do with ""least favorable priors"", but am not able to figure out the connection.

+ +

If claim is false, one example is when $X_i|\theta \sim $ Poisson$(\theta)$, then $\bar{X}$ is minimax. But a Gamma$(\alpha, \beta)$ prior fails since, that would indicate $\beta = 0$, which is improper. But, how do we know there is no other prior that gives $\bar{X}$ as a Bayes rule?

+",2013-10-28 02:20:34.733 +58331,855.0,2,,40121.0,,,,CC BY-SA 3.0,"

There is a discussion of the circle construction in the JMP help/manuals. See Statistical Details for Comparison Circles.

+",2013-10-28 02:39:50.397 +58332,8926.0,2,,58329.0,,,,CC BY-SA 3.0,"

You can make use of sklearn.feature_extraction.text.TfidfVectorizer

+ +

A simple example:

+ +
from sklearn.feature_extraction.text import TfidfVectorizer
+vectorizer = TfidfVectorizer(min_df=1)
+
+my_phrases = [""boring answer phrase"",
+              ""exciting phrase"",
+              ""phrase on stackoverflow"",
+              ""answer on stackoverflow""]
+
+my_features = vectorizer.fit_transform(my_phrases)
+
+ +

Result:

+ +
>>> import numpy as np
+>>> np.set_printoptions(precision=4)
+>>> my_features.A
+array([[ 0.5535,  0.702 ,  0.    ,  0.    ,  0.4481,  0.    ],
+       [ 0.    ,  0.    ,  0.8429,  0.    ,  0.538 ,  0.    ],
+       [ 0.    ,  0.    ,  0.    ,  0.6137,  0.4968,  0.6137],
+       [ 0.5774,  0.    ,  0.    ,  0.5774,  0.    ,  0.5774]])
+>>> vectorizer.get_feature_names()
+[u'answer', u'boring', u'exciting', u'on', u'phrase', u'stackoverflow']
+
+ +

As a side note, you can remove ""stop words"" like ""on"", by passing stop_words='english' parameter:

+ +
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
+
+ +

Edit:

+ +
from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+
+# each phrase here could be document in your list 
+# of documents
+my_phrases = [""boring answer phrase"",
+              ""exciting phrase"",
+              ""phrase on stackoverflow"",
+              ""answer on stackoverflow""]
+
+#  and you want to find the most similar document
+#  to this document             
+phrase = [""stackoverflow answer""]
+
+# You could do it like this:
+vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
+all_phrases = phrase + my_phrases
+my_features = vectorizer.fit_transform(all_phrases)
+scores = (my_features[0, :] * my_features[1:, :].T).A[0]
+best_score = np.argmax(scores)
+answer = my_phrases[best_score]
+
+ +

Result:

+ +
>>> answer
+'answer on stackoverflow'
+
+",2013-10-28 02:52:52.830 +58341,21985.0,1,,,,How to get exact distribution of estimated p for binomial distribution?,,CC BY-SA 3.0,"

This question is kind of a follow up of another question I had: Asymptotic normal distribution via the central limit theorem

+ +

There I had to calculate the estimator for $p$ (meaning $p$ for success) and approximate it's distribution by approximation with a normal distribution.

+ +

Now I would like to get the exact distribution of $p$.

+ +

I got already the following hint: ""You have the functional form of $\hat{p}$. Look up how we derive the distribution of a function of a discrete random variable."" Unfortunately that did not lead me to a solution...

+",2013-10-28 07:21:43.657 +58333,23086.0,1,58335.0,,,R- Analysis of homogeneity of slopes,,CC BY-SA 3.0,"

I´d like to analyse the effect of a treatment (treatment : Factor w/ 2 levels ""ambient"",""elevated"") in tree diameter increment. Tree diameter is influenced by tree size. To do so, I performed the following lm:

+ +
Call:
+lm(formula = BAI2013 ~ diameterJul12 * treatment, data = bandNA)
+
+Residuals:
+     Min       1Q   Median       3Q      Max 
+-16.6493  -3.1740  -0.3767   3.3631  22.7267 
+
+Coefficients:
+                                 Estimate Std. Error t value Pr(>|t|)    
+(Intercept)                     -20.49357    2.12883  -9.627  < 2e-16 ***
+diameterJul12                     1.24194    0.08876  13.992  < 2e-16 ***
+treatmentelevated                10.72336    3.45783   3.101 0.002295 ** 
+diameterJul12:treatmentelevated  -0.54953    0.14795  -3.714 0.000285 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+Residual standard error: 6.461 on 153 degrees of freedom
+Multiple R-squared:  0.6035,    Adjusted R-squared:  0.5958 
+F-statistic: 77.63 on 3 and 153 DF,  p-value: < 2.2e-16
+
+ +

How can I interpret the results? I need to figure out whether the slope of BAI~diameter is steeper for elevated trees than for ambient. +Thanks

+",2013-10-28 03:04:51.660 +58334,5448.0,2,,58307.0,,,,CC BY-SA 3.0,"

One specific method is importance sampling. The key slide in the link is slide 3.

+ +

In this case, you'd:

+ +
    +
  1. Generate a large random sample from your prior, let us denote it $\theta_i, i = 1, \dots, N$.

  2. +
  3. Each element of that sample will have associated with it a value of the likelihood function, let us say $l_i$. Calculate them.

  4. +
  5. We can then form resampling acceptance probabilities $p_i = l_i / \max l_i$.

  6. +
  7. Generate your posterior sample as follows:

    + +

    a) For each $j = 1, \dots,$ some large $M$, select some index $k$ +uniformly from $\{1,\dots,N\}$.
    +b) Generate $u \sim + \text{U}(0,1)$.
    +c) If $u < p_k$, then $\theta_k$ is put into your +posterior sample. Otherwise, go to the next $j$, and nothing is put +into your posterior sample.

  8. +
+ +

Of course, if you can't generate a large random sample from the prior, or you have a few relatively large values of the likelihood and a lot of very small ones (which would happen if your posterior is very concentrated with respect to the prior), you won't get very good results. No panaceas here, I'm afraid! But this method works quite well in many cases.

+",2013-10-28 03:07:04.160 +58335,23087.0,2,,58333.0,,,,CC BY-SA 3.0,"

The maximum likelihood fitted model is

+ +
BAI2013 = 1.24194 * diameterJul12 + 10.72336 * treatmentelevated - 0.54953 * diameterJul12 * treatmentelevated - 20.49357
+
+ +

Here treatmentelevated is a binary variable which is 1 for ""elevated"". All terms have significant p-values (these come from Wald statistics) suggesting they should be kept in the model. The interaction term is negative suggesting the slope of BAI~diameter is less steep (by 0.54953) in the elevated treatment group.

+",2013-10-28 04:12:45.507 +58336,23087.0,2,,58307.0,,,,CC BY-SA 3.0,"

How high dimensional is your state space? If it's a univariate problem I would suggest slice sampling. If it's higher dimensional you might be able to use slice sampling within Gibbs sampling still. If not, the other suggestions of ABC or importance sampling (some versions of ABC use importance sampling as an inner loop also), may be your best bet if there really isn't any additional structure you can exploit. As @jbowman says however, if your prior and posterior are very mismatched these methods will struggle.

+",2013-10-28 04:20:45.933 +58337,19750.0,1,58340.0,,,Multiple definitions of AdaBoost,,CC BY-SA 3.0,"

The description of AdaBoost in Kevin Murphy's Machine Learning book (shown in a snapshot below) differs from the one in Wikipedia. I am trying to relate both definitions. Step by step, my questions are:

+ +
    +
  1. What exactly is $\text{err}_m$ (step 4) supposed to capture below? Is this equivalent to the $\epsilon_t$ in Wikipedia's definition?

  2. +
  3. Why isn't there a stopping rule in Kevin Murphy's method but there is one in Wikipedia's definition?

  4. +
  5. There seems to be a typo in the parenthesis in the denominator, just in case - is it supposed to say the following?:

    + +

    $\text{err}_m = \frac{\sum_{i=1}^N w_{i,m} \mathbb{I}(\hat{y} \neq \phi(\mathbf{x_i)}}{\sum_{i=1}^N w_{i,m}} $

  6. +
  7. Most importantly, Wikipedia provides the following criteria for choosing the weak learner and for stopping:

    + +
    +

    $h_{t} = \underset{h_{t} \in \mathcal{H}}{\operatorname{argmax}} \; \left\vert 0.5 - \epsilon_{t}\right\vert$ + where $\epsilon_{t} = \sum_{i=1}^{m} D_{t}(i)I(y_i \ne h_{t}(x_{i}))$

    + +

    $If \left\vert 0.5 - \epsilon_{t}\right\vert \leq \beta$, where $\beta$ is a previously chosen threshold, then stop.

    +
    + +

    while Kevin's book defines the full algorithm as follows, and I don't see those two steps above in it:

  8. +
+ +

+",2013-10-28 04:28:34.460 +58338,23089.0,1,,,,How to interpret the results of ADF test using SAS ARIMA?,,CC BY-SA 3.0,"
                                       The SAS System      14:11 Thursday, October 6, 2013   1
+
+                                      The ARIMA Procedure
+
+                                Name of Variable = ln_G_S_Index
+
+                     Period(s) of Differencing                           1
+                     Mean of Working Series                       0.094293
+                     Standard Deviation                           0.316757
+                     Number of Observations                             15
+                     Observation(s) eliminated by differencing           1
+
+
+                                        Autocorrelations
+
+ Lag    Covariance    Correlation    -1 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 1      Std Error
+
+   0      0.100335        1.00000    |                    |********************|             0
+   1     0.0026693        0.02660    |          .         |*        .          |      0.258199
+   2     -0.018517        -.18456    |          .     ****|         .          |      0.258382
+   3      0.029440        0.29342    |         .          |******    .         |      0.267025
+
+                                 ""."" marks two standard errors
+
+
+                                    Inverse Autocorrelations
+
+               Lag    Correlation    -1 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 1
+
+                 1       -0.14763    |          .      ***|         .          |
+                 2        0.19526    |          .         |****     .          |
+                 3       -0.27516    |          .   ******|         .          |
+
+
+                                    Partial Autocorrelations
+
+               Lag    Correlation    -1 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 1
+
+                 1        0.02660    |          .         |*        .          |
+                 2       -0.18539    |          .     ****|         .          |
+                 3        0.31522    |          .         |******   .          |
+
+
+                               Phillips-Perron Unit Root Tests
+
+              Type           Lags         Rho    Pr < Rho        Tau    Pr < Tau
+
+              Zero Mean         0    -11.6883      0.0066      -3.23      0.0033
+                                1    -11.4504      0.0074      -3.23      0.0034
+              Single Mean       0    -13.7527      0.0129      -3.71      0.0171
+                                1    -12.6667      0.0218      -3.76      0.0157
+              Trend             0    -14.5288      0.0601      -3.25      0.1144
+                                1    -13.1531      0.1022      -3.20      0.1239
+                                        The SAS System      14:11 Thursday, October 6, 2013   2
+
+                                      The ARIMA Procedure
+
+                                Name of Variable = ln_G_S_Index
+
+                     Period(s) of Differencing                           1
+                     Mean of Working Series                       0.094293
+                     Standard Deviation                           0.316757
+                     Number of Observations                             15
+                     Observation(s) eliminated by differencing           1
+
+
+                                        Autocorrelations
+
+ Lag    Covariance    Correlation    -1 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 1      Std Error
+
+   0      0.100335        1.00000    |                    |********************|             0
+   1     0.0026693        0.02660    |          .         |*        .          |      0.258199
+   2     -0.018517        -.18456    |          .     ****|         .          |      0.258382
+   3      0.029440        0.29342    |         .          |******    .         |      0.267025
+
+                                 ""."" marks two standard errors
+
+
+                                    Inverse Autocorrelations
+
+               Lag    Correlation    -1 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 1
+
+                 1       -0.14763    |          .      ***|         .          |
+                 2        0.19526    |          .         |****     .          |
+                 3       -0.27516    |          .   ******|         .          |
+
+
+                                    Partial Autocorrelations
+
+               Lag    Correlation    -1 9 8 7 6 5 4 3 2 1 0 1 2 3 4 5 6 7 8 9 1
+
+                 1        0.02660    |          .         |*        .          |
+                 2       -0.18539    |          .     ****|         .          |
+                 3        0.31522    |          .         |******   .          |
+
+
+                            Augmented Dickey-Fuller Unit Root Tests
+
+    Type           Lags         Rho    Pr < Rho        Tau    Pr < Tau          F    Pr > F
+
+    Zero Mean         0    -11.6883      0.0066      -3.23      0.0033
+                      1    -12.4302      0.0041      -2.42      0.0197
+    Single Mean       0    -13.7527      0.0129      -3.71      0.0171       6.91    0.0157
+                      1    -25.2133      <.0001      -3.63      0.0214       6.59    0.0206
+    Trend             0    -14.5288      0.0601      -3.25      0.1144       6.44    0.0799
+                      1    -45.0252      <.0001      -3.20      0.1265       6.92    0.0622
+                                        The SAS System      14:11 Thursday, October 6, 2013   3
+
+",2013-10-28 05:34:54.517 +58339,23090.0,1,,,,Is this test answer good enough to show Granger Causality?,,CC BY-SA 3.0,"

I used an inbuilt Matlab function to check for Granger Causality between two time series P and T that have a correlation coefficient of 0.6.

+ +

The function is : +https://www.mathworks.com/matlabcentral/fileexchange/25467-granger-causality-test/content/granger_cause.m

+ +

On running it, I got the following answer:

+ +
>> [fs,cv] = granger_cause(P, T,0.05,2)
+
+fs =
+
+  1.0281e+003
+
+
+cv =
+
+    2.9966
+
+ +

where FS is the F-Statistic and cv is the critical value from the F-Distribution.

+ +

Does this show causality? Actually... what does it show?

+",2013-10-28 05:47:22.693 +58344,23091.0,1,,,,Exponential regression: calculating p-value and F significance,,CC BY-SA 3.0,"

I have a set of independent data and dependent data $(X,Y)$, where I would like to do an exponential regression to obtain its p-value and significant $F$ (already obtained $R^2$ and also the coefficients through mathematical calculation).

+ +

Often an exponential data, $y=be^{mx}$ will be converted first to a linear data, $\ln y = mx + \ln b $. Then a linear regression will done on the converted data, obtaining its p-value etc. Assume we use a statistical tool such as Excel's Analysis ToolPak: Data Analysis : Regression, it will produce a result such as below,

+ +

+ +

I believe the p-value and Significant $F$ value above is representing the converted linear data and not the original exponential data.

+ +

Questions:

+ +
    +
  1. What is the approach/steps used by Excel to get the p-value and Significant F value for the converted linear data as shown in the statistic output in the image above? It is not clear in their help page or website.

  2. +
  3. Can the p-value and Significant F could be mathematically calculated for exponential regression without using a statistical tool? Can you assist to point me to the right link if this has been answered before.

  4. +
+ +

Spent a week on the internet to study this but unable to find the right answer.

+",2013-10-28 08:36:36.887 +58345,21886.0,1,,,,Gaussian process estimation,,CC BY-SA 4.0,"

The stochastic process $(X_t)_{t\in T}$ is called Gaussian if for all $t_1,\dots,t_k\in T$, for all $k$, the joint distribution of $X_{t_1},\dots,X_{t_k}$ is multivariate normal. The process is completely characterized by its mean function $$\mu(t) = \mathbb{E}[X_t]$$ and its covariance function $$\sigma(s,t) = \operatorname{Cov}[X_s,X_t].$$

+

Given a centered (0-mean) Gaussian process, is it possible to estimate its covariance function?

+",2013-10-28 09:44:43.930 +58346,20144.0,1,,,,"Using cross correlation to infer dependence, can it be done?",,CC BY-SA 3.0,"

I have a very particular question, I have seen a similar one here, but my knowledge is too limited to make use of it. I will try to explain myself as clearly as possible... Wish me luck!

+ +

I have a sequence of (a priori) probabilities of a binary variable, we could say $X_i \sim Ber(p_i)$, i.e. $P(X_i = 1) = 1 - P(X_i = 0)=p_i$. I know that $X_i$ are not independent, but I don't know exactly $P(X_i|X_{j\neq i})$ nor $P(X_i|X_{j_1\neq i},...,X_{j_n\neq i})$ (I don't know anything about their dependence), and I want to know $$P(\sum_{j=-K}^{j=K} X_{i+j} =0)$$ +What kind of knowledge/hypothesis do you think I need in order to approximate this more efficiently? Assuming that they are independent does not work well enough. What would you do if you found a similar situation? I am not an expert at all in this matters, and the more I learn about statistics the less I know!

+ +

For example, I can see that the probabilities form small ""triangle"" shapes, so maybe something like $P(X_i = 1 | X_{i-1} = 1, X_{i-2} = 0) = P(X_i = 0 | X_{i-1} = 0, X_{i-2} = 1) $ can help? If so, is there any way to use this? As I said, I have no idea...

+ +

Thank you very much for your help!

+ +

edit: I think the title is not very good, but I don't know how to explain it better... One more question, can I say $Y_i = X_i - X_{i-1}$ and try to see $P(X_i|Y_{i-1})$?

+",2013-10-28 10:00:02.867 +58347,11772.0,2,,58345.0,,,,CC BY-SA 3.0,"

If the form of the kernel is known (many real applications use an RBF kernel for example), it is possible, given a set of observations $(x_t, y_t)$ to estimate its hyperparameters (the length-scale for RBF) via maximisation of the marginal likelihood.

+ +

You should take a look on Chapter 5 of Gaussian Process for Machine Learning. You will find an example of MATLAB code in the gpml documentation, in the paragraph of section ""Regression"" starting by ""Typically, we would not a priori know the values of the hyperparameters...""

+",2013-10-28 10:06:38.203 +58349,503.0,5,,,,,,CC BY-SA 3.0,,2013-10-28 10:42:29.940 +58348,503.0,4,,,,,,CC BY-SA 3.0,See `continuous-data`,2013-10-28 10:42:29.940 +58350,20144.0,2,,58346.0,,,,CC BY-SA 3.0,"

O.K., I think I found a way of doing it with the correct assumptions. Although it is only useful for my particular problem, maybe somebody can tell me if I am being too ""sloppy"", correct me or maybe my approach might be useful to somebody in the future.

+ +

First of all, I had not realized that the ""triangles"" account for ""long independent events"". This means that $P(X_i = 0 | \sum_{j\neq i, |j-i| < D} X_j = 1) = 1$ in my notation ($X_i$ represents that an event starts in the moment $i$). What I started doing was a smoothing of $2D$ of $p_i$, so I took the average (can I do this?) in windows. This gave me a way of seeing how many events are there in a sequence. +

+ +

The orange line is the original $p_i$ sequence, the black one is the smoothed. After this, in a window of length $2K$, I count how many peaks are there (how many possible events), and the probability of each event is the sum of probabilities from the beginning until the end of the ""hill"", although, as can be seen in the picture, sometimes they can overlap, but I have no idea of how can I take that into account. Then, the probability of no event happening in the window of length $2K$ is the product of probabilities of no ""long event"" happening in this window.

+ +

Do you think it is a good answer? Do you have any comment/suggestion? Thank you very much.

+",2013-10-28 10:49:42.880 +58351,503.0,2,,52126.0,,,,CC BY-SA 3.0,"

First, we'll need to know whether you are interested in the response to each Likert question or to a sum of Likert questions; if the latter, it matters how many questions and what the distribution of the scale looks like.

+ +

Either way, you will have to account for the nonindependence of the data, because the same people are answering the questions multiple times. Repeated measures ANOVA is one solution to this, but it makes unrealistic assumptions including sphericity, and would only be usable for the scale score, and only if the scores ranged fairly widely so that you could pretend they were continuous.

+ +

A better option is a mixed model. If you treat the scores as continuous data, then this would be a linear mixed model; if you treat them as ordinal (as you would have to do if you were interested in each question) then you would need a nonlinear mixed model.

+ +

Unfortunately, these models are not simple to implement. If you currently know only about t-tests, then you may need to hire a consultant to help.

+",2013-10-28 10:53:25.943 +58352,6708.0,1,,,,Spearman's Rho - from partial ranked variables,,CC BY-SA 3.0,"

I have two variables which represent two performance measures.
+I ranked a finite set of elements according these two variables.
+Therefore, I have to ranks. Suppose the ranks are performed in descending order (the highest the measure the highest the value in the decision process of each element).

+ +

For instance, an ordered set $\Omega$ of $10$ elements according the two measures A and B.

+ +
R#A = [5 3 1 9 2 10 6 7 4 8];  
+R#B = [9 7 4 8 5 6 1 3 2 10];
+
+ +

Suppose that now I truncate R#A and R#B in order to select the ""top 5"" elements:

+ +
R#A_5 = [5 3 1 9 2];
+R#B_5 = [9 7 4 8 5];
+
+ +

In your opinion it is still possible to get the Spearman's rho correlation coefficient with these two partial orders?

+ +

I know that
+1) We are in the second step of the Spearman's because we are alrady dealing with ranks.
+2) The Sample size is very low but it is just for explanation.

+",2013-10-28 11:21:57.617 +58353,23094.0,1,59809.0,,,(Standalone) Software for plotting graphs of large amounts of data and allowing you to scroll/zoom,,CC BY-SA 3.0,"

I've got some temporal data taken from a data logger that I'm trying to plot in a graphical form (as a line graph). Because it's a large amount of data, plotting it one one big graph (e.g. in Excel) makes it difficult to explore the visualised data as you can't really zoom in and scroll through the data. What I'm looking for is some standalone software that can plot the data as a line graph, but also allow the user to easily scroll through the graph along the horizontal (time) axis and be able to zoom that axis in and out. Ideally, the software would be free and be GUI driven. Does anyone know of any such software?

+ +

Thanks,

+",2013-10-28 11:29:09.030 +58354,23095.0,1,,,,How to mix probability estimators of the same phenomenon?,,CC BY-SA 3.0,"

Also posted here and here.

+ +

I have the following problem:

+ +

I have N models that give me an estimation of the probability distribution function p(x) of a certain phenomenon x. Let's call them: $p_1(x),...,p_N(x)$. They come from different sources of information, so they can give different values, but they all refere to the same observable fact. Is there a formal and valid way to combine them into a single formula?

+ +

I have read about mixture distributions, are they applicable to my case?

+ +

Suppose that my models can be more or less reliable, depending on the source of information they are based on, can I also combine them giving more weight to one model rather than another?

+ +

One possible solution I have thought is to make a weighted average of all the PDFs: $p(x) = w_1*p_1(x) + ... + w_N * p_N(x)$ where $\sum\limits_{i=0}^N w_i = 1$, does it make sense?

+ +

Thanks very much for your suggestions!

+ +

PS: to be more concrete my models give me the probability distribution that a certain person is in the position (x,y) and they rely on different sources of information like the power of a received signal or some other observable fact.

+",2013-10-28 11:59:45.803 +58355,503.0,2,,58352.0,,,,CC BY-SA 3.0,"

I think you have one of two problems, depending on the what exactly R#A and R#B are. For example, does the 5 in R#A mean that the first element has a rank of 5, or does it mean that the fifth element has a rank of 1?

+ +

If it is the former, then you have not selected the top 5 elements. If it is the latter then R#A and R#B contain different elements.

+ +

You can certainly run a correlation on the two vectors; but what will the output mean?

+ +

Perhaps you can tell us what you are trying to accomplish.

+",2013-10-28 12:25:34.520 +58356,23097.0,1,,,,How can to compare 1750 samples between 3 groups by R?,,CC BY-SA 3.0,"

I have 1750 proteins that I want to compare the expression level of them between 3 groups (cell-type) using R. How can I do it?

+",2013-10-28 12:27:32.710 +58357,22159.0,1,58358.0,,,What are the limitations of Kernel methods and when to use kernel methods?,,CC BY-SA 3.0,"

Kernel methods are very effective in many supervised classification tasks. So what are the limitations of kernel methods and when to use kernel methods? Especially in the large scale data era, what are the advances of kernel methods? What is the difference between kernel methods and multiple instance learning? +If the data is 500x10000, 500 is the count of samples, and 10000 is the dimension of each feature, then in this circumstance, can we use the kernel methods?

+",2013-10-28 12:33:15.317 +58358,17740.0,2,,58357.0,,,,CC BY-SA 3.0,"

Kernel methods can be used for supervised and unsupervised problems. Well-known examples are the support vector machine and kernel spectral clustering, respectively.

+ +

Kernel methods provide a structured way to use a linear algorithm in a transformed feature space, for which the transformation is typically nonlinear (and to a higher dimensional space). The key advantage this so-called kernel trick brings is that nonlinear patterns can be found at a reasonable computational cost.

+ +

Note that I said the computational cost is reasonable, but not negligible. Kernel methods typically construct a kernel matrix $\mathbf{K} \in \mathbb{R}^{N\times N}$ with $N$ the number of training instances. The complexity of kernel methods is therefore a function of the number of training instances, rather than the number of input dimensions. Support vector machines, for example, have a training complexity between $O(N^2)$ and $O(N^3)$. For problems with very large $N$, this complexity is currently prohibitive.

+ +

This makes kernel methods very interesting from a computational perspective when the number of dimensions is large and the number of samples is relatively low (say, less than 1 million).

+ +

Related: Linear kernel and non-linear kernel for support vector machine?

+ +

SVM for Large Scale Problems

+ +

For very high dimensional problems, such as the 10000 dimensions you mention in the question, there is often no need to map to a higher dimensional feature space. The input space is already good enough. For such problems, linear methods are orders of magnitude faster with almost the same predictive performance. Examples of these methods can be found in LIBLINEAR or Vowpal Wabbit.

+ +

Linear methods are particularly interesting when you have many samples in a high dimensional input space. When you have only $500$ samples, using a nonlinear kernel method will also be cheap (since $N$ is small). If you had, say, $5.000.000$ samples in $10.000$ dimensions, kernel methods would be infeasible.

+ +

For low-dimensional problems with many training instances (so-called large $N$ small $p$ problems), linear methods may yield poor predictive accuracy. For such problems, ensemble methods such as EnsembleSVM provide nonlinear decision boundaries at significantly reduced computational cost compared to standard SVM.

+",2013-10-28 12:57:31.210 +58359,192.0,2,,58353.0,,,,CC BY-SA 3.0,"

Gnuplot is free, open source and highly versatile and what I use and I think it will meet your needs. You can point and click with the mouse to zoom in and out on any part of a graph, and you can even write a script to scroll through the data as if watching a film.

+",2013-10-28 13:08:05.197 +58360,9554.0,2,,58354.0,,,,CC BY-SA 3.0,"

I think Bayesian model comparison might be what you are looking for. See for example Bishop, Chapter 3.4. +Generally speaking, given a set of $N$ models, you choose your weights to correspond to the posterior probability of each model. +$$ p(M_i | D) \propto p(M_i)p(D | M_i) $$ +where $p(M_i)$ is the prior of model importance, you can assume this to be uniform, and $D$ is your data. Hence $p(D | M_i)$ is simply the likelihood of model $i$ given data.

+ +

The predictive probability for a new value $y^*$ and explanatory values $\mathbf{x}$ is then: +$$ p(y^* | \mathbf{x}, D) = \sum_{i=1}^N p(y^*|\mathbf{x},D,M_i)p(M_i | D)$$ +where we use the posteriors as weighting between models.

+ +

That's the theory. Now in practice, unless you happen to be dealing with conjugated probabilities, you won't get a closed form solution for those posteriors and the above is pretty much useless.

+ +

When people fit a mixture of distributions, they usually use a mixture of Gaussians and in rare occasions a mixture of t distributions. I think the reason is simply that the mixture is fit using the EM algorithm (again, see Bishop) and Gaussians are particularly useful since their posterior is again a Gaussian and you can get all the required updates for the EM algorithm in closed form solution. And when I say ""fit"", they don't fit them individually, but learn the best parameters for all mixture components from the data, which is not what you are doing.

+ +

Don't worry about that for now and simply check whether you can get the posteriors for your model, or whether you don't want to fit your mixture using the EM and some well known distribution, such as the Gaussian, or t distribution in case of many outliers.

+ +

EDIT (to your comment):

+ +

So first of all, my notation: $y$ is the quantity you are trying to model, $x$ is a vector of data used to model $y$.

+ +

The data $D$ is just a tuple of vectors $x$, used to model a tuple of $y$'s. (basically think any normal dataset with a dependent variable and multiple independent ones)

+ +

OK. now, if I understand correctly you are trying to fit a complex distribution and you somehow already know the models that explain the data (perhaps because you know the generating mechanism) so you only need the mixing proportions.

+ +

You could try the following, which is guaranteed to work for Gaussians, and I don't see why it shouldn't work for other distributions too. (though there is a big caveat here!)

+ +

Calculate for each point x the likelihood that the point was generated by model $f1$ and $f2$, where the parameters are known. You end up with a matrix of likelihoods $2 x N$ where N is the number of your points. sum each row and divide by N. You should get the responsibility of each of the models for generating the data.

+ +

Use that to weight your mixture. You should also check if the resulting density integrates to 1, and if not normalize appropriately.

+",2013-10-28 13:20:37.797 +58361,,2,,58356.0,anon,,,CC BY-SA 3.0,"

Bioconductor project produces software (add-ons for R) for bioinformatics. Bioconductor offers several solutions to your problem. Possibly the easiest one is to use the limma package (http://www.bioconductor.org/packages/release/bioc/html/limma.html). It has an extensive user guide that walks you through the basics of the analysis.

+ +

In addition, see the answer eBayes() lmFit() for a quick overview of the workflow and the functions in the limma package.

+ +

What you need to consider is the question you are interested in. These questions are coded in a design matrix. For example, if you have one control group (C), and two treatment groups (T1, T2), and you are interested in comparing both treatments with the control group, you might generate the following model matrix. But, before generating the model matrix, let's assume your data matrix (containing the expression values) contains the controls in the three first columns, then three T1 columns, and last three T2 columns. A vector (listing the groups in the same order they appear in the data matrix) containing the group information can be turned into a model matrix as follows:

+ +
# ""group"" vector
+groups<-c(""C"",""C"",""C"",""T1"",""T1"",""T1"",""T2"",""T2"",""T2"")
+design<-model.matrix(~groups)
+design
+  (Intercept) groupsT1 groupsT2
+1           1        0        0
+2           1        0        0
+3           1        0        0
+4           1        1        0
+5           1        1        0
+6           1        1        0
+7           1        0        1
+8           1        0        1
+9           1        0        1
+
+ +

If you do not have a control group, but some comparison group anyhow, just specify it as the first group (alphabetically) in the groups vector. The model matrix will automatically use it as a baseline with which all others are compared.

+ +

This does a simple comparison of groups (T1 v. C and T2 v. C). If you have something more complex in mind, please elaborate your question a bit to address this.

+",2013-10-28 13:22:00.490 +58433,23131.0,1,,,,Can we make the Irwin-Hall distribution more general?,,CC BY-SA 3.0,"

I need to find a symmetric low-kurtosis distribution class, which includes the uniform, the triangular and the normal Gaussian distribution. The Irwin-Hall distribution (sum of standard uniform) offers this characteristic, but is not treating non-integer orders $N$. However, if you e.g. simply independently sum up e.g. 2 standard uniform $[0,1]$ and one 3rd with a smaller range like $[0,0.25]$ you will indeed nicely obtain a more general and smoothly extended version of Irwin-Hall for any arbitrary order (like $N=2.25$ in this case). However, I wonder if it is possible to find a practical closed formula for the CDF?

+",2013-10-29 14:07:10.673 +58362,21362.0,1,,,,Random forest “certainty / likelihood score” - how to score records in RF mode in R?,,CC BY-SA 3.0,"

My question is similar to this link Creating a "certainty score" from the votes in random forests?

+ +

I am trying to build a random forest for a binary response (1 & 0). Let's say we have 10,000 different records and I am building 500 trees. Is there a way to score the records in terms of the certainty / confidence / likelihood of being categorized in category 1 (for example)? The link above suggests using the number of votes among all 500 trees, but this way can only give me up to 500 different scores, how can I differentiate further for these 10,000 records? (Like regression, the scores can be easily obtained).

+ +

One solution is to average the score of each tree in the forest. the tree is the probability of 1s in the final node. Anyone know how to produce that average in R? I couldnt find this in the randomForest package. I think if I write my own codes to do that it , the run time may not be as fast as a built-in function.

+",2013-10-28 14:10:32.017 +58363,,2,,58328.0,user31668,,,CC BY-SA 3.0,"

Clarkson - a convenient method for this particular problem will be to recognise that the sum of poisson variables is also poisson. In this case, you would model the total number of visits in the year as poisson(30) and see what you can infer from there.

+",2013-10-28 14:12:15.860 +58364,22262.0,1,,,,Should I de-mean a predictor variable before a dummy interaction,,CC BY-SA 3.0,"

Suppose I have the following time-series linear model where $\beta$ is misspecified:

+ +

$Y(t+1) = \alpha + \beta X(t) + \sum_{i=1}^{10000}\gamma_i Z_i(T) + \varepsilon$

+ +

where all parameters are in $\mathbb{R}$ and all predictors are normally distributed and play nicely with respect to Gauss Markov.

+ +

In the population ('correct') model that corresponds to the above, the slope parameter on $X(t)$ is equal to $\beta_1$ when $X(t) > \text{Q}_{0.95}(X(t))$ and is equal to $\beta_2$ when $X(t) < \text{Q}_{0.95}(X(t))$, where $Q$ is the quantile function. This break in parameters is a priori knowledge. We also have that $\beta_1 \neq \beta_2$.

+ +

I want to model this as follows:

+ +

$Y(t+1) = \alpha + \beta_1 X(t)I(X(t) > Q_{0.95}(X(t))) $

+ +

$+ \beta_2 X(t)I(X(t) < Q_{0.95}(X(t)))$

+ +

$+ \sum_{i=1}^{10000}\gamma_i Z_i(T) + \varepsilon$

+ +

My question is whether it is good practice to de-mean $X(t)$ before estimating the equation with the indicator function breaks, supposing my objective is out of sample forecast accuracy? I ask because this has an effect on collinearity between predictors (which could impact some, but not all, feature selection algorithms). Note that feature selection and regularisation is done in an automated way.

+ +

I do not care about standard errors.

+",2013-10-28 14:16:33.520 +58365,23103.0,1,,,,Linear Discriminant Function,,CC BY-SA 3.0,"

In linear discriminant analysis, how is the linear discriminant function determined? Assuming equal variance-covariance matrices, is the linear discriminant function determined from the training data?

+",2013-10-28 14:55:58.223 +58366,12884.0,2,,58365.0,,,,CC BY-SA 3.0,"

For Linear discriminant analysis the linear discriminant function is just the inner product of a given data point $\vec{x}$ with the vector $\vec{w}$, with the criterion $\vec{x} \cdot \vec{w} > c$. The vector $\vec{w}$ is calculated as:

+ +

$$\vec{w} = \Sigma^{-1}(\vec{\mu_0} - \vec{\mu_1})$$

+ +

Where $\vec{\mu_n}$ is the vector mean of sample class $n$, that is, the mean of the training data for class $n$.

+",2013-10-28 15:03:29.057 +58367,9749.0,2,,58322.0,,,,CC BY-SA 3.0,"

I am not sure why you used a t-test at all, descriptive statistics would suffice for your hypothesis and you could lump all 50 results together, as the difference between averages of 357 and 358 is experimentally irrelevant for your sample sizes, and in any case is piffling.

+ +

If, for example, you used Graphpad free quickcalcs, you would get a 95% confidence interval of 343 to 371, for a sample size of 50 and a conjectural and probably grossly excessive standard deviation of 50. This shows that your ants eat significantly (though probably not substantially relevantly) more that 20 times their own weight on average.

+ +

If you really wanted to compare this using a t-test against an hypothetical average of 20 times their own weight, then you could use the one sample t-test that they provide, but I don't think that was really the point of your experiment. To be sure that these tests are appropriate, you should also check that the distributions are normal, which considering the nature of your experiment, I would imagine that they almost certainly are.

+ +

In addition to the suggestions made by others, I would also want to be sure that the ants didn't leave any excreta on their uneaten foods.

+",2013-10-28 15:54:38.527 +58368,21932.0,1,58378.0,,,Writing null hypothesis and deciding on rejection criteria,,CC BY-SA 3.0,"

The number of faults in one metre of a thread is Poisson distributed.It is claimed that the average number of faults is 0.02 per metre.A random sample of 100 one metre lengths of the thread reveals a total of 6 faults.Does this information support the claim?

+ +

What I want to know whether my H0 should be, H0:lambda=0.02 or should it be lamda=2

+ +

And my next question is in deciding the rejection criteria, Is it enough to check to reject H0,P[X>=6]<0.025 (Testing at 5% significance) or should I have looked for P[X<=6]<0.025 as well and checked whether either one of these is satisfied.

+ +

Or calculate 2*P[X>=6] and check if it is less than 0.05

+",2013-10-28 15:54:41.290 +58369,22865.0,1,,,,What is an 'atom' and what are 'atomic weights'?,,CC BY-SA 3.0,"

I have come across the following statement:

+ +
+

A notable feature of the Hierarchical Dirichlet Process is that all Dirichlet Processes' $G_j$ share the same set of atoms and only the atom weights differ. This is a result of the almost sure discreteness of the top-level DP.

+
+ +

What is meant by atom and atom weights? Googling gets to me Wiki articles about measure theory and I'm not able to understand them and also not sure if they are what I'm looking for.

+ +

Could anyone provide a simple explanation of their meaning?

+",2013-10-28 15:58:37.740 +58370,23108.0,2,,54724.0,,,,CC BY-SA 3.0,"

Well, for the second part, I think more more flexible model will try to fit the model hard and training data contains a high noise, so flexible model will also try to learn that noise and will result in more test error. I know the source of this question as I'm also reading the same book :)

+",2013-10-28 16:08:53.440 +58371,13165.0,1,58374.0,,,"Is ""model selection"" the same as traning?",,CC BY-SA 3.0,"

A terminology problem. In machine learning we have the following problem:

+ +

Choosing the optimal model (or training): +$$ +f^* = \arg\min_{f \in \mathcal{F}} \sum_i l(x_i,y_i) +$$

+ +

Is the term ""model selection"" always ""exactly"" referring to this? Or something else?

+",2013-10-28 17:06:49.973 +58372,13165.0,1,61894.0,,,"How to train in models, with efficient inferences, like belief-propagation ?",,CC BY-SA 3.0,"

There are many papers that are devoted to efficient inference in graphical models. Though many of these paper don't explicitly talk about the learning (training, etc) problem. For example:

+ +

http://videolectures.net/mlss09uk_minka_ai/

+ +

I am a little confused on how these models being trained. I thought they are probably doing an EM-like algorithm, i.e.

+ +
    +
  1. Inference (and calculating all marginals, using VB or EP)

  2. +
  3. Maximizing the likelihood using some blackbox optimization toolboxes using the marginals in the previous case

  4. +
+ +

For example, consider different variants of Belief-Propagation. There are HUGE number of variants for BP, but how a graphical model could be trained?

+ +

Any comments?

+",2013-10-28 17:12:01.877 +58373,9129.0,1,58617.0,,,How to evaluate/validate clusters using multiple clustering methods,,CC BY-SA 3.0,"

From some reading I did online, I understand that there are various methods for determining ""similarity"" used by different clustering algorithms. I am curious if it is good practice to run multiple clustering algorithms/methods (i.e Hiearchical w/ Ward, single linkage, centroid, etc or maybe even K-means) on a dataset and if there is some automated way to to get a ""consensus"" of clusters. In other words to get some sense of confidence that the right items are clustered together. Items that tend to cluster together using various methods would be considered valid. For example in my example below G and Z tend to cluster together using multiple methods as do S and F.

+ +

Label = what I am clustering; X & Y are my variables I use to cluster; Cluster1-3 are the results of three clustering algorithms.

+ +

+ +

Edit: I removed a side note I had here regarding how large the actual data set I plan to use might be so as not to detract from the main questions.

+",2013-10-28 17:28:21.230 +58374,10450.0,2,,58371.0,,,,CC BY-SA 3.0,"

The best model is not necessarily the model which minimizes error, but typically attempts to reduce overfitting bias by adding penalties for cost complexity and by cross-validating between training and validation samples.

+ +

web.engr.oregonstate.edu/cs534 slides

+",2013-10-28 17:29:05.487 +58375,3048.0,1,58918.0,,,"In weighted least squares, how do I weight the residuals to get an accurate ""z score""",,CC BY-SA 3.0,"

I am regressing spreads in yield curves in certain countries, in the chart below, the Spanish 2-5-10 spread, against the Italian 2-5-10 spread. +

+ +

I want recent data to count more, so I weight the inputs using a decay weighting scheme with a 1 year halflife.

+ +

A ""simple"" regression line and the weighted regression line are shown.

+ +

I want to calculate the perpendicular distance of the current point (green) from the regression line, in number of standard errors. In the unweighted regression, in R, I will simply say:

+ +
> l <- lm(SP ~ IT, data = ss)
+> last(l$residuals) / sd(l$residuals)
+2013-10-28 
+-0.1817122 
+
+ +

Which gives -.18 standard errors away from the regression line.

+ +

How do I do this same analysis for the weighted regression though? I am sure the following is incorrect:

+ +
> decay
+function(len, halflife, sumone = TRUE) {
+#function generates an exponentially decaying series
+    t <- len:1 # generate a series of numbers reverse order so biggest weights last
+    lambda <- log(2) / halflife #figure out the lambda for the halflife
+    w <- exp(-lambda * t) #create the weights series  
+    if(sumone) w <- w / sum(w) #normalise sum to 1 if necessary
+    return(w) 
+}
+> d <- decay(nrow(ss), 260)
+> ld <- lm(SP ~ IT, data = ss, weights = d)
+> last(ld$residuals) / sd(ld$residuals)
+2013-10-28 
+-0.3667876 
+
+ +

I should surely weight the residuals somehow, before doing the above, is that correct? Could I for example take the weighted standard deviation of the residuals that is:

+ +
> last(ld$residuals) / wt.sd(ld$residuals, d)
+2013-10-28 
+  -0.39717 
+
+ +

where my wt.sd function looks like this:

+ +
> wt.sd
+function (x, wt) {
+    return(sqrt(wt.var(x, wt)))
+}
+
+> wt.var
+function (x, wt) {
+    s = which(is.finite(x + wt))
+    wt = wt[s]
+    x = x[s]
+    xbar = wt.mean(x, wt)
+    return(sum(wt * (x - xbar)^2) * (sum(wt)/(sum(wt)^2 - sum(wt^2))))
+}
+
+ +

Basically, I want to know how to find the distance from the weighted regression line, in standard errors, accounting for the weights.

+",2013-10-28 18:16:36.610 +58376,23110.0,1,,,,What is a reasonable process to understand a collection of data?,,CC BY-SA 3.0,"

Can someone provide their thoughts on a structured process one might go through to understand a collection of data. +The scenario is: you've been given a set of data (features and observations - with descriptions) and been told to ""tell me what kind of interesting things this data can tell me"". +I.e., what are interesting questions that this data can answer. The meaning of ""interesting"" is certainly subjective.

+ +

This appears to be classical unsupervised learning.

+ +

My initial thoughts:

+ +
    +
  1. Cluster all pairs of variables to see interesting clusters
  2. +
  3. Run PCA find high-variance groupings
  4. +
+ +

Is there a general ""how to understand a set of data"" process that you've found successful?

+ +

Thanks

+",2013-10-28 18:36:54.563 +58377,21762.0,2,,58371.0,,,,CC BY-SA 3.0,"

Training often involves model selection (choice of model structure, set of input variables, transformations, ...). But, as @MarcClaesen pointed out, training also includes the process of fitting the model, i.e. finding best values for its parameters.

+",2013-10-28 18:38:04.773 +58378,21029.0,2,,58368.0,,,,CC BY-SA 3.0,"

For question one, I assume you want to test if the claim in the question is true or not. The claim is that there are 0.02 faults per meter. In other words, the expected value is 0.02. However, the observed value is 6 faults in 100 meters, or 0.06 faults/meter. So,

+ +
    +
  1. $H_0 : \lambda = 0.02 $
  2. +
  3. $H_1 : \lambda \gt 0.02 $
  4. +
+ +

You can also write the alternative as $\lambda \ne 0.02$.

+ +

In the second question, the rejection criteria depends on what level of $ \alpha $ you choose. It also depends on which of the two alternative hypotheses you chose. Since the observed value is greater, it is natural to chose greater than, not equal to. Assume a 5% type-1 error. Then, the sum of iid poissons variables is a new poisson random variable. You want to test

+ +

$ P\{ X_1 + X_2 + \dots + X_{100} \ge 6 | \lambda' = 2 \} + = 1 - P\{X_1+\dots \le 6 | \lambda'=2 \}$

+ +

You are still testing if $ \lambda = 0.02 $, but indirectly by calculating if the new poisson variable $ \lambda' = 100\lambda $. Since this will be a one-tailed distribution, you still want P[X>=5] if it is less than 5%, not 2*P, which corresponds to the two-tailed test

+ +

Hope this helps.

+",2013-10-28 18:45:04.827 +58434,306.0,2,,58432.0,,,,CC BY-SA 3.0,"

The degrees of freedom is n-2. this lets you test for whether rho (the correlation coefficient) is 0 or not. so calculate this value, find the p value using the degrees of freedom and decide if the null hypothesis of correlation coefficient being 0 is significant or not.

+ +

The autocorrelation of the variables do not have any effect on the correlation coefficient of the variables unless you want to check out if the lag of one variable has any affect on the other which the question does not indicate. So leave the autocorrelation part.

+",2013-10-29 14:21:26.503 +58379,16992.0,2,,57691.0,,,,CC BY-SA 3.0,"

Here are a couple thoughts that may be helpful:

+ +
    +
  • Auto-correlation doesn't matter when you only look at a single t at a time. So, at a fixed time t, you could just run a t-test to check for a difference in means. If you run the t-test for each time separately, then you get a bunch of p-values. Because of auto-correlation these p-values are not independent, but each p-value considered alone is just fine.

  • +
  • So now you want to find the times for which there is a difference in means. I would try using false discovery rate (FDR) methods (see the ""Benjamini-Hochberg procedure"" at http://en.wikipedia.org/wiki/False_discovery_rate). Luckily, this procedure controls the FDR even when there is positive dependence among your p-values. (see ""The Control of the False Discovery Rate in Multiple Testing under Dependency"", free version here http://thom.jouve.free.fr/work/thesis/sitecopy_save/Biblio/ToCheck/fdr/Benjamini2001.pdf) This should give you a reasonable first answer to your original question.

  • +
  • Finally, I think the two plots you drew are very clear. They are probably more informative than any kind of statistical analysis you can run... Good luck!

  • +
+ +

Edit by Roland:

+ +

Here is an R implementation of the FDR method for the example in the question. The result looks reasonable.

+ +
dat <- setNames(cbind(stack(as.data.frame(t(a))), 
+                      stack(as.data.frame(t(b)))), 
+                c(""a"", ""i"", ""b"", ""i""))
+dat <- dat[,-4]
+library(plyr)
+p.raw <- ddply(dat, .(i), function(df) t.test(df$a, df$b)$p.value)
+p.fdr <- cbind(p.adjust(p.raw[,2], method=""fdr""),
+               t[as.numeric(gsub(""V"","""",p.raw[,1]))])
+p.fdr[order(p.fdr[,2]),]
+
+#             [,1] [,2]
+#  [1,] 0.63001435    3
+#  [2,] 0.19439226    4
+#  [3,] 0.06200315    5
+#  [4,] 0.07335654    6
+#  [5,] 0.05336699    7
+#  [6,] 0.06115999    8
+#  [7,] 0.06115999    9
+#  [8,] 0.06103370   10
+#  [9,] 0.04324050   11
+# [10,] 0.04324050   12
+# [11,] 0.04324050   13
+# [12,] 0.04324050   14
+# [13,] 0.06103370   15
+# [14,] 0.05533972   16
+# [15,] 0.15489402   17
+# [16,] 0.58234624   18
+# [17,] 0.05533972   19
+# [18,] 0.04324050   20
+
+",2013-10-28 19:10:49.717 +58380,21029.0,2,,58302.0,,,,CC BY-SA 3.0,"

The answer to your question should logicaly be ""Yes."" The group means of the PC should differ from the PC of the means. This should happen for two reasons.

+ +
    +
  1. You're transforming your variables into PCs, which try to maximize the total interia. This depends on the spread of the data in the different variables.
  2. +
  3. Once you take the means you eliminate most of the inertia. The PCA will be much more ""accurate"" (not surprisingly almost 100% of the intertia is explained by the first component), but the composition of the PC will be different because there is no longer much intertia to explain.
  4. +
+ +

You can think of this in terms of how a PCA operates. The PCA is computed using squared distances, maximized for the first PCA. If you remove all this variability, the estimations change.

+",2013-10-28 20:02:34.957 +58381,,2,,58376.0,user31668,,,CC BY-SA 3.0,"

John Tukey came up with an entire field devoted to this: Exploratory Data Analysis PCA is one part of this. Take a look and I'm sure youll find some good ideas.

+",2013-10-28 20:13:15.203 +58382,18372.0,1,58383.0,,,Large scale ridge regression,,CC BY-SA 3.0,"

I'm trying to solve a problem of the form

+ +

$\min_x \frac{1}{2}||Ax-b||^2_2 + \frac{\rho}{2}||x-z||^2_F$

+ +

where both $x$ and $b$ are high dimensional, and $b$ is much higher dimensional than $x$. The solution is given by $x^* = (A^T A+\rho I)^{-1}(A^T b + z)$, but the problem is so large that even inverting $A^T A + \rho I$ is infeasible. However, due to structure in the problem we can efficiently multiply by $A$ and $A^T$. Basically this is large scale linear ridge regression. What would be the ideal algorithm for efficiently implementing this minimization? Would something like biconjugate gradient work?

+",2013-10-28 20:23:57.770 +58383,9245.0,2,,58382.0,,,,CC BY-SA 3.0,"

I've found that LSQR is ideal for problems like this - I've used it successfully for operators of about 3e5 * 1e6 or so. Check http://www.stanford.edu/group/SOL/software/lsqr.html for details. I've used Friedlander's (I think) C port and the python port, which I have (hastily and sloppily) ported to R.

+",2013-10-28 20:29:10.780 +58384,15782.0,1,,,,Cointegration but no Granger-Causality found,,CC BY-SA 3.0,"

I have found Cointegration based on Engle/ Granger and Johansen. However, Granger-causality is rejected for both variables. How is that possible?

+ +

According to theory, +if x and y are I(1) and cointegrated, x is Granger causal to y and/or y is Granger causal to x. +However, Granger-causality has been rejected in my bivariate case, despite their cointegration relationship.

+ +

Did I understand it correctly that there has to be at least one granger causality flow in a bivariate cointegrated system?

+ +

Thank you for your answer!

+ +

Applying an VECM, I get the following results: with only the -0.022460 being significant...

+ +
    Vector Error Correction Estimates   
+    Date: 10/28/13   Time: 23:58    
+    Included observations: 1113 after adjustments   
+    Standard errors in ( ) & t-statistics in [ ]    
+
+    Cointegrating Eq:   CointEq1
+
+    CAD(-1)             1.000000
+
+    NATGAS(-1)          0.067366
+                       (0.02646)
+                       [ 2.54615]
+
+     C                  -0.077093
+
+
+Error Correction:   D(CAD)  D(NATGAS)
+
+CointEq1        -0.022460   -0.006601
+ (0.00514)   (0.01384)
+[-4.37213]  [-0.47714]
+
+D(CAD(-1))  -0.054710    0.029241
+  (0.02998)  (0.08073)
+[-1.82508]  [ 0.36220]
+
+D(CAD(-2))   0.035656    0.101838
+ (0.02996)   (0.08070)
+[ 1.18998]  [ 1.26200]
+
+D(NATGAS(-1))   -0.004642   -0.077700
+ (0.01120)   (0.03016)
+[-0.41449]  [-2.57591]
+
+D(NATGAS(-2))    0.004712    0.056858
+ (0.01120)   (0.03016)
+[ 0.42067]  [ 1.88491]
+
+C    0.000176   -0.000850
+ (0.00019)   (0.00051)
+[ 0.92332]  [-1.65571]
+
+ R-squared   0.022437    0.011948
+
+",2013-10-28 20:33:47.817 +58385,2915.0,1,,,,Is there anything special about Gamma distribution with the shape parameter k=e?,,CC BY-SA 3.0,"

Is there any unique property of $\mathrm{Gamma}(k=e, \text{ scale})$ or a Negative binomial distribution with $r=e$? Here, $e$ is Euler's number, $e \approx 2.71828$.

+ +

The reason I'm asking is that one of the variables in my computer simulations can be fitted by $\mathrm{Gamma}(k=e, \text{ scale})$ or by $\mathrm{NB}(r=e,\ p)$ very robustly. That makes me wonder if there is something special about this process and this particular value of the shape parameter might hint to that.

+",2013-10-28 20:58:19.477 +58386,14360.0,1,,,,Why does the log likelihood need to go to minus infinity when the parameter approaches the boundary of the parameter space?,,CC BY-SA 3.0,"

In a recent lecture I was told that, in order for the maximum likelihood estimate to be valid, the log likelihood needs to go to minus infinity as the parameter goes to the boundary of the parameter space. But I don't understand why this is essential. Suppose the log likelihood goes to some kind of asymptote. Then the parameter that maximizes the likelihood is still the maximum likelihood estimate, right?

+",2013-10-28 21:05:46.417 +58387,23111.0,2,,54506.0,,,,CC BY-SA 3.0,"

Yes, logistic regression would work, but also classification trees. I don't think you need to worry about false positives. It seems that the ""confusion matrix"" the model produces will tell you what you are looking for in terms of false positives and false negatives

+",2013-10-28 21:18:20.893 +58388,15293.0,2,,53384.0,,,,CC BY-SA 3.0,"

Here is the general (semi-parametric-bootstrap) algorithm in more detail:

+ +

$\text{B}$ = number of bootstraps

+ +

the model:
+$y = x\beta + \epsilon$

+ +

let $\hat{\epsilon}$ be the residuals

+ +
    +
  1. Run the regression and obtain the estimator(s) $\hat\beta$ and residuals $\hat\epsilon$.
  2. +
  3. Resample the residuals with replacement and obtain the bootstrapped residual vector $\hat\epsilon_\text{B}$.
  4. +
  5. Obtain the bootstrapped dependent variable by multiplying the estimator(s) from (1) with the original regressors and adding the bootstrapped residual: $y_\text{B} = x\hat\beta + \hat\epsilon_\text{B}$.
  6. +
  7. Run the regression with the bootstrapped dependent variables and the original regressors, this gives the bootstrapped estimator, i.e. regress $y_B$ on $x$, this gives $\hat\beta_\text{B}$.
  8. +
  9. Repeat the procedure $\text{B}$-times by going back to (2).
  10. +
+",2013-10-28 21:19:21.240 +58389,23104.0,2,,30957.0,,,,CC BY-SA 3.0,"

On a related note, you can accomplish the same objective if your ARIMA model has external regressors. This has been helpful for me on occasion.

+ +

For instance, say your first model was created as follows:

+ +
fit.arimax <- Arima(response, order=c(1, 0, 1), xreg=xreg)
+
+ +

Then suppose that after creating your model, you observe additional values in your response and external regression variables, and would like to forecast or simulate future outcomes given these new observations. E.g., say you are predicting electricity demand, and you observe another hour of demand (i.e. response) and temperature (i.e. external regression) data.

+ +

Then, you may fit the original model to the updated time series as follows, where response.new and xreg.new are your updated response and regression variables.

+ +
fit.arimax.new <- Arima(response.new, model=fit.arimax, xreg=xreg.new)
+
+ +

You can use this new model to forecast or simulate future outcomes, conditional on all observed data. Note that you must provide forecast external regressors for each. E.g.,

+ +
forecast.Arima(fit.arimax.new, h=length(xreg.forecast), xreg=xreg.forecast)
+
+simulate.Arima(fit.arimax.new, n=length(xreg.forecast), xreg=xreg.forecast)
+
+ +

Another way to accomplish all of this is to make an entirely new model using the updated data. But the method described above is appropriate in real-time applications, in which case fitting a new ARIMA model would take too long.

+",2013-10-28 21:28:33.573 +58390,9554.0,2,,58386.0,,,,CC BY-SA 3.0,"
+

in order for the maximum likelihood estimate to be valid, the log likelihood needs to go to minus infinity as the parameter goes to the boundary

+
+ +

This is equal to saying, the Likelihood of a parameter needs to become 0 at the boundary of the parameter space in order for the result to be valid.

+ +

Well first of all, you can restrict the parameter space to values that all have a positive likelihood and still obtain a valid estimate.

+ +

Secondly, even if you use, say $(-\infty,\infty)$, you don't come close to the boundary since any off the shelf optimisation package performs some sort of random initialisation and then approaches the minimum using some method such as gradient descent, conjugate gradient or another. In either case, you almost never end up approaching the boundary of the parameter space, so I don't quite understand why the boundaries matter in the first place.

+ +

And even if you do that on purpose, at one point you will hit the floating point precision of your operating system. I can guarantee you that at that point, you haven't really approached the boundary $-\infty$ by much. :)

+ +

Personally I find the underflow problem arising when calculating sums and products of very small likelihoods and the log sum exp trick much more interesting and more noteworthy issue that actually matters a lot in practice, unlike reaching the boundaries of the parameter space.

+",2013-10-28 21:47:00.457 +58391,23112.0,1,,,,How to detect step changes in GPS time-series data?,,CC BY-SA 3.0,"

The graph below shows GPS heading data (sampled every second) and I am trying to find the best way to detect (right/left) turns in the data. Appreciate suggestions for algorithms/methods for it (perhaps step detection)?

+ +

+ +

While turns typically result in sharp change in heading value, heading might also change gradually due to road curvature which should not be detected.

+",2013-10-28 22:02:30.007 +58392,12358.0,2,,58252.0,,,,CC BY-SA 3.0,"

I suspect that it is just that the authors/editors got confused because the physics problem is usually described in terms of energies, which map to (play the a role analogous to) negative log-likelihoods. Note that in the final sentence they refer to ""low-energy"".

+ +

In the physics literature the Ising model is cannonically defined as +$$ +H(Y)=- \sum_{<ij>} J_{ij} y_i y_j +$$ +the sum is over all pairs of interacting sites, $Y$ is my notation for the entire state of the sites (i.e. a given $Y$ specifies $y_i$ for all $i$). Note the minus sign. One might be inclined to write $H(Y)=\vec{y}^T W \vec{y}$

+ +

The partition function is given by $Z=\sum_{Y} e^{-\beta H(Y)}$; yet another minus sign.

+ +

The probability that you'll see the physical system in the state $Y$ is given by +$$ +P(Y) = \frac{ e^{-\beta H(Y)}}{Z} = \frac{ e^{\beta \sum_{<ij>} J_{ij} y_i y_j} } {Z} = \frac{e^{-\beta \vec{y}^T W \vec{y}}}{Z} +$$

+ +

I left $\beta=1/k_BT$ in these expressions since these are the expressions typically written by physicists, but in this problem, one can just absorb it into the definition of the $J_{ij}$ (or equivalently set $\beta=1$).

+ +

The main point is that when dealing with statistical mechanics physicists are use to dealing with ""energies"" which have the property that lower (more negative) energy states are more likely to be occupied. This is in the opposite sense from log-likelihoods (higher log-likelihoods are associated with more probable outcomes), and can lead to confusion when one tries to switch back and forth between these two conceptions.

+",2013-10-28 22:22:22.797 +58393,14110.0,1,,,,"How to estimate a pdf of x under the model of y = x+n, when the pdf of y and the pdf of n are given",,CC BY-SA 3.0,"

I guess I come up with a classic question, but I failed to find any useful solutions by far. My question is about the following model +$$y=x+n$$ +where $x$ is a hidden random variable that cannot be observed, $n$ is the random variable of white noise, namely $n$ follows a known Gaussian distribution of variance $\sigma^2$ ($f_n= {\cal{N}}(0,\sigma^2)$), and $y$ is a random variable that we can observe.

+ +

Suppose we know the distribution of $y$ as $f_y$, and I wonder how to find the pdf of $x$. Theoretically, it seems to be equivalent of the sum of two dependent random variables i.e. $x = y-n$, but in this question how $y$ and $n$ are correlated is unknown. Is there any existing solution?

+ +

Thanks

+",2013-10-28 22:49:35.873 +58543,1506.0,2,,58542.0,,,,CC BY-SA 3.0,"

It is certainly possible and does happen quite frequently, especially if there are many pairwise comparisons (which is likely the case if you're investigating an interaction term).

+ +

The Tukey procedure controls the Type I error rate and requires a larger difference to declare significance compared to if no adjustment was used. The ANOVA F-test uses MSE in the denominator which borrows information from all the data and is not affected by this adjustment.

+",2013-10-30 20:52:06.820 +58394,594.0,2,,53404.0,,,,CC BY-SA 3.0,"

F tests are most commonly used for two purposes:

+ +
    +
  1. in ANOVA, for testing equality of means (and various similar analyses); and

  2. +
  3. in testing equality of variances

  4. +
+ +

Let's consider each in turn:

+ +

1) F tests in ANOVA (and similarly, the usual kinds of chi-square tests for count data) are constructed so that the more the data are consistent with the alternative hypothesis, the larger the test statistic tends to be, while arrangements of sample data that looks most consistent with the null corresponds to the smallest values of the test statistic.

+ +

Consider three samples (of size 10, with equal sample variance), and arrange them to have equal sample means, and then move their means around in different patterns. As the variation in the sample means increases from zero, the F statistic becomes larger:

+ +

+ +

The black lines ($^{\:_|}$) are the data values. The heavy red lines ($\color{red}{\mathbf{|}}$) are the group means.

+ +

If the null hypothesis (equality of population means) were true, you'd expect some variation in sample means, and would typically expect to see F ratios roughly around 1. Smaller F statistics result from samples that are closer together than you'd typically expect ... so you aren't going to conclude the population means differ.

+ +

That is, for ANOVA, you'll reject the hypothesis of equality of means when you get unusually large F-values and you won't reject the hypothesis of equality of means when you get unusually small values (it may indicate something, but not that the population means differ).

+ +

Here's an illustration that might help you see that we only want to reject when F is in its upper tail:

+ +

+ +

2) F tests for equality of variance* (based on variance ratios). Here, the ratio of two sample variance estimates will be large if the numerator sample variance is much larger than the variance in the denominator, and the ratio will be small if the denominator sample variance is much larger than variance in the numerator.

+ +

That is, for testing whether the ratio of population variances differs from 1, you'll want to reject the null for both large and small values of F.

+ +

* (Leaving aside the issue of the high sensitivity to the distributional assumption of this test (there are better alternatives) and also the issue that if you're interested in suitability of ANOVA equal-variance assumptions, your best strategy probably isn't a formal test.)

+",2013-10-28 23:09:43.140 +58395,18690.0,1,58400.0,,,Stationarity in OLS time series and asymptotic properties,,CC BY-SA 3.0,"

I think I lack somewhat deeper understanding of this topic, but I thought stationarity is required in order for OLS to have asymptotic properties.

+ +

""But stationarity is not at all critical for OLS to have its standard asymptotic properties""(Wooldridge, 2012)

+ +

I thought stationarity is needed or otherwise OLS would not be consistent, but I guess I'm wrong. Could someone tell me why stationarity is not critical for LLN?

+ +

Thanks in advance!

+",2013-10-28 23:09:59.747 +58396,19120.0,2,,58372.0,,,,CC BY-SA 3.0,"

Training is done by EM, repeating the E-step and M-step until convergence.

+ +
    +
  1. E-step: calculate sufficient statistics using the posterior over the hidden variables given the observed variables.
  2. +
  3. M-step: update the parameters using the sufficient statistics computed in the E-step
  4. +
+ +

For example, see this paper on how it proves learning parameters of HMM is also a belief propagation. +http://homepages.inf.ed.ac.uk/csutton/notes/sutton04fbbp.pdf

+",2013-10-28 23:16:22.427 +58397,3894.0,2,,58372.0,,,,CC BY-SA 3.0,"

I'm a little confused by your question. Tom Minka's tutorial you are referring to is completely devoted to inference in graphical models. Learning the parameters of a graphical model is an inference problem, and, therefore, methods explained in the tutorial such as expectation propagation or variational inference can be applied to it. In fact, all of the examples in the tutorial show how to learn parameters in various graphical models.

+ +

EM algorithm can be applied to this problem as well. It should be, however, noted the the EM algorithm delivers point estimates for the parameters of interest, and, therefore, can be inferior to the approximate Bayesian inference methods discussed in the tutorial, since these methods aim to capture all the uncertainty present in the posterior distribution over parameters.

+",2013-10-28 23:27:37.970 +58398,,1,,,user30490,How to plot spectra of an AR(2) process,,CC BY-SA 3.0,"

I am stuggling with this problem and was hoping to find some guidance to answer it.

+ +

Let $y_t=\phi_1y_{t-1}+\phi_2y_{t-2}+\epsilon_t$, with $\epsilon_t\sim N(0,1)$. Now, I want to plot the spectra of $y_t$ in the following cases:

+ +

Case 1: When the AR(2) characteristic polynomial has two real reciprocal roots given by $r_1=0.9$ and $r_2=-0.95.$

+ +

Case 2: When the AR(2) characteristic polynomial has a pair of complex reciprocal roots with modulus $r=0.95$ and frequency $2\pi/8$.

+ +

Now, before plotting the spectra of $y_t$ in the following cases, I have tried to make use of the following important facts. The AR(2) process $y_t=\phi_1y_{t-1}+\phi_2y_{t-2}+\epsilon_t$ has the general linear process form $\psi(u)=1/(1-\phi_1u-\phi_2u^2)$ and hence +$$f(\omega)=\frac{v}{2\pi}|(1-\phi_1e^{-i\omega}-\phi_2e^{-2i\omega})|^{-2}$$ +This can be expanded to give +$$f(\omega)=\frac{v}{2\pi[1+\phi^2_1+2\phi_2+\phi_2^2+2(\phi_1\phi_2-\phi_1)\cos(\omega)-4\phi_2\cos^2(\omega)]}$$ +Now if the roots are real, then $f(\omega)$ has a mode at either zero or $\pi$; otherwise, the roots are complex conjugates and $f(\omega)$ is unimodal at $\omega=\arccos[-\phi_1(1-\phi_2)/4\phi_2]$ lying strictly between zero and $\pi$.

+ +

So if anyone could help explain to me how I am supposed to relate the above facts with the two different case that would be very helpful. I guess what I am struggling with is what values to plug into $f(\omega)$.

+",2013-10-28 23:53:40.903 +58399,23115.0,1,,,,Bayesian and frequentist interpretations vs approaches,,CC BY-SA 3.0,"

I have been reading about the frequentist vs bayesian issue (this article has helped a lot, specially with the example; also this one), and I haven't come to terms with it. At the moment it seems like there are the frequentist and bayesian interpretation of probability; and, separately, the frequentist and bayesian approach to problems. The former is about the belief vs frequency issue (illustrated in the second article). The later is illustrated in the first article. Both put together seem to me like this:

+ +
    +
  • The frequentist interpretation of the frequentist approach ensures to +be right a% of the time for large number of trials assuming only the +likelihood distribution, no matter which parameter we get, as long we +as assume that we'll get a good range of data.
  • +
  • The frequentist interpretation of the bayesian approach ensures to be +right a% of the time for large number of trials assuming the +likelihood distribution and the prior, no matter which data we get, +as long as we assume that we'll get a good range of parameters.
  • +
  • The bayesian interpretation of the frequentist approach says that we +are right with a probability of a% assuming only the likelihood +distribution, no matter which parameter we get, as long as we assume +fairness in the randomness of the data.
  • +
  • The bayesian interpretation of the bayesian approach says that we are +right with a probability of a% assuming the likelihood distribution +and the prior, no matter which data we get, as long as we assume +fairness in the randomness of the parameters.
  • +
+ +

This is the only consistent view that I have been able to form from what I've read. However, I I still think I maybe missing something (as I actually haven't found this view like this anywhere else, it's my own conclusion), so taking the null hypothesis that I'm wrong, where's my mistake?

+",2013-10-29 00:06:34.723 +58448,21029.0,2,,58443.0,,,,CC BY-SA 3.0,"

It makes sense to take a large random sample of N cases (10,000) to estimate the real distribution (the full 1 million). The area under the curve will be an approximate, but a very good approximate as N increases.

+ +

If this is something that needs to be done frequently, you can try the ROC calculation with increasingly large sample sizes to find an optimally large sub-set. Optimal here would mean that the loss of information is acceptable. Be warned that a random sample needs to still be representative of the full dataset (whatever that means for your study).

+ +

I can't cite any literature off-hand, but I know this type of sampling is used often in practice for different reasons. I for one often use sampling to reduce a large data-set (1-2 million) to something more easily handled (~5-10K) before starting on a data analysis.

+",2013-10-29 18:19:04.047 +58400,20473.0,2,,58395.0,,,,CC BY-SA 3.0,"

(Stationarity of what? What kind/level of stationarity?)

+ +

Given the standard linear regression specification (without any specific stochastic assumptions)

+ +

$$\mathbf y = \mathbf X\beta +\mathbf u $$ +as a matter of mathematics we have +$$\hat\beta_{OLS} = \left(\mathbf X'\mathbf X\right)^{-1}\mathbf X'\mathbf y=\beta + \left(\mathbf X'\mathbf X\right)^{-1}\mathbf X'\mathbf u$$

+ +

For consistency of $\hat\beta_{OLS}$ we need (as sample size $n$ goes to infinity) +$$\operatorname {plim}\left [\left(\mathbf X'\mathbf X\right)^{-1}\mathbf X'\mathbf u \right ]= \mathbf 0 \Rightarrow \left(\operatorname {plim}\frac 1n\mathbf X'\mathbf X\right)^{-1} \operatorname {plim}\left (\frac 1n\mathbf X'\mathbf u \right )= \mathbf 0$$

+ +

This requires

+ +

a) that $\left(\operatorname {plim}\frac 1n\mathbf X'\mathbf X\right)^{-1} < \infty$, and that it converges to a positive definite matrix, which is a condition usually just assumed, and it will be satisfied if the ""Grenander conditions"" are satisfied (in short, as $n\rightarrow \infty$, no regressor degenerates to a sequence of zeros, no single observation dominates the sum of squares of its series, and the regressor matrix always has full rank). These conditions exclude some kinds of non-stationarity of the regressors, but they do not require covariance-stationarity (which is the one usually meant under the term ""stationarity"").

+ +

b) that +$$\operatorname {plim}\left (\frac 1n\mathbf X'\mathbf u \right )= \mathbf 0\Rightarrow \left [\begin{matrix} +\operatorname {plim}\frac 1n\sum_{i=1}^nx_{1i}u_i \\ +...\\ +\operatorname {plim}\frac 1n\sum_{i=1}^nx_{ki}u_i +\end{matrix}\right ] =\mathbf 0 $$

+ +

Now Markov's Law of Large Numbers, in order to hold requires that +$$\frac 1{n^2}\operatorname {Var}\left(\sum_{i=1}^nx_{1i}u_i\right)\rightarrow 0,\; \text {as}\; n\rightarrow \infty$$

+ +

Here too, this condition excludes some kinds of non-stationarity, but it does not require covariance stationarity (for example, both the mean and the variance of each $x_{ji}$ and each $u_i$ may be different -we only need that the variance of the sum is of smaller order than $n^2$). +If this condition holds then the Law of Large Numbers applies and we have (abusing notation a bit)

+ +

$$\operatorname {plim}\left (\frac 1n\mathbf X'\mathbf u \right )= \left [\begin{matrix} +\operatorname {lim}\frac 1n\sum_{i=1}^nE(x_{1i}u_i) \\ +...\\ +\operatorname {lim}\frac 1n\sum_{i=1}^nE(x_{ki}u_i) +\end{matrix}\right ]$$

+ +

For this to be equal to the zero-vector we need that each regressor is contemporaneously uncorrelated with the error term, $E(x_{ji}u_i)=0,\; \forall j,i$. This is a condition related to stochastic dependence/independence, and has nothing to do with stationarity.

+ +

For asymptotic normality of $\hat\beta_{OLS}$ we examine

+ +

$$\operatorname {plim}\sqrt n(\hat\beta_{OLS} -\beta)= \operatorname {plim}\left[\sqrt n\left(\frac 1n \mathbf X'\mathbf X\right)^{-1}\frac 1n\mathbf X'\mathbf u\right] = \left(\operatorname {plim}\frac 1n\mathbf X'\mathbf X\right)^{-1} \operatorname {plim}\left (\frac 1{\sqrt n}\mathbf X'\mathbf u \right )$$

+ +

The first plim was discussed previously. For the Lindeberg-Feller Central Limit Theorem to hold for the second plim, what is required is
+a) that each regressor series is comprised of independent r.v.'s,
+b) that the errors are independent from each other,
+(both these can be relaxed)
+c) that the expected values and the variances of the rv's involved are finite, but not necessarily equal
+d)and finally that ""no term dominates the whole"", which is expressed as a condition on the relative magnitude of the variances involved.

+ +

So again, some forms of non-stationarity are excluded, but covariance-stationarity is not needed.

+",2013-10-29 01:11:19.870 +58401,22544.0,1,58422.0,,,What model should one use for this short time series?,,CC BY-SA 3.0,"

Below I have quarterly total sales on the left (dependent variable), and a sample of the sales on the right. The two variables share a correlation of 98.7%. What model should I use to predict X? For that model, should I include a constant? Seasonal adjustments? Remove outliers? The most important criteria is minimizing out of sample prediction error.

+ +
Q3'10   40.19   0.2386
+Q4'10   39.36   0.2000
+Q1'11   51.25   0.2173
+Q2'11   54.99   0.2630
+Q3'11   50.38   0.2242
+Q4'11   50.77   0.2623
+Q1'12   67.39   0.3548
+Q2'12   77.14   0.3716
+Q3'12   72.54   0.3451
+Q4'12   80.21   0.3816
+Q1'13   94.57   0.4661
+Q2'13   102.13  0.4919
+Q3'13E  X       0.4424
+
+",2013-10-29 01:17:39.457 +58402,1359.0,1,,,,Incremental SVD in Collaborative Filtering,,CC BY-SA 3.0,"

In the so-called incremental SVD used for collaborative filtering:

+ +

http://www.machinelearning.org/proceedings/icml2007/papers/407.pdf

+ +

http://www2.research.att.com/~volinsky/papers/ieeecomputer.pdf

+ +

http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/

+ +

The user x item matrix R is factored as QP using gradient descent. In the classical SVD there is the diagonal matrix S which holds the singular values. What happens(ed) to that matrix in this formulation? Is it just omitted and they still call it SVD or is it implicitly part of Q and/or P?

+",2013-10-29 01:21:37.867 +58403,23087.0,2,,58401.0,,,,CC BY-SA 3.0,"

I would consider using Gaussian Process regression. Carl Rasmussen's excellent book and associated Matlab software are freely available here: http://www.gaussianprocess.org/gpml/. You would probably want to use a mixture of a periodic kernel for the seasonal effect plus a linear kernel since there looks to be a roughly linear growth. If you want to use the ""sample of sales"" variable as a covariate that is possible too.

+",2013-10-29 01:27:27.993 +58404,23087.0,2,,52099.0,,,,CC BY-SA 3.0,"

As a general rule you should work with log probabilities rather than probabilities themselves, since multiplying small doubles is very imprecise. Multiplication then becomes addition, which is much more accurate. If you need to sum probabilities you need a stable ""logsumexp"" function, see [http://machineintelligence.tumblr.com/post/4998477107/the-log-sum-exp-trick] for example. SlowlyFailing is absolutely correct about using lfactorial (i.e. lgamma really) rather than factorial.

+",2013-10-29 01:34:57.827 +58405,22262.0,1,58412.0,,,What if Lasso selects transformed terms but not untransformed terms,,CC BY-SA 3.0,"

Suppose I have standard normal features $X_i \in \{X_i : i \in \{1,...,1000\}\}$. I extend this set of predictors with transformations as follows: $\{X_i,X_i^2,X_iI(X_i > 0) : i \in \{1,...,1000\}\}$.

+ +

What happens if Lasso would pick $X_i^2$ or $X_iI(X_i > 0)$ but not $X_i$ itself. What do I do? Is this even a problem?

+",2013-10-29 02:43:02.493 +58406,594.0,2,,44635.0,,,,CC BY-SA 4.0,"

As with many such situations, one must take care to avoid confusing sample and population quantities. (Given some particular distributional assumptions, we might choose to test for symmetry about a population mean using a statistic based on sample medians for example.)

+

We should also keep in mind that failure to reject a null of symmetry is not the same as showing symmetry.

+

Let's begin by simplifying things by assuming continuity.

+

First, what is meant by symmetry of a distribution? While it's usually conceived in the elementary treatments in terms of the density - i.e. as $f(\theta+x)=f(\theta-x)$, when we say 'that the distribution is symmetric', I often tend to conceive it in terms of the distribution function (though the distinction won't matter, generally).

+

Note that symmetry around the population mean implies symmetry about the population median, so we needn't distinguish them - if the mean exists, the two will be the same.

+

There are two cases to distinguish:

+
    +
  1. testing for symmetry about a specified location and

    +
  2. +
  3. testing for symmetry about an unspecified location

    +
  4. +
+

Let's consider each in turn

+
    +
  1. One example of a way to test for symmetry about a specified mean $\theta_0$ is to create a second sample, $Y=2\theta_0-X$ and compute a test statistic that measures discrepancy in the distributions of X and Y (such as a two-sample Kolmogorov-Smirnov statistic).
  2. +
+

[I'm not certain the distribution of the test statistic under the null is still the same as for the KS test $-$ and I'm not going to try to work it out right now $-$ but the distribution could easily be simulated for this circumstance, so it's not a huge issue.]

+

Note further that testing for symmetry about a known location may be reduced to testing for symmetry about 0 simply by subtracting the given location from all the observations. The test mentioned above would then be a test for symmetry about 0.

+

There are many other tests that could be used in this situation, such as a sign test (if the distribution is not symmetric about 0, there will typically tend to be an excess or deficit of positive signs, though counterexamples are certainly possible), or the signed rank test mentioned before. (They all act as a test of symmetry about the specified population mean)

+
+
    +
  1. Some tests for symmetry about an unknown center. There are many of these; I'll mention just a few.
  2. +
+

i) The triples test of Randles et al (1980)

+

This test is (IMO) intuitively appealing. It looks at sets of three observations, checking whether in each case the triple has the middle observation closer to the smaller (suggesting right skew) or larger (suggesting left skew) observation (the right skew case gets a score of 1/3, the left skew case gets -1/3 and anything else scores 0. Then the test statistic, $R$, is the average of the scores over all possible triples.

+

(This test is not distribution free, but with a consistent estimator of the variance of $R/\sqrt n$ it is asymptotically distribution free.)

+

Randles, Fligner, Policello and Wolfe (1980)
+An Asymptotically Distribution-Free Test for Symmetry Versus Asymmetry
+Journal of the American Statistical Association
+Vol. 75, No. 369, Mar., pp. 168-172

+

ii) Gastwirth's (1971) modified sign test. Gastwirth considered a sign test about the sample mean. It's no longer distribution-free, but again, with a consistent estimator of the variance of an appropriately scaled statistic, it is asymptotically so. However, note that this test would have essentially no power against asymmetric distributions with $P(X>\mu) = 1/2$

+

Gastwirth, J.L. (1971)
+On the Sign Test for Symmetry.
+Journal of the American Statistical Association, 66, 821-828.

+

iii) Hotelling and Solomons test (1932) of the Pearson skewness (scaled mean-median). Gastwirths 1971 paper (mentioned above) gives an expression for the asymptotic variance of a suitably normalized statistic and this, too, is thereby asymptotically distribution free.

+

Hotelling, H. and L. M. Solomons (1932)
+The Limits of a Measure of Skewness
+Ann. Math. Statist. Vol 3, No. 2, 141-142.

+

On this test, also see here

+
+

Note that Gastwirth's test in (ii) is quite similar to the test you propose, with only the substitution of the sign test for the signed rank test. Your test would also not be distribution-free, but you should probably be able to find a consistent estimator of the variance of your statistic (appropriately standardized), and thus get an aymptotically distribution free test. (Alternatively, you might be able to come up with a bootstrap test based off such a statistic.)

+
+

A review of tests of symmetry can be found here. Also see this tech report

+",2013-10-29 03:26:42.400 +58407,20752.0,2,,58405.0,,,,CC BY-SA 3.0,"

It's not a problem strictly speaking, but you should pay attention to whether that interpretation makes sense in the context of your problem. If the squared value is what's important to your dependent variable, then this is what you'd expect to happen because it would have more explanatory power than the raw variable.

+ +

For example, if you're a sports fan, you could use a players age to predict their performance. If you mean-shift that to something like 27, then the age variable can be negative if it is below 27 and positive if it's greater than 27. Well, players performance tends to have an arc that peaks at age 27, so if you put the raw age into the regression, chances are it would show not significant. If you put age^2 in though, it would. And likewise if you gave Lasso the choice between the two, it's a no brainer.

+",2013-10-29 03:58:53.220 +58435,13740.0,1,58890.0,,,Classifier for uncertain class labels,,CC BY-SA 3.0,"

Let's say I have a set of instances with class labels associated. It does not matter how these instances were labelled, but how certain their class membership is. Each instancs belongs to exactly one class. Let's say I can quantify the certainty of each class membership with a nominal attribute that goes from 1 to 3 (very certain to uncertain, respectively).

+ +

Is there some sort of classifier that takes into consideration such a certainty measure and if yes, is it available in the WEKA toolkit?

+ +

I imagine this situation occurs quite often, for example when instances are classified by human beings which are not always completely sure. In my case, I have to classify images, and sometimes an image could belong to more than one class. If this happens, I give the class a high uncertainty, but still classify it with only one class.

+ +

Or are there any other approaches to this problem, without a specialized classifier? E.g. only taking ""certain"" classifications for training? I fear that in this case, there will be more misclassifications because ""border"" cases are not covered.

+",2013-10-29 14:38:10.610 +58408,22923.0,1,58428.0,,,Does zero correlation between 2 differenced series implies no cointegration between original series?,,CC BY-SA 3.0,"

The question is related to this one.

+ +

In this question @mpiktas gives an answer on why checking correlation is not enough but the answer doesn't seem completely correct to me for the following reason:

+ +

If 2 time-series are cointegrated, i.e. there is a linear relation between them $$y_t = a + b x_t + \varepsilon_t$$ with stationary $\varepsilon_t$ it implies a linear relation between their differences $$\Delta y_t = b \Delta x_t + \Delta \varepsilon_t.$$ So if series are cointegrated, their differences should be correlated. And this means that if we don't see significant correlation between differences in our data, then there is no cointegration either. Is this correct or am I missing something?

+ +

The question arises because I look for relations between hundreds of time-series (mainly non-stationary) and the way I do this is by considering correlations between their differenced counterparties. And I assume that if I don't see correlation between differenced series there is no cointegration either - it suffices to check only correlations.

+",2013-10-29 04:16:50.607 +58409,22262.0,1,,,,When would I choose Lasso over Elastic Net,,CC BY-SA 3.0,"

What are the scenarios where Lasso is likely to perform better than Elastic Net (out of sample prediction)?

+",2013-10-29 05:22:25.660 +58410,23121.0,1,58411.0,,,Simple linear regression - understanding given,,CC BY-SA 3.0,"

The question is to fill out the missing numbers (A-L) of a simple linear regression model. +I am having problems with converting and interpreting the given table in terms of variables. Would it be possible for someone to confirm and clarify things for me.

+ +

The first table represents regression statistics

+ +

+ +

True model +$$ +Y_t = \beta_o + \beta_1X_t + \mu_t +$$

+ +

Estimated model +$$ +\hat Y_t = \hat\beta_0 + \hat\beta_1x_t +$$

+ +

This is what I am confused about

+ +
    +
  • Does the first standard error (12.8478) mean $\sum\hat\mu_t^2$ ?
  • +
  • Does the standard error for the intercept in last table (14.6208) mean $\sum\mu_t^2$ ?
  • +
  • Does 3.8508 equal $\hat\beta_1$ ?
  • +
  • In order to calculate RSS (for J) I need $\sum \hat\mu_t^2$ does this confirm that my first two points are incorrect
  • +
  • I know $G=\hat\beta_1^2\sum x_t^2$, how do I find $\sum x_t^2$
  • +
+ +

If I am wrong, would it be possible to know what those numbers mean in terms of variables

+",2013-10-29 05:47:54.477 +58411,22419.0,2,,58410.0,,,,CC BY-SA 3.0,"
    +
  1. The standard error here refers to the standard error of the model as a whole, and it is the standard deviation divided by the square root of the sample size.

    +
  2. +
  3. Here the standard error refers to the individual standard error for the intercept. The formula is same as that in the first point.

    +

    (To get the answer for K and L; use this -> T stat = Coefficient / Std .Error)

    +
  4. +
  5. Yes its the estimated coefficient of X variable 1

    +
  6. +
+

For the rest of the confusion, watch this youtube video!

+

http://www.youtube.com/watch?v=zwr0bs8znEE

+

EDIT:

+

I used some identities to figure out G H I J, so i cant guarantee this is what your lecturer wants.

+

Calculating D, E, F:

+

these are the degrees of freedom, some formulas here, +D -> 1 +E -> 13 +F -> 14

+

Calculating H, G:

+
    +
  1. F statistic(=24.15) = I / J.
  2. +
  3. I is G/D; similarly J is H/E
  4. +
  5. You know D and E so just some math will get you the value for G and H.
  6. +
+

This is a really weird approach, but I don't think you can get the sum of squares for regression and residual without the actual observations.

+",2013-10-29 06:19:06.987 +58412,16992.0,2,,58405.0,,,,CC BY-SA 4.0,"

You're right: in general, people don't like to put interactions into a model before putting in the primary effects. There is a recent paper that solves this problem for the lasso: "A lasso for hierarchical interactions" by Jacob Bien, Jonathan Taylor, and Robert Tibshirani. Their solution is implemented in the R package hierNet. Hope this helps!

+",2013-10-29 07:17:44.107 +58413,21434.0,1,58426.0,,,Which statistical method to use for finding systematic patterns in data,,CC BY-SA 3.0,"

As part of a broader study I am analysing 30 websites that fall into 3 categories:

+ +
    +
  • Consumer (10 sites)
  • +
  • Commercial (10 sites)
  • +
  • Health (10 sites)
  • +
+ +

The approach I used was a 'tick and flick' spreadsheet with 24 dichotomous variables that represent features of the website that are either absent or not (i.e. they receive a tick if they exhibit that particular feature).

+ +

Here is an example of the data.

+ +

The numbers represent how many websites from each category contain each particular feature (variable).

+ +

I want to know which kind of statistical test would be used to find if there are any systematic patterns about which 'Category' of website tends to correlate with particular variables. For example, which websites tend to share power with users to edit/contribute web content (measured by variables 2,3,4,5,6,7,8,13,16,19,23,24)?

+ +

I would rather use a more robust/rigorous statistical approach than simply counting up totals, or 'eye-balling' patterns in the data.

+ +

Thank you in advance.

+",2013-10-29 07:54:14.847 +58414,20470.0,2,,45457.0,,,,CC BY-SA 3.0,"

You need to use the Baum-Welch algorithm to learn the transition, emission and prior probabilities from your data.

+ +

If you are using the HiddenMarkov CRAN package, you can achieve this by using the BaumWelch() function.

+",2013-10-29 08:53:43.243 +58415,22804.0,1,58431.0,,,beta distributions assigned to represent uncertainty,,CC BY-SA 3.0,"

I need to calculate the Probabilistic Sensitivity Analysis for a function. +I was given this:

+ +

beta distributions assigned to represent uncertainty
+And have this parameter with this data:

+ +
variable = d_progress
+probability of variable : 0.1
+n = 100
+r = cases = 3
+
+ +

Now, I need to use this function or something?

+ +
p <- rbeta(n, shape1=alpha, shape2=beta)
+
+ +

I already have the parameters so I can find p. +If I have:

+ +
p<-rbeta(100, 1, 99) 
+
+ +

What I do is I create a vector now? Of 100 values? +So, I need to create a loop, 100 times for the following formula:

+ +
for each of 100 p generated (
+  EV = p*b_par
+  I then submit each EV in a vector or something...
+  I need to Find the different values of EV for different p,
+    so I just find the mean of all the 100 EV in the end
+)
+
+",2013-10-29 08:57:10.730 +58416,23126.0,2,,10008.0,,,,CC BY-SA 3.0,"

There are various processes in nature that involve only an interaction effect and laws that decribe them. For instance Ohm's law. In psychology you have for instance the performance model of Vroom (1964): Performance = Ability x Motivation.Now, you might expect finding an significant interaction effect when this law is true. Regretfully, this is not the case. You might easily end up with finding two main effects and an insignificant interaction effect (for a demonstration and further explanation see Landsheer, van den Wittenboer and Maassen (2006), Social Science Research 35, 274-294). The linear model is not very well suited for detecting interaction effects; Ohm might never have found his law when he had used linear models.

+ +

As a result, interpreting interaction effects in linear models is difficult. If you have a theory that predicts an interaction effect, you should include it even when insignificant. You may want to ignore main effects if your theory excludes those, but you will find that difficult, as significant main effects are often found in the case of a true data generating mechanism that has only a multiplicative effect.

+ +

My answer is: Yes, it can be valid to include a two-way interaction in a model without including the main effects. Linear models are excellent tools to approximate the outcomes of a large variety of data generating mechanisms, but their formula's can not be easily interpreted as a valid description of the data generating mechanism.

+",2013-10-29 09:19:13.857 +58417,15293.0,2,,42885.0,,,,CC BY-SA 3.0,"

Firstly, your 2SLS-estimator is wrong. Check wikipedia: +http://en.wikipedia.org/wiki/Instrumental_variable

+ +

Let n be the number of observations. Let Z be the instrument matrix and X be endogenous regressor. You say you have 2 instruments and 1 endogenous regressor, hence Z is n by 2 and X is n by 1.

+ +

Projection matrix $Pz(=Z(Z'Z)^{-1}Z')$ is n by n, hence the 2SLS-estimator +$\beta_{2sls} = (X'PzX)^{-1}X'Pzy$ +works like a charm. Hence there is probably something wrong with you code/data.

+ +

When you get stuck at these things go back to matrix form.

+",2013-10-29 09:57:19.897 +58418,23127.0,1,,,,Multiple regression and hypothesis test $H_0$:$\beta_2=0$,,CC BY-SA 3.0,"

Multiple regression model
+$H_0$:$\beta_2=0$, $H_1$:$\beta_2 \neq 0$
+where $\beta_2$ is the vector of elements ($\beta_2, \beta_3, \dots, \beta_k$) and $\beta$ is slope of regression line.

+ +

Why it is equivalent to a test based on the statistic +$$\frac{R^2/(k-1)}{(1-R^2)/(T-k)}$$ +where $R^2$ is the square of the multiple correlation coefficient of the equation.

+ +

I don't know how to solve this. Please give me any suggestion or hint.

+",2013-10-29 10:03:02.447 +58419,18841.0,1,58421.0,,,Neighborhood analysis in Matlab using a dot plot,,CC BY-SA 3.0,"

I have points in a 2D graph (coordinates: X,Y property: Z). I would like to find for every point the closest, for example, 5 points and save their properties.

+ +

What would be the easiest approach?

+ +

Update:
+Using the following code:

+ +
%Synthetic data
+A = {[1,1]; 'A'};
+B = {[2,2]; 'B'};
+C = {[3,3]; 'C'};
+
+%plot
+D = {A{1};B{1};C{1}};
+VarPlot = cell2mat(D);
+plot (VarPlot,'.');
+
+%knnsearch
+[IDX,dist] = knnsearch(VarPlot(:,1),VarPlot(:,2))
+
+ +

I receive the following result:

+ +
IDX =
+
+ 1
+ 2
+ 3
+
+dist =
+
+ 0
+ 0
+ 0
+
+ +

What does this mean? And how can I link the result to the properties A,B and C. I am new to this kind of questions.

+",2013-10-29 10:33:32.380 +58420,23096.0,1,,,,Inferring an unmeasured value,,CC BY-SA 3.0,"

Suppose we have a table of finishing times for an number of 100 metre races where each competition has a different mix of entrants. We suspect that some races are slower than others due to, say, a headwind, but this wasn't measured. How do I find the underlying time for each runner and the correction for each race?

+ +

I have devised my own crude technique to solve this problem. I assume

+ +
+Tobs(e,c) = Tund(e) + Tcor(c) + E(e,c)
+
+where 
+
+Tobs(e,c) = observed time of entrant e in competition c
+Tund(e) = underlying time of entrant e
+Tcor(c) = time correction for competition c
+E(e,c) = error matrix 
+
+sum of all Tcor = 0
+sum of all rows in E(e,c) = 0
+sum of all columns in E(e,c) = 0
+
+
+ +

I then iteratively choose values for Tcor until the above relations are satisfied.

+ +

Is there a better way? Using R? Please be gentle. As you might have guessed by now, I don't much much about stats or maths. Thanks!

+",2013-10-29 11:37:29.087 +58421,22923.0,2,,58419.0,,,,CC BY-SA 3.0,"

Matlab Statistics Toolbox has a 'knnsearch' function that does exactly this: http://www.mathworks.com/help/stats/knnsearch.html

+",2013-10-29 11:52:20.217 +58422,2149.0,2,,58401.0,,,,CC BY-SA 4.0,"

You can just use the history of Y or also your suggested causal. I have not seen “sample of sales” before as a causal, so I am hesitant to want to use that variable, but I am sure you know what you are doing.

+

Yes, you should consider the adjustment of outliers. Yes, you should allow for a constant. Yes, you should consider seasonal impacts.

+

The ACF/PACF doesn't show that the lag of 4 is important so autoregressive seasonality is weak. The data are short so this can be expected. Q4 is flat and then the last year Q4 is high which might due to the short data or a change in the behavior of Q4. Tough to tell.

+

A possible model (automatically developed using AUTOBOX), a piece of software I have helped develop is providing There are two seasonal dummies detected consistent 1st and 2nd qtr positive effects.

+

If one did not use the predictor then a very similar forecast is developed using this equation

+

+ It is interesting (at least to me !) that the two quarterly negative seasonal pulses (qtrs 3 and 4) are the “reflection” of the two quarterly seasonal pulses developed using the predictor series.

+

EDITED to respond to Nick's OLS MODEL:

+

If you take Y and divide it by X to get a new variable called Z and THEN run an OLS model restricting the intercept to be 0., you in fact will obtain . The residuals from this assumed model (as you have wisely said in previous posts it is always a good thing to bring the residuals to your "doctor" for a checkup) have a serious violation/malady at period 1 and clearly evident non-randomness. The whole idea is to avoid entertaining insufficient models and adequately capturing the signal. Clearly, the simple OLS model for Z ignores the very clear need for seasonal/quarterly dummies which are lost in translation when converting Y and X to Z.

+",2013-10-29 13:00:03.613 +58436,23136.0,1,,,,Plotting variables in transformed space,,CC BY-SA 3.0,"

Suppose $A = X_1/X_2$ and $B = X_3/X_4$. Why would one plot the data in $(\log A, \log B)$ space as opposed to $(A,B)$ space?

+",2013-10-29 14:38:39.240 +58437,,2,,58436.0,user31668,,,CC BY-SA 3.0,"

If one wanted to use linear regression, then the logs of A and B will be linear in the numerator and denominator. That is the usual reason to use log transforms. It also may make the sampling errors better behaved, since you are converting ratios to sums.

+",2013-10-29 14:41:29.037 +58438,18513.0,1,,,,How do residuals affect the t-statistic?,,CC BY-SA 3.0,"

Could a t-statistic for a simple regression relationship with a smaller sum of squared residuals but a smaller or less positive slope be larger than that of a plot with a larger sum of squared residuals and a large slope?

+",2013-10-29 14:52:38.303 +58423,20613.0,2,,58420.0,,,,CC BY-SA 3.0,"

You are on the right path, I guess. (This is me being gentle.)

+ +

If you are looking for a quick and dirty solution (this is me being helpful), you may want to fit a linear model, specifically an ANOVA. Your response would be Tobs(e,c) and your factors of interest would be Ecor (time correction for entrant e) and Ccor (time correction for competition c). This is different from what you had posted with Tund (an underlying time of entrant e), because by default the ANOVA model includes an overall mean (mu). Make sure that both factors of interest (Ecor and Ccor) are treated as ""factors"" or ""categorical variables"" in whatever package you use. If you treat them as ""continuous"" or ""numeric"" variables, you will get nonsensical results. (Since you seem to be interested in the estimates for each race and each runner, you avoid going down the wormhole of random effects or mixed effects models.)

+ +

If you are looking for a more thorough and vetted solution (this is me being realistic), you should consult with a statistician. S/he can help ensure that your model answers your questions, suits the ""design"" of the data, meets all the necessary assumptions (independence, homogeneity, normality), and avoids commonly made mistakes.

+ +

If you are looking for a long term solution (this is me being encouraging), take a statistics or data analysis class. It can be very rewarding, especially when you have specific analysis needs in mind when taking the course.

+",2013-10-29 13:05:48.223 +58424,0.0,5,,,,,,CC BY-SA 3.0,,2013-10-29 13:11:25.997 +58425,16043.0,4,,,,,,CC BY-SA 3.0,Describes data models with a time-series component and a spatial component.,2013-10-29 13:11:25.997 +58426,22985.0,2,,58413.0,,,,CC BY-SA 3.0,"

It seems you don't really know what you want except some sort of pattern. So why not making a Principal Component Analysis to reduce the complexity and get the direction of greatest variabilities.

+",2013-10-29 13:20:35.720 +58427,20416.0,1,58478.0,,,Modelling relation between two persistent AR(1) processes? Is my approach reasonable and how to adjust standard errors?,,CC BY-SA 3.0,"

I want to run a simulation in which I want to find out whether there is a relation between the independent variable $x_t$ and the dependent variable $y_t$. I.e., in the following regression I want to find out if $\beta$ is signficantly different from zero:

+ +

$$ y_t = \beta x_t + \epsilon_t. $$

+ +

In empirical data, $y_t$ is stationary, but close to a random walk.I want to model it as an AR(1) process. Of course, I could just model $x_t$ as an AR(1) process and if $\beta$ is different from zero, $y_t$ would end up as such a process as well. However, I also want to consider the case in which $\beta=0$ and I don't want to confuse the reader with changing descriptions for different scenarios (i.e., ""if $\beta=1$ I simulate $y_t$ from the above equation, if $\beta=0$, I simulate $y_t$ as an AR(1) process."")

+ +

I came up with the following solution to this dilemma. Why not model both $y_t$ and $x_t$ as AR(1) processes with potentially correlated errors. That is:

+ +

$$ y_{t+1} = \tau_y y_t + u_{t+1}, \quad 0 < \tau_y < 1, \quad u_{t+1} \sim N(0, \sigma_u^2) $$ and +$$ x_{t+1} = \tau_x x_t + v_{t+1} , \quad 0 < \tau_x < 1, \quad v_{t+1} \sim N(0, \sigma_{v}^2), $$

+ +

where

+ +

$$ cov\left(\begin{bmatrix} v_{t+1} \\ u_{t+1} \end{bmatrix}, \begin{bmatrix} v_{t+1} & u_{t+1} \end{bmatrix}\right) = \begin{bmatrix} \sigma_v^2 & \sigma_{v} \sigma_u \rho_{u,v} \\ +\sigma_{v} \sigma_u \rho_{u,v} & \sigma_u^2 \end{bmatrix} $$

+ +

Since $\beta = \frac{Cov(y, x)}{\sigma_x^2}$ and

+ +

$$ +\begin{aligned} +Cov(y, x) &= E[y_{t+1}x_{t+1}] \\ +&= E[(\tau_y y_t + u_{t+1})(\tau_x x_t + v_{t+1})] \\ +&= \tau_y \tau_x E[y_t x_t] + E[u_{t+1} v_{t+1}] \\ +&= \tau_y \tau_x Cov(y, x) + \sigma_{u} \sigma_{v} \rho_{v, u} \\ +\end{aligned} +$$

+ +

we get by rearranging

+ +

$$ \beta = \frac{\sigma_u}{\sigma_{v}}\rho_{v, u} \frac{1 - \tau_x^2}{1-\tau_x \tau_y}. $$

+ +

Fair enough, this is a little bit complicated, but it allows me to control $\beta$ without changing the structure of either $y_t$ or $x_t$ and both those processes are AR(1) in this setup.

+ +

I basically have two questions:

+ +
    +
  1. Is this a reasonable approach given the requirement that preferably both $x_t$ and $y_t$ should be AR(1) processes. The interpretation of the relation should really be just like in a simple OLS setup, i.e. has $x_t$ an impact on $y_t$ (I'm not concerned about causality here, just relation). Would you consider my setup still as a reasonable way of modelling it? (I just don't make a direct link, but use the error structure for that. I don't see a problem with this approach. If $x_t$ and $y_t$ are related via correlated shocks, so be it.)
  2. +
  3. I already simulated this and it works fine. However, I also want to obtain correct confidence intervals for each run. That is, I want to run the regression in the first equation, get $\widehat{\beta}$ and also standard errors that should be reasonable. Now I noticed that the errors are highly autocorrelated and this autocorrelation seems to be identical to $\tau_y$. However, I could not formalize it. So it would be great to know how I would have to adjust the standard errors. (It would be extra awesome if I could get a hint how to implement that in R.)

    + +
    set.seed(123)
    +library(MASS)
    +### Set start values
    +nrT <- 1e5
    +burnin <- 1e3
    +sd_y_shock  <- 1
    +sd_x_shock  <- 1
    +corr_x_y    <- 0.5
    +tau_y  <- 0.8
    +tau_x  <- 0.1
    +### Simulate the correlated shocks
    +shocks <- mvrnorm(nrT + burnin, 
    +                  mu = c(0,0), 
    +                  Sigma = matrix(c(sd_y_shock^2, corr_x_y * sd_y_shock * sd_x_shock, 
    +                                   corr_x_y * sd_y_shock * sd_x_shock, sd_x_shock^2), 
    +                                 nrow=2))
    +
    +vec_y <- arima.sim(list(order = c(1, 0, 0), ar = tau_y),
    +                    n = nrT + burnin, 
    +                    innov = shocks[, 1]) 
    +
    +vec_x <- arima.sim(list(order = c(1, 0, 0), ar = tau_x),
    +                   n = nrT + burnin, 
    +                   innov = shocks[,2])
    +### Check that formula derived above is correct; the two should be similar
    +sd_y_shock/sd_x_shock * corr_x_y * (1 - tau_x^2)/(1 - tau_x * tau_y) 
    +coef(lm(vec_y ~ vec_x))[2]
    +### Plot ACF
    +# Note that, independent from corr_x_y and tau_x, the autocorrelation structure seems
    +# always to be tau_y
    +acf(lm(vec_y ~ vec_x)$resid)
    +
  4. +
+ +

+",2013-10-29 13:23:50.210 +58428,20473.0,2,,58408.0,,,,CC BY-SA 3.0,"

The existence or not of a linear relationship does not necessarily go hand-in-hand with co-integration. Variables co-integrated in levels won't necessarily exhibit correlation in first-differences.

+ +

Assume that the following relation holds: +$$y_t = a + b x_t + \varepsilon_t, \; \varepsilon_t=\text {i.i.d} $$

+ +

i.e. the variables are co-integrated. Then the relation

+ +

$$\Delta y_t = b \Delta x_t + \Delta \varepsilon_t$$
+also holds. Calculating the sample correlation of first-differences we will estimate the Covariance as

+ +

$$ \begin{align}\operatorname{\hat Cov}(\Delta y_t,\Delta x_t)=& \frac 1{T-1} \sum_{t=2}^{T}\left(b \Delta x_t + \Delta \varepsilon_t\right)\Delta x_t \\-&\left(\frac 1{T-1} \sum_{t=2}^{T}\left(b \Delta x_t + \Delta \varepsilon_t\right)\right)\left(\frac 1{T-1} \sum_{t=2}^{T}\Delta x_t\right)\end{align} $$

+ +

$$ \begin{align}=b\frac 1{T-1}& \sum_{t=2}^{T}\left(\Delta x_t \right)^2 + \frac 1{T-1} \sum_{t=2}^{T}\left(\Delta x_t \Delta \varepsilon_t\right) \\ -& b\left(\frac 1{T-1}\sum_{t=2}^{T} \Delta x_t\right)^2 -\left(\frac 1{T-1} \sum_{t=2}^{T}\Delta \varepsilon_t\right)\left(\frac 1{T-1} \sum_{t=2}^{T}\Delta x_t\right) \end{align}$$

+ +

To the degree that $x_t$ and $\varepsilon_t$ are independent, the terms involving the error will tend to vanish and so

+ +

$$ \operatorname{\hat Cov}(\Delta y_t,\Delta x_t)\rightarrow bs^2_{\Delta x_t} $$

+ +

where $s^2$ is the sample variance (irrespective of whether the variance of $x_t$, or $\Delta x_t$ is constant or not).

+ +

The sample variance of $\Delta y_t$ will be

+ +

$$s^2_{\Delta s_t} \approx b^2s^2_{\Delta x_t} + s^2_{\Delta \varepsilon_t}$$

+ +

again, irrespective of whether these sample moments estimate anything meaningfull.

+ +

So +$$\operatorname {\hat Corr}(\Delta y_t,\Delta x_t) \approx \frac {bs^2_{\Delta x_t}}{\sqrt {\left(b^2s^2_{\Delta x_t} + s^2_{\Delta \varepsilon_t}\right)}\sqrt {s^2_{\Delta x_t} }} = \frac {bs_{\Delta x_t}}{\sqrt {\left(b^2s^2_{\Delta x_t} + s^2_{\Delta \varepsilon_t}\right)}}$$

+ +

So the magnitude of the empirically estimated correlation of first differences, will depend on the magnitude of the variance of the error term (which moreover enters the expression doubled since we consider first differences). If this (constant) variance is large compared to the variance of $x_t$, then the estimated correlation of first-differences may be small to non-existent, even though the variables are co-integrated in levels.

+",2013-10-29 13:24:15.223 +58429,,2,,58408.0,user31668,,,CC BY-SA 3.0,"

Since correlation is a measure of the degree of linear dependence, first differences should tease this out. Now, I am assuming you check cointegration across multiple lags, not just contemporaneous values, since there could be something like $y_t = a + b x_{t-1} + \varepsilon_t$ going on, which may complicate matters. Alecos' observation that there may be not detectible cointegration is also important.

+",2013-10-29 13:24:54.120 +58430,,2,,58399.0,user31668,,,CC BY-SA 3.0,"

Your notes about frequentists relying on ""repeated sampling"" properites and Bayesians relying on ""fairness"" is in line with Neyman/Pearson and deFinetti's justifications of each paradigm, respectively. Bayesian and frequentist approaches are appropriate in different contexts. A controversial aspect of frequentist approaches is the relevance of the concept of ""confidence"" in the case where it is not clear what is the ""embedding series"" of experiements (there can be many, look up ""relevant subsets problem"" for more on this). Bayesians get critisized for applying priors when the underlying property is not random...hence there is a ""calibration"" problem with a, say, 95% posterior interval...95% of what, and why do we care?

+ +

I'd take a look at another school/paradigm as well...the Likelihood school, as described by the accessible and useful book ""In All Likelihood"" by Yudi Pawitan. This approach shows how the objective and subjective aspects of probability are related via the distinction between likelihood and probability.

+ +

Also, there is an interesting ""meeting of the minds"" when it comes to random-effects modeling. Take a look at that to see how the two, in practice, can converge in concepts.

+",2013-10-29 13:42:17.477 +58431,,2,,58415.0,user31668,,,CC BY-SA 3.0,"

Based on OPs goal of creating 100 error-dispersed EV values generated from a vector of beta errors:

+ +

Assumptions:

+ +

Function: EV($b_{par}$)= $b_{par}$

+ +

Uncertainty type: multiplicative beta-distributed, alpha=1, beta=99

+ +

Number of realizations of the error: n=100

+ +

You would do the following:

+ +

Create vector of p-values:

+ +

p<-rbeta(100, shape1=1, shape2=99); + EV<-p*b_par; + mean(EV)

+ +

If this is all you need to do, and if b_par is a constant, then you can take a shortcut and just say that $$\text{mean}(EV) = b_{par}\mathbb{E}[\text{Beta}(1,99)]$$ since expectation is a linear operation.

+",2013-10-29 13:50:41.777 +58432,23132.0,1,,,,T-statistic of correlation coefficient,,CC BY-SA 3.0,"

Hi I am trying to calculate the t-statistic for a correlation coefficient between two vectors $x$ and $y$.

+ +

The individual vectors shows signs of autocorrelation. I have made use of the formula:

+ +

$\frac{r}{\sqrt{(1-r^2)(n-2)}}$

+ +

but I'm not sure this is the correct way of solving the problem.

+ +

The correlation between the two vectors has been calculated with the Excel formula for correlation.

+",2013-10-29 14:06:15.503 +58439,14298.0,1,,,,Why don't asymptotically consistent estimators have zero variance at infinity?,,CC BY-SA 3.0,"

I know that the statement in question is wrong because estimators cannot have asymptotic variances that are lower than the Cramer-Rao bound.

+ +

However, if asymptotic consistence means that an estimator converges in probability to a value, then doesn't this also mean that its variance becomes 0?

+ +

Where in this train of thought am I wrong?

+",2013-10-29 15:57:09.100 +58440,23115.0,2,,58399.0,,,,CC BY-SA 3.0,"

I think I may have found the problem with my argument, i.e. how what i called ""the frequentist interpretation of the bayesian approach"" and viceversa don't really make sense:

+ +

The ""frequentist interpretation of the bayesian approach"", as described in my quesion, doesn't make much sense because it says that it assumes the likelihood function (and the prior), but then says ""no matter which data we get [in the hypothetical large set of experiments]"" which is incompatible with the frequentist interpretation of the likelihood function.

+ +

The ""bayesian interpretation of the frequentist approach"", is also wrong because it doesn't ensure what I say below. For example, in the frequentist approach, I may well make a measurement and have an emtpy confidence interval for it, which clearly means that it doesn't ensure an a% probability of being right.

+ +

So I think I understand it better now. And so it seems that if you want to be as cautious as possible, frequentist is the way to go. But if you don't have much statistical significance, or have good reasons for your prior, bayesian is the way to go.

+",2013-10-29 16:13:50.840 +58441,23138.0,1,,,,What is my experimental design?,,CC BY-SA 3.0,"

I am very new to mixed/multilevel models. I have an experiment where we measured 2 scale variables (varA and varB) in 2 different groups of subjects (control and treatment) at 4 different time points.

+ +

I suspect that the relationship between varA and varB is different for the 2 groups (control vs treatment), however it should be roughly consistent across the 4 time points. Researching on the internet I've come to conclusion that I need to model this using a multilevel (mixed effects) model. So, I am interested in the difference between the regression line between varA and varB for the 2 groups, but I want to account for the repeated measures.

+ +

What is my experimental design in this case? It's not very clear to me what are the fixed and random effects. Additionally, is it also possible to test if the regression lines are significantly different across the time point? Would that call for a different model? Thanks very much, any help on how I should go about doing this in R/Matlab/SPSS would be greatly appreciated as well!

+",2013-10-29 16:27:36.400 +58442,19750.0,1,58455.0,,,Variational Bayes vs EP and other message-passing methods,,CC BY-SA 3.0,"

I am trying to understand the difference between:

+ + + +

Wikipedia says:

+ +
+

Expectation Propagation differs from other Bayesian approximation approaches such as + Variational Bayesian methods.

+
+ +

Why isn't EP considered a Variational Bayes method? Isn't EP Bayesian, and relies on message-passing to approximate the posterior? What makes a method Variational Bayes?

+ +

Also, what about the following methods. Can they be considered Variational Bayes?

+ +
    +
  • Sum-product
  • +
  • Mean-field methods
  • +
  • Bethe-Kikuchi approximations
  • +
+",2013-10-29 16:30:49.660 +58443,3868.0,1,,,,Estimating ROC/AUC on large data sets?,,CC BY-SA 3.0,"

Plotting an ROC curve of a classifier compared to cases requires that the data set be sorted first on the classifier score. I am in a position where I need to calculate ROC on a large data set very quickly and sorting is the bottleneck (even using quicksort in C or F90). If instead of calculating ROC by thresholding at each case in the data set I instead threshold at every 100 cases then my execution time decreases by orders of magnitude based upon how I can write the code. The result is an ROC curve with let's say 10,000 points instead of 1,000,000. My tests show that the area under these two curves are the same out to > 5 decimal places.

+ +

I would like to use this method but have not ran into anyone trying to speed up calculation in this way. Most of the lit. is on uses of ROC analysis where the data sets are relatively small and execution time is not an issue, so I have not found anyone else using this method or another to speed calculation by ""thinning"" out the points on the curve.

+ +

Has anyone ran into a reference/study that has used or evaluated this or another method for speeding up ROC analysis? If so, or if you have other thoughts, please share.

+",2013-10-29 16:35:36.747 +58444,1322.0,1,58450.0,,,How to interpret BIC,,CC BY-SA 3.0,"

I am fitting two different models to the same data. In one model, there is one free parameter for three different experimental conditions. In another model, I fit three free parameters, one for each condition. I do this for 10 subjects in a dataset.

+ +

For each subject, the model with fewer free parameters has a higher BIC. But for every single subject, the difference in BIC is roughly the same (about 10). I find this very suspicious, since the BIC values themselves range from ~30 to ~1000.

+ +

I have never used BIC before, and would like to say that the model with one free parameter is better.

+",2013-10-29 17:15:05.517 +58445,23140.0,1,58567.0,,,Hierarchical Bayes Normal-Normal Model,,CC BY-SA 3.0,"

I have the following data for 8 runners in a 100 meter dash:

+ +
runner 1 88 91 87 82
+runner 2 81 85 78 91
+runner 3 75 77 83 81
+runner 4 92 89 84 82
+runner 5 78 79 84 92
+runner 6 89 75 79 83
+runner 7 91 89 92 91
+runner 8 87 86 88 91
+
+ +

The ratings represent a performance rating and are normally distributed with unknown mean and unknown variance. Each runner can be considered as a sub-group with a mean and variance.

+ +

Any guidance will be highly appreciated

+",2013-10-29 17:21:38.730 +58446,23144.0,1,58447.0,,,Assessing the power of a normality test (in R),,CC BY-SA 3.0,"

I want to assess the accuracy of normality tests over different sample sizes in R (I realize that normality tests may be misleading). For example, to look at the Shapiro-Wilk test, I'm conducting the following simulation (as well as plotting the results) and would expect that as the sample size increases the probability of rejecting the null decreases:

+ +
n <- 1000
+pvalue_mat <- matrix(NA, ncol = 1, nrow = n)
+
+for(i in 10:n){
+    x1 <- rnorm(i, mean = 0, sd = 1)
+    pvalue_mat[i,] <- shapiro.test(x1)$p.value
+}   
+
+plot(pvalue_mat)
+
+ +

My thought would be that as the sample size grows there should be a lower rejection rate, however it seems pretty uniform. I think I am misunderstanding this - any and all thoughts welcome.

+",2013-10-29 17:53:42.027 +58447,21762.0,2,,58446.0,,,,CC BY-SA 3.0,"

You are simulating under the null hypothesis (normal distribution), therefore the rejection rate will tend to the significance level as expected. To assess the power, you need to simulate under any non-normal distribution. There are infinite possibilities/scenarios (e.g. gamma distributions with increasing skewness, t-distribution with decreasing df etc.) to choose from, depending on the scope of your study.

+",2013-10-29 18:04:16.460 +58451,23147.0,1,,,,how to test most frequent variable between groups?,,CC BY-SA 3.0,"

I have different groups of dolphins, some with offspring, some without. I'll count how many types of vocalizations were emitted by each group, and want to find out if there is a statistical difference between groups with and without offspring.

+ +

My dataset is something like:

+ +
Group_number    Vocal1  Vocal2  Offspring
+Gr01    5   3   0
+Gr02    7   3   0
+Gr03    4   4   0
+Gr04    1   6   0
+Gr05    7   9   0
+Gr06    6   2   1
+Gr07    2   4   1
+Gr08    2   6   1
+Gr09    9   7   1
+Gr10    8   8   1
+
+ +

I have to do this one vocalization type at a time (I have some tens of types)? Or can I do it all in a single run (like a multiple linear regression)? Someone told me to do a chi-square, but am not sure how to do it, or if it's appropriate. T-student, ANOVA, MANOVA, GLM, what's the pros and cons of each method, supposing my data is normally distributed? And what if it's not? Sorry if it seems a silly and vague question, but maybe someone could point me to a simple tutorial or something.

+ +

Thanks in advance!

+",2013-10-29 19:18:51.163 +58452,14179.0,1,,,,Looking for the 'Elbow' in data,,CC BY-SA 3.0,"

Subitization is the rapid, accurate enumeration of low-numerosity displays, distinguished from +counting by a sharp non-linearity in the plot of response times. Below is a representative plot, from Watson, D. G., Maylor, E. A., & Bruce, L. A. M. (2007). Notice that mean enumeration times for displays 1-3 increases roughly linearly, but mean enumeration time for 4 does not follow the linear trend. Some research suggests that the subitization 'limit' is dependent on task conditions and participant working memory.

+ +

+ +

I'm looking for a way to test where the elbow is, with the ultimate goal of identifying what a participant's subitization limit is. Currently, my best idea is to do something like repeated polynomial contrasts. Basically, I would test for a quadratic trend in numerosities 1-3, then in numerosities 1-4, etc. I would want to say that I have passed the subitization limit when the quadratic trend becomes significant (adjusting for repeated tests).

+ +

That's about the limits of my statistical savvy, though, so I can't evaluate this idea too well. Thoughts?

+ +

Thanks in advance.

+",2013-10-29 19:23:42.563 +58453,20304.0,1,58460.0,,,How to think of reduced dimensions in PCA on facial images (eigenfaces)?,,CC BY-SA 3.0,"

I've been reading up a bit on eigenfaces. I think I understand the basic concept of it - vectorize a set of facial images then reduce the dimensionality of the images using PCA. What I don't really understand is the visualization of the lower-dimensional representation of the images.

+ +

In the facial images, the number of dimensions is the number of pixels so if you reduce the dimensionality of an image, you reduce the number of pixels. But then how do you visualize this image? Is it just a much smaller version of the full-dimensional original? The examples that I have seen do not look like this. Or do you alternatively make each pixel bigger so that the overall image is the same size as the original?

+",2013-10-29 19:29:32.170 +58454,22804.0,1,58458.0,,,Find the difference between two vectors and the mean of the result,,CC BY-SA 3.0,"

So, with the help of someone here in stats i could solve most of my problems.

+ +

Now i'm stuck at this part:

+ +

I have the below code, which worked very well if this values were fixed.

+ +
immediate.amput.below.EV
+watchful.wait.EV
+
+ +

Now what they are is vectors. They represent a vector with [n_psa] size. +What i need to do is, i need iterate for n_psa values, i.e for 100 values and find the difference between each immediate.amput.below.EV[1] - watchful.wait.EV[1] and so on, save the results only as positive numbers, even if the output is a negative number in a new vector, and find the mean of the new vector.

+ +

This is what i used to do.

+ +
DELTA_COST <- max(immediate.amput.below.EV,watchful.wait.EV) - min(immediate.amput.below.EV,watchful.wait.EV)
+mean(DELTA_COST)
+
+",2013-10-29 19:39:47.533 +58455,3580.0,2,,58442.0,,,,CC BY-SA 3.0,"

To me VB just means we will approximate posterior by turning the problem into a minimization problem and then changing the solution space. In that sense, EP is a type of VB algorithm, because it does exactly that. When I think of VB, though, the first thing I think of is mean-field, and if I had to guess I would think that whoever wrote that EP is ""different"" from VB might have had mean-field algorithms in mind.

+ +

This source, which I take to be canonical, explicitly classifies all algorithms you mention as VB (i.e. Bethe-Kikuchi, sum-product, EP, and mean-field).

+",2013-10-29 19:42:46.853 +58456,23149.0,1,,,,Permutation test of meta-analysis : correlation coefficient,,CC BY-SA 3.0,"

I am going to estimate False Discovery Rate using permutation tests.

+ +

To my knowledge, several R packages are applicable for multiple testing.

+ +

I have several independent datasets for meta-analysis.

+ +

I computed a Pearson correlation coefficient, converted it into a Fisher-Z score and then calculated the mean effect size of each gene pair from several independent datasets.

+ +

Actually, in each dataset, there are two different subgroups - healthy subjects and patients. In this co-expression network, I analyzed the gene expression profiles of the patient group only.

+ +

Here is my question.

+ +
+

What should I do to estimate FDR using permutation test with creating random and independent shuffling gene expression values of all genes in each dataset + to break the inter-gene relationships while keeping intact the expression mean and standard deviation of the genes in every dataset?

+
+ +

Unfortunately, I have no idea how to do it. I am still new to statistics and as well as R.

+ +

I would appreciate it if you could answer the question with corresponding code I could run.

+",2013-10-29 19:50:50.240 +58476,23165.0,1,,,,Should I standardize my variables for this particular case of cluster analysis?,,CC BY-SA 3.0,"

I'm trying to cluster a list of records based on a (percentage) frequency distribution of variables which add up to 100%. +Like

+ +
    +
  1. Record1 - VarA(25%) VarB(25%) varC(50%) varD(0%)
  2. +
  3. Record2- VarA(50%) VarB(15%) varc(0%) VarD(35%)
  4. +
+ +

and so on. +I have standardized variables before while dealing with different dimensions(lengths and weights) etc. +In this case I do not think standardization is appropriate. +Also is k means clustering appropriate in this context, I wanted to use k means and use the distribution observed at the centroid of the cluster for the whole cluster. +Thanks a ton.

+",2013-10-30 05:05:39.567 +58649,4656.0,2,,58645.0,,,,CC BY-SA 3.0,"

If your datum $\alpha$ is $X$ or $Y$ with +probabilities $\pi_X$ and $\pi_Y = 1 - \pi_X$ +respectively, and the probability of mis-classification involves $\pi_X$ and $\pi_Y$, then you are a Bayesian and all you need to do is compare the likelihood +ratio $\displaystyle \frac{p_X(\alpha)}{p_Y(\alpha)}$ (which takes on values +$0, \frac{d-c}{b-a}, \infty$ depending on the value of +$\alpha \in [a,d]$ to an appropriate threshold (which I will leave you to +determine), and this can be reduced to a threshold test on the value of +the datum $\alpha$.

+",2013-11-01 13:43:15.560 +58457,9554.0,2,,58452.0,,,,CC BY-SA 3.0,"

Depending on your definition of the ""elbow"" there are many statistical tests at your disposal. With an entire R package dedicated to this topic.

+ +

I personally tend to avoid them, since you never know in advance what will they consider an ""elbow"" and whether your and their opinions will coincide. (but this might be considered an extreme position) +It would also depend whether you want to know if there is an ""elbow"" in a specific location, or whether you want to ask if there is one in general.

+ +

For the case of a specific location, you can of course fit a local regression, compare the coefficients and declare one an elbow according to your own rule about the difference in slopes.

+ +

The real problem occurs in the latter case. If you have only a couple of points anyway you can just try them all. Otherwise I would fit something non-parametric such as LOESS, calculate the gradient of the line at regular intervals (with sufficient density), such as shown here: +https://stackoverflow.com/questions/12183137/calculate-min-max-slope-of-loess-fitted-curve-with-r

+ +

and use again some rule that you find convenient to declare something an ""elbow"". I view the ""elbow"" as the case when a large enough change of gradient of a function occurs over a short enough interval. Of course the thresholds for the above rules are a matter of individual taste, which is why there is no test.

+ +

In general, I presume this would be quite useless if the data is wiggly (as there would be a lot of changes in the gradient).

+",2013-10-29 19:52:16.080 +58458,21029.0,2,,58454.0,,,,CC BY-SA 3.0,"

To calculate the absolute value in R, use the function abs(). Below is an example for vectors a and b. The sizes can change as long as they have the same dimension, otherwise you will get strange results or errors.

+ +
a <- c(1, 4, 8, 20)
+b <- c(50, 2, 4, 21)
+DELTA_COST <- abs(a-b)
+
+DELTA_COST
+[1] 49   2   4  1
+
+mean(DELTA_COST)
+[1] 14
+
+",2013-10-29 20:06:53.583 +58459,12544.0,2,,58153.0,,,,CC BY-SA 3.0,"

You can do this in R with Lavaan by specifying the model as a structural equation model and adding constraints. I'm not sure if it's a good idea, but it can be done.

+ +
#load library and generate some data
+library(lavaan)
+
+d <- as.data.frame(matrix(rnorm(1:3000), ncol=3, dimnames=list(NULL, c(""y"", ""x1"", ""x2""))))
+
+ +

Run it with GLM:

+ +
> summary(glm(y ~ x1 + x2, data=d))
+
+Call:
+glm(formula = y ~ x1 + x2, data = d)
+
+Deviance Residuals: 
+    Min       1Q   Median       3Q      Max  
+-3.6385  -0.5899  -0.0224   0.6024   3.0131  
+
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) -0.01855    0.03021  -0.614    0.539
+x1           0.01208    0.03049   0.396    0.692
+x2          -0.03676    0.03021  -1.217    0.224
+
+(Dispersion parameter for gaussian family taken to be 0.912437)
+
+    Null deviance: 911.2  on 999  degrees of freedom
+Residual deviance: 909.7  on 997  degrees of freedom
+AIC: 2751.2
+
+ +

Then run the same model with lavaan, to check equivalence:

+ +
> model1.syntax <- '
++ y ~ x1 + x2
++ '
+> summary(sem(model1.syntax, data=d))
+lavaan (0.5-14) converged normally after   1 iterations
+
+  Number of observations                          1000
+
+  Estimator                                         ML
+  Minimum Function Test Statistic                0.000
+  Degrees of freedom                                 0
+  P-value (Chi-square)                           1.000
+
+Parameter estimates:
+
+  Information                                 Expected
+  Standard Errors                             Standard
+
+                   Estimate  Std.err  Z-value  P(>|z|)
+Regressions:
+  y ~
+    x1                0.012    0.030    0.397    0.691
+    x2               -0.037    0.030   -1.219    0.223
+
+Variances:
+    y                 0.910    0.041
+
+ +

In lavaan, you then add constraints, by naming the parameters and adding a constraint section:

+ +
> model2.syntax <- '
++ y ~ b1 * x1 + b2 * x2
++ '
+> 
+> model2.constraints <- 
++   ' 
++     b1 > 0
++     b2 > 0
++   '
+> 
+> summary(sem(model=model2.syntax, constraints=model2.constraints, data=d))
+lavaan (0.5-14) converged normally after   1 iterations
+
+  Number of observations                          1000
+
+  Estimator                                         ML
+  Minimum Function Test Statistic                1.484
+  Degrees of freedom                                 0
+  P-value (Chi-square)                           0.000
+
+Parameter estimates:
+
+  Information                                 Observed
+  Standard Errors                             Standard
+
+                   Estimate  Std.err  Z-value  P(>|z|)
+Regressions:
+  y ~
+    x1       (b1)     0.012       NA
+    x2       (b2)     0.000       NA
+
+Variances:
+    y                 0.911    0.041
+
+Constraints:                               Slack (>=0)
+    b1 - 0                                       0.012
+    b2 - 0                                       0.000
+
+ +

Instead of being negative, the b2 parameter is fixed to zero.

+ +

Notice that you don't get any standard errors - if you want them, you have to bootstrap. (That's described in the lavaan manual).

+",2013-10-29 20:34:39.747 +58460,9554.0,2,,58453.0,,,,CC BY-SA 3.0,"

Just a hint, after reading your comment. Each image (face) is represented as a stacked vector of length $N$. The different faces make up a dataset stored in a matrix $X$ of size $K\times N$. You might be confused about the fact that you use the PCA to obtain a set of eigenvectors (eigenfaces) $I = \{u_1, u_2, \ldots, u_D\}$ of the covariance matrix $X^TX$, where each $u_i \in \mathbb{R}^{N}$. You don't reduce the number of pixels used to represent a face, but rather you find a small number of eigenfaces that span a space which suitably represents your faces. The eigenfaces still live in the original space though (they have the same number of pixels as the original faces).

+ +

The idea is, that you use the obtained eigenfaces as a sort of archetypes that can be used to perform face detection.

+ +

Also, purely in terms of storage costs, imagine you have to keep an album of $K$ faces, each composed of $N$ pixels. Instead of keeping all the $K$ faces, you just keep $D$ eigenfaces, where $D \ll K$, together with the component scores and you can recreate any face (with a certain loss in precision).

+",2013-10-29 20:35:34.133 +58461,21029.0,2,,58451.0,,,,CC BY-SA 3.0,"

First of all, your data is not normally distributed because it takes discrete values. But, if there is enough range in the data, you may say it approximates a normal distribution. It's an assumption you need to verify.

+ +

The question is vague because this is a data mining/data analysis question, and many methods will provide you with many (similar but different) results. There are no pros/cons here, since they all test for different things. For the methods you mentioned, here's a brief description:

+ +
    +
  1. Chi-squared. This will tell you if there is a relation between your groups. I assume the test will be offspring versus all the vocalizations. You may find signficance but not be certain where it comes from. If you find a relation, you have to dig deeper to find the source. Avoid having groups with few observations otherwise this test will not be valid.
  2. +
  3. (Student) T-test, or a test of equivalence in means. You compare the mean value of two populations, versus the alternative hypothesis that they are not equal. You will need to test each vocalization individually.
  4. +
  5. ANOVA. You are testing if the variance is explained across different factors (offspring). This needs to be done for each different variable. The idea is to look at the variance of the continuous variable within each class $s_i$ and compare it to the total variance st. The correlation coefficient for one class compared to the total is then $\nu_i=s_i/s_t$. This test also assumes normality.
  6. +
  7. MANOVA. This is testing if your qualitative factor (offspring) has an effect on ALL of the other variables. It is a generalization of ANOVA to multiple variables, which sounds appropriate for your study. We still have the normality assumption.
  8. +
  9. GLM. Modeling can show you the relationships in your data as well, but you need to have a predictor variable defined in advance. (Offspring?)
  10. +
+ +

Other ideas: If you are looking to see how your vocalizations are related only to offspring, then try perhaps a discriminant analysis (normality assumption again) or a logistic regression (no normality assumption).

+",2013-10-29 20:35:36.090 +58507,23176.0,1,,,,Sampling technique to estimate how many toxic waste sites are in a country?,,CC BY-SA 3.0,"

I work for an environmental health nonprofit and I have moderate expertise in statistics. We want to estimate the total number of toxic industrial waste sites within a small African country. I would love to hear your thoughts on how we should start. At this stage, if you can recommend a book, an idea, or a general sampling method that's all I'm asking for. I basically need a place to start.

+ +

For instance, I thought that maybe we could we divide the country into three zones: low, medium, and high industrial zones (based on UN data). We could then create 100 equal-sized sectors within each zone, and randomly select 10 sectors from each of the three industrial zones. We would then survey these small sectors areas and find all toxic sites within those sectors. If we do this, could we estimate the total number of industrial sites along with a measure of uncertainty? Is there a name for this type of geographic sampling?

+ +

There is very little pre-existing data on the number of toxic waste sites. Also, to simplify things, assume it is very easy to identify a toxic waste site once you are on the ground with a team.

+",2013-10-30 15:38:39.880 +58462,5821.0,1,,,,Weights for retrospective cohort data,,CC BY-SA 3.0,"

Suppose you randomly identify individuals with a certain disease (cases) at time $T$, then you identify a sample of individuals without disease at time $T$ (but at risk for disease) in a population of individuals undergoing routine screening for that disease. The controls are sampled in a known proportion from the population of interest. Suppose further you are able to retrospectively ascertain all screening information in both groups and you interested in inferring the population level screening rates and the relative risk for disease adjusting for screening rates. Lastly, there are some subject level effects which you believe may mediate screening behavior and disease risk.

+ +

A probability model for the screening process is:

+ +

$$t_{i,j} \sim \mbox{exponential}(\lambda_s; t_{i,j-1})$$

+ +

where an $\mbox{exponential}(\lambda, t)$ random variable is a location shifted exponential distribution with location parameter $t$. So the number of screens that a person utilizes in a fixed time period has a Poisson distribution.

+ +

Disease develops according to another approximately exponential process

+ +

$$D_i \sim \mbox{exponential} (\lambda_c)$$

+ +

and is clinically detectable with screening for a short interval, $\theta_{d}$.

+ +

$\lambda_s \gg \lambda_c$ obviously. Disease prevalence is very rare in the population.

+ +

Disease status $D_i(t)$ is 1 if the subject has either screen detected or symptomatic disease at time $t$, 0 otherwise.

+ +

The time varying subject level effects (age) we denote with $X_{i,t}$, so all $\lambda_c$, $\lambda_s$, and $\theta_d$ are conditionally dependent on $X_{i,t}$.

+ +

This is a type of analysis whose name is not familiar to me. I'm inclined to call it ""cluster case control"" since the individual observation is a single screening period. However, we did not sample screens based on case/control status, but individuals. Because of the variability in lead time and individual screening rates, cases screen above the population rate and controls screen below it. Thus it's tempting to use some kind of sampling weight to account for this and obtain an unbiased estimate of the population level screening rate. However, the subject frequency:

+ +

$$w_i = \left\{\begin{array}{ccc} +1 & \mbox{if} & D_{i}(T) = 1 \\ +1/p & \mbox{if} & D_{i}(T) = 0 \\ +\end{array} \right.$$

+ +

does not produce valid weights because of the lead time bias and differential screening (controls are followed for a longer period of time and so accrue more screening instances and person years, while cases are followed for shorter times and still accrue more screening intervals than cases in shorter time periods, yet still not more overall).

+ +

Is this even an identifiable problem???

+",2013-10-29 20:53:40.900 +58463,21346.0,1,58465.0,,,Standard error of marginal effect for binary/categorical variable,,CC BY-SA 3.0,"

The delta method/bootstrap method is used to obtain the standard error of marginal effect in case of limited dependent variable model (like tobit model). I have seen these being applied for the continuous variables, but my question is whether these can be also applied for categorical variables (say with 4 categories) or binary variables (say only with two categories). Please suggest academic papers if it is relevant to the question.

+",2013-10-29 20:58:37.147 +58464,13459.0,1,,,,How to deal with data having huge disparity in number in each class,,CC BY-SA 3.0,"

I have data in which the number of negative cases in response is approximately 98% of the total sample size (total # records are approximately 1 million, Response is binary). The positive cases are roughly 2%. What are the limitations of applying 'glm' and 'cart' on such data? What option do I have in such cases?

+ +

On test data I did get a very good AUC ~0.92. How much faith should I have in this model considering such a disparity in the number of cases in the positive and negative categories?

+",2013-10-29 21:10:26.307 +58465,5045.0,2,,58463.0,,,,CC BY-SA 3.0,"

Take a look at the formulas on page 324 of Tamás Bartus' ""Estimation of Marginal Effects Using Margeff"" from Stata Journal. The formulas and explanations are not Stata-specific.

+ +

Stata now handles these calculations using the margins command.

+",2013-10-29 21:43:17.067 +58466,20981.0,2,,58464.0,,,,CC BY-SA 3.0,"

Is this a binary response variable?

+ +

One limitation of models with very small (or large) rates is the amount of data needed to get accurate and stable estimates of variance and sample errors. As a rule of thumb you would want both Np = 5 and N(1-p) = 5 (or higher), so with an estimated p of 0.02 you need N of 250. So in order to get accurate sample errors etc you want a minimum 250 observations.

+",2013-10-29 22:33:45.173 +58467,23150.0,1,58488.0,,,What is the difference in Bayesian estimate and maximum likelihood estimate?,,CC BY-SA 3.0,"

Please explain to me the difference in Bayesian estimate and Maximum likelihood estimate?

+",2013-10-29 23:15:00.087 +58468,594.0,2,,58446.0,,,,CC BY-SA 3.0,"

(More than a comment, perhaps not a complete answer)

+ +
+

[I] would expect that as the sample size increases the probability of rejecting the null decreases

+
+ +

Leaving aside considerations of biased tests (which are not uncommon in goodness of fit, so it's worth a mention), there are three situations relating to rejection rate one might want to consider:

+ +

1) the rejection rate when simulating from the null (as you seem to be doing in your question)

+ +

Here, the rejection rate should be at or near the significance level, so, no, if you hold the significance level constant, the rejection rate doesn't decrease as n increases, but stays at/near $\alpha$.

+ +

2) the rejection rate when simulating from some alternative

+ +

Here the rejection rate should increase as n increases.

+ +

3) the rejection rate for some collection of real data

+ +

Practically, the null is never actually true, and real data will have some mixture of amounts of non-normality (as measured by the test statistic). If the degree of non-normality is not related to sample size, the rejection rate should increase as n increases.

+ +

So in fact, in none of these situations should we see the rejection rate decrease with sample size.

+",2013-10-29 23:27:51.550 +58469,,1,,,Andreas Pasternak,Finding the right model,,CC BY-SA 3.0,"

I have a data frame with several predictors, lets call them pred1 through pred3, and a result column. Now I need to specify the right model. I could randomly try:

+ +
svm.model <- svm(result ~ pred1+pred2+pred3,       data = train)
+# or
+svm.model <- svm(result ~ pred1*pred2+pred3^2,     data = train)
+# or
+svm.model <- svm(result ~ log2(pred1)+pred2*pred3, data = train)
+# etc. etc.
+
+ +

But there must be a better automatic approach in R to model selection?!

+",2013-10-30 00:05:53.997 +58477,22792.0,1,,,,Understanding d-separation theory in causal Bayesian networks,,CC BY-SA 3.0,"

I am trying to understand the d-Separation logic in Causal Bayesian Networks. I know how the algorithm works, but I don't exactly understand why the ""flow of information"" works as stated in the algorithm.

+ +

+ +

For example in the graph above, lets think that we are only given X and no other variable has been observed. Then according to the rules of d-separation, the information flow from X to D:

+ +
    +
  1. X influences A, which is $P(A)\neq P(A|X)$. This is OK, since A causes X and if we know about the effect X, this affects our belief about the cause A. Information flows.

  2. +
  3. X influences B,which is $P(B)\neq P(B|X)$. This is OK, since A has been changed by our knowledge about X, the change at A can influence our beliefs about its cause, B, as well.

  4. +
  5. X influences C,which is $P(C)\neq P(C|X)$. This is OK because we know that B is biased by our knowledge about its indirect effect, X, and since B is biased by X, this will influence B's all direct and indirect effects. C is a direct effect of B and it is influenced by our knowledge about X.

  6. +
+ +

Well, up to this point, everything is OK for me since the flow of the information occurs according to intuitive cause-effect relationships. But I don't get the special behavior of so called ""V-structures"" or ""Colliders"" in this scheme. According to the d-Separation theory, B and D are the common causes of C in the graph above and it says that if we did not observe C or any of its descendants, the flow information from X is blocked at C. Well, OK, but my question is why?

+ +

From the three steps above, started from X, we saw that C is influenced by our knowledge about X and the information flow occurred according to the cause-effect relationship. The d-Separation theory says that we cannot go from C to D since C is not observed. But I think that since we know that C is biased and D is a cause of C, D should be affected as well while the theory says the opposite. I am clearly missing something in my thinking pattern but can't see what it is.

+ +

So I need an explanation of why the flow of information blocked at C, if C is not observed.

+",2013-10-30 07:03:41.870 +58470,5821.0,2,,58467.0,,,,CC BY-SA 3.0,"

I think you're talking about point estimation as in parametric inference, so that we can assume a parametric probability model for a data generating mechanism but the actual value of the parameter is unknown.

+ +

Maximum likelihood estimation refers to using a probability model for data and optimizing the joint likelihood function of the observed data over one or more parameters. It's therefore seen that the estimated parameters are most consistent with the observed data relative to any other parameter in the parameter space. Note such likelihood functions aren't necessarily viewed as being ""conditional"" upon the parameters since the parameters aren't random variables, hence it's somewhat more sophisticated to conceive of the likelihood of various outcomes comparing two different parameterizations. It turns out this is a philosophically sound approach.

+ +

Bayesian estimation is a bit more general because we're not necessarily maximizing the Bayesian analogue of the likelihood (the posterior density). However, the analogous type of estimation (or posterior mode estimation) is seen as maximizing the probability of the posterior parameter conditional upon the data. Usually, Bayes' estimates obtained in such a manner behave nearly exactly like those of ML. The key difference is that Bayes inference allows for an explicit method to incorporate prior information.

+ +

Also 'The Epic History of Maximum Likelihood makes for an illuminating read

+ +

http://arxiv.org/pdf/0804.2996.pdf

+",2013-10-30 00:08:32.913 +58471,5480.0,1,,,,Clustering microblogs,,CC BY-SA 3.0,"

I have a microblog dataset with about 100k tweets and I would like to cluster them effectively using some less advanced algorithm. Is there anywhere I can find like a single-pass K-Means or similar algorithms. Thanks.

+ +

I have tried using K-Means and it takes about 5 minutes to cluster 100k tweets, using standard pre-processing (tokenization, stemming). I have also used Online LDA, but I can't retrieve tweets from the topics.

+",2013-10-30 01:58:09.657 +58472,22600.0,1,,,,Mean and Covariance of Office Hours,,CC BY-SA 3.0,"

I have a homework assignment with the following question. I've attempted to solve the problems and my answers are below, but I feel like they are incorrect, and have no idea how to check. I would be very grateful for any help you can provide:

+ +
Posted office hours notwithstanding, students make calls at the offices of their 
+professors and have formed the impression that professors are more likely to be away
+from their offices on Friday than any other working day. A review of calls, 
+1/5 of which are on Fridays, indicates that for 16% of Friday calls, the professor 
+is away from his or her office, while this occurs for only 12% of calls on every 
+other working day.  Define two random variables as follows:
+
+X = 1 if call is made on a Friday; otherwise, X = 0
+
+Y = 1 if professor is away from his or her office; otherwise, Y = 0
+
+
+a.  Find the mean value of X, namely, E(X).
+
+b.  Find the mean value of Y, namely, E(Y).
+
+c.  Find (to four decimal places) and interpret the covariance between X and Y.  
+
+ +
+ +

My Answers:

+ +

A : 0.2(1) = 0.2

+ +

B : (0.12)0.2 + (0.12)0.2 +(0.12)0.2 +(0.12)0.2 +(0.16)0.2 = 0.128

+ +

C : E(xy) – E(x)*E(y) :

+",2013-10-30 02:58:31.177 +58473,20473.0,2,,58063.0,,,,CC BY-SA 3.0,"

This is a digressing answer, but since computers don't do what we want them to do, but only what they tell them to do, I believe that when we can become more specific, we reduce uncertainty (actual or perceived).

+ +

The implicit equation that determines $y$ gives a quadratic equation in $y$, with the roots of the quadratic being functions of the parameters (and in fact $y$ is seen to be the logistic cdf).

+ +

For comapactness, denote $\ln \bigl(\frac{-y}{y - 1} \bigl)\equiv h$. Then

+ +

$$g(\theta) = 0 \Rightarrow -(h-\theta_1)(h-\theta_3) + a\theta_2(h-\theta_3) + (h-\theta_1)b\theta_4 =0$$ +$$\Rightarrow -h^2+\theta_3h+\theta_1h-\theta_1\theta_3+a\theta_2h - a\theta_2\theta_3+hb\theta_4-b\theta_1\theta_4=0$$ +$$-h^2+(\theta_1+\theta_3+a\theta_2+b\theta_4)h-(\theta_1\theta_3+a\theta_2\theta_3+b\theta_1\theta_4)=0 $$

+ +

Set

+ +

$$ \phi_1 \equiv\theta_1+\theta_3+a\theta_2+b\theta_4, \;\; \phi_2= \theta_1\theta_3+a\theta_2\theta_3+b\theta_1\theta_4$$

+ +

Then the roots of the polynomial $-h^2+\phi_1h-\phi_2=0$ are

+ +

$$h^*_A,h^*_B = \frac {-\phi_1 \pm \sqrt {\phi_1^2 -4\phi_2}}{-2}=\frac {\phi_1}{2}\pm\sqrt {\left(\frac {\phi_1}{2}\right)^2-\phi_2}$$

+ +

Then we obtain two equations for $y$

+ +

$$\ln \bigl(\frac{-y}{y - 1} \bigl) = h^*_A \Rightarrow y_A = \frac 1{1+e^{-h^*_A}}$$ +and

+ +

$$\ln \bigl(\frac{-y}{y - 1} \bigl) = h^*_B \Rightarrow y_B = \frac 1{1+e^{-h^*_B}}$$

+ +

which is the cdf of the logistic distribution, call it $\Lambda_j,\; j=A,B$, and denote $\lambda_j$ the derivative w.r.t its argument, $\lambda_j = \Lambda_j(1-\Lambda_j)$.

+ +

Your log-likelihood becomes

+ +

$$\ln L = \ln \Bigl(\frac {n!}{k!(n-k)!}\Bigl)+k\ln\Lambda_j + (n-k)\ln(1-\Lambda_j) $$

+ +

Now even the Hessian can be calculated by hand (with patience), let alone the gradient. It is also feasible to check concavity of the log-likelihood. Since you have two equations for $y$, you maximize separately and pick the solution that gives the higher value for the log-likelihood.

+",2013-10-30 03:55:29.550 +58474,23163.0,1,,,,Analyzing repeated rank data.,,CC BY-SA 3.0,"

I have a data set of N people, T items. Let's say N=100, and T=10.

+ +

Each person goes through the following exercise.

+ +
    +
  1. She is shown 2 random items from the set of T=10, and ranks them as rank 1 and 2.
  2. +
  3. She is next shown 2 more random items from the remaining 8 out of 10 items, and ranks as rank 1 and 2.
  4. +
+ +

At the end, the data set is of size 100x10, where each row has 4 numeric entries (two of which will be 1, and the other two will be 2) and 6 empty entries.

+ +

My goal is to compare the 10 items against one another, and come up with an estimated rank value for a given item.

+ +

What is the best way to analyze such data ?

+ +

Thank you.

+",2013-10-30 04:27:36.030 +58475,23002.0,1,58498.0,,,"$P(X_1 < \min(X_i,\ldots, X_n))$ across different normal random variables",,CC BY-SA 3.0,"

I have a set of mutually independent normal distributions $X_1$ to $X_5$ (with means and standard deviations) which represent finishing times for swimmers over a certain distance. The actual data is as follows:

+ +

$$X_1(60, 3.0)$$ +$$X_2(61, 1.0)$$ +$$X_3(58, 2.3)$$ +$$X_4(63, 2.4)$$ +$$X_5(61, 1.7)$$ +So swimmer 1 ($X_1$) has a mean finishing time of 60 seconds with a standard deviation of 3.0 seconds.

+ +

Question 1: What is the probability of an event where $X_i$ finishes first. e.g.

+ +

$$P(X_1 \lt X_i, i=2,\ldots,n)$$

+ +

Question 2: If I calculate this for all swimmers, can I simply order the results to determine the most probable finishing order?

+ +

This is not homework.

+ +

Based on the answers to this Cross Validated question, I have tried to solve this problem based on the first answer. i.e.

+ +

$$\Pr(X_1 \le X_i, i=2,\ldots,n) = \int_{-\infty}^{\infty} \phi_1(t) [1 - \Phi_2(t)]\cdots[1 - \Phi_n(t)]dt$$

+ +

Where $\phi_i$ is the PDF of $X_i$ and $\Phi_i$ is its CDF.

+ +

Based on this formula, the results I obtained were:

+ +

$$\Pr(X_1 \le X_i, i=2\ldots5) = 0.259653$$ +$$\Pr(X_2 \le X_i, i=1,3\ldots5) = 0.214375$$ +$$\Pr(X_3 \le X_i, i=1\ldots2, 4\ldots5) = 0.611999$$ +$$\Pr(X_4 \le X_i, i=1\ldots3, 5) = 0.0263479$$ +$$\Pr(X_5 \le X_i, i=1\ldots4) = 0.0697597$$ +However, the probabilities add to 1.182135 when they should add to 1.0. I’m not sure if the formula is incorrect or my implementation of the integral (I used Excel and the trapezoidal method).

+ +

I also attempted to solve the problem using Dillip’s method (from the above mentioned question) as follows:

+ +

\begin{align*} +P(X_1 < \max X_i) &= P\{(X_1 < X_2) \cup \cdots \cup (X_1 < X_n)\\ +&\leq \sum_{i=2}^n P(X_1 < X_i)\\ +&= \sum_{i=2}^n Q\left(\frac{\mu_1 - \mu_i}{\sqrt{\sigma_1^2 + \sigma_i^2}}\right) +\end{align*}

+ +

However, the probability results were much greater than 1 in most cases so abandoned this approach. By the way, what exactly does $\max X_i$ mean?

+ +

Any assistance in calculating the probability would be appreciated.

+",2013-10-30 05:04:28.233 +58508,2149.0,2,,58119.0,,,,CC BY-SA 3.0,"

Outliers are a reflection of an unknown/unspecified external factor. If there is a relationship between two series then there would be an increased probability that both series would be affected. My answer to your question is ""yes"" since there may be a relationship between the two series.

+",2013-10-30 15:38:58.053 +58478,1406.0,2,,58427.0,,,,CC BY-SA 3.0,"

Given the information in the question I do not see any problems with such simulation set-up. You are modelling VAR(1) process and then explore the contemporaneous relationship between its components. Perfectly reasonable, although I would double check that this conforms to your empirical model.

+ +

As for the second question, the best way would be to figure out the exact covariance structure of the stationary process $\varepsilon_t=y_t-\beta x_t$ and then use GLS instead of OLS with the covariance matrix exactly specified. This way you would get the efficient estimates of $\beta$ and subsequently the smallest possible standard errors. Another way is to use autocorrelation-robust standard errors. In R you can calculate them using package sandwich:

+ +
library(lmtest)
+library(sandwich)
+mod <- lm(vec_y ~ vec_x)
+vv <- vcovHAC(mod)
+coeftest(mod,vcov=vv)
+
+
+  t test of coefficients:
+
+              Estimate Std. Error  t value Pr(>|t|)    
+(Intercept) -0.0093830  0.0111428  -0.8421   0.3998    
+vec_x        0.5333422  0.0052502 101.5857   <2e-16 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+ +

Here I used function coeftest from the package lmtest to get the usual summary output. Note that for large sample sizes the calculation of robust covariance matrix can take up some time.

+",2013-10-30 07:27:33.617 +58479,5671.0,2,,58476.0,,,,CC BY-SA 3.0,"

As the dimensions are on the same scale and measure the same kind of quantity (relative share), you don't need to standardize / whiten your data.

+ +

I'm not convinced that k-means is appropriate here. It will likely work though.

+ +

The reason is that since your data is histograms (they sum up to 1), you will likely get much better results with distance functions designed for this type of distributions; i.e. histogram intersection distance, jensen-shannon divergence etc.

+ +

Unfortunately, k-means is really designed for squared Euclidean distance (= Variance minimization), and you shouldn't blindly combine it with other measures. Instead, use a k-means variant that will converge with arbitrary distances, such as k-medoids = PAM.

+",2013-10-30 08:13:16.343 +58480,5671.0,2,,58471.0,,,,CC BY-SA 3.0,"

With single-pass algorithms you won't be able to discover useful structure. K-means already is too simple for actual structure discovery; it's merely a vector quantization method.

+ +

100k isn't a lot, it should run in a few minutes with full k-means, actually...

+ +

Have you experimented with hashing, in particularly LSH/MinHash? These seem to be fairly standard techniques. Please update your question and list what you have already tried.

+",2013-10-30 08:15:20.053 +58481,22013.0,1,58550.0,,,R package to make a linear discriminant analysis scatter plot,,CC BY-SA 3.0,"

Is there a package in R that can do this plot on LDA data +graph plot:

+ +

original question: How does linear discriminant analysis reduce the dimensions?

+",2013-10-30 08:37:20.010 +58482,20062.0,1,58486.0,,,How to best visualize one-sample test?,,CC BY-SA 3.0,"

We are currently writing a paper with several one-sample Wilcoxon tests. While visualizing two-sample tests is easy via boxplots, I was wondering whether there is any good way to visualize one-sample test results?

+ +
# Example data
+pd <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64,
+        0.73, 1.46, 1.15, 0.88, 0.90, 0.74, 1.21)
+
+wilcox.test(pd, mu = 1.1)
+
+#   Wilcoxon signed rank test
+#
+# data:  pd
+# V = 72, p-value = 0.5245
+# alternative hypothesis: true location is not equal to 1.1
+
+ +

...and also:

+ +

I would like to get Z-value instead of V-value. I know that if I use coin package instead of basic stats I will have z-values, but coin package +seems not to be able perform one-sample Wilcoxon test.

+",2013-10-30 08:55:08.630 +58483,23168.0,1,,,,Is a mixed/ random effects model required if fixed effects model shows no pattern in the residuals?,,CC BY-SA 3.0,"

I've got some data on discrete flood events (response variable - duration) on several rivers.

+ +

I've modelled the response variable by site and time using a Generalized Linear Model:

+ +
 mod = glm(duration ~ site + time)
+
+ +

I've then looked at the residual plots and cannot find any evidence for temporal correlation in the residuals. Does this mean that this model approach is valid (and that using a mixed/ random effects model is unnecessary) (or should one always use mixed/ random effects models for time-series data)?

+ +

Any links to reading references regarding this subject would be brilliant,

+",2013-10-30 09:10:58.437 +58484,23169.0,1,58531.0,,,Probability of a given password,,CC BY-SA 3.0,"

A password consists of 4 alphabet letters and 4 numbers. Calculate the following two probabilities:

+ +
    +
  1. $p_1$: the probability that the letters are all equal and that the numerical part contains one eight.
  2. +
  3. $p_2$: the probability that the password has 3 numbers followed by 4 letters.
  4. +
+ +

Although it sounds like an easy question, but how would I apply the definition of permutations and combinations here? Here is how I thought of solving it.

+ +

$p_1= (1/21)^4*(1/10)*(9/10)^2$

+ +

Do I need to calculate all the possible combinations here?

+ +

$p_2= 1/\binom{7}{7}=1/7!/(7-7)!= 1/7!$

+ +

since we are considering only one case among a permutation of 7 elements over 7 places.

+",2013-10-30 09:32:23.277 +58485,17180.0,1,,,,Self organizing maps vs. kernel k-means,,CC BY-SA 3.0,"

For an application, I want to cluster data (potentially high dimensional) and extract probability of belonging to a cluster. I consider at the moment Self organizing maps or kernel k-means to do the job. What are the pros and cons of each classifier for this task? Am-I missing others clustering algorithms that could be performant in this case?

+",2013-10-30 09:37:41.590 +58486,594.0,2,,58482.0,,,,CC BY-SA 3.0,"

Something like this?

+ +

+ +

Or were you after some interval for the median, like you get with notched boxplots (but suited to a one sample comparison, naturally)?

+ +

Here's an example of that:

+ +

+ +

This uses the interval suggested in McGill et al (the one in the references of ?boxplot.stats). One could actually use notches, but that might increase the chance that it is interpreted instead as an ordinary notched boxplot.

+ +

Of course if you need something to more directly replicate the signed rank test, various things can be constructed that do that, which could even include the interval for the pseudo-median (i.e. the one-sample Hodges-Lehmann location estimate, the median of pairwise averages).

+ +

Indeed, wilcox.test can generate the necessary information for us, so this is straightforward:

+ +
> wilcox.test(pd,mu=1.1,conf.int=TRUE)
+
+    Wilcoxon signed rank test
+
+data:  pd
+V = 72, p-value = 0.5245
+alternative hypothesis: true location is not equal to 1.1
+95 percent confidence interval:
+ 0.94 1.42
+sample estimates:
+(pseudo)median 
+        1.1775 
+
+ +

and this can be plotted also:

+ +

+ +

[The reason the boxplot interval is wider is that the standard error of a median at the normal (which is the assumption underlying the calculation based off the IQR) tends to be larger than that for a pseudomedian when the data are reasonably normalish.]

+ +

And of course, one might want to add the actual data to the plot:

+ +

+ +
+ +

Z-value

+ +

R uses the sum of the positive ranks as its test statistic (this is not the same statistic as discussed on the Wikipedia page on the test).

+ +

Hollander and Wolfe give the mean of the statistic as $n(n+1)/4$ and the variance as $n(n+1)(2n+1)/24$.

+ +

So for your data, this is a mean of 60 and a standard deviation of 17.61 and a z-value of 0.682 (ignoring continuity correction)

+ +
+ +

The code I used to generate the fourth plot (from which the earlier ones can also be done by omitting unneeded parts) is a bit rough (it's mostly specific to the question, rather than being a general plotting function), but I figured someone might want it:

+ +
notch1len <- function(x) {
+  stats <- stats::fivenum(x, na.rm = TRUE)
+  iqr <- diff(stats[c(2, 4)])
+  (1.96*1.253/1.35)*(iqr/sqrt(sum(!is.na(x))))
+}
+
+w <- notch1len(pd)
+m <- median(pd)
+
+boxplot(pd,horizontal=TRUE,boxwex=.4)
+
+abline(v=1.1,col=8)
+points(c(m-w,m+w),c(1,1),col=2,lwd=6,pch=""|"")
+
+ci=wilcox.test(pd,mu=1.1,conf.int=TRUE)$conf.int                       #$
+est=wilcox.test(pd,mu=1.1,conf.int=TRUE)$estimate
+
+stripchart(pd,pch=16,add=TRUE,at=0.7,cex=.7,method=""jitter"",col=8)
+
+points(c(ci,est),c(0.7,0.7,0.7),pch=""|"",col=4,cex=c(.9,.9,1.5))
+lines(ci,c(0.7,0.7),col=4)
+
+ +

I may come back and post more functional code later.

+",2013-10-30 09:59:34.693 +58495,503.0,2,,58494.0,,,,CC BY-SA 3.0,"

Where did you hear this? The usual reason for preferring the median is that it is less affected by extreme values than the mean. However, it is in general less sensitive to changes in the data.

+ +

I ran a tiny example in R

+ +
set.seed(1234)
+true <- rnorm(1000)
+smallerror <- true + rnorm(1000,0,.1)
+largeerror <- true + rnorm(1000, 0, 1)
+bias <- true + rnorm(1000,1, .5)
+
+mean(true) - mean(smallerror)
+quantile(true, .5) - quantile(smallerror, .5)
+
+mean(true) - mean(largeerror)
+quantile(true, .5) - quantile(largeerror, .5)
+
+ +

In this particular case, the mean was more affected than the median.

+",2013-10-30 11:19:25.383 +58693,594.0,2,,58688.0,,,,CC BY-SA 3.0,"

It's a simple matter of playing 'spot the probability function':

+ +

$$\pi(s)=e^{-\lambda}\sum^{\infty}_{i=0}\frac{e^{\lambda s}}{e^{\lambda s}}\frac{(\lambda s)^i}{i!}=e^{-\lambda}e^{\lambda s}\cdot\sum^{\infty}_{i=0}e^{-\lambda s}\frac{(\lambda s)^i}{i!}=e^{-\lambda}e^{\lambda s}\cdot 1$$

+ +

since the term in the sum is just the sum over the probability function of a Poisson$(\lambda s)$

+",2013-11-02 09:44:16.833 +58487,22923.0,2,,58408.0,,,,CC BY-SA 3.0,"

So if my understanding of @Alecos very insightful analysis is correct, he has 2 points: even if returns are linearly related $\Delta y_t = b\cdot\Delta x_t + \Delta\varepsilon_t$ then

+ +
    +
  1. true correlation between $\Delta y_t$ and $\Delta x_t$ can be anything between 0 and 1 depending on noise/signal ratio $var(\Delta \varepsilon_t)/ var(\Delta x_t)$

  2. +
  3. because we estimate above true correlation from a finite sample, our estimate can be something different from the true one.

  4. +
+ +

Now, for point 1 I can object that if noise/signal ratio is big then both correlation and original cointegration will be ""weak"" (not sure if there is a measure of cointegration strength - probably ADF-test p-value?).

+ +

So if we know that true correlation between differenced series is ~0 due to high $var(\Delta \varepsilon)$, we probably still can conclude that cointegration between original series is very weak due to same high $var(\varepsilon)$.

+ +

Now, the second point probably makes this conclusion less certain - if finite sample estimate of correlation is ~0 this doesn't mean true one is ~0 and thus cointegration may be in place. So the question is how far can sample correlation be from the true one given a sample size :)

+",2013-10-30 10:00:27.300 +58488,20470.0,2,,58467.0,,,,CC BY-SA 3.0,"

It is a very broad question and my answer here only begins to scratch the surface a bit. I will use the Bayes's rule to explain the concepts.

+ +

Let’s assume that a set of probability distribution parameters, $\theta$, best explains the dataset $D$. We may wish to estimate the parameters $\theta$ with the help of the Bayes’ Rule:

+ +

$$p(\theta|D)=\frac{p(D|\theta) * p(\theta)}{p(D)}$$

+ +

$$posterior = \frac{likelihood * prior}{evidence}$$

+ +

The explanations follow:

+ +

Maximum Likelihood Estimate

+ +

With MLE,we seek a point value for $\theta$ which maximizes the likelihood, $p(D|\theta)$, shown in the equation(s) above. We can denote this value as $\hat{\theta}$. In MLE, $\hat{\theta}$ is a point estimate, not a random variable.

+ +

In other words, in the equation above, MLE treats the term $\frac{p(\theta)}{p(D)}$ as a constant and does NOT allow us to inject our prior beliefs, $p(\theta)$, about the likely values for $\theta$ in the estimation calculations.

+ +

Bayesian Estimate

+ +

Bayesian estimation, by contrast, fully calculates (or at times approximates) the posterior distribution $p(\theta|D)$. Bayesian inference treats $\theta$ as a random variable. In Bayesian estimation, we put in probability density functions and get out probability density functions, rather than a single point as in MLE.

+ +

Of all the $\theta$ values made possible by the output distribution $p(\theta|D)$, it is our job to select a value that we consider best in some sense. For example, we may choose the expected value of $\theta$ assuming its variance is small enough. The variance that we can calculate for the parameter $\theta$ from its posterior distribution allows us to express our confidence in any specific value we may use as an estimate. If the variance is too large, we may declare that there does not exist a good estimate for $\theta$.

+ +

As a trade-off, Bayesian estimation is made complex by the fact that we now have to deal with the denominator in the Bayes' rule, i.e. $evidence$. Here evidence -or probability of evidence- is represented by:

+ +

$$p(D) = \int_{\theta} p(D|\theta) * p(\theta) d\theta$$

+ +

This leads to the concept of 'conjugate priors' in Bayesian estimation. For a given likelihood function, if we have a choice regarding how we express our prior beliefs, we must use that form which allows us to carry out the integration shown above. The idea of conjugate priors and how they are practically implemented are explained quite well in this post by COOlSerdash.

+",2013-10-30 10:07:41.400 +58489,15827.0,2,,58482.0,,,,CC BY-SA 3.0,"

If you like boxplots, you can as readily show a single boxplot with a line or other reference showing your hypothesised value. (@Glen_b posted an answer with an excellent simple example precisely as I was first writing this.)

+ +

It is arguable that boxplots, now very popular, are massively overused for one-sample and two-sample exploration. (Their real value, in my view, is when you are comparing many sets of values, with number of samples or groups or variables more like 10, 30 or 100, and there is a major need to see overall patterns amid a mass of possible detail.)

+ +

The key point is that with just one or two samples (groups, variables), you have space on a plot to show much more detail, detail that could be interesting or important for comparison. With a good design, such detail need not be distracting in visual comparison.

+ +

Evidently, in most usual versions the box plot suppresses all detail in its box, showing the middle half of the data, except in so far as the position of the median inside the box conveys some information. Depending on the exact rules used, such as the 1.5 IQR convention of showing data points individually if and only if they are 1.5 IQR or more from the nearer quartile, it is even possible that the box plot suppresses most of the detail about the other half of the data. Often, and perhaps even usually, such detail may be irrelevant to something like a Wilcoxon test, but being prepared to see something illuminating in the data display is always a good idea.

+ +

A display that remains drastically underused in many fields is the quantile plot, a display of the ordered values against an associated cumulative probability. (For other slightly technical reasons, this cumulative probability is typically not $1/n, \cdots, n/n$ for sample size $n$ but something like $(i - 0.5)/n$ for rank $i$, 1 being the rank of the smallest value.)

+ +

Here are your example data with a reference line added for 1.1.

+ +

+ +

In other examples, key points include

+ +
    +
  • For two-sample comparisons, there are easy choices between superimposing traces, juxtaposing traces, or using related plots such as quantile-quantile plots.

  • +
  • The plot performs well over a range of sample sizes.

  • +
  • Outliers, granularity (lots of ties), gaps, bi- or multimodality will all be shown as or much more clearly than in box plots.

  • +
  • Quantile plots mesh well with monotonic transformations, which is not so true for box plots.

  • +
+ +

Some will want to point out that cumulative distribution plots or survival function plots show the same information, and that's fine by me.

+ +

See W.S. Cleveland's books (details at http://store.hobart.com/) for restrained but effective advocacy of quantile plots.

+ +

Another very useful plot is the dot or strip plot (which goes under many other names too), but I wanted to blow a small trumpet for quantile plots here.

+ +

R details I leave for others. I am focusing here on the more general statistical graphics question, which clearly cuts across statistical science and all software possibilities.

+ +

Incidentally, I don't know the background story but the name wilcox.test in R seems a poor choice to me. So, you save on typing two characters, but the name encourages confusion, not least because of past and present people in statistical fields called Wilcox. Lack of justice for Mann and Whitney is another detail. The person being honoured was Wilcoxon.

+",2013-10-30 10:11:06.140 +58490,18690.0,1,,,,First differences interpretation,,CC BY-SA 3.0,"

I have a small issue regarding first difference models. Suppose we have

+ +

$$ \Delta y_{t} = \beta_{0} + \beta_{1} \Delta \log{y_{t-1}} + \Delta u_{t} $$

+ +

The interpretation of the delta would be the same as if I did with:

+ +

$$ y_{t} = \beta_{0} + \beta_{1} \log{y_{t-1}} + u_{t} $$

+ +

That is correct right?

+ +

However if we have:

+ +

$$ \Delta y_{t} = \beta_{0} + \beta_{1} \log{p} + u_{t} $$

+ +

Then the interpretation would be if p increases by 1%, the increase in $y_{t}$ would be $\beta_{1}$ right?

+ +

Thanks in advance for clearing things up!

+",2013-10-30 10:14:20.797 +58491,23171.0,1,,,,"Show that the best mean square estimator of $X$ given $(X_{1},...,X_{n})$ is $\hat X =E[X|\sigma(X_{1},...,X_{n})]$",,CC BY-SA 3.0,"

Let $X$ and $X_{i}$, $i=1,...,n$ be random variables on a probability space $(\Omega , \mathcal F,P)$. Show that the best mean square estimator of $X$ given $(X_{1},...,X_{n})$ is $\hat X =E[X|\sigma(X_{1},...,X_{n})]$

+",2013-10-30 10:37:22.313 +58492,12683.0,2,,58464.0,,,,CC BY-SA 3.0,"

For logistic regression there's no problem with imbalanced samples per se, though if the absolute number in either response class is small for separate covariate patterns the maximum-likelihood estimators of odds ratios can be rather too biased for comfort, and some prefer to used penalized-likelihood methods as discussed here. For larger models computational constraints might necessitate down-sampling the most common class, which reduces the precision of all estimates somewhat but otherwise only affects the intercept (an estimate of which for the original population can be recovered —see here). In your case a minority class numbering 20k shouldn't give cause for concern unless you're trying to estimate odds ratios for some very rare predictor categories.

+ +

For classification trees there's what seems to be a good answer here.

+",2013-10-30 10:50:44.987 +58493,23172.0,1,,,,Correlation of one binary input series and continous response series,,CC BY-SA 3.0,"

I have one input series of type 0/1 (it is an intervention time series) and some metric response series. I need to have a look at cross correlations. Normally I would use Box-Jenkins-technique. But now I have to prewhiten an input series which is binary. Can I handle it like a metric one? Does it make sense to look at the autocorrelations and fit an ARMA-model in the ""normal"" way?

+ +

Or would it be possible to fit ARMA-models for the metric response variables and then fit the same model to the input variable and then look at the cross correlations?

+ +

Nearly all of my response series are white noise - but as I understand I have to fit an ARMA-model to the input if this is not white noise anyway. Is this correct or is it sufficiently for the response to be white noise?

+ +

Would be very happy if anyone can help me! +Thank you very much.

+",2013-10-30 11:01:36.227 +58494,6805.0,1,58527.0,,,When is the median more affected by sampling error than the mean?,,CC BY-SA 3.0,"

I'm writing a paper on making probability estimates, and it's been asserted to me that I should take the median of the estimates given by my participants, rather than the mean. I've been told I should do this because the mean is more affected by sampling error than the median.

+ +

Why is this? Is this something that is always true, or which only holds under certain circumstances?

+",2013-10-30 11:09:20.073 +58506,23057.0,2,,57403.0,,,,CC BY-SA 3.0,"

Mantel test and Moran's I refer to two very different concepts.

+ +

The reason for using Moran's I is the question of spatial autocorrelation: correlation of a variable with itself through space. One uses Moran's I when wants to know to which extent the occurrence of an event in an areal unit makes more likely or unlikely the occurrence of an event in a neighboring areal unit. In other words (using your example): if there is a noisy crow on a tree, how likely or unlikely are there other noisy crows in the neighborhood? The null hypothesis for Moran's I is no spatial autocorrelation in the variable of interest.

+ +

The reason for using the Mantel test is the question of similarities or dissimilarities between variables. One uses the Mantel test when wants to know whether samples that are similar in terms of the predictor (space) variables also tend to be similar in terms of the dependent (species) variable. To put it simply: Are samples that are close together also compositionally similar and are samples that are spatially distant from each other also compositionally dissimilar? Using your example: it tests whether quiet crows are located near other quiet crows, while noisy crows have noisy neighbors. The null hypothesis is no relationship between spatial location and the DV.
+Besides this, the partial Mantel test allows comparing two variables while controlling for a third one.
+For example, one needs the Mantel test when compares

+ +
    +
  • Two groups of organisms, which form the same set of sample units;
  • +
  • Community structure before and after disturbance;
  • +
  • Genetic/ecological distance and geographic distance.
  • +
+ +

Here is a good discussion on the Mantel test and its application.

+ +

(Edited in response to Ladislav Nado's new examples)

+ +

If I may guess, the reason for your confusion is that you keep thinking of space and noise in your examples either as of two continuous variables, or as of one distance matrix (position in space) and one continuous variable (noise). In fact, to analyze similarities between two such variables, one should think of both of them as distance matrices. That is:

+ +
    +
  • one matrix (for example, for space) describes the differences for each pair of geographic coordinates. Value for 2 crows sitting next to each other is lower than the value for crows sitting far apart;
  • +
  • another matrix (for environmental, genetic, or any other structure) describes the differences between measured outcomes at given points. The value for 2 crows with a similar level of noise (it doesn't matter if they are quiet or noisy--it's just a measure of similarity!) is lower than the value for a pair of crows with dissimilar levels of noise.
  • +
+ +

Then the Mantel test computes the cross-product of the corresponding values in these two matrices. Let me underline again that the Mantel statistic is the correlation between two distance matrices and is not equivalent to the correlation between the variables, used to form those matrices.

+ +

Now let's take two structures you showed in pictures A and B.
+In picture A, the distance in each pair of crows corresponds to similarities in their level of noise. Crows with small differences in their level of noise (each quiet crow vs. another quiet crow, each noisy crow vs. another noisy crow) stay close, while each and every pair of crows with big difference in their level of noise (a quiet crow vs. a noisy crow) stay away from each other. The Mantel test correctly shows that there is a spatial correlation between the two matrices.
+In picture B, however, the distance between crows does not correspond to the similarities in their level of noise. While all noisy crows stay together, quiet crows may or may not stay close. In fact, the distance in some pairs of dissimilar crows (one quiet+one noisy) is smaller than the distance for some pairs of similar crows (when both are quiet).
+There is no evidence in picture B that if a researcher picks up two similar crows at random, they would be neighbors. There is no evidence that if a researcher picks up two neighboring (or not so distant) crows at random, they would be similar. Hence, the initial claim that On both plots the hypothesis valid is incorrect. The structure as in picture B shows no spatial correlation between the two matrices and accordingly fails the Mantel test.

+ +

Of course, different types of structures (with one or more clusters of similar objects or without clear cluster borders at all) exist in reality. And the Mantel test is perfectly applicable and very useful for testing what it tests. If I may recommend another good reading, this article uses real data and discusses Moran's I, Geary's c, and the Mantel test in quite simple and understandable terms.

+ +

Hope everything is slightly more clear now; though, I can expand this explanation if you feel like there is still something missing.

+",2013-10-30 15:07:43.040 +58725,7155.0,2,,58669.0,,,,CC BY-SA 3.0,"

The obvious answer is to pick the polynomial order with the best cross-validation.

+ +

A slight improvement is to use something like Gaussian Processes, which tend to do the right thing without tuning parameters.

+ +

If you don't need function approximation and you're trying to lump them into groups, you could do perform some kind of kernel-based clustering, like Spectral Clustering. The use of a kernel is crucial here because you can confer to the algorithm exactly what it is that makes these observations similar.

+",2013-11-02 21:28:52.147 +58496,20473.0,2,,58119.0,,,,CC BY-SA 3.0,"

NARROW VIEW 1
+A narrow view of this question (since the other answers and comments I believe have covered adequately various general approaches) is ""Assume two random variables $X$ and $Y$ are dependent. Does the variance of $X$ conditional on $Y$ is a function of the variance of $Y$?

+ +

To take refuge in the normal distribution, if both $X$ and $Y$ are normal and dependent, denote $\sigma^2_x$, $\sigma^2_y$, $\sigma_{xy}$ their unconditional variances and their covariance respectively. Then

+ +

$$\operatorname {Var}(X\mid Y) = \sigma^2_x-\frac {\sigma_{xy}^2}{\sigma^2_y} $$

+ +

which is increasing in $\sigma^2_y$. Note that the direction of their covariance (positive/negative) doesn't matter.

+ +

But also, note that the conditional variance is lower than the unconditional variance...

+ +

NARROW VIEW 2
+A 2nd narrow view of the matter is: consider pairs of random variables $(X_i,Y_i),\; i=1,...,n$, and $\sigma^2_x(i)$ and $\sigma^2_y(i)$ their unconditional variances. Assume that for some indices $k,j \in [1,...,n]$, we have $\sigma^2_y(k) > \sigma^2_y(j), \forall j\neq k$.
+Should we ""expect"" that $\sigma^2_x(k) > \sigma^2_x(j), \forall j\neq k$ also?
+This formalization of the OP's question requires from us to consider directly the dependence of each $(X_i,Y_i)$ on a common source, say a vector of random variables $\mathbf Z_i$, So we have

+ +

$$X_i = h_i(\mathbf Z_i),\;\; Y_i = g_i(\mathbf Z_i)$$

+ +

Then we ask ""does (or when) +$$ \operatorname {Var}[g_k(\mathbf Z_k)]>\operatorname {Var}[g_j(\mathbf Z_j)] \Rightarrow ? \operatorname {Var}[h_k(\mathbf Z_k)]>\operatorname {Var}[h_j(\mathbf Z_j)],\; \forall j\neq k$$

+ +

To be able to tell something about such an inequality, it must be the case that at least some of the variables that appear in $Z_j$, must also appear in the other $Z_i$'s: then as a necessary condition, we do not only require that the elements of each pair of $(X_i,Y_i)$ rv's are dependent -we require also that the pairs are dependent between them: good-bye i.i.d samples...

+",2013-10-30 11:28:40.740 +58497,22262.0,1,,,,Pearson correlation between discrete variable that's mostly 0 and a standard normal variable,,CC BY-SA 3.0,"

Suppose I want to estimate the correlation between $X\sim N(0,1)$ and $Y$, where $Y \in \{-1,0,1\}$ and is equal to zero for 99 per cent of the sample. Sample size is 10 million.

+ +

What are the properties of the Pearson correlation in this instance (bias, usefulness, etc)? Would an alternative correlation estimator be better (e.g. a rank correlation)?

+",2013-10-30 11:41:41.313 +58498,21638.0,2,,58475.0,,,,CC BY-SA 3.0,"

I always find it best in these situations to run a Monte Carlo simulation to check (roughly) what the correct answer should be. Here is some R code for doing that:

+ +
doRace <- function()
+{
+  times <- rnorm(mean=c(60,61,58,63,61),sd=c(3,1,2.3,2.4,1.7),n=5)
+  winner <- which.min(times)
+  winner
+}
+
+winners <- replicate(n=10000,expr=doRace())
+table(winners) / length(winners)
+
+ +

Which gives the following output for me (of course you will get slightly different answers depending on the state of your random number generator):

+ +
winners
+ 1      2      3      4      5 
+0.2573 0.0317 0.6108 0.0282 0.0720
+
+ +

This indicates that the issue is with swimmer 2, as these results otherwise agree well with yours. I suspect you just have an incorrect cell reference somewhere. Note that a reasonable resolution to the problem is to use the Monte Carlo simulation not just as a verification method but as the final implementation for calculating the probabilities. After all, numerical integration is itself an approximate and computationally expensive procedure.

+ +

In order to be absolutely sure, we can use the integrate() function in R. First define the integral:

+ +
integral<- function(x,whichSwimmer)
+{
+  means <- c(60,61,58,63,61)
+  sds <- c(3,1,2.3,2.4,1.7)
+
+  dnorm(x,mean=means[whichSwimmer],sd=sds[whichSwimmer]) *
+    (1 - pnorm(x,mean=means[-whichSwimmer][1],sd=sds[-whichSwimmer][1])) *
+    (1 - pnorm(x,mean=means[-whichSwimmer][2],sd=sds[-whichSwimmer][2])) *
+    (1 - pnorm(x,mean=means[-whichSwimmer][3],sd=sds[-whichSwimmer][3])) *
+    (1 - pnorm(x,mean=means[-whichSwimmer][4],sd=sds[-whichSwimmer][4]))
+}
+
+ +

Then we can calculate the probability for each swimmer in turn:

+ +
>integrate(integral,whichSwimmer=1,lower=0,upper=100)
+0.2596532 with absolute error < 2.5e-05
+
+>integrate(integral,whichSwimmer=2,lower=0,upper=100)
+0.03223977 with absolute error < 6.4e-06
+
+>integrate(integral,whichSwimmer=3,lower=0,upper=100)
+0.6119995 with absolute error < 1.5e-06
+
+>integrate(integral,whichSwimmer=4,lower=0,upper=100)
+0.02634785 with absolute error < 1.4e-06
+
+>integrate(integral,whichSwimmer=5,lower=0,upper=100)
+0.06975967 with absolute error < 8.1e-05
+
+ +

Which gives very good agreement with the Monte Carlo simulation.

+ +

Note that although you can technically give lower and upper bounds of negative/positive infinity to integrate() I found that this caused the procedure to break down, giving clearly incorrect results.

+ +

EDIT: I've just noticed you had a second question regarding the most likely ordering of swimmers. Again, we can easily check whether the intuition about just ordering the probability of each swimmer winning is correct by running a Monte Carlo simulation. We just need to adapt the sampling function to return the order of the swimmers rather than only the winner:

+ +
doRace <- function()
+{
+  times <- rnorm(mean=c(60,61,58,63,61),sd=c(3,1,2.3,2.4,1.7),n=5)
+  finishOrder <- order(times)
+
+  paste(finishOrder,collapse="""")
+}
+
+finishOrders <- replicate(n=1e6,expr=doRace())
+which.max(table(finishOrders) / length(finishOrders))
+
+ +

I get the ouput:

+ +
31254 
+   50
+
+ +

In other words, the most likely order is $3,1,2,5,4$ which is not the same as ordering the swimmers by their probability of winning!

+ +

For me, this is another reason to prefer the Monte Carlo approach as the final implementation as you can easily answer this and other questions - e.g. what is each swimmer's probability of finishing second or, given that swimmer $1$ finishes first, what is the most likely ordering of the remaining swimmers?

+ +

EDIT 2: To be able to answer these other questions, we need to change the sampling function again, this time to return the complete order in which the swimmers finish:

+ +
doRace <- function()
+{
+  times <- rnorm(mean=c(60,61,58,63,61),sd=c(3,1,2.3,2.4,1.7),n=5)
+  finishOrder <- order(times)
+
+  finishOrder
+}
+
+finishOrders <- replicate(n=1e6,expr=doRace())
+
+ +

finishOrders is a matrix where each column corresponds to a single simulated race, the first row gives the winner of each race, the second row the second placed swimmer of each race and so on. So, to get the probability that each swimmer finishes second we do:

+ +
> table(finishOrders[2,]) / ncol(finishOrders)
+
+       1        2        3        4        5 
+0.271749 0.198205 0.235460 0.075165 0.219421
+
+ +

To find the most likely order given that swimmer $1$ wins the race is a little more fiddly. First, extract all races where the first row is equal to $1$:

+ +
finishOrdersWhen1WinsRace <- finishOrders[,finishOrders[1,]==1]
+
+ +

Then turn the finish order of each race from a vector of numbers into a character string so we can use the table function to find the most frequent one:

+ +
> which.max(table(apply(finishOrdersWhen1WinsRace,2,paste,collapse="""")))
+13254 
+    8
+
+ +

In other words, given that swimmer $1$ wins the race, the most likely order of the remaining swimmers is $3,2,5,4$, which occurs:

+ +
> max(table(apply(finishOrdersWhen1WinsRace,2,paste,collapse="""")) / ncol(finishOrdersWhen1WinsRace))
+[1] 0.2341227
+
+ +

$23.4\%$ of the time.

+ +

I'm not sure whether a Monte Carlo approach is the only way to answer these questions but it seems likely that even if you can obtain closed-form expressions for the probabilities you'll need to rely on numerical integration like you did to find the winning probabilities.

+",2013-10-30 11:42:35.240 +58499,503.0,2,,58497.0,,,,CC BY-SA 3.0,"

There are a couple issues here:

+ +

First, the Pearson correlation would be assuming that both variables are continuous. This is not the case here.

+ +

Second, when 99% of one variable takes on only 1 value, correlation measures will be limited.

+ +

e.g

+ +
set.seed(1234)
+x <- rnorm(10000)
+y <- cut(x, breaks = c(-100,-3,3,100), labels = FALSE)
+boxplot(x~y)
+cor(x,y, method = ""spearman"")
+
+ +

The boxplot shows the perfect relationship, but the correlation is 0.09 (and Spearman's correlation is 0.17). This doesn't seem useful.

+ +

What is it you are trying to do, in non-statistical terms?

+",2013-10-30 11:59:51.077 +58500,7333.0,2,,58477.0,,,,CC BY-SA 3.0,"

Let's forget about X for a moment and consider just the collider of B, C and D. The reason that the v-structure can block the path between B and D is that, in general, if you have two independent random variables (B and D) that affect the same outcome (C), then knowing the outcome can allow you to draw conclusions about the relationship between the random variables, thus allowing for information flow.

+ +

Using an example from Pearl's book on causality, let C be the observation that the lawn is wet, D the event that it rained, and B the event that the sprinkler was on. Then if you don't know whether the lawn is wet or not, C and D are clearly independent (no information flows). But if you do know that the lawn is wet, then if it rained, it's less likely that the sprinkler was on ($P(B|D) \neq P(B)$) and if the sprinkler was on, it's less likely that the wet grass was caused by the rain ($P(D|B) \neq P(D)$). Hence, knowing that the lawn is wet unblocks the path and makes B and D dependent.

+ +

To understand this better, it might be useful to have a look at Berkson's Paradox, which describes the same situation.

+",2013-10-30 12:02:11.747 +58501,2071.0,2,,58453.0,,,,CC BY-SA 3.0,"

To clarify a bit more: in your original high-dimensional space, pixels are the dimensions. In the new space, each image is represented as a linear combination of a relatively small number of basis images, the eigenfaces. So in the new space, the eigenfaces are the dimensions.

+",2013-10-30 13:09:12.900 +58502,22637.0,1,58504.0,,,Understanding Sufficient Statistics,,CC BY-SA 3.0,"

As I began my study of sufficient statistics I stumbled upon a definition that puzzled me. The conditional probability distribution of the sample values given an estimator $\hat{\Theta}=\hat{\theta} $ is given by

+ +

$$ f\left( x_1,x_2,\ldots,x_n|\hat{\theta} \right) = \frac{f \left(x_1,x_2,\ldots,x_n,\hat{\theta} \right)}{g\left( \hat{\theta} \right)}=\frac{f\left( x_1,x_2,\ldots,x_n \right) }{g \left(\hat{\theta} \right)} $$

+ +

The first equality is of course the definition of the conditional distribution $P \left(A| B \right) = \frac{P\left( A \cap B \right)}{P \left( B \right)} $. What I do not understand is where the numerator in the second equality comes from.

+ +

It looks like we are assuming that $A \subset B \Rightarrow A \cap B =A $. But how is that possible in our case? Any insight on that? Thank you!

+",2013-10-30 13:25:39.063 +58503,23173.0,1,,,,Test if a variable is a good predictor of a transition to a state,,CC BY-SA 3.0,"

I have a dataset of the wealth of 10 different countries in the world since 1800, one data point per year.

+ +

Let's say I have noticed that when the wealth of a country goes above $1,000,000, this country goes from state A to state B (no matter what those states really mean).

+ +

What kind of statistical test can I use to try to test the hypothesis that wealth above $1,000,000 is a good predictor of the transition to state B, and in particular how do I test this hypothesis against an alternative hypothesis that it's not wealth, but population size for example that matters to make the transition?

+",2013-10-30 14:06:37.333 +58504,,2,,58502.0,user31668,,,CC BY-SA 3.0,"

In short, the value of a statistic is completely determined by the observed data, so given the values for $x_i$, the probability that the sufficient statistic takes a particular value is guaranteed, as $\hat \theta = h(x_i)$ for some function h.

+ +

Theoretically, a sufficient statistic ""encapsulates"" the information in your data about a particular parameter, so the conditional distribution of the data should no longer depend on the parameter you have estimated. In actual calculations, $\theta$ will drop out of your final formula.

+",2013-10-30 14:08:30.877 +58505,23175.0,1,,,,Estimating the effects of cumulative odds ratios - additive or exponential?,,CC BY-SA 3.0,"

I have run some regression models that examine the effect of variable X (a 15 point continuous scale) on Y (a binary variable - not perform/perform a particular behaviour) using logistic regression.

+ +

For the sake of this example, let’s say that the OR in one of the models is 1.08. I can argue that for each extra exposure on scale X, the respondents are 8% more likely to do Y. So far, so good.

+ +

The average score on the X scale is 7. I originally considered that somebody with an average score on the X scale (7) was 56% more likely to do Y (7 x 8%) than somebody with a 0 score on the X scale. However, I have been told that this is incorrect – that the correct calculation is 1.08 to the power of 7 (= 1.71), or in other words that they are 71% more likely to do Y if they have an average score on X compared to somebody who has a 0 score on X.

+ +

Cam anybody confirm (1) that this is correct and (2) provide a good reference for this?

+ +

Thanks!

+",2013-10-30 15:07:26.137 +58509,8958.0,2,,58485.0,,,,CC BY-SA 3.0,"

This has the potential to be an interesting question. Clustering algorithms perform 'well' or 'not-well' depending on the topology of your data and what you are looking for in that data. ¿What do you want the clusters to represent? I attach a diagram which sadly does not include kernel k-means or SOM but I think it is of great value for understanding the grave differences between the techniques. You probably need to ask and respond this to yourself before you dig in to measuring the ""pros"" and ""cons"".

+ +

This is the source of the image.

+",2013-10-30 15:49:08.193 +58510,22200.0,1,58615.0,,,Discriminant analysis for the validation of cluster analysis,,CC BY-SA 3.0,"

I did a discriminant analysis for the validation of my cluster analysis. The cluster analysis is based on a PCA, so I used the components as the independant variables in the discriminant analysis. My question is: is there a restriction considering the number of the independant variables (components) compared to the number of my (cluster) groups?

+ +

Thanks a lot!

+",2013-10-30 16:00:04.767 +58511,19750.0,1,,,,"In message-passing methods, what is the actual content of the messages?",,CC BY-SA 3.0,"

In message-passing methods, factors and random variables exchange messages that typically encode marginals, but as much as I look at their formulas, I still don't understand what those messages actually look like.

+

For example, in belief propagation, when using the factor graph representation, we have the following scheme:

+
    +
  • Messages from a variable node $v$ to a factor node $u$

    +

    $\forall x_v\in Dom(v),\; \mu_{v \to u} (x_v) = \prod_{u^* \in N(v)\setminus\{u\} } \mu_{u^* \to v} (x_v).$

    +
  • +
  • Messages from a factor node $u$ to a variable node $v$ are the product of the factor with messages from all other nodes, marginalised over all variables except the one associated with $v$:

    +

    $\forall x_v\in Dom(v),\; \mu_{u \to v} (x_v) = \sum_{\mathbf{x}'_u:x'_v = x_v } f_u (\mathbf{x}'_u) \prod_{v^* \in N(u) \setminus \{v\}} \mu_{v^* \to u} (x'_{v^*}).$

    +
  • +
+

I understand that these messages represent marginals and that the eventually we end up with the marginals in every node/factor, but my question is:

+
    +
  • Are these messages vectors holding real numbers?
  • +
  • What is the dimensionality of those vectors?
  • +
  • What does this dimensionality represent?
  • +
+

To make things more concrete let's assume a distribution over real random variables (e.g. gaussian), although I am also interested in the discrete case (e.g. categorical random variables). Also, how are these messages initialized?

+",2013-10-30 16:05:23.563 +58512,668.0,2,,58446.0,,,,CC BY-SA 4.0,"

Understanding power analysis of statistical hypothesis tests can be enhanced by carrying some out and looking closely at the results.

+ +
+ +

By design, a test of size $\alpha$ is intended to reject the null hypothesis with a chance of at least $\alpha$ when the null is true (its expected false positive rate). When we have the ability (or luxury) of choosing among alternative procedures with this property we would prefer those that (a) actually come close to the nominal false positive rate and (b) have relatively higher chances of rejecting the null hypothesis when it is not true.

+ +

The second criterion requires us to stipulate in what way(s) and by how much the null fails to be true. In textbook cases this is easy, because the alternatives are limited in scope and clearly specified. With distribution tests like the Shapiro-Wilk, the alternative are much more vague: they are ""non-normal."" When choosing among distribution tests, then, the analyst is likely to have to conduct their own one-off power study to assess how well the tests work against more specific alternative hypotheses that are of concern in the problem at hand.

+ +

An example motivated by Michael Mayer's answer posits that the alternative distribution may have qualities similar to those of the family of Student t distributions. This family, parameterized by a number $\nu\ge 1$ (as well as by location and scale) includes in the limit of large $\nu$ the Normal distributions.

+ +

In either situation--whether evaluating the actual test size or its power--we must generate independent samples from a specified distribution, run the test on each sample, and find the rate at which it rejects the null hypothesis. However, there is more information available in any test result: its P-value. By retaining the set of P-values produced during such a simulation, we can later assess the rate at which the test would reject the null for any value of $\alpha$ we might care about. The heart of the power analysis, then, is a subroutine that generates this P-value distribution (either by simulation, as just described, or--occasionally--with a theoretical formula). Here is an example coded in R. Its arguments include

+ +
    +
  • rdist, the name of a function to produce a random sample from some distribution

  • +
  • n, the size of samples to request of rdist

  • +
  • n.iter, the number of such samples to obtain

  • +
  • ..., any optional parameters to be passed on to rdist (such as the degrees of freedom $\nu$).

  • +
+ +

The remaining parameters control the display of the results; they are included mainly as a convenience for generating the figures in this answer.

+ +
sim <- function(rdist, n, n.iter, prefix="""",
+                breaks=seq(0, 1, length.out=20), alpha=0.05,
+                plot=TRUE, ...) {
+
+  # The simulated P-values.
+  # NB: The optional arguments ""..."" are passed to `rdist` to specify
+  #     its parameters (if any).
+  x <- apply(matrix(rdist(n*n.iter, ...), ncol=n.iter), 2, 
+             function(y) shapiro.test(y)$p.value)
+
+  # The histogram of P-values, if requested.
+  if (plot) {
+    power <- mean(x <= alpha)
+    round.n <- 1+ceiling(log(1 + n.iter * power * (1-power), base=10) / 2)
+    hist(x[x <= max(breaks)], xlab=paste(""P value (n="", n, "")"", sep=""""), 
+         breaks=breaks, 
+         main=paste(prefix, ""(power="", format(power, digits=round.n), "")"", sep=""""))
+    # Specially color the ""significant"" part of the histogram
+    hist(x[x <= alpha], breaks=breaks, col=""#e0404080"", add=TRUE)
+  }
+
+  # Return the array of P-values for any further processing.
+  return(x)
+}
+
+ +

You can see the computation actually takes just one line; the rest of the code plots the histogram. To illustrate, let's use it to compute the expected false positive rates. ""Rates"" is in the plural because the properties of a test usually vary with the sample size. Since it is well-known that distributional tests have high power against qualitatively small alternatives when sample sizes are large, this study focuses on a range of small sample sizes where such tests of often applied in practice: typically about $5$ to $100.$ To save computation time, I report only on values of $n$ from $5$ to $20.$

+ +
n.iter <- 10^5                 # Number of samples to generate
+n.spec <- c(5, 10, 20)         # Sample sizes to study
+par(mfrow=c(1,length(n.spec))) # Organize subsequent plots into a tableau
+system.time(
+  invisible(sapply(n.spec, function(n) sim(rnorm, n, n.iter, prefix=""DF = Inf "")))
+)
+
+ +

After specifying the parameters, this code also is just one line. It yields the following output:

+ +

+ +

This is the expected appearance: the histograms show nearly uniform distributions of P-values across the full range from $0$ to $1$. With the nominal size set at $\alpha=0.05,$ the simulations report between $.0481$ and $0.0499$ of the P-values were actually less than that threshold: these are the results highlighted in red. The closeness of these frequencies to the nominal value attests that the Shapiro-Wilk test does perform as advertised.

+ +

(There does seem to be a tendency towards an unusually high frequency of P-values near $1$. This is of little concern, because in almost all applications the only P-values one looks at are $0.2$ or less.)

+ +

Let's turn now to assessing the power. The full range of values of $\nu$ for the Student t distribution can adequately be studied by assessing a few instances from around $\nu=100$ down to $\nu=1$. How do I know that? I performed some preliminary runs using very small numbers of iterations (from $100$ to $1000$), which takes no time at all. The code now requires a double loop (and in more complex situations we often need triple or quadruple loops to accommodate all the aspects we need to vary): one to study how the power varies with the sample size and another to study how it varies with the degrees of freedom. Once again, though, everything is done in just one line of code (the third and final):

+ +
df.spec <- c(64, 16, 4, 2, 1)
+par(mfrow=c(length(n.spec), length(df.spec)))
+for (n in n.spec) 
+  for (df in df.spec)
+    tmp <- sim(rt, n, n.iter, prefix=paste(""DF ="", df, """"), df=df)
+
+ +

+ +

A little study of this tableau provides good intuition about power. I would like to draw attention to its most salient and useful aspects:

+ +
    +
  • As the degrees of freedom reduce from $\nu=64$ on the left to $\nu=1$ on the right, more and more of the P-values are small, showing that the power to discriminate these distributions from a Normal distribution increases. (The power is quantified in each plot title: it equals the proportion of the histogram's area that is red.)

  • +
  • As the sample size increase from $n=5$ on the top row to $n=20$ on the bottom, the power also increases.

  • +
  • Notice how as the alternative distribution differs more from the null distribution and the sample size increases, the P-values start collecting to the left, but there is still a ""tail"" of them stretching all the way to $1$. This is characteristic of power studies. It shows that testing is a gamble: even when the null hypothesis is flagrantly violated and even when our sample size is reasonably large, our formal test may fail to produce a significant result.

  • +
  • Even in the extreme case at the bottom right, where a sample of $20$ is drawn from a Student t distribution with $1$ degree of freedom (a Cauchy distribution), the power is not $1$: there is a $100 - 86.57 = 13\%$ chance that a sample of $20$ iid Cauchy variates will not be considered significantly different from Normal at a level of $5\%$ (that is, with $95\%$ confidence).

  • +
  • We could assess the power at any value of $\alpha$ we choose by coloring more or fewer of the bars on these histograms. For instance, to evaluate the power at $\alpha=0.10$, color in the left two bars on each histogram and estimate its area as a fraction of the total.

    + +

    (This won't work too well for values of $\alpha$ smaller than $0.05$ with this figure. In practice, one would limit the histograms to P-values only in the range that would be used, perhaps from $0$ to $20\%$, and show them in enough detail to enable visual assessment of power down to $\alpha=0.01$ or even $\alpha=0.005$. (That is what the breaks option to sim is for.) Post-processing of the simulation results can provide even more detail.)

  • +
+ +
+ +

It is amusing that so much can be gleaned from what, in effect, amounts to three lines of code: one to simulate i.i.d. samples from a specified distribution, one to apply that to an array of null distributions, and the third to apply it to an array of alternative distributions. These are the three steps that go into any power analysis: the rest is just summarizing and interpreting the results.

+",2013-10-30 16:21:55.193 +58513,23177.0,1,,,,Correlated features produce strange weights in Logistic Regression,,CC BY-SA 3.0,"

I have a data set with highly positively correlated features that I'm classifying with LR. AFAIK correlated weights are not a problem in the same way they are in Naive Bayes - overcounting will not occur with LR.

+ +

The strange things that I'm seeing is that some of the highly correlated features assume opposite weights: feature A might be highly positive and feature B highly negative, though not as much. Is this a symptom of something going wrong with optimization or is this expected (a priori I expect A and B to be positive class indicators)

+",2013-10-30 16:29:34.377 +58514,3999.0,2,,58505.0,,,,CC BY-SA 3.0,"

It's additive on the log scale.

+ +

The natural log of your odds ratio is 0.7696 (this should be the estimate from your model). As such, the odds ratio of going from 0 to 7 is exp(0.07696*7) = exp(0.53872) = 1.71

+",2013-10-30 16:37:02.997 +58515,23180.0,1,,,,Econometrics - choosing the best model when removing variables,,CC BY-SA 3.0,"

So I am looking into a regression model that is supposed to predict the value of a house based on numerous independent variables. What I don't quite understand is how to select the ""best"" model when eliminating insignificant variables. +The original model contained four independant variables of which three turned out to be insignificant (p-value > 0.05). So I removed, say X1, which had the largest p-value (small t-stat). The new model containing three variables clearly appeared to be better, as Adjusted R2 rose (I understand Adjusted R Square is suitable for comparing models with different number of parameters). The F-tests also seemed to confirm that removing X1 was the right thing to do due to its insignificance. +However, in the three variable model there was still one insignificant variable, X3. So I removed it as well. Now, in the two variable model, both variables were reliable (small p-values), but Adjusted R Square was actually somewhat lower than that of the previous model. F-tests seemed to indicate that X3 had a small enough impact on the model to be eliminated. +Which is actually the better model statistically and economically? The three variable model with the highest Adjusted R Square or the two variable model with no insignificant variables but lower Adjusted R2? I couldn't quite figure out the logic behind this.

+",2013-10-30 16:41:10.440 +58516,1889.0,2,,58475.0,,,,CC BY-SA 3.0,"

You had two questions

+ +
+

Question 1: What is the probability of an event where $X_i$ finishes first.

+
+ +

Your proposed solution look correct but, as you say, you clearly have an error in implelentation as the probablities do not add up to $1$. M. Berk has shown that it is likely to be an issue with Swimmer 2.

+ +
+

Question 2: If I calculate this for all swimmers, can I simply order the results to determine the most probable finishing order?

+
+ +

Not quite - if you have two swimmers with the same mean time in the middle of the group then the chance of one beating the other is $\frac12$, but the one with the higher standard deviation is more likely to be the overall winner: in your particular example, the chance of $X_2$ beating $X_5$ is $0.5$ but $X_5$ is more likely than $X_2$ to be the overall winner, so $X_5$ is less likely than $X_2$ to be third because of its higher standard deviation.

+ +

Using simulation, the most likely finishing order is $X_3,X_1,X_2,X_5,X_4$ (with a probability about 9.0%) above $X_3,X_1,X_5,X_2,X_4$ (about 8.0%), $X_1,X_3,X_2,X_5,X_4$ (about 6.1%), $X_3,X_2,X_5,X_1,X_4$ (about 5.8%), $X_1,X_3,X_5,X_2,X_4$ (also about 5.8%), $X_3,X_5,X_1,X_2,X_4$ (about 4.5%) and other less likely outcomes. Your idea would have predicted 31524, the second most likely outcome.

+",2013-10-30 16:46:19.640 +58517,22049.0,1,,,,Finding similarities using Wavelet transform,,CC BY-SA 3.0,"

I have a time serie and I want to find similarities in it. For the first step I have calculated Haar-wavelet coefficients for this time serie, and now I don't know exactly how should I continue

+ +

should I extract features from this transformed data, to find similarities? how can I do that?

+",2013-10-30 16:52:29.967 +58534,13165.0,1,58541.0,,,Why semi/nonparametric models?,,CC BY-SA 3.0,"

Increasing the flexibility of models makes it prone to overfitting. On the other hand, it looks to me that, if the space function classes $\mathcal{F}$ is too big, it is hard to prove bounds on empirical risks bounds and that sort of stuff. That's why I am questioning the necessity/importance/applicability of non-parametric models.

+ +

Here by nonparametrics I mostly mean, Dirichlet Processes and Beta Processes (and related family).

+ +

Any comments?

+",2013-10-30 19:13:32.363 +58518,23179.0,1,,,,Similarity distance score to remove outliers for survey data,,CC BY-SA 3.0,"

I'm still a beginner at data mining. I'm working on finding the association rules from hypothesis X to conclusion Y. To this end, I've conducted a survey with questions that go something like this:

+ +
Q1: Do you have any relatives with ABC property? (Ans: yes or no)
+Q2: Do you have an interest in XYZ field? (Ans: yes or no)
+Q3: Which institute are you from? (Ans: Option 1, 2, 3, 4 or 5)
+
+ +

and so on.. So there are lots of ""parameters"" or ""features"" or ""dimensions"" to my data.

+ +

My data is formatted similar to this: http://www.hakank.org/weka/weather.arff, and I'll be using WEKA.

+ +

However I'm still currently in the data pre-processing stage. Removing duplicate entries and dealing with missing values is no issue. What I'm worried about is removing outliers.

+ +

Firstly, how can I represent this type of record data in such a way that similarity measures like Euclidean or Minkowski (or perhaps any!) distance can even be applied to it?

+ +

And secondly, what's the most reasonable similarity measure to use in this type of case? I've looked at the Mahalanobi distance and it seems useless for my project because I have no ""ideal"" set of features against which I could compare other sections of data. Is it usual to even need to detect outliers before finding association rules? Or are outliers usually detected after the rules have been learned?

+ +

I've been thinking about this for a while but can't seem to reach a sensible conclusion. Could any of the more experienced data miners help please?

+",2013-10-30 16:52:48.587 +58519,22637.0,1,58647.0,,,Order Statistics-Expected Value of Random Length,,CC BY-SA 3.0,"

Let $Y_1<Y_2 $ denote the order statistics of a random sample of size 2 from a distribution that is $N\left( \mu,\sigma^2 \right) $, where $\sigma^2$ is known. Compute the expected value of the random length $Y_2-Y_1$.

+ +

I can see that the answer is $\frac{2\sigma}{\sqrt{\pi}}$ but I do not know how to get there since I cannot evaluate the double integral:

+ +

$$ \int_{-\infty}^{\infty} \int_{-\infty}^{y_2} \left( y_2-y_1 \right) \frac{1}{2\pi \sigma^2} exp \left\{ -\frac{1}{2\sigma^2}\left[ \left( y_1-\mu \right)^2 +\left( y_2-\mu \right)^2 \right]\right\} \mathrm {dy_1 dy_2}$$

+ +

Any ideas on how to compute this are greatly appreciated, thank you!

+",2013-10-30 16:53:13.367 +58520,23082.0,1,58559.0,,,Multivariate normal density function,,CC BY-SA 3.0,"

I am trying to compute multivariate normal distributions at some points.

+ +

I am using Matlab's mvnpdf function: y = mvnpdf(X,MU,SIGMA)

+ +

The first argument is the point where I compute the density, MU is the mean and SIGMA the covariance.

+ +

I am puzzled by the following result:

+ +
 mvnpdf([0 0 0],[0 0 0],0.001*eye(3))
+
+ans =
+
+   2.0078e+03
+
+>> mvnpdf([0 0 0.002],[0 0 0],0.001*eye(3))
+
+ans =
+
+   2.0038e+03
+
+ +

I am going at $2\sigma^2$ from the mean and the density is almost the same? +Shouldn't the result be close to zero?

+",2013-10-30 16:55:59.407 +58521,22682.0,1,58562.0,,,Expectation of Quotient of Sums of IID Random Variables (Cambridge University worksheet),,CC BY-SA 3.0,"

I'm preparing for an interview which requires a decent knowledge of basic probability (at least to get through the interview itself). I'm working through the sheet below from my student days as revision. It's mostly been fairly straightforward, but I am completely stumped on question 12.

+ +

http://www.trin.cam.ac.uk/dpk10/IA/exsheet2.pdf

+ +

Any help would be appreciated.

+ +

Edit: the question is:

+ +

Suppose that $X_1, X_2, ... $ are independent identically distributed positive random variables with $\mathbb{E}(X_1) = \mu < \infty$ and $\mathbb{E}(X_1^{-1}) < \infty$. Let $S_n = \sum_{i=1}^n X_i$. Show that $\mathbb{E}(S_m/S_n) = m/n$ when $m<=n$, and $\mathbb{E}(S_m/S_n) = 1 + (m-n)\mu\mathbb{E}(S_n^{-1}))$ when $m>=n$.

+ +

In fact, in the process of typing this up, I've solved the second part.

+ +

For $m>=n$, $\mathbb{E}(S_m/S_n) = \mathbb{E}(X_1+ . . . +X_m)/\mathbb{E}(X_1+ . . . +X_n)$

+ +

$=\mathbb{E}(1 + (X_{n+1} + ... + X_m)/(X_1 + ... + X_n)) $

+ +

and the numerator and denominator of the ratio above are clearly independent, so:

+ +

$ = 1 + \mathbb{E}(X_{n+1} + ... + X_m)\mathbb{E}(S_n^{-1})$

+ +

and we obtain the desired result.

+ +

I'm still stuck on the first part though.

+",2013-10-30 17:07:39.457 +58522,23181.0,2,,27194.0,,,,CC BY-SA 3.0,"

In fact, Gujarati states: ""Today, however, the term multicollinearity is used in a broader sense to include the case of perfect multicollinearity, (...), as well +as the case where the X variables are intercorrelated but not perfectly so"" and proceeds to give a definition in correspondence to RockScience's definition. So, my guess would be that the two terms are related to each other by the argument given by him.

+",2013-10-30 17:09:56.460 +58523,13165.0,1,,,,Which toolbox for Belief Propagation and other inference methods in graphical models?,,CC BY-SA 3.0,"

Which open-source software (toolbox) do you think is the best for modelling graphical models (e.g. factor graphs), and doing inference on them? (the language doesn't matter)

+",2013-10-30 17:19:54.253 +58524,23182.0,1,,,,Which values to use for scaling out-of-sample PCA data?,,CC BY-SA 3.0,"

I have centered and scaled inputs via prcomp ():

+ +
prOut<-prcomp(trainSet[,2:4],scale = TRUE,scores=TRUE)
+
+ +

I now want to use my completed model on new (future) data. +I assume the correct approach is to use the training data prOut$scale and prOut$center values and apply them to prior to calculating the principal component scores for my new data?

+ +

It doesn't seem right to scale the new data using the scale and center values for the new data set.

+",2013-10-30 17:27:52.937 +58525,19996.0,2,,58513.0,,,,CC BY-SA 3.0,"

It is possible you are up against collinearity here (I'm assuming that when you say ""correlated"" you are assuming positive correlation, otherwise the postive/negative difference may make sense). In any case, caution should be used when confronting collinearity in logistic regression. Parameter estimates are often difficult to obtain and unreliable. Of course, this depends on how highly correlated your predictors are. To rule out collinearity, you might want to check something like the Variance Inflation Factor.

+ +

If your variables have a high correlation coefficient, but are not truly collinear, then it still isn't incredibly surprising to get the opposite sign behavior you observe (I say this without knowing more details of your problem), depending on what other variables are in your model. Remember that fitting an LR model fits all variables simultaneously to the outcome, so you typically have to interpret the weights as a whole. They may be correlated with each other, but have opposite effects in predicting an outcome, especially if grouped with other variables.

+",2013-10-30 17:46:57.703 +58526,10756.0,1,,,,Algorithms for keyphrase clustering,,CC BY-SA 3.0,"

Are there any standard algorithms for keyphrase clustering. There are several algorithms for keyphrase extraction from a corpus. For e.g. this publication reviews some of the popular keyphrase extraction algorithms. Example of possible keyphrases extracted from a corpus of real-estate data would be 'house prices', 'car parking', 'foreclosure', 'victorian houses', etc. A clever algorithm can cluster 'hardwood floors' and 'hdwd flrs' together. Another example would be clustering 'basketball court' and 'b-ball court' together.

+ +

Are there any algorithms to group semantically similar keyphrases together?

+",2013-10-30 18:03:37.327 +58527,15827.0,2,,58494.0,,,,CC BY-SA 3.0,"

Imagine that a variable takes values 0 and 1 with probability both 0.5. Sample from that distribution and most of the medians will be 0 or 1 and a very few exactly 0.5. The means will vary far less. The mean is much more stable in this circumstance.

+ +

Here is a sample graph of results. The plots are quantile plots, i.e. ordered values versus plotting position, a modified cumulative probability. The results are for 10,000 bootstrap samples from 1000 values, 500 each 0 and 1. The means range fortuitously but nicely from 0.436 to 0.564 with standard error 0.016. The medians are as said, with standard error 0.493. (Closed-form results are no doubt possible here too, but a graph makes the point vivid for all.)

+ +

+ +

But that is exceptional. It illustrates the least favourable case for medians, a symmetric bimodal distribution such that the median is likely to flip between different halves of the data. However, symmetric bimodal distributions are not especially common, but watch out for so-called U-shaped distributions in which the extremes are most common and intermediate values uncommon. Distributions that are unimodal, or in which the number of modes has only a small effect on median or mean, are more common.

+ +

As advised by every treatment of robust statistics, a very common situation is that your data come with tails heavier than Gaussian and/or with outliers, and in those circumstances median will almost always be more robust. The point is that that is not a universal general result.

+ +

All that said, what relevance is a general result? You can at a minimum establish by bootstrapping the relative variability of mean and median for your own data. That's what you care about.

+",2013-10-30 18:04:23.850 +58528,,2,,58507.0,user31668,,,CC BY-SA 3.0,"

Your approach seems reasonable, especially your choice to stratify your sampling. This will make it more efficient provided you can easily delineate the different industrial zones.

+ +

I don't have a book to recommend you, but you could model your uncertainty using the Poisson distribution, with the $\lambda =$ No. of Toxic Waste Sites per Square Kilometer. +You could carry out your sampling program as you described and then find the maximum likelihood estimator for $\lambda_{Ai}$ where A is the area of a sampling sector in zone $i$. In particular, you would maximize the following formula wrt $\lambda_{Ai}$ where $N_i$= number of sectors sampled from zone $i$:

+ +

$\max\limits_{\lambda_{Ai}} \prod\limits_{j=1}^{N_i} \frac{e^{-\lambda_{Ai}}\lambda_{Ai}^{n_{ij}}}{n_{ij}!}$ where $n_{ij}$ is the number of toxic sites in sector $j$ of zone $i$. The value of $\lambda_{Ai}$ that maximizes the product is $\lambda_{Ai}^* = \frac{1}{N_i}\sum\limits_{j=1}^{N_i}{n_{ij}}$

+ +

You will get one estimate per zone, $\lambda_{Ai}^*$, which you can interpret as the frequency of toxic waste sites within a region of area $A_i$. Your uncertainty for the total number of sites in Zone $i$ with total area $A_{Ti}$can be modeled using your estimated $\lambda_{Ai}^*$ in the Poisson distribution: $Poisson(\lambda_{Ai}^*\frac{A_{Ti}}{A_i})$.

+ +

To get a country-wide estiamte, you would need to combine the $\lambda_{Ai}^*$ into another Poisson distribution: Total No. of Sites ~ $Poisson(\sum\limits_{i=1}^{N_{zones}}\lambda_{Ai}^*\frac{A_{Ti}}{A_i})$.

+ +

Refinements

+ +

The above should get you a decent estimate. However, if your country is small enough that your sample will cover an appreciable portion of the total land area or area within a zone, then you should reduce the total area for each zone by the sampled area in the above formula, so you are modleing the uncertainty on the remaining area (which is actually more accurate in both cases), then you add this uncertainty to your actual counts in the areas you've sampled.

+ +

Also, you will notice that you're using a point estimate of $\lambda$. There is some uncertianty in the actual value of this quantity, but including it requires using extended likelihood for predicting a Poisson variable. The formula is pretty simple, if Y is the total number of sites in zone $i$, then the likelihood function for Y is:

+ +

$L(Y_i) = e^{-(N+1)\hat\theta(Y_i)}\frac{\hat\theta(Y_i)^{Y_i+\sum\limits_{j=1}^{N_i}{n_{ij}}}}{Y_i!}$ Where $\hat\theta(Y_i) = \frac{A_{Ti}}{A_i}(Y_i + \frac{\sum\limits_{j=1}^{N_i}{n_{ij}}}{N_i+1})$ You need to normalize this formula to sum to 1 over the range of relevant Y. To get the country-wide estimate, you would need to use Monte-Carlo simulation for the sum of the $Y_i$ from each area based on the above formula. There are a couple inexpensive/free simulators out there.

+",2013-10-30 18:22:03.667 +58529,10957.0,1,,,,pattern of ROC curve and choice of AUC,,CC BY-SA 3.0,"

I am using ROC curves and full AUC values to compare different models, using simulated data. Now I think I am confused with the interpretations of ROC curves and AUC values. Please see the figure below (sorry it is partial from screen shots...)

+ +

There are three models compared, and I know that the model shown in green should preform best of all. However, as you can see, the green curve is superior to the other two before the FPR reaching around 0.2. This cut-off of 0.2 is quite interesting: it is the percentage of differentially expressed genes that I specify in my simulation (i.e. 20% of the observations are simulated to be positives).

+ +

My concern are:

+ +
    +
  1. given that people in reality will seldom choose a FPR cut-off of 0.5 or higher, why people would prefer a ROC curve with FPR ranging from 0 to 1 and use the full AUC value (i.e. calculate the entire area under the ROC curve) instead of just reporting the area made from, say, 0 to 0.25 or to 0.5? Is that called ""partial AUC""?

  2. +
  3. in the figure below, what can we say about the performances of the three models? The AUC values are: green (0.805), red (0.815), blue (0.768). The red curve turns out to be superior, but as you see, the superiority is only reflected after FPR > 0.2. Thanks :)

  4. +
+ +

+",2013-10-30 18:25:16.317 +58530,3999.0,2,,58119.0,,,,CC BY-SA 3.0,"

The answer, in my mind, is ""Yes and No, and if Yes then this isn't actually interesting"".

+ +

For variables with no dependency structure, the answer is no - a very high, or very low, value of a particular variable doesn't imply a very high, or very low value of another variable.

+ +

For variables with a dependency structure, the answer is often (but not always) yes. But this isn't a property of ""outlier-ness"", it's a property of correlation itself. What you're showing is not that ""Being an outlier begets being an outlier in other areas"" but that two associated variables behave exactly like associated variables should.

+",2013-10-30 18:26:09.790 +58531,7007.0,2,,58484.0,,,,CC BY-SA 3.0,"

Suppose that each symbol of the password occupies one of $8$ numbered boxes. First you choose $4$ of the $8$ boxes to put the letters, and each choice gives you $26^4$ possible letters configurations. Now, in the remaining $4$ boxes you put the digits, and each choice gives you $10^4$ possible digits configurations. Therefore, the total number of passwords is +$$ + \binom{8}{4} \times 26^4 \times \binom{4}{4} \times 10^4 \, . +$$ +For the password with the letters all equal and one digit $8$, from the $8$ boxes you choose $4$ to put the letters, and each choice gives you $26$ possible letters configurations. From the remaining $4$ boxes you choose $3$ to put digits, and each choice gives you $10^3$ digits configurations. In the last remaining box you put the digit $8$, which gives us +$$ + \binom{8}{4} \times 26 \times \binom{4}{3} \times 10^3 \times \binom{1}{1} \times 1 +$$ +paswords. I'm supposing that we can have more than one digit $8$.

+ +

P.S. For Huber's simplified problem, the number of possible passwords is +$$ + \binom{4}{2} \times 1 \times \binom{2}{2} \times 2^2 = 24 \, . +$$

+",2013-10-30 18:30:41.327 +58532,18331.0,1,58551.0,,,"Why do they call it ""sampling distribution of the sample mean""?",,CC BY-SA 3.0,"

Ok, I understand that there is a true population mean and one that I get from the sample. It is different for every sample and, thus, I can build the distribution of the sample means. I arrive at a distribution of sample means. But why is it a sampling distribution? Is the whole point that the moniker must be longer than necessary? What do I lose if I omit the extra qualifier and get away with distribution of sample mean alone?

+",2013-10-30 18:44:55.693 +58533,5045.0,2,,58513.0,,,,CC BY-SA 4.0,"

This might be a case of ceteris paribus confusion. It's hard to know without knowing more about your analysis.

+ +

Example 5 from Peter Kennedy might be relevant here:

+ +
+

In a linear regression of racehorse auction prices on various + characteristics of the horse and information on its sire (father) and + dam (mother), Robbins and Kennedy found that although the estimated + coefficient on dam dollar winnings was positive, the coefficient on + number of dam wins was negative, suggesting that yearlings from dams + with more race wins are worth less. This wrong sign problem is + resolved by recognizing that the negative sign means that holding dam + dollar winnings constant, a yearling is worth less if its dam required + more wins to earn those dollars.

+
+",2013-10-30 19:01:50.803 +58535,9554.0,2,,58529.0,,,,CC BY-SA 3.0,"

Usually, it will be your application that will determine whether your focus is on precision or recall.

+ +

@2 +These will be dramatically different in the medical field, where you will often tolerate having a bad precision for the sake of a very good recall, when it comes to prevention, i.e. you prefer to label a lot of healthy people as sick and make additional tests, rather than to let someone die (here sickness is considered ""relevant"", and labeled as sick ""retrieved"").

+ +

On the other hand, in production, you can tolerate a certain quota of bad apples and you might prefer a test that does not catch all the faulty products but is much more precise in identifying the bad apples - usually the costs associated with inspecting the items can not be disregarded. This corresponds to a high precision, and low recall scenario.

+ +

For your models, either you know what you need and pick a better model for that purpose, or you pick the one with better AUC. Of course there are also other things you might take into consideration, such as, which model is more parsimonious (has fewer explanatory variables), where are the assumptions better met, etc.

+ +

@1 +I don't see the advantage of putting less information in a plot, especially if it could be misleading. (unless you work in marketing)

+",2013-10-30 19:20:43.900 +58536,21346.0,1,58537.0,,,Integral in Stata when upper limit is infinity,,CC BY-SA 3.0,"

How can we calculate the integral of the integrand with the lower limit 0 and upper limit infinity in Stata? I am aware of the integ command, but I am not sure whether I can use that when the upper limit is infinity.

+",2013-10-30 19:41:33.683 +58537,5045.0,2,,58536.0,,,,CC BY-SA 3.0,"

You can use integrate from SSC to do numerical integration for one dimensional functions like this:

+ +
. integrate, f(normalden(x)) l(.) u(.)
+
+    Note: The function to be integrated will be compiled using Mata and stored in your personal directory ~/ado/personal/ (make sure this is writeable)
+
+The integral = 1
+
+",2013-10-30 20:07:49.170 +58538,17660.0,2,,58515.0,,,,CC BY-SA 3.0,"

From what I can tell, it generally depends on your purpose. If you are trying infer causality, then you need choose variables that specify some structure that actually has some theory behind it. Do you actually have good, intuitive reasons for including one variable or the other? (However, this is useless without some sort of identification strategy to disentangle other effects and biases).

+ +

On the other hand, if your goal is only to create a model with predictive power, it seems like in general you want to choose variables that make significant contributions to the fit of the model. For prediction and descriptive analysis, it seems to me like different fields have different rules. See this discussion about hierarchical and stepwise regressions of an example of different kinds of rules. In the case you have describe above, there isn't a hard rule to determine which case to choose.

+",2013-10-30 20:08:39.210 +58539,23187.0,1,,,,Nonnegativity of model optimism,,CC BY-SA 3.0,"

The following sounds like a very basic question in learning theory to me, so I am hoping someone can point out the obvious.

+ +

Efron's ""expected optimism"" is the expected difference between the prediction and training errors. Efron (2004) shows that for a wide range of loss functions (the ""q-family""), and virtually all modeling approaches, the expected optimism in modeling the mean of a vector is proportional to the sum of covariances of fitted and observed components. It thus seems natural that any sensible modeling approach would have non-negative expected optimism (because one tries at least in part to minimize the training error).

+ +

I wonder if a more general result exists, though. Suppose that we have an arbitrary optimization criterion (not necessarily a loss in Efron's q-family, maybe not even convex/differentiable/separable; just a function of the data and a tunable parameter). We minimize this criterion on the training data over all possible parameter values in some feasible set. Is the expected minimum obtained this way (where the expectation is taken over all training sets) necessarily an underestimate of the expected criterion value applied to new data, unseen during training (where now the expectation is over all training and all iid testing datasets, as in Efron's definition of optimism)?

+ +

In other words, is the expected optimism of a sensible modeling approach always negative? Or maybe one has to really require something of the distribution of the data and/or know something about the criterion?

+ +

Counter examples may also be enlightening.

+",2013-10-30 20:18:03.027 +58540,13165.0,1,,,,Why beta process is useful ?,,CC BY-SA 3.0,"

Why Beta process is useful/important? How different is that from Dirichlet process?

+",2013-10-30 20:29:01.840 +58541,5821.0,2,,58534.0,,,,CC BY-SA 3.0,"

Your first sentence is not necessarily correct. First off, an increase in numbers of parameters does indeed increase the degrees of freedom and the standard errors of point estimates (hence their degree of generalization). An example of this is the nested classes of Exponential and Weibull models. It's not universally agreed upon that ""model complexity"" necessarily means the parameter space, though, but it is a good place to start for discussion.

+ +

Semiparametric and nonparametric inference make overfitting a nonissue by generalizing the likelihood function into a new type of function where such extraneous parameters are ancillary. The only caveat is that the statistician has to correctly identify such models. Examples of such extended likelihoods are conditional likelihood (in mixed modeling), partial likelihood (in Cox models), pseudo likelihood (forgetting some applications for that...), profile likelihood, quasilikelihood, (and the list goes on). The parameter spaces for such likelihoods are seen as projections of high (possibly infinte) dimensional (compact) parameter spaces.

+ +

It's only in fully parametric inference where every causal relationship needs to be specified, such as the correlation structure for teeth within a mouth, or the correlation between failure times in a prospective cohort among denominators of individuals counted more than once. Many of these likelihoods are overly complex or intractable hence inference about them is non-existent or otherwise not popular.

+ +

Modeling processes is a fully parametric endeavor. You must be able to simulate data from an estimated data generating mechanism. SP/NP often cannot achieve this. Neither can the produce fitted effects nor can they claim to simulate realizations from any data generating process. SP/NP focuses on the point estimation of a specific parameter and efficiently calculates estimates and standard errors for that parameter cancelling out all other parameters in the data generating process through either conditioning, estimating them as nuisance parameters, or some other process.

+ +

SP/NP inference examples are the log-rank test (NP), the plain vanilla asymptotic t-test without normality assumptions (NP), conditional logistic regression (SP), generalized estimating equations (GEE), and Cox proportional hazards models (SP).

+ +

Examples where semi-parametric inference breaks down is in the case of missing at random data (as opposed to missing completely at random data), where the value of some observed outcome or covariate depends on the things which we deemed to be ancillary (such as informative censoring in Cox models). A fully likelihood based survival analysis would require separate models (and their correlation) for survival and censoring outcomes.

+",2013-10-30 20:30:32.093 +58542,19043.0,1,58543.0,,,Significant ANOVA interaction but non-significant pairwise comparisons,,CC BY-SA 3.0,"

I ran a two-way ANOVA and got a significant interaction. I ran a Tukey.HSD() post-hoc test in R and no pairwise comparisons were significant. Is this an error on my part? My adviser insists that this is not possible. If it is possible, why does this happen?

+",2013-10-30 20:32:52.157 +58544,4656.0,2,,43458.0,,,,CC BY-SA 3.0,"

Given a data set $\{y_1, y_2, \ldots, y_n\}$ with $n$ entries, the +mean $\mu$ satisfies +$n\mu = \sum_{i=1}^n y_i, $ while the mean of the expurgated set +$\{y_1, y_2, \ldots, y_{n-1}\}$ is +$$\hat{\mu} = \frac{1}{n-1}\sum_{i=1}^{n-1} y_i += \left.\left.\frac{1}{n-1}\right(n\mu - y_n\right)$$ +which will equal $\mu$ exactly if and only if the deleted +entry $y_n$ equals $\mu$. Thus, deleting an entry from a data +set will change the mean unless the point deleted happens to +equal the mean of the original data set.

+ +

The variance $\sigma^2$ of the original data set +satisfies $(n-1)\sigma^2 = \sum_{i=1}^{n}(y_i-\mu)^2$. +If we delete an entry (say $y_n$) which happens +to have value $\mu$ (so that the mean remains the same), +then the expurgated data set has variance +$$\begin{align} +\hat{\sigma}^2 &= \frac{1}{n-2}\sum_{i=1}^{n-1} (y_i-\mu)^2\\ +&= \frac{1}{n-2}\sum_{i=1}^{n} (y_i-\mu)^2 &\text{since}~ y_n-\mu = 0,\\ +&= \frac{n-1}{n-2}\sigma^2\\ +&> \sigma^2. +\end{align}$$ +Thus, our effort to preserve the mean necessarily increases the variance.

+",2013-10-30 21:31:45.690 +58545,20286.0,2,,58515.0,,,,CC BY-SA 3.0,"

If your goal is prediction rather than trying to infer causality, there is no need to remove any predictor variables, as @Michael Mayer has already commented, unless there is a significant cost to obtaining the values of some of them. And although your predictor variables may be the ""independent"" variables in your analysis, it's likely that many are correlated to each other (e.g., number of bedrooms, number of bathrooms, size of lot). In those cases trying to remove variables that are ""insignificant"" in analysis of a particular sample may lead to weird, counter-intuitive results that do not generalize well for future predictions.

+ +

If you do need to remove predictor variables for some reason, follow specific defined methods for hierarchical/stepwise analyses as suggested by @jmbejara, rather than trying to make up the rules as you go. These methods use better ways to compare models than the adjusted R-squared values, and they are available in R and other statistical analysis software.

+",2013-10-30 21:34:28.380 +58546,22682.0,2,,58521.0,,,,CC BY-SA 3.0,"

Thanks to whuber for the hint for the first part.

+ +

Consider $nS_m/S_n$ for the case $m<=n$

+ +

We have $\mathbb{E}(nS_m/S_n) = \mathbb{E}((nX_1 + . . . + nX_m)/(X_1 + . . . + X_n))$

+ +

$= \mathbb{E}(nX_1/X_1 + . . . + X_n) + . . . + \mathbb{E}(nX_m/X_1 + . . . + X_n)$

+ +

and by the iid property, this is equal to:

+ +

$m\mathbb{E}((X_1+ . .+ X_n)/(X_1+ . . . + X_n)) = m$

+ +

Therefore $\mathbb{E}(S_m/S_n) = m/n$ for $m<=n$

+",2013-10-30 21:38:05.033 +58547,21762.0,2,,58532.0,,,,CC BY-SA 3.0,"

For a given data set, the sample mean provides a single estimate of the population mean. This estimate is a constant and thus its distribution is rather boring.

+ +

In contrast, the sampling distribution of the mean refers to the frequentist approach of considering the distribution of the sample means between many hypothesized samples drawn from the same population.

+ +

So it kind of makes sense to use a 'new' word.

+",2013-10-30 21:55:27.957 +58548,23191.0,1,,,,Negative Binomial Regression and Heteroskedasticity test,,CC BY-SA 3.0,"

I am running a negative binomial regression in Stata and would like to know if I need to include the vce(robust) option in the model. I know the negative binomial regression is the best for the data I have. I've combed through so many sources and it seems like negative binomials already seem to take care of some heteroskedasticity, but I would really like a test that would let me know if there is still some in the model that needs to be taken care of via the vce(robust) option. I know there is the hettest but I can only use that in a regress model, not a nbreg (negative binomial regression) model. I also saw this advice:

+ +

""You could try plotting the absolute value of Pearson residuals from your NB regression against covariates (at least, covariates you think might affect the overdispersion) or against the fitted means. If there's a discernible trend, this suggests non-constant variance.""

+ +

But what constitutes 'a discernible trend'? How do I know if that discernible trend would be significantly better taken care of by the vce(robust) option or it wouldn't matter?

+",2013-10-30 22:12:17.233 +58549,21497.0,1,,,,Estimating if a distribution is significantly less conserved than others when one group is involved.,,CC BY-SA 3.0,"

I am looking at levels of genes in a dataset and want to identify genes that do not vary much in terms of their expression level. While I can do this using the coefficient of correlation, calculating the covariance or by looking for number of genes within the botton x percent of genes using median absolute deviation those methods appear to be arbitrary.

+ +

What I am interested in is defining a cutoff based on P.values - is there any way of finding out which genes show significantly less variability than would be expected by chance without having external controls to compare it to?

+ +

Cheers, +Ankur.

+",2013-10-30 22:13:04.033 +58550,11197.0,2,,58481.0,,,,CC BY-SA 4.0,"

Yes, there is a package adegenet. For example:

+ +
library(adegenet)
+data(dapcIllus)
+x <- dapcIllus$a
+grp <- find.clusters(x, max.n.clust = 40)
+dapc1 <- dapc(x, grp$grp)
+scatter(dapc1)
+
+ +

+ +

For more information read this.

+",2013-10-30 22:14:01.137 +58551,668.0,2,,58532.0,,,,CC BY-SA 3.0,"

Within a particular setting where the type of distribution is known or implied, ""distribution of the sample mean"" works just fine. But in general would the ""distribution of the sample mean"" be its sampling distribution, a bootstrap distribution, a permutation distribution, or perhaps something else?

+ +

The existence of different kinds of distribution of a sample statistic requires some linguistic method of disambiguation. Without that, you lose precision and perhaps miscommunicate with your audience.

+",2013-10-30 22:14:58.057 +58552,23192.0,1,58558.0,,,Random walk on simplex as part of Metropolis-Hastings,,CC BY-SA 3.0,"

I would like to perform a random walk on a J-dimensional simplex. However, since this is part of a metropolis-hastings algorithm application, my understanding is that the steps need to be drawn from a symmetric distribution (is this correct?)

+ +

I was wondering if there is a standard/established way to approach this.

+ +

Any help/pointers are greatly appreciated!

+ +

Noushin

+",2013-10-30 22:25:32.417 +58553,2958.0,2,,58524.0,,,,CC BY-SA 3.0,"

Yes, you should use the center and scaling from the prcomp object.

+ +

An easy way to realize that center and variance of the new data set is not suitable is:
+assume the new data set consists of one row only. Its center would be the case itself and a scaling cannot be calculated.

+",2013-10-30 22:25:36.973 +58554,2958.0,2,,58529.0,,,,CC BY-SA 3.0,"

I agree with your concerns.

+ +
+

given that people in reality will seldom choose a FPR cut-off of 0.5 or higher, why people would prefer a ROC curve with FPR ranging from 0 to 1 and use the full AUC value (i.e. calculate the entire area under the ROC curve) instead of just reporting the area made from, say, 0 to 0.25 or to 0.5? Is that called ""partial AUC""?

+
+ +
    +
  • I'm a big fan of having the complete ROC, as it gives much more information that just the sensitivity/specificity pair of one working point of a classifier.
  • +
  • For the same reason, I'm not a big fan of summarizing all that information even further into one single number. But if you have to do so, I agree that it is better to restrict the calculations to parts of the ROC that are relevant for the application.
  • +
+ +
+

in the figure below, what can we say about the performances of the three models? The AUC values are: green (0.805), red (0.815), blue (0.768). The red curve turns out to be superior, but as you see, the superiority is only reflected after FPR > 0.2. Thanks :)

+
+ +
    +
  • That depends entirely on your application. In your example, if high specificity is needed, then the green classifier would be best. If high sensitivity is needed, go for the red one.
  • +
+ +

As to the comparison of classifiers: there are lots of questions and answers here discussing this. Summary:

+ +
    +
  • classifier comparison is far more difficult than one would expect at first
  • +
  • not all classifier performance measures are good for this task. Read @FrankHarrells answers, and go for so-called proper scoring rules (e.g. Brier's score/mean squared error).
  • +
+",2013-10-30 22:35:14.527 +58555,22974.0,1,,,,Is it always bad to retrain your model to include predicted data?,,CC BY-SA 3.0,"

I understand intuitively why this is a horrible idea - you assume your model is correct and then increase your number of observations which will likely result in a poor fit on future data.

+ +

I'm wondering if there is some mathematical/statistical property to describe this, or if there is any rare case where this may not be as fatal as I am thinking?

+",2013-10-30 23:29:33.807 +58556,22564.0,2,,56445.0,,,,CC BY-SA 3.0,"

If data in the treatment group is not normal while the control group is it sounds like the treatment may only be affecting a subset of the sample or having variable levels of effect. Comparing means under such circumstances would be losing out on this information. You should attempt to offer explanations for why this change of distribution occurred rather than only comparing means. The rank tests assume that both groups come from the same shape distribution. If you believe the distributions are different the tests are not useful for your purposes.

+ +

Let us take an example of what can happen with the U-test. We will make our control group come from a normal distribution with mean=0. Meanwhile the treatment will have negative effects on half the subjects and positive effects on the other half. So the treatment group will come from two normal distributions. The first with mean=-5, the second with mean=5. All distributions have sd=1 and both groups have sample size=100. Red shows the treatment group while blue shows the control group:

+ +

+ +

Results of doing a U-test (which is also called the Wilcoxon test):

+ +
        Wilcoxon rank sum test with continuity correction
+
+data:  a and b 
+W = 4999, p-value = 0.999
+alternative hypothesis: true location shift is not equal to 0
+
+ +

We can see it returns ""not significant"". Would you really want to conclude the treatment had no effect?

+ +

R code for generating the above:

+ +
##Generate Data
+control<-rnorm(100,0,1) # create control data
+treatment<-c(rnorm(50,-5,1),rnorm(50,5,1)) # create treatment data
+
+
+##Plot data
+# Get min/max values (for plotting)
+min.val<-min(control,treatment)
+max.val<-max(control,treatment)
+
+# make plots
+hist(treatment, breaks=seq(min.val-.1,max.val+.1,.5), col=""Red"",
+xlab=""Value"", ylim=c(0,20),
+main=""Results""
+)
+hist(control, add=T, breaks=seq(min.val-.1,max.val+.1,.5),col=""Blue"")
+
+##perform U-test
+wilcox.test(treatment,control)
+
+",2013-10-30 23:32:08.437 +58557,22564.0,2,,58306.0,,,,CC BY-SA 3.0,"

The tests you mention are not appropriate for your situation. They will only tell you the probability of getting a difference difference between years 1-2 and years 3-5 as or more extreme than the difference you observed if there was exactly zero difference from year to year. This null hypothesis is highly unlikely to be true regardless of whether tools were changed. It is also unlikely that classes in the future will be exactly the same type of students as in the past.

+ +

What you care about should be (I think) whether tool B will lead to higher participation in the future than tool A. This is an ""analytic"" problem, while the statistical tests you are attempting to use are meant for ""enumerative problems"". Yes, this type of use is very common and it has lead to about 80 years of misleading results in many fields.

+ +

The only way to make rational decisions is to have understanding of the underlying data generating process. If there is little background knowledge all you can do is plot the data and look for patterns that indicate there may be some lurking/confounding variable that offers an alternative explanation for the increase in participation. You should try to break up the data into as many plausible subgroups as possible (e.g., type of student) and look for patterns.

+ +

If you and other experts cannot think of any plausible alternative explanations then it would be rational to decide to continue using tool B. If an alternative explanation is available then further study is necessary to determine which is responsible. A good source on this issue would be William Edwards Deming.

+ +

https://en.wikipedia.org/wiki/Analytic_and_enumerative_statistical_studies

+ +

https://www.deming.org/media/pdf/081.pdf

+ +

EDIT: +Here is a quote from Deming (decide for yourself whether he is a credible source, but he knew both Fisher and J Neyman):

+ +
+

Limitations of statistical inference. All results are conditional on + (a) the frame whence came the units for test; (b) the method of + investigation (the questionnaire or the test-method and how it was + used) ; (c) the people that carry out the interviews or measurements. + In addition (d), the results of an analytic study are conditional also + on certain environmental states, such as the geographic locations of + the comparison, the date and duration of the test, the soil, rainfall, + climate, description and medical histories of the patients or subjects + that took part in the test, the observers, the hospital or hospitals, + duration of test, levels of radiation, range of voltage, speed, range + of temperature, range of pressure, thickness (as of plating), number + of flexures, number of jolts, maximum thrust, maximum gust, maximum + load.

+ +

The exact environmental conditions for any experiment will never + be seen again. Two treatments that show little difference under one + set of environmental circumstances or even within a range of + conditions, may differ greatly under other conditions-other soils, + other climate, etc. The converse may also be true: two treatments that + show a large difference under one set of conditions may be nearly + equal under other conditions.

+ +

There is no statistical method by which + to extrapolate to longer usage of a drug beyond the peritd of test, + nor to other patients, soils, climates, higher voltages, nor to other + limits of severity outside the range studied. Side effects may develop + later on. Problems of maintenance of machinery that show up well in a + test that covers three weeks may cause grief and regret after a few + months. A competitor may stop in with a new product, or put on a blast + of advertising. Economic conditions change, and upset predictions and + plans. These are some of the reasons why information on an analytic + problem can never be complete, and why computations by use of a + loss-function can only be conditional. The gap beyond statistical + inference can be filled in only by knowledge of the subject-matter + (economics, medicine, chemistry, engineering, psychology, agricultural + science, etc.), which may take the formality of a model [12], [14], + [15].

+
+ +

Deming, W. Edwards ""On probability as a basis for action"" The American Statistician, volume 29, 1975

+ +

https://www.deming.org/media/pdf/145.pdf

+",2013-10-31 00:28:18.800 +58558,23087.0,2,,58552.0,,,,CC BY-SA 3.0,"

I think you're asking whether the MH proposal distribution has to be symmetric. No, it doesn't have to be symmetric, it just can't depend on the current state. For sampling on a constrained space it's perfectly valid just to use a Gaussian proposal distribution and reject any proposals that fall outside the space. However, this may not work well in practice, particularly if J is large or the mass is concentrated towards the corners of the simplex.

+",2013-10-31 00:47:09.833 +58559,12501.0,2,,58520.0,,,,CC BY-SA 3.0,"

What you are specifying with the second argument to mvnpdf is, as you correctly state, the (co-) variance. The standard deviation corresponding to a variance of 0.001 is about 0.0316. The point you are looking at is at a distance of 0.002 from the center of the distribution, or about 0.0632 standard deviations from the center, i.e. it is very close to the center. It is therefore to be expected that the density is only slightly smaller than at the center.

+",2013-10-31 00:48:13.660 +58560,20981.0,2,,58532.0,,,,CC BY-SA 3.0,"

You can have a sampling distribution of other statistics than the mean, such as the estimated median, or estimated variance.

+ +

Sometimes ""sampling distribution"" might be a loose term referring to the estimated mean and estimated variance of the sample taken together (with the unspoken assumption that the distribution of sample means is approximately normal).

+",2013-10-31 00:49:14.583 +58561,6384.0,1,58564.0,,,How to prove that $X^T$e = 0,,CC BY-SA 4.0,"

I need to prove that $X^T e$ = 0 where $e$ is the residual in multiple linear regression model in matrix algebra?

+ +

I need some guidance on how to do it. Is there any good pdf for the proofs for multiple linear regression model for matrix alegbra?

+",2013-10-31 01:32:10.817 +58562,15972.0,2,,58521.0,,,,CC BY-SA 3.0,"

Spotting to add $n$ identical copies of $S_m/S_n$ is very clever! But some of us are not so clever, so it is nice to be able to ""postpone"" the Big Idea to a stage where it is more obvious what to do. Without knowing where to start, there seem be a number of clues that symmetry could be really important (addition is symmetric and we have some summations, and iid variables have the same expectation so maybe they can be swapped around or renamed in useful ways). In fact the ""hard"" bit of this question is how to deal with the division, the operation which isn't symmetric. How can we exploit the symmetry of summation? From linearity of expectation we have:

+ +

$\mathbb{E}(S_m/S_n) = \mathbb{E}\left(\frac{X_1 + ... + X_m}{X_1 + ... + X_n}\right) = \mathbb{E}\left(\frac{X_1}{X_1 + .... + X_n}\right) + ... + \mathbb{E}\left(\frac{X_m}{X_1 + .... + X_n}\right)$

+ +

But then on symmetry grounds, given that $X_i$ are iid and $m \le n$, all the terms on the right-hand side are the same! Why? Switch the labels of $X_i$ and $X_j$ for $i, j \le n$. Two terms in the denominator switch position but after reordering it still sums to $S_n$, whereas the numerator changes from $X_i$ to $X_j$. So $\mathbb{E}(X_i/S_n) = \mathbb{E}(X_j/S_n)$. Let's write $\mathbb{E}(X_i/S_n)=k$ for $1 \le i \le n$ and since there are $m$ such terms we have $\mathbb{E}(S_m/S_n) = mk$.

+ +

It looks as if $k=1/n$ which would produce the correct result. But how to prove it? We know

+ +

$k=\mathbb{E}\left(\frac{X_1}{X_1 + .... + X_n}\right)=\mathbb{E}\left(\frac{X_2}{X_1 + .... + X_n}\right)=...=\mathbb{E}\left(\frac{X_n}{X_1 + .... + X_n}\right)$

+ +

It's only at this stage it dawned on me I should be adding these together, to obtain

+ +

$nk = \mathbb{E}\left(\frac{X_1}{X_1 + .... + X_n}\right) + \mathbb{E}\left(\frac{X_2}{X_1 + .... + X_n}\right) + ... + \mathbb{E}\left(\frac{X_n}{X_1 + .... + X_n}\right)$ +$\implies nk = \mathbb{E}\left(\frac{X_1 + ... + X_n}{X_1 + .... + X_n}\right) = \mathbb{E}(1) = 1$

+ +

What's nice about this method is that it preserves the unity of the two parts of the question. The reason symmetry is broken, requiring adjustment when $m>n$, is that the terms on the right-hand side after applying linearity of expectation will be of two types, depending on whether the $X_i$ in the numerator lies in the sum in the denominator. (As before, I can switch the labels of $X_i$ and $X_j$ if both appear in the denominator as this just reorders the sum $S_n$, or if neither does as this clearly leaves the sum unchanged, but if one does and one doesn't then one of the terms in the denominator changes and it no longer sums to $S_n$.) For $i \le n$ we have $\mathbb{E}\left(\frac{X_i}{X_1 + .... + X_n}\right)=k$ and for $i>n$ we have $\mathbb{E}\left(\frac{X_i}{X_1 + .... + X_n}\right)=r$, say. Since we have $n$ of the former terms, and $m-n$ of the latter,

+ +

$\mathbb{E}(S_m/S_n) = nk + (m-n)r = 1 + (m-n)r$

+ +

Then finding $r$ is straightforward using independence of $S_n^{-1}$ and $X_i$ for $i>n$: $r=\mathbb{E}(X_i S_n^{-1})=\mathbb{E}(X_i) \mathbb{E}(S_n^{-1})=\mu \mathbb{E}(S_n^{-1})$

+ +

So the same ""trick"" works for both parts, it just involves dealing with two cases if $m>n$. I suspect this is why the two parts of the question were given in this order.

+",2013-10-31 01:55:16.093 +58563,11775.0,1,,,,Data normalization and sufficient statistic,,CC BY-SA 3.0,"

I was taught that when we feed our data to machine learning algorithm (e.g. SVM), we should first normalize our data.

+ +

Suppose I have a set of data $X = \{x_1,x_2,...,x_n\}$, I knew two-way of normalizing them, let $\hat{\mu}$ and $\hat{\sigma}^2$ be the sample mean and sample variance of X. I can normalize each data point by

+ +

$$ +y_k = \frac{x_k-\hat{\mu}}{\hat{\sigma}} +$$

+ +

I think I know this when I first learn PCA.

+ +

I can also normalize it using the minimum and maximum of the data:

+ +

$$ +y_k = \frac{x-m}{M-m} +$$

+ +

where $M = \max(X)$ and $m = \min(X)$. By using this normalization, I can make sure the normalized data will be in the interval $[0,1]$.

+ +

I observe a fact that:

+ +

($\hat{\mu}$,$\hat{\sigma}^2$) is sufficient statistic to a normal distribution and the projection matrix of PCA is a solution to a minimization problem that minimize $l^2$-norm.

+ +

On the other hand, $(M,m)$ is sufficient statistic to a uniform distribution.

+ +

My questions are:

+ +
    +
  1. The observation gives me an intuition that what normalization technique I should employ is depends on my belief (or the learning algorithm believes) of underlying distribution of my data. If I believe the distribution is normal distributed, I should use z-score normalization, if I believe the distribution is uniform distributed, I should use min-max normalization. Is my thought correct?

  2. +
  3. If I do not know the underlying distribution, how should I do data normalization?

  4. +
  5. If I am going to feed my data to an online learning algorithm (like winnow algorithm), are there any online data normalization technique?

  6. +
+",2013-10-31 01:59:37.967 +58564,20473.0,2,,58561.0,,,,CC BY-SA 3.0,"

$$\mathbf X'\mathbf e = \mathbf X'(\mathbf y -\mathbf {\hat y})= \mathbf X'(\mathbf y -\mathbf X\hat \beta) =...$$

+ +

ADDENDUM

+ +

$$=\mathbf X'\left(\mathbf y -\mathbf X (\mathbf X'\mathbf X)^{-1}\mathbf X' \mathbf y\right) =\mathbf X'\mathbf y -\mathbf X'\mathbf X (\mathbf X'\mathbf X)^{-1}\mathbf X' \mathbf y$$

+ +

$$\mathbf X'\mathbf y -\mathbf X' \mathbf y = \mathbf 0$$

+",2013-10-31 02:40:29.167 +58565,23087.0,2,,58555.0,,,,CC BY-SA 3.0,"

I can give you a probabilistic/Bayesian interpretation of why this is not helpful. A probabilistic model for data $X$ and parameters $\theta$ is defined by a likelihood $P(X|\theta)$ and a prior $P(\theta)$. Now imagine I have some training data $X_\text{train}$ and want to make predictions about future data $X_\text{future}$, which means I need to calculate, or approximate +$$ +P(X_\text{future}|X_\text{train}) = \int P(X_\text{future}|\theta) P(\theta|X_\text{train}) d\theta +$$ +where $P(\theta|X_\text{train})$ is the posterior. What you suggest is to sample predictions $X_\text{pred}$ from $P(X_\text{pred}|X_\text{train})$ (which can be represented in the same way as the above equation). However, since $X_\text{pred}$ is not observed you can integrate it away and your posterior on $\theta$ will be unchanged. Conditioning on $X_\text{pred}$ is therefore not a reasonable thing to do.

+ +

To speculate on the typical effect it might have: if you sample from $P(X_\text{pred}|X_\text{train})$ then you are both adding noise to your estimate, and reducing the uncertainty in the estimate of $\theta$ (so you would probably be both overconfident and more wrong!), whereas if you optimise $P(X_\text{pred}|X_\text{train})$ I expect the main effect would be reducing the uncertainty in the posterior and thereby making your predictions overly confident (i.e. overfitting).

+",2013-10-31 03:14:25.267 +58566,23193.0,1,,,,Hypothesis test for difference in preference pre and post treatment,,CC BY-SA 3.0,"

Two large beach holiday destinations near Bangkok, Thailand are Pattaya and Hua Hin. In a random sample of 100 individuals in Bangkok who have been on a weekend away to nearby beach destinations 6 preferred Pattaya and 3 preferred Hua Hin. The other 91 did not have a preference. These individuals were then exposed to certain tourism promotional material and after such exposure 12 preferred Pattaya and 1 preferred Hua Hin. The remaining 87 had no preference. I would like to set up a hypothesis test to determine if the promotional material had an effect on preference. Your assistance or a referral to a URL for a tutorial on this kind of hypothesis testing would be greatly appreciated.

+",2013-10-31 03:19:00.427 +58567,22564.0,2,,58445.0,,,,CC BY-SA 3.0,"

My example here may help: When making inferences about group means, are credible Intervals sensitive to within-subject variance while confidence intervals are not?

+ +

I modified the model slightly for your data. Note that with such little data your results will be heavily dependent on the priors you use so I would attempt modifying the priors on the group means and precisions (1/variance) and seeing the different results to learn.

+ +

Here are the results I got: +

+ +

+ +

+ +

This is modified from John Krushke's example here: +http://psy2.ucsd.edu/~dhuber/cr_SimpleLinearRegressionRepeatedBrugs.R

+ +

He has a helpful website and blog: +http://www.indiana.edu/~kruschke/DoingBayesianDataAnalysis/

+ +
#Note. To use rjags you need to first install JAGS from here: 
+#http://sourceforge.net/projects/mcmc-jags/files/
+
+install.packages(""rjags"") #run first time to install package
+
+require(rjags) #load rjags package
+
+
+#Format your data
+subID<-rep(1:8,each=4)
+
+dat<-rbind(88, 91, 87, 82,
+81, 85, 78, 91,
+75, 77, 83, 81,
+92, 89, 84, 82,
+78, 79, 84, 92,
+89, 75, 79, 83,
+91, 89, 92, 91,
+87, 86, 88, 91
+)
+
+dat<-cbind(subID,dat)
+colnames(dat)<-c(""Subject"",""Value"")
+dat<-as.data.frame(dat)
+
+
+
+#Jags fit function
+jags.fit<-function(dat){
+
+  #Create JAGS model
+  modelstring = ""
+
+  model{
+  for(n in 1:Ndata){
+  y[n]~dnorm(mu[subj[n]],tau[subj[n]]) T(0, )
+  }
+
+  for(s in 1:Nsubj){
+  mu[s]~dnorm(muG,tauG) T(0, )
+  tau[s] ~ dgamma(5,5)
+  }
+
+
+  muG~dnorm(80,.01) T(0, )
+  tauG~dgamma(1,1)
+
+  }
+  ""
+  writeLines(modelstring,con=""model.txt"")
+
+#############  
+
+  #Format Data
+  Ndata = nrow(dat)
+  subj = as.integer( factor( dat$Subject ,
+                                 levels=unique(dat$Subject ) ) )
+  Nsubj = length(unique(subj))
+  y = as.numeric(dat$Value)
+
+  dataList = list(
+    Ndata = Ndata ,
+    Nsubj = Nsubj ,
+    subj = subj ,
+    y = y
+  )
+
+  #Nodes to monitor
+  parameters=c(""muG"",""tauG"",""mu"",""tau"")
+
+
+  #MCMC Settings
+  adaptSteps = 1000             
+  burnInSteps = 1000            
+  nChains = 1                   
+  numSavedSteps= nChains*10000          
+  thinSteps=20                      
+  nPerChain = ceiling( ( numSavedSteps * thinSteps ) / nChains )            
+
+
+  #Create Model
+  jagsModel = jags.model( ""model.txt"" , data=dataList, 
+                          n.chains=nChains , n.adapt=adaptSteps , quiet=FALSE )
+  # Burn-in:
+  cat( ""Burning in the MCMC chain...\n"" )
+  update( jagsModel , n.iter=burnInSteps )
+
+  # Getting DIC data:
+  load.module(""dic"")
+
+
+  # The saved MCMC chain:
+  cat( ""Sampling final MCMC chain...\n"" )
+  codaSamples = coda.samples( jagsModel , variable.names=parameters , 
+                              n.iter=nPerChain , thin=thinSteps )  
+
+  mcmcChain = as.matrix( codaSamples )
+
+  result = list(codaSamples=codaSamples, mcmcChain=mcmcChain)
+
+}
+
+
+output<-jags.fit(dat) # fit the model to your data
+
+
+
+###make plots
+##Overall plots
+par(mfrow=c(2,1))
+#Plot overall means
+hist(output$mcmcChain[,""muG""],col=""Grey"", freq=F,
+main=""Overall Mean"", xlab=""Performance""
+)
+#Plot overall variance
+hist(1/output$mcmcChain[,""tauG""],col=""Grey"", freq=F,
+main=""Overall Variance"", xlab=""Performance"")
+
+
+##Indidvidual Mean Plots
+dev.new()
+par(mfrow=c(2,4))
+for(i in 1:8){
+hist(output$mcmcChain[,paste(""mu["",i,""]"",sep="""")],
+main=paste(""Mean of Runner"", i), xlab=""Performance"", freq=F, col=""Grey""
+)
+}
+
+
+##Indidvidual Variance Plots
+dev.new()
+par(mfrow=c(2,4))
+for(i in 1:8){
+hist(1/output$mcmcChain[,paste(""tau["",i,""]"",sep="""")],
+main=paste(""Variance of Runner"", i), xlab=""Performance"", freq=F, col=""Grey""
+)
+}
+
+# see what is in the output
+attributes(output$mcmcChain)
+
+ +

Edit: +To see the percent of time the model predicts each runner will win we can take the mean and variance estimated for each individual at each mcmc step, then sample a performance from a distribution determined by those parameters. We can then simply count the number of times each runner had the highest performance.

+ +

+ +
nSamps<-length(output$mcmcChain[,paste(""mu["",i,""]"",sep="""")])
+out=matrix(nrow=nSamps*8,ncol=3)
+cnt<-1
+for(j in 1:nSamps){
+for(i in 1:8){
+m<-output$mcmcChain[,paste(""mu["",i,""]"",sep="""")][j]
+v<-1/output$mcmcChain[,paste(""tau["",i,""]"",sep="""")][j]
+t<-rnorm(1,m,sqrt(v))
+out[cnt,]<-cbind(j,i,t)
+cnt<-cnt+1
+}
+}
+colnames(out)<-c(""N"",""RunnerID"",""Time"")
+
+
+winners=matrix(nrow=nSamps,ncol=1)
+for(i in 1:nSamps){
+sub<-out[which(out[,""N""]==i),]
+winners[i]<-sub[which(sub[,""Time""]==max(sub[,""Time""])),""RunnerID""]
+}
+
+dev.new()
+barplot(100*table(winners)/nSamps, xlab=""Runner ID"", ylab=""% of Wins"")
+
+",2013-10-31 03:47:01.427 +58568,23196.0,1,,,,Ridge regression results different in using lm.ridge and glmnet,,CC BY-SA 3.0,"

I applied some data to find the best variables solution of regression model using ridge regression in R. I have used lm.ridge and glmnet (when alpha=0), but the results are very different especially when lambda=0. It suppose that both parameter estimators have the same values. So, what is the problem here? + best regards

+",2013-10-31 04:07:29.983 +58569,16464.0,2,,52567.0,,,,CC BY-SA 3.0,"

To give an example in line with @neverKnowsBest's response, consider that in a $2^3$ factorial experiment there are 3 factors, each treated as categorical variables with 2 levels, and each possible combination of the factor levels is tested within each replication. If the experiment were only administered once (no replication) this design would require $2^3=8$ runs. The runs can be described by the following 8x3 matrix: +$$ +\left[\begin{array}{rr} +0 & 0 & 0 \\ +1 & 0 & 0 \\ +0 & 1 & 0 \\ +1 & 1 & 0 \\ +0 & 0 & 1 \\ +1 & 0 & 1 \\ +0 & 1 & 1 \\ +1 & 1 & 1 \\ +\end{array} +\right] +$$ +where the rows represent the runs and the columns represent the levels of the factors: +$$ +\left[\begin{array}{rr} +A & B & C \\ +\end{array} +\right]. +$$ +(The first column represents the level of factor A, the second column B, and the third column C). This is referred to as the Design Matrix because it describes the design of the experiment. The first run is collected at the 'low' level of all of the factors, the second run is collected at the 'high' level of factor A and the 'low' levels of factors B and C, and so on.

+ +

This is contrasted with the model matrix, which if you were evaluating main effects and all possible interactions for the experiment discussed in this post would look like: +$$ +\left[\begin{array}{rr} +1 & 0 & 0 & 0 & 1 & 1 & 1 & 0 \\ +1 & 1 & 0 & 0 & 0 & 0 & 1 & 1 \\ +1 & 0 & 1 & 0 & 0 & 1 & 0 & 1 \\ +1 & 1 & 1 & 0 & 1 & 0 & 0 & 0 \\ +1 & 0 & 0 & 1 & 1 & 0 & 0 & 1 \\ +1 & 1 & 0 & 1 & 0 & 1 & 0 & 0 \\ +1 & 0 & 1 & 1 & 0 & 0 & 1 & 0 \\ +1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ +\end{array} +\right] +$$ +where the columns represent independent variables: +$$ +\left[\begin{array}{rr} +I & A & B & C & AB & AC & BC & ABC \\ +\end{array} +\right]. +$$ +Although the two matrices are related the design matrix describes how data is collected, while the model matrix is used in analyzing the results of the experiment.

+ +

Citations

+ +

Montgomery, D. (2009). Design and Analysis of Experiments, 7th Edition. John Wiley & Sons Inc.

+",2013-10-31 04:28:15.877 +58570,23198.0,1,,,,Probability of winning money with a pair of loaded dice,,CC BY-SA 3.0,"

You go gambling with a pair of loaded dice. Because of this, your odds of winning are 53% on every throw. Assuming the game pays 2:1 and you keep betting the same amount, how many games do you need to play to ensure an 80% likelihood of winning money?

+ +

I am lost with where to start on this problem. I would appreciate some help so I can figure it out. Thanks.

+",2013-10-31 05:32:00.743 +58571,22507.0,2,,58570.0,,,,CC BY-SA 3.0,"

I assume this is a ""self-study"" problem. Here are the hints for you:

+ +
    +
  1. Suppose you play N throws. What is the probability to win in exactly M throws?
  2. +
  3. How many throws you need to win money?
  4. +
+",2013-10-31 06:27:11.663 +58738,503.0,5,,,,,,CC BY-SA 3.0,"

Tag usage

+ + +",2013-11-03 00:18:10.927 +58572,21029.0,2,,58549.0,,,,CC BY-SA 3.0,"

If you're only interested in checking the variability, it sounds like an analysis of variance would be a good start.

+ +

Idea 1: Perform an F-test against the different variances for each control. The null hypothesis is that the two controls come from the same normal distribution, but potentially with different means.

+ +

Idea 2: an analysis of variance (ANOVA) on the continuous variable separated into the controls. The idea is to look at the variance of the continuous variable within each class $s_i$ and compare it to the total variance $s_t$. The correlation coefficient for one class compared to the total is then $\eta_i = \sqrt{s_i / s_t}$. The test is then an F-test. There is an assumption of a normal distribution here also.

+ +

As for the p-values, they represent the level where you can accept or reject the null hypothesis (the variances are all equal). If the p-value is low (below a 5% level, for example) you reject the null hypothesis and assume that the variables have different variances based on the control groups.

+",2013-10-31 06:42:01.353 +58573,13740.0,1,58576.0,,,Transform multiclass classification to binary - benefits?,,CC BY-SA 3.0,"

I have 400 instances which must be categorized into 4 classes. Using WEKA, I tried out a couple of multiclass classifiers like J48 and Random Forests, but never made it above Kappa 0.6 and ~65% correctly classified instances (10-fold X-V)

+ +

Then I thought about transforming the problem into a 1-vs-all classification, which usually yields accuracies of ~90%. I would then remove the one ""single"" class and keep the merged ones. Then, again, having only instances with 3 classes, I would perform 1-vs-2 and remove the instances classified as belonging to the single class, ending up with a binary classification problem. As I said - I always have like 90% correctly classified instances, but I fear that the 10% incorrectly classified instances add up and propagate through the splitting and dataset reduction process ---

+ +

so in the end I would maybe end up with the same garbage output I'd have when performing the original multiclass classification?! What's the stand on this approach? Does it have any benefits at all?

+",2013-10-31 07:26:22.987 +58574,21823.0,2,,58566.0,,,,CC BY-SA 3.0,"

You can apply a chi-squared Test of independence test, but you will face an issue with the fact that only 1 respondent has chosen Hua Hin post receiving a stimulus.

+ +

Your alternative would be to run a Fisher's exact test to accommodate less than a frequency of 5 in each cell.

+ +

Note: Since your samples are paired, you would want to use a McNemar test instead.

+",2013-10-31 07:28:29.130 +58575,21823.0,2,,57636.0,,,,CC BY-SA 3.0,"

An alternate method of finding variable importance is using random forests.

+ +

A package called varSelRF was built specifically for this purpose. This method isn't designed to be right all the time, but is a rather quick way of dealing with large dimensions to get a semblance of which variables could at the first level affect the response variable.

+ +

Combine this with an MANOVA and you stand a decent chance of finding your key variables.

+",2013-10-31 07:35:55.953 +58576,17740.0,2,,58573.0,,,,CC BY-SA 3.0,"

Translating a multiclass problem into a set of binary ones (using 1-vs-all or 1-vs-1) is typically done when you want to use algorithms that don't actually have a multiclass formulation, such as SVM.

+ +

If you do not plan to change the classification algorithm, you will probably end up with similar results after transforming your problem.

+ +

Note that changing algorithm will not necessarily improve your performance.

+",2013-10-31 08:01:38.190 +58577,22970.0,1,58580.0,,,Generalized linear mixed models: model selection,,CC BY-SA 3.0,"

This question/topic came up in a discussion with a colleague and I was looking for some opinions on this:

+ +

I am modeling some data using a random effects logistic regression, more precisely a random intercept logistic regression. For the fixed effects I have 9 variables that are of interest and come into consideration. I would like to do some sort of model selection to find the variables that are significant and give the “best” model (main effects only).

+ +

My first idea was to use the AIC to compare different models but with 9 variables I was not too exciting to compare 2^9=512 different models (keyword: data dredging).

+ +

I discussed this with a colleague and he told me that he remembered reading about using stepwise (or forward) model selection with GLMMs. But instead of using a p-value (e.g. based on a likelihood ratio test for GLMMs), one should use the AIC as entry/exit criterion.

+ +

I found this idea very interesting, but I did not find any references that further discussed this and my colleague did not remember where he read it. Many books suggest using the AIC to compare models but I did not find any discussion about using this together with a stepwise or forward model selection procedure.

+ +

So I have basically two questions:

+ +
    +
  1. Is there anything wrong with using the AIC in a stepwise model selection procedure as entry/exit criterion? If yes, what would be the alternative?

  2. +
  3. Do you have some references that discuss the above procedure that (also as reference for a final report?

  4. +
+ +

Best,

+ +

Emilia

+",2013-10-31 08:48:04.587 +58578,23205.0,1,,,,random forest classification in R - no separation in training set,,CC BY-SA 3.0,"

Originally posted on Stack Overflow, but suggested to move here...

+ +

I'm new to machine learning, but I've tried to perform a Random Forest classification (randomForest package in R) on some metabolomics data with bad results. My normal approach in this case would be to employ a PLS-DA strategy. However, I decided to try both RF and SVM as there are some publications highly recommending these machine learning approaches for Omics data.

+ +

In my case, 'X' is a 16*100 data frame (16 individuals with 100 recorded features/predictors) read from a CSV file. 'Y' is a factor vector (length=16) with 8 'high' and 8 'low'. In both PLS-DA and SVM (both linear and radial kernel) I get excellent separation. However, I get 3 misclassifications out of 16 in the RF model.

+ +

The RF model looks like: RFA1=randomForest(X,Y)

+ +
## read file and fix data frame
+in.data = read.csv2(file='Indata.csv', header = FALSE, skip=5)[,-4] # Col 1-3 are identifiers. Features/predictors from col 4
+names(in.data)=names(read.csv2(file='Indata.csv',header=T)[,-4])
+# str(in.data)
+ # $ ID                       : Factor w/ 27 levels ""2"",""3"",""4"",""5"",..: 2 3 4 6 8 10 20 23 5 11 ...
+     # $ period                   : Factor w/ 2 levels ""A"",""B"": 1 1 1 1 1 1 1 1 1 1 ...
+ # $ consumption          : Factor w/ 2 levels ""high"",""low"": 1 1 1 1 1 1 1 1 2 2 ...
+     # $ FEATURES...
+
+## Sort DF into X (features) and Y (classifier based on consumption)
+y = in.data$consumption                   # Classifier based on high vs low consumption
+x = in.data[,-1:-3]                       # 100 features/predictors into X NB Contains many NAs
+nr=nrow(x)
+nc=ncol(x)
+x.na = as.data.frame(is.na(x))            # Find NAs in X
+col.min=apply(x,2,min,na.rm=T)            # Find min value per feature (omitting NAs)
+## Deal with zero/missing data-situation
+x2=x                                      # Compute new x2 matrix without NA
+for (i in 1:nc) {
+    x2[x.na[,i],i]=col.min[i]             # Substitute missing data with col.min
+}
+
+## Make classifiers according to period (A vs B)
+a.ind = in.data$period=='A'
+    b.ind = in.data$period=='B'
+
+## Choose data from period A only & transform/scale X
+x2a=x2[a.ind,]                 # Original data
+x2a.scale=scale(x2a)           # Scaled
+x2a.log=log(x2a)               # Log-transformed
+x2a.logscale=scale(log(x2a))   # Log-transformed and scaled
+ya=y[a.ind]
+
+## Perform analysis for period A
+library(randomForest)
+(rfa1=randomForest(x2a,ya))
+(rfa2=randomForest(x2a.scale,ya))
+(rfa3=randomForest(x2a.log,ya))
+(rfa4=randomForest(x2a.logscale,ya))
+
+ +

This generates output like:

+ +
Call:
+ randomForest(x = x2a, y = ya) 
+               Type of random forest: classification
+                 Number of trees: 500
+No. of variables tried at each split: 10
+
+        OOB estimate of  error rate: 18.75%
+Confusion matrix:
+     high low class.error
+high    6   2       0.250
+low     1   7       0.125
+
+ +

I have played around with both mtry (5-50) and ntree (500-2000) with no apparent success. I've also tried combinations of transforms and scaling of 'X'. But as I understand it, RF is a non-parametric method and as such, transformations and scaling won't do anything for the results.

+ +

For comparison, using the exact same data, PLS-DA using SIMCA13 provides excellent separation already in the 1st component. SVM using the kernlab package in R provides 0 training error. At this stage I'm not looking at validation or using test sets. I want to first make sure I get good classification on my training set.

+ +

I'm sure I'm missing something, but I don't know what. I hope to have supplied sufficient information to describe the problem.

+ +

Thanks in advance for any help!

+ +

Sincerely,

+ +

Calle

+",2013-10-31 09:48:41.600 +58579,1575.0,1,,,,What languages are commonly used in medical statistics?,,CC BY-SA 3.0,"

A doctor friend has asked if I would teach them some basic programming, because they think it will be useful when they start to do research in the future.

+ +

To give some background - they don't know what language their lab-mates will use (because they don't know what lab they are joining) but they would like to learn something that will be generally useful. They are not interested in doing statistical research, but rather applying statistical techniques to medical data.

+ +

There are some obvious options (I know enough about all of these to teach them)

+ +
    +
  • Python
  • +
  • R
  • +
  • MATLAB
  • +
+ +

and a few less obvious ones (I don't know anything about these)

+ +
    +
  • Stata
  • +
  • SAS
  • +
  • SPSS
  • +
+ +

I'd like to know what languages are most commonly used in medical statistics - do people use general purpose languages like Python, or more specialized languages like R and MATLAB, or statistical software like Stata/SAS/SPSS?

+ +

I should clarify that I'm thinking of someone whose primary job is as a doctor/medical researcher who will be applying statistics, not someone who is a professional statistician or software developer who happens to be working in medicine (so e.g. I think that C/C++/Java would all be poor choices).

+",2013-10-31 10:09:54.433 +58580,503.0,2,,58577.0,,,,CC BY-SA 3.0,"

Stepwise selection is wrong in multilevel models for the same reasons it is wrong in ""regular"" regression: The p-values will be too low, the standard errors too small, the parameter estimates biased away from 0 etc. Most important, it denies you the opportunity to think.

+ +

9 IVs is not so very many. Why did you choose those 9? Surely you had a reason.

+ +

One initial thing to do is look at a lot of plots; which precise ones depends a little on whether your data are longitudinal (in which case plots with time on the x-axis are often useful) or clustered. But surely look at relationships between the 9 IVs and your DV (parallel box plots are one simple possibility).

+ +

The ideal would be to build a few models based on substantive sense and compare them using AIC, BIC or some other measure. But don't be surprised if no particular model comes forth as clearly best. You don't say what field you work in, but in many (most?) fields, nature is complicated. Several models may fit about equally well and a different model may fit better on a different data set (even if both are random samples from the same population).

+ +

As for references - there are lots of good books on nonlinear mixed models. Which one is best for you depends on a) What field you are in b) What the nature of the data is c) What software you use.

+ +

Responding to your comment

+ +
    +
  1. If all 9 variables are scientifically important, I would at least consider including them all. If a variable that everyone thinks is important winds up having a small effect, that is interesting.

  2. +
  3. Certainly plot all your variables over time and in various ways.

  4. +
  5. For general issues about longitudinal multilevel models I like Hedeker and Gibbons; for nonlinear longitudinal models in SAS I like Molenberghs and Verbeke. The SAS documentation itself (for PROC GLIMMIX) also provides guidance.

  6. +
+",2013-10-31 10:11:45.777 +58698,21762.0,2,,58694.0,,,,CC BY-SA 3.0,"

The distribution of the response is irrelevant. Inference based on small samples requires the errors to be approximately normal (better look at the QQ-plot of the residuals than at its density because the tails are important). If you are only interested in descriptive results or if the sample size is not too small, you therefore do not need to worry about normality.

+ +

Much more important are the other assumptions of linear regression (correct model structure, no large outliers in the predictors and, if you are interested in inference, homoscedastic and uncorrelated errors).

+",2013-11-02 11:05:26.930 +58581,12503.0,1,58607.0,,,Measure of similarity/distance of data points in geographic space,,CC BY-SA 3.0,"

Given two points $p_1=(x_1,y_1,t_1)$ and $p_2=(x_2,y_2,t_2)$, where $x$ and $y$ refer to the geographic coordinates in the plane, and $t$ to some measured value. +Two distance measures to evaluate the similarity between these two points come to my mind: +$$d_1(p_1,p_2) = \sqrt{ (x_1-x_2)^2+(y_1-y_2)^2+(t_1-t_2)^2 }$$ +$$d_2(p_1,p_2) = \sqrt{ (x_1-x_2)^2+(y_1-y_2)^2 }+ \sqrt{ (t_1-t_2)^2 }$$

+ +

Measure $d_1$ is simply Euclidean distance in 3d-space, while $d_2$ is the sum between spatial distance and attribute distance.

+ +

Which measure does make more sense and should be applied for e.g. clustering?

+",2013-10-31 10:13:33.410 +58582,503.0,2,,58579.0,,,,CC BY-SA 3.0,"

This question seems to me to conflate two issues: Programming and statistics.

+ +

I don't know what programming languages are used in medical labs, although I get the sense that none are. In terms of statistics, I'd say R and SAS dominate. These are radically different languages. The problem with trying to teach SAS is 1) Since it isn't on your list you probably don't know it and 2) You'd have to have access to it, and it's expensive. That would lead to teaching R. But if they wind up in a place that uses SAS exclusively, I don't think R would help much.

+ +

However, if the person will be doing their own analysis, then R is fine. I would try to teach reproducible research methods.

+",2013-10-31 10:17:18.677 +58583,23208.0,1,,,,Logistic Regression: wald chi-square,,CC BY-SA 3.0,"

I am running a logistic regression with a binary dependent variable and 5 class independent variables.

+ +

One information the software I use returns, is the following:

+ +

Effect / DF / Wald / Pr > ChiSq

+ +

Var1 / 1 / 150 / <.0001

+ +

Var2 / 3 / 119 / <.0001

+ +

Var3 / 8 / 157 / <.0001

+ +

Var4 / 6 / 1553 / <.0001

+ +

Var5 / 4 / 15975 / <.0001

+ +

Concerning the data above I have two questions:

+ +

I) How can I interpret the fact that for var5 the value of the wald chi-square is so much higher than the values of the remaining variables? And is it a bad indicator of the regression quality?

+ +

II) I saw the impact of a regression variable calculated as the division between the wald chi-square value of the variable and the total sum of all the variables wald chi-square values (shown in percentage….is this example var5 would represent 88.9% of the probability resulting from the logistic regression), but found no statistical grounds for this methodology. Anyone knows this methodology or others?

+ +

Any guidance will be helpful

+",2013-10-31 10:51:28.807 +58584,10735.0,1,58590.0,,,"Is this a correct use of ""optimal""?",,CC BY-SA 3.0,"

During fine-tuning some classifier parameters, is it correct to talk about ""optimal"" configuration?

+ +

For example, if perfect classification is impossible, but the error converges to some relatively good small value, is it correct to call it an ""optimal"" solution, or is it ""suboptimal"" because there's no single optimal one?

+ +

If there are several configurations that perform best and equally good, would it be correct to call either of them ""optimal""?

+",2013-10-31 11:43:14.730 +58585,23211.0,1,,,,Multiprocess models,,CC BY-SA 3.0,"

I would like to use multi-process models. in particular, I would like to apply this model: Kulu, Hill. ""Migration and fertility: Competing hypotheses re-examined."" European Journal of Population/Revue européenne de Démographie.

+ +

In the fertility equations I don't understand how the estimation procedure differs between the two models (conceptually I get the difference but I am rather confused when it comes to the actual methodological application). In particular, to let the errors vary across the three migration equation is the estimation procedure extended in a discrete-time setting?

+ +

Additionally, i don´t understand whether the author distinguishes between a first birth after first migration and first birth after second migration or whether he is just considering first birth after whatever migration.

+ +

Can someone help me to better understand? Textbooks about the topic are welcome

+ +

thank you!

+",2013-10-31 11:57:07.880 +58586,23171.0,1,58612.0,,,"Is $X_{i}$ a martingale, submartingale, or supermartingale?",,CC BY-SA 3.0,"

Let $X_{i}$, $i=0,1,\cdots$ be a sequence of random variables generated by $X_{i+1}=\rho X_{i}+W_{i}$, where $\rho$ is constant and $W_{i}$ are i.i.d random variables. Suppose $X_{0}$ is independent of $W_{i}$. Is $X_{i}$ a martingale, submartingale, or supermartingale?

+",2013-10-31 12:45:51.320 +58587,23179.0,1,,,,coding survey data for cosine similarity and euclidean distance?,,CC BY-SA 3.0,"

I want to know how to code survey data such that a similarity function can be applied on it.

+ +

Say I want to use cosine similarity. All the search results and QA I've found while in my search deal only with the similarity between documents, with vectors consisting of word frequencies or tf/idf.

+ +

What about survey data? What is the sensible/common/useful way of coding survey data such that similarity can be compared? (Is it even sensible to use functions like cosine similarity for this?)

+ +

My data is record data, purely categorical, neither binary nor numerical. Should I code it into numerical data? My data looks like this (3 sample records):

+ +
Do you like Technology?  | Current GPA       | Institute name
+Y                        | Band 1 (3.75-4)   | UUIC
+N                        | Band 3 (3.0-3.5)  | ADU
+N                        | Band 2 (3.5-3.75) | UUIC
+
+ +

etc. These are just 3 questions, my survey had a lot more questions, but I hope you get the idea.

+ +

Is it sensible to code the data into numerical vectors, where for eg. I represent yes/no values as binary variables, and assign numbered categories to other values? In which case the above 3 records would become:

+ +
(1, 1, 1)
+(0, 3, 2)
+(0, 2, 1)
+
+ +

Where UUIC = 1, ADU = 2, and the GPA bands are represented simply by 1, 2, 3, 4, etc..

+ +

And then apply cosine similarity or euclidean distance? Would this make sense? I've been searching for similar examples for a while now but everything that comes up seems to be about document similarity. There doesn't seem to be much beginner's help on how to deal with survey data.

+",2013-10-31 13:20:51.977 +58588,9755.0,1,,,,How to get proper randomization in website A/B testing?,,CC BY-SA 3.0,"

In the statistical practice of experimental design, you separate your tests by 'blocks' if you can control the factors (say routing to one page or another, or categorizing by browser) or through randomization of trials to counteract variables you don't control.

+ +

When doing A/B testing, you obviously can't control who comes to your site when. How can good statistical practice be done?

+",2013-10-31 13:24:46.067 +58699,15827.0,2,,58694.0,,,,CC BY-SA 3.0,"

Your distribution is not beta if your density plot is to be taken at face value. A beta distribution cannot have two modes within (0, 1). However, no density plot for a bounded variable (at a guess here from some kernel density estimation procedure) can be taken at face value unless the estimation includes adjustments for boundary artefacts, which is not typical. But, as it were, we see what you mean.

+ +

However, to focus on the major issues:

+ +
    +
  • A regression is first and foremost a model for the mean of a variable as it varies with the predictors. Even if an assumption of normal errors is made, that is not an assumption about the marginal distribution of the response and it is the least important assumption that is being made. So, it is not surprising that your regression behaves fairly well as far as can be inferred from the distribution of residuals if the functional form catches the way that conditional means behave.

  • +
  • The assertion of normality is more convincing if you show us a normal probability plot. That distribution looks to me to have higher kurtosis than a normal, although that is likely to be a little deal.

  • +
  • You need to check that your model is predicting values within [0,1]. Some of your residuals are about 0.7 in magnitude and so it seems possible that some of the predictions are qualitatively wrong.

  • +
  • At the same time, you should be able to do better with a regression that respects the bounded nature of the response. You could try beta regression or a generalised linear model with binomial family and logit link. The latter sounds wrong but often works well in practice. For a concise introductory review, see http://www.stata-journal.com/sjpdf.html?articlenum=st0147 Beta regression is supported in R and Stata (and likely so in other software) and generalised linear models are widely supported, although watch for routines that reject non-binary responses if a logit link is requested.

  • +
+ +

Note: The exact form of your density plot for the response is a side-issue, so I will make this an added note. It's clear that the density for a variable bounded by 0 and 1 must average 1. Your graph has a useful reference line density at 1. Visually comparing the bump above 1 on the left with the area to its right underlines that some of the density has been smoothed away by the procedure beyond the support and discarded. That is, the graph shown truncates the display: the smoothed distribution has positive density below 0 or above 1, which is not shown. There are known ways to smooth a bounded variable more respectfully, in this case including (a) to smooth logit of the variable and back-transform the density (a little problematic if observed values include 0 or 1), or (b) to reflect density inwards at the extremes. Naturally, there is scope for disagreement about whether this is trivial or secondary on the one hand or incorrect on the other. (I'd rather see a quantile plot of the data, but I'll not expand on that.)

+",2013-11-02 11:06:50.860 +58737,503.0,4,,,,,,CC BY-SA 3.0,"This tag is very ambiguous. Dispersion is a general term for how spread apart values are. For questions related to disease dissemination content, use the tag epidemiology. For other meanings of dispersion consider using a related tag or creating a new one.",2013-11-03 00:18:10.927 +58589,5273.0,1,58594.0,,,Gradient boosting in R uses only a single variable,,CC BY-SA 3.0,"

I am trying to build a boosting model using the package gbm in R. I have the following code:

+ +
gb = gbm(aaa_target ~ .,
+     data=myDdata,
+     n.trees=100,
+     verbose=TRUE)
+
+ +

and when I have trained the model, I can get a summary like this:

+ +
summary(gb)
+
+ +

The issue I am having, is that only a single variable (out of around 30) is selected and is given 100% predictive power. I know for a fact that many of the variables carry information (although the selected one is the most significant one), and using the randomForest package gives me a model which assigns significance to many of the variables.

+ +

Does anybody have a clue to why this might be the case?

+",2013-10-31 13:48:40.793 +58590,20473.0,2,,58584.0,,,,CC BY-SA 3.0,"

""Optimal"", ""Best"", ""Equally good"", are all concepts relative to some criterion. Once you specify your criterion, the ranking of alternative configurations can be determined, and the first in rank will be optimal according to the specific criterion -no more than that.

+ +

And it so happens that in most cases, a configuration optimal given criterion A, is not optimal given criterion B.

+ +

If, given the same criterion, two alternative configurations ""perform equally well"", then, strictly speaking, they are both ""optimal"" -although if the criterion is some continuous quantity, it will be difficult for the two to perform exactly the same. Still, in practice, if their performance according always to the same criterion is ""very close"" (vague term), then we usually say that they are ""equivalent"" in terms of predictive or explanatory power.

+ +

Finally, the criterion is not ""achieve the ideal"" (say, zero error), but ""how close to the ideal can you be"" (minimum error).

+",2013-10-31 13:58:27.493 +58591,5211.0,1,58621.0,,,Converting 2nd order Markov chain to the 1st order equivalent,,CC BY-SA 3.0,"

Given a 2nd order Markov chain where each state takes values in the set $\mathcal{X}=\{A,C,G,T\}$, such that all transition probabilities $p(x_t|x_{t-1},x_{t-2})$ are larger than zero,

+ +

How to convert it to the equivalent 1st order one with all the transition probabilities defined?

+",2013-10-31 13:59:26.240 +58592,23216.0,1,,,,Regression where a subset of observations are missing data on an independent variable,,CC BY-SA 3.0,"

Consider the regression equations below:

+ +

\begin{align} +Y_i &= \beta_0 + \beta_1 X_{i1} + \varepsilon_i \\ +Y_j &= \beta_0 + \beta_1 X_{j1} + \beta_2 X_{j2} + \varepsilon_j +\end{align}

+ +

where $Y_i,\ X_{i1},\ \varepsilon_i,\ Y_j,\ X_{j1},\ \& \ X_{j2},\ \varepsilon_j$ are vectors, and $_i$ and $_j$ index distinct sets of observations. The $_i$ respondents did not meet a qualification criterion and hence were not asked the question that corresponds to $X_2$.

+ +

The dependent variable and the first independent variable is the same in both regression equations but the second regression equation has an independent variable that is not present in the first. Obviously, I can estimate the two regressions separately but that will not be efficient. Therefore, I was considering re-writing the first one as:

+ +

$$ +Y_i = \beta_0 + \beta_1 X_{i1} + \beta_2 X_{i2} + \varepsilon_j +$$

+ +

where $X_{i2}$ is a vector of $0$s.

+ +

Then I can estimate the parameter estimates by using OLS with the equation below:

+ +

$\left[ \begin{array}{ccc} +Y_i\\ +Y_j\end{array} \right] = \left[ \begin{array}{ccc} +{\bf 1} & X_{i1} & X_{i2} \\ +{\bf 1} & X_{j1} & X_{j2}\end{array} \right] \left[ \begin{array}{ccc} +\beta_0\\ +\beta_1\\ +\beta_2\end{array} \right] + \left[ \begin{array}{ccc} +\epsilon_i\\ +\epsilon_j\end{array} \right]$

+ +

In the above equation, ${\bf 1}$ stands for a vector of $1$s of the appropriate dimension.

+ +

Is the above a standard approach to obtaining efficient estimates? Is there a name to this way of estimation?

+",2013-10-31 14:18:26.747 +58593,23215.0,1,,,,Weighted Random Selection with bounded replacement,,CC BY-SA 3.0,"

The problem: +Let there be an array (noted as X) where each item has a member, and a label assigned to it. For example:

+ +

$$ +X= \{a_0, b_0, b_1, c_0, c_1, a_1\} +$$

+ +

Here the letter denotes a label, and the index is just another instance from the given label. Now the goal is to sample n items from X into Y, where:

+ +

$$ +|Y|=n < m = |X| +$$

+ +

and each label has a weight assigned to it, so that in the long run the weights will define the distribution of the elements in Y. Elements from within a label are different, but whichever of them may be selected. The multiplicity of the labels are random too (it may vary from 1 to 10). The length of X will be usually around 100, from which around 10 items need to be selected.

+ +

So, let's say if the weights are

+ +

$$ +W = \{a=5, b=45, c=50\} +$$ +for 10,000 runs if we add up the elements from Y it will be true that 5% of it will be from a, 45% from b, and 50% from c.

+ +

The indexes just denote that you have multiple item instances from a label. There is no frequency requirement for within label (so simple random sampling can/should be used).

+ +

The sampling is without replacement (considering the full array (X), however it is with bounded replacement if you consider just the labels (that is a,b,c labels).

+ +

At the end we do not need exact match with W, expected value is okay. Successive runs may result in slightly different output, that too is okay (although the expected value of the frequency of labels, should approximate W).

+ +

Now, I've already tried to do this in python via the roulette wheel selection, where if a label has no more items I redo the roulette, however this sometimes goes really off.

+ +
    +
  1. What's a good algorithm for this?

  2. +
  3. What's wrong with my code? (why the big difference sometimes - here's the python code http://pastebin.com/SFhV74Z8)

  4. +
+",2013-10-31 14:25:15.797 +58594,5917.0,2,,58589.0,,,,CC BY-SA 3.0,"

Because the overworked maintainers of the gbm package have not had time to implement random feature sampling at each split calculation yet. I submitted a bad patch that did this as a proof of concept, but:

+ +
    +
  • My C++ skills are non existent
  • +
  • I provided no documentation
  • +
  • I didn't integrate with the formula interface wrapper
  • +
+ +

So I feel no ill will for not picking up the patch. I haven't maintained the fork either so I'm sure it wouldn't integrate with the current gbm. You can see where I left off here: https://code.google.com/r/sheaparkes-mtry-additions/source/browse

+ +

If you really need the feature sampling functionality, it's available in Python's scikit.learn package implementation of gbm.

+",2013-10-31 14:33:02.057 +58595,20470.0,2,,58591.0,,,,CC BY-SA 3.0,"

The first order transition matrix: $T^1$ is of size $[k*k]$. And the second order transition matrix: $T^2$ is of size $[k^2*k]$. So you want to reduce the number of rows from $k^2$ to $k$ by merging.

+ +

An example is given on the Wikipedia link, you should be able to convert $T^2$ to $T^1$ simply by marginalising over the $t-2$ states (which are not needed for $T^1$) at each column.

+ +

My explanation is probably not crystal clear but I think you will understand what I mean once you see the example on the link.

+",2013-10-31 14:34:26.360 +58596,633.0,2,,58477.0,,,,CC BY-SA 3.0,"

Is it not intuitive that you cannot reason from cause to unobserved effect to another cause? If the rain (B) and the sprinkler (D) are causes of the wet ground (C), then can you argue that seeing rain implies that the ground is probably wet, and continue to reason that the sprinkler must be on since the ground is wet?! Of course not. You argued that the ground was wet because of the rain — you can't look for additional causes!

+ +

If you observe the wet ground, of course the situation changes. Now you may be able to reason from one cause to the other as Frank explains.

+",2013-10-31 14:36:28.603 +58597,306.0,2,,58413.0,,,,CC BY-SA 3.0,"

In order to check if a variable is significant for a category, do a hypothesis testing for a binomial variable assuming that the probability of getting a 1 is 0.5. In order to answer these other questions, do the same for each of the constituent variables. if the p value in the two cases is sufficient to reject the null hypothesis, then you can claim whether that variable or the set of variables is characteristic or not for the category. if you cannot reject it, then the variables behave randomly for that category. Check the link Binomial test.

+",2013-10-31 14:46:39.507 +58598,23218.0,1,58703.0,,,Finding probability density function with unknown values,,CC BY-SA 3.0,"

I am not sure if this is the place to ask, have tried to read up on probability density function (PDF) in order to answer this question but to no avail.

+ +

How do I go about starting on this?

+ +

How do I generate a function based on the given $\lambda$ and that small $t$ symbol.

+ +

What is $X \sim Ga(n,\lambda)$?

+ +

An explanation before the answer would be nice. Appreciate all the help needed.

+ +

Regards

+ +

+",2013-10-31 14:47:22.883 +58599,22970.0,1,,,,Generalized Linear Mixed Models: Diagnostics,,CC BY-SA 3.0,"

I have a random intercept logistic regression (due to repeated measurements) and I would like to do some diagnostics, specifically concerning outliers and influential observations.

+ +

I looked at residuals to see if there are observations that stand out. But I would also like to look at something like Cook's distance or DFFITS. Hosmer and Lemeshow (2000) say that due to the lack of model diagnostic tools for correlated data, one should just fit a regular logistic regression model ignoring the correlation and use the diagnostics tools available for regular logistic regression. They argue that this would be better than doing no diagnostics at all.

+ +

The book is from 2000 and I wonder if there are methods available now for model diagnostics with mixed effects logistic regression? What would be a good approach to check for outliers?

+ +

Edit (Nov 5, 2013):

+ +

Due to the lack of responses, I am wondering if doing diagnostics with mixed models is not done in general or rather not an important step when modeling data. So let me rephrase my question: What do you do once you found a ""good"" regression model?

+",2013-10-31 14:55:26.660 +58600,22511.0,1,,,,Finding maximum likelihood,,CC BY-SA 3.0,"

This is a model that is used to model soccer scores, so $i$ and $j$ are, respectively, home and away teams. Random variables $(x,y)$ are the goals scored by the home and away teams, respectively. Parameter $\lambda$ is a known mean goals scored by the home team and $\mu$ is the mean goals scored by the away team. I have managed to fix all the other parameters except for $\rho$, which I have to estimate via MLE.

+ +

$$Pr(X_{i,j}=x, Y_{i,j}=y)=\tau_{\lambda, \mu}(x,y)\frac{\lambda^x \text{exp}(-\lambda)}{x!}\frac{\mu^y\text{exp}(-\mu)}{y!}$$ +where +$$\lambda=\alpha_{i}\beta_{j}\gamma$$ +$$\mu=\alpha_{j}\beta_{i}$$ +and +$$\tau_{\lambda,\mu}(x,y)=\left\{\begin{array}{cc} +1-\lambda\mu\rho &\text{if $x=y=0$,} \\ +1+\lambda\rho &\text{if $x=0,y=1$,}\\ +1+\mu\rho &\text{if $x=1,y=0$,}\\ +1-\rho &\text{if $x=y=1$,}\\ +1 &\text{otherwise}\end{array} +\right.$$

+ +

Based on the above equations, all the parameters $(\lambda, \mu, \alpha, \beta, \gamma)$ are known constants.

+ +

So, now, the problem that I am having is that I have no clue on how to estimate $\rho$ using the maximum likelihood function since a piece-wise equation is involved.

+ +

Also, it will be great if anyone can do this using R.

+",2013-10-31 14:57:59.460 +58601,2081.0,2,,58587.0,,,,CC BY-SA 3.0,"
    +
  1. Both cosine similarity and euclidean distance require +scale (=metric) level data, that is, interval or ratio level. I suppose it is what you mean by ""numeric"". Also, binary data (1 vs +0) will do (though there is theoretical controversy). Nominal data - +convert it into dummy binary data first. Ordinal data - see to +choose either treat it as interval or nominal.
  2. +
  3. Squared euclidean +distance and cosine similarity are exactly related. You always can +transform one into the other. 1.
  4. +
  5. It is not generally a good idea to compute a (dis)similarity coefficient on a hodge-podge of different characteristics (different types and/or units) even if you do recodings mentioned in point 1 and do appropriate standardizations prior the computation, because there remains issue of weighting (relative importance) of the characteristics. Gower similarity (rather than cosine/euclidean) is the measure of choice, if you are nevertheless determined to base the coefficient on mixed characteristics. It can ""take"" interval, ordinal, binary and nominal ones (and with used-defined weighting, if necessary). 1, 2.
  6. +
  7. If you are going to compute similarity based on binary characteristics only, be aware that not all of of a great variety of binary ""matching"" similarity coefficients equally well suit natural binary characteristics and dummy variables (i.e. former nominal ones). 1.
  8. +
+",2013-10-31 15:04:11.687 +58602,14965.0,1,58749.0,,,Time Varying System Matrices in Kalman Filter,,CC BY-SA 4.0,"

Kalman filter can accommodate time varying system matrices. Equations to run the filter are the same and it preserves its optimality under linear gaussian model.

+ +

My question is the following:

+ +

Can the evolution of time varying system matrices be stochastic? In some references I seem to read between the lines that they should evolve deterministically. Does it mean that the entire filter breaks or do we simply lose optimality by making them stochastic?

+ +

For reference, please peek at section 3.2 of the following paper:

+ +

http://www.ims.cuhk.edu.hk/~cis/2012.1/CIS%2012-1-05.pdf

+ +

A similar comment is in Harvey's book on Kalman Filter.

+",2013-10-31 15:33:02.747 +58603,16159.0,2,,58119.0,,,,CC BY-SA 3.0,"

If I were doing a binomial test, and I had 100% of my results indicating pass until the nth sample, at which time I had one sample indicate fail, then I would say that my estimate of the binomial distribution parameters and their confidence intervals might be very different before versus after the ""fail"".

+ +

My binomial test is whether or not the samples belong to an expected distribution which if true gets a ""pass"" or whether they qualify as an outlier in which case the result is ""fail"".

+ +

Keywords of interest may include ""zero defect sampling"", and ""acceptance sampling"".

+ +

Reference links:

+ + +",2013-10-31 15:42:29.387 +58604,20538.0,2,,58588.0,,,,CC BY-SA 3.0,"

In high volume sites just randomly sending the user to one of your test pages or control (let's call them factor levels) usually works fine. There may be some variability in the mix of browsers between the factor levels but you'll have a sample size that's large enough for it not to matter so much. This is also very easy to implement, even if you randomly load-balance or shard across multiple servers.

+ +

For low-volume sites where you have $n$ factor levels you could generate random permutations and force the next $n$ users to the factor level according to the permutation. You could do this within each group you define (e.g. browser type). This does require keeping track of a global state, and assumes you can do so fast enough that you can observe an order in which the users arrive. I don't know of anyone that does this.

+",2013-10-31 15:42:31.400 +58605,7007.0,2,,58600.0,,,,CC BY-SA 3.0,"

If I understand it, your data is $\{(i_m,j_m,x_m,y_m) : m=1,\dots,n\}$, in which, for the $m$-th match, $i_m$ is the index of the home team, $j_m$ is the index of the away team ($i_m\neq j_m$), $x_m$ is the number of goals scored by the home team, and $y_m$ is the number of goals scored by the away team.

+ +

If we have $t$ teams disputing the $n$ matches, the likelihood $L(\rho,\gamma,\alpha_i,\beta_i, i=1,\dots,t)$ is proportional to +$$ + \prod_{m=1}^n \tau_{\lambda_m,\mu_m}(x_m,y_m)\,\lambda_m^{x_m} \,\mu_m^{y_m}\,e^{-(\lambda_m+\mu_m)} \, , +$$ +in which $\lambda_m = \alpha_{i_m} \beta_{j_m}\gamma$ and $\mu_m = \alpha_{i_m} \beta_{j_m}$. This likelihood is a function of $2(t+1)$ parameters. You will probably need to implement some conjugate gradient method to solve this constrained optimization problem. I wonder what happens if we put some priors on the parameters and MCMC the posterior with a random walk Metropolis. You said that you know the values of the parameters, except for $\rho$ (and I wonder how...). Hence, your first step is to write an R function that computes the product above for each value of $\rho$. Then, plot it and use the optimizer to see what happens.

+",2013-10-31 15:49:28.083 +58606,23220.0,1,,,,Export variance-covariance matrix using PROC GLM,,CC BY-SA 3.0,"

I have a ordinary linear regression model like this

+ +
y = b0 + b1*x + b2*z + b3*x*z
+
+ +

I used PROC GLM in SAS to test the model. Now I want to export the variance-covariance matrix of the coefficients (b0, b1, b2, and b3). However, I didn't find any option to export it. I can't not use PROC REG because of the interaction term.

+ +

Does anyone know how to get it?

+",2013-10-31 15:57:08.483 +58607,5671.0,2,,58581.0,,,,CC BY-SA 3.0,"

Neither makes ultimately sense.

+ +

First of all, Earth is not flat. Don't use Euclidean distance on latitude, longitude coordinates, because that is highly inaccurate.

+ +

So lets assume you don't have GPS data, but, e.g. meters in a room; then Euclidean on this attribute makes sense.

+ +

Control yourself by looking at units. Physical data has units, for a very good reason... +$$ +\sqrt{(x_1-x_2)^2+(y_1-y_2)^2} \sim_\text{units} \sqrt{m^2 + m^2} \sim m +$$

+ +

I.e. Euclidean distance, applied to two coordinates in meter, returns a distance in meters!

+ +

Now assume that your third attribute is e.g. Volt.

+ +

$$ +\sqrt{(x_1-x_2)^2+(y_1-y_2)^2+(v_1-v_2)^2} \sim_\text{units} \sqrt{m^2 + m^2 + V^2} \sim ??? +$$

+ +

You cannot add (squared) meters to (squared) volts. They are entirely different things.

+ +

You might, instead, want to look at an algorithm that can deal with multiple relations. For example Generalized DBSCAN can trivially be used to cluster this data by specifying a different $\varepsilon$ for each Relation. You would then specify ""neighbors"" as ""within 1 meter of distance and 10 Volts in the measurement"".

+ +

See how nicely this works out for some algorithms to keep different data separate?

+",2013-10-31 16:00:49.880 +58608,5374.0,1,,,,Why is Standard Deviation used to evaluate the effect of a change in an independent variable on the dependent one?,,CC BY-SA 3.0,"

I'm reading some economics papers about the relationship between inequality and growth and some of them have sentences like these:

+ +
+

an increase of 0.07 (one standard deviation in the sample) in the income share of the top 20 percent lowers the average annual growth rate just below half a percentage point

+
+ +

and

+ +
+

the estimated coefficients imply that an increase in, say, the land Gini coefficient by one standard deviation (an increase of 0.16 in the Gini index) would lead to a reduction in growth of 0.8 percentage points per year

+
+ +

Why is ""one standard deviation"" used? Why is it preferred to a unitary change? Thanks

+",2013-10-31 16:04:03.883 +58609,22564.0,1,61071.0,,,What examples of lurking variables in controlled experiments are there in publications?,,CC BY-SA 3.0,"

In this paper:

+ +

Lurking Variables: Some Examples +Brian L. Joiner +The American Statistician +Vol. 35, No. 4, Nov., 1981 227-233

+ +

Brian Joiner claims that ""randomization is not a panacea"". This is contrary to common statements such as the one below:

+ +
+

A well-designed experiment includes design features that allow + researchers to eliminate extraneous variables as an explanation for + the observed relationship between the independent variable(s) and the + dependent variable. These extraneous variables are called lurking + variables.

+
+ +

The quote was taken from this question and does not have a source but in my experience it is representative of the prevailing attitude: +Examples of Lurking Variable and Influential Observation

+ +

One example given is that when testing the safety (specifically carcinogenesis) of red #40 food dye on rodents in the seventies an effect of cage position was found to confound the study. Now I have read many journal articles studying carcinogenesis in rodents and have never seen anyone report controlling for this effect.

+ +

Further discussion of these studies can be found here: +A case study of statistics in the regulatory process: the FD&C Red No. 40 experiments.

+ +

I could not find a non-paywalled version but here is an excerpt:

+ +
+

At the January meeting, we presented a preliminary analysis (14) that + disclosed a strong correlation between cage row and RE (reticulo-endothelial tumor) death rates, + which varied from 17% (bottom row) to 32% (top row) (table 2). We + could not explain this strong association by sex, dosage group, or + rack column or position. A subsequent analysis (18) also indicated + that cage position (front vs. back) might be correlated with non-RE + mortality and that position was correlated with time to non-RE death.

+
+ +

I am specifically interested in why there seems to be such a problem with replication in the medical literature, but examples from all fields would be welcome. Note that I am interested in examples from randomized controlled experiments, not observational studies.

+",2013-10-31 16:12:08.707 +58610,20470.0,1,58634.0,,,Effect of Wald-test and collinearity on Logistic Regression model selection,,CC BY-SA 3.0,"

A researcher is interested in how variables, such as GRE (continuous), GPA (continuous) and rank of the undergraduate institution (categorical), affect admission into graduate school. The response variable, admit/don't admit, is a binary variable. The data set is taken from UCLA stats page.

+ +
admisdata <- read.csv(""http://www.ats.ucla.edu/stat/data/binary.csv"")
+summary(admisdata) 
+admisdata$rank <- factor(admisdata$rank)
+mylogit <- glm(admit ~ gre + gpa + rank, family = ""binomial""(link=logit), data = admisdata)
+
+ +

My questions follow:

+ +

1) In the code (below), they check whether there is a statistically significant difference between the rank3 and rank4 coefficients. What would the consequence be if the difference is not significant (as below)? Are we better off merging rank3 and rank4 or leaving one out?

+ +
l2 <- cbind(0, 0, 0, 0, 1, -1)  # rank3 with rank4
+wald.test(b = coef(mylogit), Sigma = vcov(mylogit), L = l2)
+>Wald test:
+>Chi-squared test:
+>X2 = 0.29, df = 1, P(> X2) = 0.59
+
+ +

2) In another list of heuristics, it is recommended to look for collinearities by checking the correlation matrix of the estimated coefficients. And it is stated: ""If two covariates are highly correlated, do not need both of them in the model"". For the given model fit:

+ +
cov2cor(vcov(mylogit))
+
+>            (Intercept)          gre         gpa        rank2       rank3       rank4
+>(Intercept)   1.0000000 -0.241538075 -0.80278632 -0.234145435 -0.12357608 -0.18775966
+>gre          -0.2415381  1.000000000 -0.34207786 -0.004867914  0.04925080  0.02589326
+>gpa          -0.8027863 -0.342077858  1.00000000  0.043045375 -0.08263837  0.02573691
+>rank2        -0.2341454 -0.004867914  0.04304537  1.000000000  0.63655379  0.53030520
+>rank3        -0.1235761  0.049250801 -0.08263837  0.636553788  1.00000000  0.48337703
+>rank4        -0.1877597  0.025893262  0.02573691  0.530305204  0.48337703  1.00000000
+
+ +

It seems like the highest inter-coefficient correlation is between rank3 and rank2. Does that mean it is better to leave one of them out or merge them? How do we decide what correlation value is significant enough?

+ +

3) Or, should one prioritise looking at the AIC's of the different models with/without these categories to compare them instead of the issues listed in 1) and 2)?

+",2013-10-31 16:25:10.943 +58611,,2,,58608.0,user31668,,,CC BY-SA 3.0,"

Practically, the value of 1 may have very different meaninings in different contexts, so the standard deviation would put the sensitivity in terms of a ""typical"" deviation from the current value.

+ +

Theoretically, if a normal approximation is being used to model the variables, then 1 standard deviation is a convenient metirc for converting deviations into probabilities via the Z-score.

+",2013-10-31 16:39:06.383 +58612,7007.0,2,,58586.0,,,,CC BY-SA 3.0,"

For $n\geq 1$, since +$$X_n = \rho^nX_0 + \rho^{n-1}W_0 + \rho^{n-2}W_1 + \dots + W_{n-1} \, ,$$ +it follows that $W_n$ is independent of $X_1, \dots,X_n$, because they are functions of $X_0,W_0,W_1,\dots,W_{n-1}$ only and the $W_i$'s and $X_0$ are independent. Therefore, defining $\mathscr{F}_n=\sigma(X_1,\dots,X_n)$, we have almost surely +$$ + \mathrm{E}[X_{n+1}\mid\mathscr{F}_n] = \rho\,\mathrm{E}[X_n\mid\mathscr{F}_n] +\mathrm{E}[W_n\mid\mathscr{F}_n] = \rho\,X_n + \mathrm{E}[W_n] \, . +$$ +From here we need to know more about the value of $\rho$ and the distributions of $X_0$ and the $W_i$'s.

+ +

For example, if $\mathrm{E}[W_i]=0$, for every $i\geq 1$, and $\rho=1$, then $\{(X_i,\mathscr{F}_i)\}_{i\geq 1}$ is a martingale.

+ +

If $\mathrm{E}[W_i]\leq 0$, for every $i\geq 1$, and $\rho=1$, then $\{(X_i,\mathscr{F}_i)\}_{i\geq 1}$ is a supermartingale.

+ +

If $X_0$ and the $W_i$'s are almost surely positive, and $\rho>1$, then $\{(X_i,\mathscr{F}_i)\}_{i\geq 1}$ is a submartingale.

+",2013-10-31 16:46:26.213 +58613,5237.0,2,,58606.0,,,,CC BY-SA 3.0,"

If it is more convenient to export the variance-covariance matrix using PROC REG, then you can use that. You can get an interaction term by doing a DATA step first, and creating a new dataset with an extra variable that constitutes the x by z interaction. The code might look something like this:

+ +
DATA new.data;
+    SET data;
+    xz = x*z;
+RUN;
+PROC REG DATA=new.data;
+    MODEL y = x z xz /COVB;
+RUN;
+
+",2013-10-31 16:50:47.407 +58614,22637.0,1,58618.0,,,Cauchy Distribution and Central Limit Theorem,,CC BY-SA 3.0,"

In order for the CLT to hold we need the distribution we wish to approximate to have mean $\mu$ and finite variance $\sigma^2$. Would it be true to say that for the case of the Cauchy distribution, the mean and the variance of which, are undefined, the Central Limit Theorem fails to provide a good approximation even asymptotically?

+",2013-10-31 17:30:53.127 +58615,8958.0,2,,58510.0,,,,CC BY-SA 3.0,"

You can use the discriminant analysis to predict the cluster using your principal components as independent variables, so your model would be:

+ +

$Cluster=Component_1+Component_2,...,Component_n$

+ +

And no, you don't have a restriction on the number of components you can use in regards to the number of clusters you have. I would use them all. By means of cross validation I would measure how well this model (linear discriminant) predicts the cluster and if the accuracy is good you would know that the clusters are separable, crisp, which could be interpreted as cluster health. Note that you could use any classifier to do this. You could also directly use separability measures for this same purpose like Jeffries-Matusita or divergence.

+",2013-10-31 17:49:10.763 +58616,23223.0,1,,,,Why does asymptotic distribution of LRT not depend on specific functions used to express null set?,,CC BY-SA 3.0,"

For a likelihood-ratio test (LRT) statistic, $\Lambda(x)$, the asymptotic distribution of the statistic $-2 \log \Lambda(x)$ is a chi-squared distribution, which occurs only under certain regularity conditions. Here, we have two hypotheses for the unknown vector $\theta$:

+ +
    +
  1. $H_0: \theta \in \omega$ versus
  2. +
  3. $H_1: \theta \in \Omega \cap \omega^C$.
  4. +
+ +

Here, $\Omega$ is the full parameter space, and $\omega$ is the restricted subspace for the null hypothesis. The subspace $\omega \subset \Omega$ is defined in terms of $q$ independent constraints of the form $g_1(\theta) = a_1, \ldots, g_q(\theta) = a_q$, where $0 < q \le p$ and $a_1, a_2, \ldots, a_q$ are constants. The functions $g_1, \ldots, g_q$ must be continuously differentiable.

+ +

Why does the asymptotic distribution of $-2 \log \Lambda(x)$ not depend on what the functions $g_1, \ldots, g_q$ actually are as long as they are smooth and independent?

+",2013-10-31 17:58:29.983 +58617,8958.0,2,,58373.0,,,,CC BY-SA 3.0,"

As daniellopez46 noted, I think you are thinking of consensus clustering where you basically form an ensemble of different clustering runs. What is a bit strange here is that you would want the ensemble to contain results from different clustering methods which can be very misleading. I say this because unlike supervised learning, unsupervised learning always has in a larger or smaller degree a subjective component as you need to have an idea of what you consider a grouping you would be interested in based on your data. Elaborating a bit, clustering is labeling observations based on what relationship they have with other observations in your feature space. Different clustering algorithms will understand this in a totally different fashion as they are looking for different things. Depending on what kind of topology you are looking for you will (as a human) be satisfied with what one clustering algorithm produced on some data set and be totally dissatisfied with what it did on another data set. Look at this question I recently answered, where you can see a diagram of how different clustering techniques treat the same data sets.

+ +

Another thing that should be noted is that consensus clustering is still very new and is basically just being explored so don't take it as panacea.

+",2013-10-31 18:03:04.320 +58618,1889.0,2,,58614.0,,,,CC BY-SA 3.0,"

The distribution of the mean of $n$ i.i.d. samples from a Cauchy distribution has the same distribution (including the same median and inter-quartile range) as the original Cauchy distribution, no matter what the value of $n$ is.

+ +

So you do not get either the Gaussian limit or the reduction in dispersion associated with the Central Limit Theorem.

+",2013-10-31 18:18:24.980 +58626,23233.0,1,,,,Rolling twelve-month crime rate,,CC BY-SA 3.0,"

I need to do a 12 month rolling crime rate. I have monthly crime counts and population counts every 3 months.

+ +

For a calendar year I would usually use a population from mid-year as the denominator and the crime rate for the year as the numerator and then standardise using, say, 100,000. (i.e. crime count / population * 100 000).

+ +

However, now I need to do a rolling 12 month rate. I'm unclear Which population count to use as the denominator. Each method I can think of biases the results in some way. As I see it my options for population denominators are: +(a) An average pop of the 12 months +(b) The middle pop count for the 12 months +(c) The count at the end of the 12 months

+ +

Which option is most appropriate or is there a more appropriate method?

+",2013-11-01 00:08:38.230 +58695,306.0,2,,58687.0,,,,CC BY-SA 3.0,"

Ideally you would want to estimate the missing data using some other data that you might have collected which has some form of relation with the data that you want to estimate. If there is no other data whatsoever, then you would want to look at some kind of autocorrelation kind of a structure so that you can predict the current value based on some of the previous ones. If there is no such information too, but you can assume the data to be from a particular distribution then use the mean to replace missing data. if no distribution can be assumed, use the median to replace the missing data.

+",2013-11-02 10:07:38.983 +58619,22942.0,1,58765.0,,,Problem with response optimization with three variables using Response Surface in Minitab,,CC BY-SA 3.0,"

I'm intending to do a response optimization of one response, $y$, having three predictor variables, $x_1$, $x_2$, and $x_3$. These variables are coded in the following manner:

+ +
A    B   C   y
+-1.00000    -1.00000    -1.00000    66
+ 1.00000    -1.00000    -1.00000    80
+-1.00000     1.00000    -1.00000    78
+ 1.00000    1.00000     -1.00000    100
+-1.00000    -1.00000     1.00000    70
+ 1.00000    -1.00000     1.00000    100
+-1.00000    1.00000      1.00000    60
+ 1.00000    1.00000     1.00000     75
+-1.68179    0.00000     0.00000     100
+1.68179     0.00000      0.00000    80
+0.00000    -1.68179     0.00000     68
+0.00000     1.68179     0.00000     63
+0.00000     0.00000     -1.68179    65
+0.00000     0.00000      1.68179    82
+ 0.00000    0.00000      0.00000    113
+0.00000     0.00000     0.00000     100
+0.00000     0.00000     0.00000     118
+0.00000     0.00000     0.00000     88
+0.00000     0.00000    0.00000       100
+0.00000     0.00000     0.00000     85
+
+ +

What I have tried is (in Minitab)

+ +

Stat -> DOE -> Define Custom Response Surface

+ +

And choosing 3 responses, 6 center points, 1 replicate and alpha 1,682 and subsequently choosing

+ +

Stat -> DOE -> Optimize Response variable

+ +

and choosing Maximize and lowest 100 and target 118 I got that the maximum for $y$ is

+ +

101.711

+ +

and D=0.41

+ +

but A=0.6, B = -0.11 and C=0.15

+ +

which, if I've understood correctly, is invalid for this design.

+ +

When I've googled all I see is people who merely make surface plots and conclude in what path the response is optimized - however using three variables I do not know how to perform this.

+",2013-10-31 19:53:16.933 +58620,23230.0,1,,,,Correlated variables in a math model,,CC BY-SA 3.0,"

Let's say you have 8 variables in a regression model. If some of them are correlated, what degree or percent of correlation should you considering removing some of the variables from the equation? How about covariance - what effect does that have?

+ +

Lastly, how many variables are too many to inform the model? 5, 15, 50, 500? The model I have is a profit/financial type model and is fairly straightforward - profit is the output.

+",2013-10-31 21:33:02.790 +58621,594.0,2,,58591.0,,,,CC BY-SA 3.0,"

Here's a way to do it:

+ +

(I may be writing my state vectors and transition matrices transposed relative to the way you might have learned them, or even the way they're usually done. If that's the case you'll need to translate back.)

+ +

The probability model gives you probabilities for 4 output states at time $t$ in terms of the 16 input states - the possible ordered pairs for $(x_{t-1},x_{t-2})$.

+ +

For speed of writing, let's write $AC$ for $(A,C)$ and so on.

+ +

\begin{array}{c|cccc|cccc|c} + & AA & AC &AT&AG& CA &CC &CT &CG& \ldots \\ \hline +A &p_{AA\to A}&p_{AC\to A}&p_{AT\to A}&p_{AG\to A}&p_{CA\to A}&p_{CC\to A}&p_{CT\to A}&p_{CG\to A}\\ +C &p_{AA\to C}&p_{AC\to C}&p_{AT\to C}&p_{AG\to C}&p_{CA\to C}&p_{CC\to C}&p_{CT\to C}&p_{CG\to C}\\ +T &p_{AA\to T}&p_{AC\to T}&p_{AT\to T}&p_{AG\to T}&p_{CA\to T}&p_{CC\to T}&p_{CT\to T}&p_{CG\to T}\\ +G &p_{AA\to G}&p_{AC\to G}&p_{AT\to G}&p_{AG\to G}&p_{CA\to G}&p_{CC\to G}&p_{CT\to G}&p_{CG\to G}\\ \hline +\end{array}

+ +

We could label the partitions with boldface versions of the state $x_{t-1}$:

+ +

\begin{array}{c|cccc|cc|c|cc} + & AA & AC &AT&AG& CA & &\ldots && \ldots& GG \\ \hline + A & & & & & & & & & & \\ + C & & \mathbf{A} & & & \quad\mathbf{C} & &\mathbf{T}& & \mathbf{G} & \\ + T & & & & & & & & & & \\ + G & & & & & & & & & & \\ \hline +\end{array}

+ +

As I mentioned in comments, you need to extend the state. Let $z_t$ consist of pairs of states $(x_t,x_{t-1})$ and now consider a Markov Chain in $z_t$; that is, you have a transition matrix $p(z_t|z_{t-1})$.

+ +

So the state at time $t$ will be one of the 16 pairs $(A,A), (A,C) \ldots (G,G)$, and the transition matrix will be a 16 $\times$ 16 matrix of transition probabilities that will be mostly zero (necessarily so, because any pair that doesn't have the second component of $z_t$ match with the first component of $z_{t-1}$ is impossible).

+ +

As above, for speed of writing, let's also write $AC$ for $(A,C)$ and so on.

+ +

For ease of display I am going to define $z_{t-1}^*$ which is simply a permuted $z_{t-1}$. We can write $p(z_t|z_{t-1}^*)$ and then arrive back at $p(z_t|z_{t-1})$ by simple permutation.

+ +

So the transition matrix for $p(z_t|z_{t-1}^*)$ is of the form:

+ +

\begin{array}{c|cccc|cc|c|cc} + & AA & AC &AT&AG& CA & &\ldots && \ldots& GG \\ \hline + AA & & & & & & & & & & \\ + CA & & \mathbf{A} & & & \quad\mathbf{0} & &\mathbf{0}& & \mathbf{0} & \\ + TA & & & & & & & & & & \\ + GA & & & & & & & & & & \\ \hline + AC & & & & & & & & & & \\ + \vdots & & \mathbf{0} & & & \quad\mathbf{C} & &\mathbf{0}& &\mathbf{0} & \\ + \vdots & & & & & & & & & & \\\hline + \vdots & & \mathbf{0} & & &\quad\mathbf{0} & &\mathbf{T}& & \mathbf{0} &\\ + \vdots & & & & & & & & & & \\ \hline + \vdots & & \mathbf{0} & & &\quad\mathbf{0} & &\mathbf{0}& & \mathbf{G} & \\ + GG & & & & & & & & & & \\ +\hline +\end{array}

+ +

We can then rearrange either the rows or columns so they're in the same order; the transition matrix no longer has that simple structure, but contains the same values.

+ +

Generally, you can use this procedure to transform any $k$-th order Markov chain to a first-order MC (also holds for Hidden Markov Models).

+",2013-10-31 22:06:08.017 +58622,16464.0,1,58902.0,,,Single or Multiple Models for Prediction with Categorical Variable,,CC BY-SA 3.0,"

For a project I need to build a model that can predict batch processing time for the steps in a manufacturing process that are performed on a specific tool type. I know that the processing time is a function of the batch size (number of individual pieces of product loaded into the equipment at once) and the step in question.

+ +

I assume that the relationship between processing time and batch size is linear and the data indicate that the relationship is quite different based on which step is being run. In a stylized example where there were two steps run on the equipment under investigation, I've assumed an accurate way to describe the relationship between processing time and batch size is +$$ +\mathbf{y=\delta_{0}+\delta_{1}x_{1}+\epsilon} +$$ +for step 1 and +$$ +\mathbf{y=\alpha_{0}+\alpha_{1}x_{1}+\epsilon} +$$ +for step 2, where $\mathbf{\delta}\neq\mathbf{\alpha}$. Segregating the data by step and creating a unique predictive model for each step is less efficient than using an indicator variable to represent the step (with the actual data I have more like 10 steps). Using a model that treated the step as an indicator variable, I would need to specify a model +$$ +\mathbf{y=\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}+\beta_{3}x_{1}x_{2}+\epsilon} +$$ +to accurately capture both models stated above within one model that treated the step as an independent variable.

+ +

I thought the larger model might produce predictions identical to the two smaller models but I was unsure because the parameters would be estimated with the 'full' data set in the general case and only a portion of the data set in the case of the smaller models. To test this I created a data set in R:

+ +
# create variables and define relationships
+btchSize <- rnorm(100,12,3)
+step <- vector(""character"",100)
+step[1:50] <- ""A""
+step[51:100] <- ""B""
+step <- as.factor(step)
+y <- vector(""numeric"",100)
+y[1:50] <- 50+25*x1[1:50]+rnorm(50,0,5)
+y[51:100] <- 100+15*x1[51:100]+rnorm(50,0,5)
+
+# create linear models
+fullDataLm <- lm(y~x1+x2+x1*x2)
+aLm <- lm(y[1:50]~x1[1:50])
+bLm <- lm(y[51:100]~x1[51:100])
+
+summary(fullDataLm)
+summary(aLm)
+summary(bLm)
+
+ +

This code gives the following results:

+ +
Call:
+lm(formula = y ~ x1 + x2 + x1 * x2)
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+ -35.332  -9.246   1.227  10.716  26.028 
+
+Coefficients:
+        Estimate Std. Error t value Pr(>|t|)    
+(Intercept)  49.8611     4.8206  10.343  < 2e-16 ***
+x1           24.8566     0.3481  71.410  < 2e-16 ***
+x2B          54.9103     7.1373   7.693 1.26e-11 ***
+x1:x2B      -10.1553     0.5396 -18.819  < 2e-16 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+Residual standard error: 14.07 on 96 degrees of freedom
+Multiple R-squared:  0.987, Adjusted R-squared:  0.9866 
+F-statistic:  2427 on 3 and 96 DF,  p-value: < 2.2e-16
+
+
+Call:
+lm(formula = y[1:50] ~ x1[1:50])
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-35.332  -8.802  -0.608  11.255  24.448 
+
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)    
+(Intercept)  49.8611     4.7467   10.50 4.94e-14 ***
+x1[1:50]     24.8566     0.3428   72.52  < 2e-16 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+Residual standard error: 13.85 on 48 degrees of freedom
+Multiple R-squared:  0.991, Adjusted R-squared:  0.9908 
+F-statistic:  5259 on 1 and 48 DF,  p-value: < 2.2e-16
+
+
+Call:
+lm(formula = y[51:100] ~ x1[51:100])
+
+Residuals:
+    Min      1Q  Median      3Q     Max 
+-33.812 -10.049   3.803  10.529  26.028 
+
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)    
+(Intercept) 104.7714     5.3427   19.61   <2e-16 ***
+x1[51:100]   14.7013     0.4186   35.12   <2e-16 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+Residual standard error: 14.28 on 48 degrees of freedom
+Multiple R-squared:  0.9625,    Adjusted R-squared:  0.9618 
+F-statistic:  1234 on 1 and 48 DF,  p-value: < 2.2e-16
+
+ +

The results indicate that the larger model is identical to the smaller models in terms of the coefficient estimates. I.e. +$$ +\mathbf{\beta_{0}=\delta_{0}} \\ +\mathbf{\beta_{1}=\delta_{1}} \\ +\mathbf{\beta_{0}+\beta_{2}=\alpha_{0}} \\ +\mathbf{\beta_{1}+\beta_{3}=\alpha_{1}} \\ +$$ +My questions are:

+ +

(1) Can anyone explain algebraically how the above is true considering that the parameter estimation equation is: +$$ +\mathbf{\hat{\beta}=(X’X)^{-1}X’y} +$$ +and neither $\mathbf{X}$ nor $\mathbf{y}$ is the same in either of the three regressions. Obviously the indicator variables must cause the parameter estimation equations to end up being equal but I’d like to know how exactly.

+ +

(2) Does this result hold for parameter estimation techniques other than OLS?

+ +

(3) Which methodology is considered to be correct within the statistics community and why? Since the predictions from the larger model are identical to the predictions from the smaller models, I know that the residuals for the larger model are just the residuals of both smaller models (In my R example the residuals from the 100 observations used in the larger model are just the residuals from the 50 observations used in each of the smaller models). Consequently, if the regression assumptions are satisfied to a larger extent for data within one step then another it seems that the larger model may balance out the data and appear satisfactory in satisfying the regression assumption, while in fact the data lend themselves to regression analysis for one step but for another step are inconsistent with the use of regression.

+",2013-10-31 22:26:25.270 +58623,21346.0,1,,,,Transforming the explanatory variables,,CC BY-SA 3.0,"

What are the considerations that we need to take into account if we need to transform just the explanatory variables (not the dependent variable). I have data on assets and liabilities and I need to use assets-liabilities or net worth as the explanatory variable. I can't use logarithm because net worth can be negative or zero or positive. In addition, if I just use the amount of networth (raw data) the maximum likelihood estimator doesn't achieve convergence. But, with the Inverse hyperbolic sine (IHS) transformation, the convergence is achieved. My first question is if it is safe to say that we are using the IHS transformation just because we don't have convergence. Are there any other considerations that we need to look into in this regard? Second question is how do we find the scale parameter (theta) in IHS transformation. ?

+",2013-10-31 22:41:53.390 +58624,10135.0,2,,58620.0,,,,CC BY-SA 3.0,"

This is called Multicollinearity in regression. Dropping is just one way to handle this situation and it is not the only remedy. Sometimes you standardize your independent variables, or get more data (if possible), dropping a term is another option, or even fitting a regression with the knowledge of having Multicollinearity! To assess how much they are correlated, you need to find a factor called Variance inflation factor (VIF), where $VIF(\hat{\beta_j})=\dfrac{1}{1-R^2_i}$ and $R^2_i$ is the coefficient of determination of the regression model, i.e. you replace your $Y$ (dependent variable) with $X_i$ and fit your regression model with all other independent variables. Then you find your $R^2$ as usual. The result is denoted by $R^2_i$ to emphasize the fact that you regressed $X_i$ against other independent variables. As a rule of thumb when $VIF(\hat{\beta_j})>5$ (or in some references $VIF(\hat{\beta_j})>10$), we say that multicollinearity is high.

+ +

To answer your 2nd question, just think about the relation between covariance and correlation of two random variables. Then I guess you can find your answer. the correlation is just scaled version of covariance.

+ +

For the last question: I don't think there is specific number of variables to call ""too many"" that works in any model. The thing is that you need to fit a good regression model but it does not mean that you should add too many variables. Because over-parametrization is another problem that should be avoided.

+",2013-10-31 23:26:26.763 +58625,23201.0,2,,143.0,,,,CC BY-SA 3.0,"

I used this RunningStats C++ Library in an embedded application. It is the most simple running stats library I have found yet.

+ +

From the link:

+ +
+

The code is an extension of the method of Knuth and Welford for + computing standard deviation in one pass through the data. It computes + skewness and kurtosis as well with a similar interface. In addition to + only requiring one pass through the data, the algorithm is numerically + stable and accurate.

+
+",2013-10-31 23:55:13.693 +58627,5086.0,1,58672.0,,,What method is simulating pvalues from re sampling from the data,,CC BY-SA 3.0,"

A while back I asked a question about correlating times between time stamps and received a response from Peter Ellis that said I could calculate mean distances between codes...

+
+

This already will give you some sense of which behaviours are clustered together, but you also should check that this isn't plausibly due just to chance.

+

To check that, I would create simulated data generated by a model under the null hypothesis of no relation. Doing this would require generating data for each behaviour's time from a plausible null model, probably based on resampling the times between each event (eg between each yawn) to create a new set of time stamps for hypothetical null model events. Then calculate the same indicator statistic for this null model and compare to the indicator from your genuine data. By repeating this simulation a number of times, you could find out whether the indicator from your data is sufficiently different from the null model's simulated data (smaller average time from each yawn to the nearest stretch, for example) to count as statistically significant evidence against your null hypothesis.

+
+

I finally possess the skill set to do this and have done so in R but I don't know what this method or technique is called so that I can (a) learn more about it (b) speak intelligently about the theory behind what I'm doing.

+

Some people have suggested this is called a permutation test, others say similar to but not the same as bootstrapping and some have told me it's related to Monte Carlo re sampling.

+

What is this method of resampling, given the NULL is TRUE, called? If you have a reference or two to back up your response that may be helpful but not necessary.

+",2013-11-01 01:12:07.763 +58628,,2,,58398.0,user30490,,,CC BY-SA 3.0,"

I figured it out. When the reciprocal roots are real we will have that +$$\phi_1=(r_1+r_2)\hspace{.1in}\text{ and }\hspace{.1in}\phi_2=-r_1r_2$$ +Likewise when the reciprocal roots appear as a complex pair then we have that +$$|\phi_1|=2r\cos(\omega)\hspace{.1in}\text{ and }\hspace{.1in}\phi_2=-r^2$$

+ +

Thus I can just plug that into the above equation and plot as a function of $\omega$.

+",2013-11-01 02:30:53.827 +58629,23237.0,1,,,,Does the sum of several distributions become more central or approximated to Normal,,CC BY-SA 3.0,"

As the classic CLT states Xs follow the same distribution, then the sum of Xs approximate to Normal distribution. +But what about several Xs follows different distributions (maybe the same class but not the same value of parameters or totally different kinds)? +It seems that the sum will become more central or even more Normal ""likely"". +Maybe some kinds of ""Generalized"" CLT has interests about that.

+ +

Is there any theory doing study about such kind of research? +What characteristics if the Xs follow, we will have such results. What kinds of assumptions fails, then we have another results?

+ +

We know that sum of the uniforms is triangle distribution, while sum of the Poissons is still Poisson. Sum of stable family (including Normal, Cauchy) is still stable distribution. +And about student-t, the sum is not student-t but not knowing what kind it is. But it seem that the fat-tail effect of the sum is thinner.

+ +

So could you please tell me some more general rules about such aspects.

+",2013-11-01 02:34:39.047 +58630,21804.0,2,,57636.0,,,,CC BY-SA 3.0,"

I would advise against using statistical methods for determining the ""best"" variable. You have only 98 observations (how are ""yes"" and ""no"" answers distributed?) and more variables than cases. This is a recipe for disaster in the sense that any attempt to build a model with all variables is prone to overfit the data. You will find packages that try to do the trick and some careful cross-validation might help you to avoid some obvious pitfalls, but do not assume that you will learn much on a conceptual level. My suggestion would therefore be to eliminate variables that are weak on theoretical grounds before moving to the analysis step or to collect more cases if that is possible. And: test some simple models (equal weighting of variables) as competitors to get some benchmarks.

+ +

There is less literature on the soft and fuzzy process of conceptual variable selection than on algorithmic ways to ""solve"" this problem, but this is not necessarily due to the superiority of the latter. Some pointers in the literature could be:

+ +

Dawes, R. M. (1979). The robust beauty of improper linear models in decision making. American psychologist, 34(7), 571.

+ +

Freedman, D. A. (1991). Statistical models and shoe leather. Sociological methodology, 21, 291-313.

+ +

Freedman, D. (1999). From association to causation: some remarks on the history of statistics. Statistical Science, 14(3), 243-258.

+",2013-11-01 03:14:02.883 +58631,22891.0,1,,,,Differences Between Logistic Regression in Statistics and in Machine Learning,,CC BY-SA 3.0,"

I just found out that machine learning also has logistic regression as one of its methods. Can someone please tell me the differences between logistic regression in statistics and machine learning? I've seen lecture slides on logistic regression from a machine learning course, but I can't see the difference with the coverage of logistic regression in a statistics course.

+ +

Does logistic regression in machine learning have no need to check for multicollinearity?

+ +

The reason I asked this is because I've tried to run a dataset through R's glm function with binomial logit, and then I ran the same dataset through Apache Mahout's trainlogistic. But the resulting coefficients are different.

+ +

This is the command I use in R:

+ +
w1.glm <- glm(anw ~ cs, data = w1, family = ""binomial"")
+
+ +

This is the result of summary(w1.glm):

+ +
glm(formula = anw ~ cs, family = ""binomial"", data = w1)
+
+Deviance Residuals: 
+    Min       1Q   Median       3Q      Max  
+-2.5400   0.1073   0.1924   1.0047   1.0047  
+
+Coefficients:
+            Estimate Std. Error z value Pr(>|z|)    
+(Intercept)  0.42077    0.02588   16.26   <2e-16 ***
+cs           1.89342    0.06427   29.46   <2e-16 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+(Dispersion parameter for binomial family taken to be 1)
+
+    Null deviance: 11762.5  on 10660  degrees of freedom
+Residual deviance:  9250.3  on 10659  degrees of freedom
+
+ +

And this is the command I use in Mahout:

+ +
/usr/local/mahout/bin/mahout trainlogistic --input w1.csv --output ./model --target anw --categories 2 --predictors cs --types numeric --features 20 --passes 100 --rate 50
+
+Running on hadoop, using /usr/local/hadoop/bin/hadoop and HADOOP_CONF_DIR=
+MAHOUT-JOB: /usr/local/mahout/mahout-examples-0.8-job.jar
+20
+anw ~ 
+-19.553*cs + -7.512*Intercept Term
+            cs -19.55265
+      Intercept Term -7.51155
+    0.000000000     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000   -19.552646543     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000    -7.511546797     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000 
+13/11/01 02:04:47 INFO driver.MahoutDriver: Program took 22118 ms (Minutes: 0.3686333333333333)
+
+ +

Edited: Added the reason I asked the question in the title. Added the commands used to execute glm in R and trainlogistic in Mahout.

+",2013-11-01 06:49:31.823 +58632,23242.0,1,58635.0,,,show that $X_{n}= E[X|\mathcal F_{n}]$ is an $\mathcal F_{n}$-martingale,,CC BY-SA 3.0,"

Let $X$ be an integrable random variable defined on probability space $(\Omega , \mathcal F,P)$, and let $\mathcal F_{n},n\ge0$ , be a filtration on this space.
+ show that $X_{n}= E[X|\mathcal F_{n}]$ is an $\mathcal F_{n}$-martingale.

+",2013-11-01 08:33:04.497 +58648,5448.0,2,,58644.0,,,,CC BY-SA 3.0,"

Power is an attribute of a test that depends upon the true parameter values, not the estimated parameter values. It may be that the assumptions about the true parameter values that you made when calculating power are contraindicated by the data, but that does not affect the power as such, since that is specific to the null/alternative hypotheses used in the calculation. It is important to remember that power calculations take into account the randomness of the sample; in your case, for example, a power calculation based upon a true correlation of 0.5 does not assume that the sample correlation will always be 0.5.

+ +

Consequently, your thinking is correct. If your assumptions are correct, the test still has 90% power. As Michael Meyer has pointed out in comments, given your sample size, it is not even the case that your sample correlation indicates that your assumptions are false, which means there isn't any reason to revisit your power calculations.

+",2013-11-01 13:25:50.947 +58633,9408.0,1,58821.0,,,Comparing the effect of a treatment that was optional for its receivers,,CC BY-SA 3.0,"

In 2012 we collected data at our university about retention of students from first semester to second semester along with some other variables. The retention variable is binary, 'retained' or 'did not retain'.

+ +

In 2013 we introduced a new system. Student success advisers were assigned and any student when needed could go to them. We collected some new variables associated with the introduction of the student success advisers together with those collected in 2012. Now, going to these advisers was not mandatory. Those who felt a problem could go to them.

+ +

Just looking at 2013 data alone makes it seem that the advisers make things worse: the students they see do worse, and are more likely to leave university. Only the students who faced problems probably went to the advisers and those who were doing well probably didn't. If 10 students went to the advisers with problems and 8 retained to semester 2, that is surely a success. But if 80 out of 100 students who didn't have any problem and thus didn't go to the adviser retained to semester 2, that will have log odds similar to those who went to the advisers and thus tell you that going to the advisers had no significant effect!

+ +

Part of the reason was a major policy change that affected how universities in our country took in students. There was a ""cap"" in place before 2013, a limit on how many students a university could take on and expect government funding for. In 2013 that was removed. So, successful, desirable universities started opening their doors to more students, and more students were able, suddenly, to get into desirable universities. Our university is not so desirable. So what probably happened was that our university was forced in 2013 to take significantly ""worse"" students, which could well lead to greater retention difficulty.

+ +

What my bosses think is that we need to weight the 2012 data so that it matches the 2013 data on some of the variables that may have been sensitive to the removal of the cap on student entry (or weight the 2013 data so that it matches the 2012 data). Then we can compare two different cohorts as if they were a proper control group for each other. In fact, this is illusory. Some suggested using SPSSINC RAKE, but I am not getting any clue why.

+ +

What we want to find out (very broadly) is whether introducing the advisers etc. worked, and the degree to which it worked.

+",2013-11-01 08:36:37.353 +58634,12683.0,2,,58610.0,,,,CC BY-SA 3.0,"

First, if the researcher's interested in how those variables affect admission, doesn't that interest include how much difference there is between third & fourth ranks?

+ +

(1) It's not a good idea to merge categories based just on their having similar responses—you're introducing bias into the coefficient estimates. Things like this are done ruefully, when they have to be, to fix problems with the model, not as a matter of course.

+ +

(2) Think this through. A high correlation between two predictors makes it difficult to separate their effects on the response. How many values of rank can a single person have? Are you ever going to be interested in predicting admission for someone having both rank=2 & rank=3? +[Edit in response to comment: The answer is that a single person can only have one level of a categorical predictor, & you'll never be interested in predicting the response for someone with more than one level, so it's to be expected that there's correlation between the levels, & poses no problem at all. Neither leave one out (which would be equivalent to merging it with the reference level) nor merge them. (This is sometimes called structural multicollinearity to distinguish it from the problematic kind of multicollinearity).]

+ +

(3) Read the post you linked to carefully. Gung is saying that Akaike's Information Criterion can be useful to decide between a few candidate models ""of substantive interest to you""; he's not recommending trawling through all possibilities to find the model with the lowest AIC.

+",2013-11-01 09:39:01.750 +58635,21029.0,2,,58632.0,,,,CC BY-SA 3.0,"

To prove this is a martingale, you must show three things. First, that

+ +
    +
  1. $X_n$ is $\mathcal F_{n}$ measurable, which is given by definition.
  2. +
  3. Next, that $E|X_n| < \infty $. Since $X$ is integrable, we have this property as well.
  4. +
  5. Finally, show that $E[ X_n | \mathcal F_{n-1}] = X_{n-1} $.
  6. +
+ +

$ E[ X_n | \mathcal F_{n-1}] = E[ E[ X | \mathcal F_{n}] | \mathcal F_{n-1}] = E[ X | \mathcal F_{n-1}] = X_{n-1}$

+ +

So $X_n$ is a $\mathcal F_{n}$ martingale.

+",2013-11-01 09:54:38.803 +58636,18085.0,2,,46894.0,,,,CC BY-SA 3.0,"

This is a kind of late answer to my question.

+ +

The short answer is that a variable set of data can have the same coefficients but explains variance to different proportion. +This explains the low variance within coefficients and the high variance within $R^2$.

+ +

So its actually a quite simple thing.

+",2013-11-01 10:31:39.700 +58637,22923.0,2,,58375.0,,,,CC BY-SA 3.0,"

First, just on a side note - your yields (x and y) seems to be non-stationary - you see these 2 clouds on your scatter - they probably happened to be at that particular slope to each other just by chance because yields ""randomly walked"" in those particular directions. If they would randomly walked in other directions, you could have completely different slope (and you probably will have in future - something like 3rd cloud somewhere aside). So when you draw a regression line it may happen to be spurious.

+ +

Next, one of interpretations of weighted least squares regression is that residuals in your underlying model have different variances in different regions (heteroscedasticity) and you put more weight on where they have smaller variance to obtain consistent estimates of regression coeffs. But then computing residuals' stdev will not be interpretable as an estimate of their true variance - because they come from different distributions with different variances :)

+ +

But if you still want some measure of stdev taking into account relative weights or importance of points - then Mean Squared Weighted Deviation may be a way to go.

+",2013-11-01 10:59:22.467 +58638,21728.0,1,58640.0,,,"Show that for a Geometric distribution, the probability generating function is given by $\frac{ps}{1-qs}$, $q=1-p$",,CC BY-SA 3.0,"

Suppose that $X$ has a geometric distribution with probability mass function $P(X=x) = q^{i-1}p$, $i=1,2,...$ and $q=1-p$

+ +

Show that its probability generating function is given by $ \pi(s)=\frac{ps}{1-qs}$. Hence show that $E(x)=\frac{1}{p}$ and $Var(X)=\frac{q}{p^2}$

+ +

Hi everyone, I am doing this question for exam practice, and I can't seem to get the correct answer. And to be honest, I am just working through it mechanically and don't have a great understanding of the probability generating functions.

+ +

Here is what I have:

+ +

$$\pi(s)=E(S^X)=\sum^\infty_{i=0}q^{i-1}p\cdot s^i$$ +$$= p\sum^\infty_{i=0}q^{i-1}\cdot s^i=p\sum^\infty_{i=0}\frac{q^i}{q}\cdot s^i$$ +$$=\frac{p}{q}\sum^\infty_{i=0}(qs)^i$$

+ +

Then using the sum of a geometric series formula, I get:

+ +

$$=\frac{p}{q}(\frac{1}{1-qs})$$

+ +

Now I am stuck. I feel like I am close, but am just missing something. I'll be ok with deriving the expected value and variance once I can get past this part.

+ +

As an addition I was wondering if anyone could also give me a bit of an 'idiots' explanation of the probability generating function, as I am struggling to understand it conceptually. $s$ seems to be the dependent variable, but my lecturer hasn't explained what exactly it is.

+ +

Many thanks in advance!

+",2013-11-01 11:16:29.010 +58639,23243.0,1,58641.0,,,Goodness of fit tests for quantile regression in R,,CC BY-SA 3.0,"

What goodness of fit tests are usually used for quantile regression? Ideally I need something similar to F-test in linear regression, but something like AIC in logistic regression will suite as well. I use quantreg R package, but found only some Khmaladze test in there. To be fair I hardly understand what is does.

+",2013-11-01 11:19:57.813 +58640,21029.0,2,,58638.0,,,,CC BY-SA 3.0,"

It's normal you'd arrive at the wrong answer in this case. The problem is that your index is wrong. There are two definitions for the pdf of a geometric distribution. The one you use, where $E(X)=\frac{1}{p}$ is defined from 1 to infinity. At zero it is not defined. So, the generating function needs to take this into account, as well.

+ +

$$\pi(s)=E(S^X)=\sum^\infty_{i=1}q^{i-1}ps^i$$ +$$= ps\sum^\infty_{i=1}(qs)^{i-1}=ps\sum^\infty_{i=0}(qs)^i$$ +$$=\frac{ps}{1-qs}$$

+ +

If you use the alternative definition, where $P(Y=y)=q^ip$, then the pdf is defined at zero. In this case the generating function converges to $\frac{p}{1-qs}$.

+ +

As for what $s$ represents, as far as I know it represents nothing. Generating functions are derived functions that hold information in their coefficients. They are sometimes left as an infinite sum, sometimes they have a closed form expression. Take a look at the wikipedia article, which give some examples of how they can be used. Here and here.wiki article probability generating functions and wiki article generating functions

+",2013-11-01 12:00:05.277 +58641,503.0,2,,58639.0,,,,CC BY-SA 3.0,"

quantreg includes several AIC functions: ""AIC.nlrq"", ""AIC.rq"", ""AIC.rqs"" and ""AIC.rqss"" and similar log likelihood functions.

+ +

It also has a vignette at vignette(""rq"",package=""quantreg"").

+ +

Do these do what you want?

+",2013-11-01 12:28:03.257 +58642,22923.0,2,,56580.0,,,,CC BY-SA 3.0,"

SVM doesn't assume normality. But it's still a regression that minimizes some symmetric loss function (I suppose you use symmetric kernel).

+ +

So... this is just a feeling and I'm too tired to justify/prove all this but:

+ +
    +
  1. Probably your output variable has highly skewed distribution;
  2. +
  3. And you use symmetric gaussian kernel that leads to symmetric squared loss to minimize (squared error with bump cut-off if I remember correct?);
  4. +
  5. Then SVM still estimates something close to conditional mean of your data if you minimize this loss for original output variable;
  6. +
  7. When you log-transform output variable and minimize that symmetric loss for it, then in terms of original variable it estimates something like a conditional median;
  8. +
  9. it's well-known that mean is the thing that minimizes average squared error and median is the thing that minimizes average absolute error, so when you estimate regression using log-transformed output you get worse MSE but better MAPE.
  10. +
+ +

Hope this helps.

+",2013-11-01 12:39:25.987 +58643,2666.0,2,,58529.0,,,,CC BY-SA 3.0,"

You didn't state the ultimate goal of the exercise, hence the choice of ROC curves was not well motivated. Many useful things can be done with log-likelihood and Brier scores, as well as with the distribution of predicted risks (ignoring $Y$). The use of cutoffs is questionable.

+",2013-11-01 12:40:42.403 +58644,15806.0,1,58648.0,,,Is the post hoc power of a dependant t test related to the sample correlation or actual correlation,,CC BY-SA 3.0,"

I have a question that may be fairly easy to answer but it is making my head spin. If I do a paired t test on two groups of data with sample size 3 each AND I assume that the pairs are correlated with at least an correlation coefficient of 0.5, then I should have about 90% power to detect an effect size of 2SD.

+ +

Ok, so lets say that after the experiment, I check my sample correlation and it turns out that it is only 0.2 for that sample and not 0.5. I am wondering how this would affect the power of my test since, I believe, the sample correlation value influences my p-value. Would I then have to say that I actually had less than 90% power? Or did my test still have 90% power regardless of how the sample correlation turned out? I am thinking it is the latter rather than the former but I would appreciate some help as this whole business of calculating power after a test is very confusing. Hopefully my question makes sense.

+",2013-11-01 12:52:43.070 +58645,23246.0,1,58649.0,,,How can we find the decision boundary for two overlapping continuous uniform distribution?,,CC BY-SA 3.0,"

Say I have $X \sim \text{CUnif}(a, b)$ and $Y \sim \text{CUnif}(c, d)$. The parameters of $X$ and $Y$ overlap i.e., $a < c < b < d$.

+ +

How can I find a decision boundary in such case?

+ +

I am thinking of taking an arbitrary point (say $x_o$) as a decision boundary in between $b$ and $c$ and then find the probability of miss classification and then minimizing it.

+",2013-11-01 13:01:27.717 +58646,449.0,2,,58644.0,,,,CC BY-SA 3.0,"

Power is the probability of finding a significant effect in a sample if what you believed true about the population is true. Correlation, effect, and variance all change from sample to sample. That's why power is expressed as a probability. It's sensitive to these variations.

+ +

Here's a simulation in R that demonstrates what I'm saying. It generates data from the population you expect in your question. But each sample has a different correlation, variance in groups, and effect.

+ +
library(MASS)
+# mvrnorm requires a variance, covariance matrix
+# with variances of 1 then the covariances will equal the correlation
+sigma <- matrix(c(1.0, 0.5,
+                  0.5, 1.0), 2, byrow = TRUE)
+y <- replicate(1000, {
+     mat <- mvrnorm(3, c(0, 2*sqrt(2)), sigma, empirical = FALSE) # get a sample
+     tt <- t.test(mat[,1], mat[,2], paired = TRUE, var.equal = TRUE)
+     tt$p.value })  #return p-value of t-test
+
+sum( y < 0.1 ) / 1000
+
+ +

The typical result is that about 90% of the tests pass a significance test.

+ +

[The alpha used is 0.1 from the comments and the effect size is diff / (pooledSD / sqrt(1-r)) based on comments as well. None of that's particularly germaine to the basic story but I note it because there are arguments to use different effect size calculations and this is an atypical significance cutoff.]

+",2013-11-01 13:04:17.510 +58647,17328.0,2,,58519.0,,,,CC BY-SA 4.0,"

Here is a quick check using a computer algebra system. I am using the mathStatica package for Mathematica (I am one of the developers of the former) to automate the nitty gritties for me ...

+ +

Given: The parent pdf is $N(\mu, \sigma^2)$ with pdf $f(y)$:

+ +

+ +

Then, the joint pdf of the 1st and 2nd order statistics $(Y_1, Y_2)$, in a sample of size 2, denoted say $g(y_1,y_2)$ can be easily obtained using the OrderStat function:

+ +

+ +

Note that the constant multiplier here is $\frac{1}{\pi \sigma^2}$, not $\frac{1}{2\pi \sigma^2}$ in your equation.

+ +

Because the constraint $Y_1 < Y_2$ is already entered into the pdf definition, we can enter the domain of support on the real line as:

+ +

+ +

Finally, the expectation you seek is:

+ +

+ +

which agrees with your stated solution.

+ +
+ +

@Ioannis wrote:

+ +
+

Is there a way to see how the integral is computed though?

+
+ +

One can activate VerboseMode, which shows all the integrands being sent off for calculation. With VerboseMode[On], one can see the intermediary integrands ...

+ +

+ +

You might need to open the pic manually in a separate window to see the detail ...

+",2013-11-01 13:20:39.700 +58696,23279.0,2,,58694.0,,,,CC BY-SA 3.0,"

Strictly speaking, the normality of residuals assumption is not needed for OLS to work, it becomes an issue especially in hypothesis testing. Since your residuals actually seem to be normally distributed, you're fine even in this area. Additionally, OLS does not assume anything about the distribution of variables so you do not have to worry about that.

+",2013-11-02 11:01:07.143 +58650,23250.0,1,,,,Identifying linear relationships among many interrelated variables,,CC BY-SA 3.0,"

I have a dataset with many variables. Some of these variables are linked to each other in various ways, but I don't know in advance those that are.

+ +

For example, these are some relationships:

+ +
    +
  • $A=B$ -- obvious and easy to spot/remove;

  • +
  • $A=B\cdot\mathrm{constant}$ -- again easy to remove using a correlation matrix;

  • +
  • The problem comes with $A = B+C+D+E$ or even $A = B+(0.5\cdot C)+(0.5\cdot D+E).$

  • +
+ +

The last set are difficult to identify and PCA doesn't provide me with all the clues to identify what A is made up of.

+ +

For example, a simple $A = B + C$ will not always show a link. +If half of the B values are 0.1 with the other half distributed from 0.5-1.5 randomly and C is a normal distribution with mean 1 no correlation is detected between B and A. However, correlation is often detected in A and C due to the distribution in C still being present in A.

+ +

Without knowing in advance these relationships are present how do identify these underlying relationships?

+ +

In essence, I am trying to remove these relationships before running PCA and later modelling with remaining uncorrelated variables.

+",2013-11-01 13:57:37.697 +58651,23251.0,1,,,,Mathematical equation for a generalized linear mixed model with interactions,,CC BY-SA 3.0,"

I'd like to know how following R-code for a binomial GLMM (package lme4) translates into a proper mathematical notation form?

+ +
glmer(Y ~ Var1*Var2*Var3+(Var1+Var2+Var3|i) , family=”binomial”, data=df)
+
+ +

maybe something like:

+ +

$$ +logit(Y_i) = α + β_{Var1} Var1 × β_{Var2} Var2 × β_{Var3} Var3 + a_i +b_{i Var1} Var1 + b_{i Var2} Var2 +b_{i Var3} Var3 + ε_i +$$

+",2013-11-01 13:59:52.803 +58652,7278.0,1,58654.0,,,Predictions of Random Forest on training data don't lie around x=y line,,CC BY-SA 3.0,"

I've trained a random forest on a data set where the targets are in [0, 100]. I use a 5 fold cross validation framework to find the optimum mtry and then train the model on the whole data set using that mtry. When I make predictions on the training set I find that the low targets are over-predicted and the high targets are under-predicted and I don't know why this is happening. The RMSE would be lower if the predictions were just pushed farther from the mean value.

+ +

Can anyone indicate why this is happening? When the model makes predicts on a holdout set this problem is exacerbated but I though it instructive to show the predictions back on the training set in this example. I could augment the model to correct for this and that would improve results on the holdout set, but I would like to understand why this is happening so that I might fix it without augmenting the model. The distribution of target values is given below:

+",2013-11-01 14:01:56.863 +58653,23253.0,1,,,,Reliability standard via relative standard error for computer science or randomized algorithms,,CC BY-SA 3.0,"

I am performing experiments on randomized algorithms, and want to do it scientifically sound.

+ +

What relative standard error of mean (RSEM) is acceptable? I know that often RSEM<30% or RSEM<25% is used, but does that also apply for randomized algorithms?

+ +

What paper/book can I cite that give a reliability standard? This would optimally be a standard for the field of computer science or randomized algorithms, but a general reliability standard would be fine, too.

+ +
+ +

Details: I only found

+ + +",2013-11-01 14:30:27.487 +58654,5448.0,2,,58652.0,,,,CC BY-SA 4.0,"

This is actually to be expected, not just with random forests, and comes about as a consequence of the fact that the variance of the target variable = the variance of the model (the estimates) + the variance of the residuals (for least-squares type fitting procedures.) Given that the latter is positive, unless your model fits perfectly, it must be that the variance of the model < the variance of the target variable. As a result, the prediction vs. actual plot can't lie on the 45-degree line passing through 0; if it did, the variance of the target variable would be equal to the variance of the model, and there would be no room left for residual variance.

+ +

Here are four plots to illustrate this point with linear regression. In the first one, the error variance is relatively high, and, as a consequence, the predicted - vs - actual plot isn't anywhere near the diagonal line. In the second through fourth, the error variance is much lower, and the predicted - vs - actual plot gets much closer to the diagonal line.

+ +

First, the code:

+ +
x <- rnorm(1000)
+y <- x + rnorm(1000,0,2) # rnorm(1000,0,1), rnorm(1000,0,0.5), rnorm(1000,0,0.1) 
+
+plotlim <- range(y)
+plot(predict(lm(y~x))~y,ylim=plotlim,xlim=plotlim)
+abline(c(0,1))
+
+ +

Now, the plots:

+ +

+ + +

+ +

Consequently, there's no need to alter your fitting procedure or augment your model.

+ +

Further heuristic explanation: Note that this comes about because $\sigma^2_Y > \sigma^2_X$, in this particular linear regression model. Therefore, even with the true parameter values (in this case, 0 intercept and 1 slope), the plot of $Y$ will be more spread out than the plot of $X$, and, since the estimated values of $Y$ with the true parameter values will equal $X$, it will also be the case that the plot of $Y$ will be more spread out than the plot of the estimated values of $Y$. As a result, the estimated values vs. true values plot will not lie on a 45-degree line.

+",2013-11-01 14:35:34.233 +58655,23249.0,1,58657.0,,,"Analyzing installation rates, not quite binomial. Tried bootstrapping. Is there a approximate binomial approach that would be better?",,CC BY-SA 3.0,"

I'm analyzing a data set of installation rates of lighting fixture projects and want to determine the expected installation rate and the confidence interval. Projects have various numbers of fixtures intended to be installed and the most common rate is 100% followed in frequency by a handful of 0% and then a small number of rates besides 0 and 100%. The distribution of the sample data looks like this:

+ +

+ +

In many ways it seems like this data should follow a binomial distribution except for the variable number of fixtures in each project and the possibility of values other than 0 and 1.

+ +

As a result of this confusion, I simply bootstrapped the average installation rates, weighted by project size, and arrived at this distribution of results.

+ +

+ +

Is this a reasonable approach, or is there a better way to treat this as an approximate binomial distribution with weights for project size, or use a non-parametric approach?

+ +

Thanks in advance for any assistance...

+",2013-11-01 14:36:05.607 +58697,23280.0,1,,,,"What is the difference among Indicator, Index, Variable and Measure?",,CC BY-SA 3.0,"

What is exactly the difference among Indicator, Index, Variable and Measure? I would appreciate some reference.

+",2013-11-02 11:03:54.097 +58721,22682.0,2,,58720.0,,,,CC BY-SA 3.0,"

The solution to calculating the mean:

+ +

$\mathbb{E}(S_N) = 0.P(N=0) + \mathbb{E}(X_1).P(N=1) + \mathbb{E}(X_1+X_2).P(N=2) + . . .$

+ +

$= 0 + \mu_2P(N=1) + 2\mu_2P(N=2) + . . . = \mu_2\sum_{i=0}^\infty i.P(N=i)$

+ +

and the infinite sum above is just equal to the expectation of $N$, hence:

+ +

$\mathbb{E}(S_N) = \mu_2.\mathbb{E}(N) = \mu_2\mu_1$

+",2013-11-02 19:16:47.060 +58656,23255.0,2,,58578.0,,,,CC BY-SA 3.0,"

Because of your sample size, I would recommend using leave one out cross validation to estimate model fit. The algorithm goes like this:

+ +
    +
  1. Take out an observation from the data set.
  2. +
  3. Fit the model.
  4. +
  5. Estimate the class label of the held out observation.
  6. +
  7. Repeat steps 1-3 until all observations have been held out.
  8. +
+ +

To see the algorithms performance, look at how well it fit the held out data.

+ +

You may find, as Simone suggested, that your algorithm is overfitting the data.

+ +

One thing that I have found is that you need much more than 16 observations to fit a good classification model.

+ +

Check out the cvTools package to perform the cross validation.

+",2013-11-01 14:36:27.640 +58657,503.0,2,,58655.0,,,,CC BY-SA 3.0,"

If you want an estimate of the average (arithmetic mean) and its error, the bootstrap seems reasonable to me.

+ +

But are you sure you want the arithmetic mean and its error?

+ +

Of course, it depends on your application, but for most purpose, it seems to me that you would want something else. You might want estimation of quantiles (say, 10th, median and 90th percentiles), either of the installation rate (possibly weighted, possibly not); or you might want to go straight to ""number of fixtures"" to be installed.

+",2013-11-01 15:09:53.100 +58658,20457.0,1,58719.0,,,Training a convolution neural network,,CC BY-SA 3.0,"

I am currently working on a face recognition software that uses convolution neural networks to recognize faces. Based on my readings, I've gathered that a convolutional neural network has shared weights, so as to save time during training. But, how does one adapt backpropagation so it can be used in a convolution neural network. In backpropagation, one uses a formula similar to this to train the weights.

+ +
New Weight  = Old Weight +  LEARNING_RATE * 1 * Output Of InputNeuron * Delta
+
+ +

However, since in convolutional neural networks, the weights are shared, each weight is used with multiple neurons, so how do I decide which Output of InputNeuron is used?

+ +

In other words, since the weights are shared, how do I decide how much to change the weights by?

+",2013-11-01 16:05:22.290 +58659,668.0,2,,14729.0,,,,CC BY-SA 3.0,"

The question asks about ""identifying underlying [linear] relationships"" among variables.

+ +

The quick and easy way to detect relationships is to regress any other variable (use a constant, even) against those variables using your favorite software: any good regression procedure will detect and diagnose collinearity. (You will not even bother to look at the regression results: we're just relying on a useful side-effect of setting up and analyzing the regression matrix.)

+ +

Assuming collinearity is detected, though, what next? Principal Components Analysis (PCA) is exactly what is needed: its smallest components correspond to near-linear relations. These relations can be read directly off the ""loadings,"" which are linear combinations of the original variables. Small loadings (that is, those associated with small eigenvalues) correspond to near-collinearities. An eigenvalue of $0$ would correspond to a perfect linear relation. Slightly larger eigenvalues that are still much smaller than the largest would correspond to approximate linear relations.

+ +

(There is an art and quite a lot of literature associated with identifying what a ""small"" loading is. For modeling a dependent variable, I would suggest including it within the independent variables in the PCA in order to identify the components--regardless of their sizes--in which the dependent variable plays an important role. From this point of view, ""small"" means much smaller than any such component.)

+ +
+ +

Let's look at some examples. (These use R for the calculations and plotting.) Begin with a function to perform PCA, look for small components, plot them, and return the linear relations among them.

+ +
pca <- function(x, threshold, ...) {
+  fit <- princomp(x)
+  #
+  # Compute the relations among ""small"" components.
+  #
+  if(missing(threshold)) threshold <- max(fit$sdev) / ncol(x)
+  i <- which(fit$sdev < threshold)
+  relations <- fit$loadings[, i, drop=FALSE]
+  relations <- round(t(t(relations) / apply(relations, 2, max)), digits=2)
+  #
+  # Plot the loadings, highlighting those for the small components.
+  #
+  matplot(x, pch=1, cex=.8, col=""Gray"", xlab=""Observation"", ylab=""Value"", ...)
+  suppressWarnings(matplot(x %*% relations, pch=19, col=""#e0404080"", add=TRUE))
+
+  return(t(relations))
+}
+
+ +

Let's apply this to some random data. These are built on four variables (the $B,C,D,$ and $E$ of the question). Here is a little function to compute $A$ as a given linear combination of the others. It then adds i.i.d. Normally-distributed values to all five variables (to see how well the procedure performs when multicollinearity is only approximate and not exact).

+ +
process <- function(z, beta, sd, ...) {
+  x <- z %*% beta; colnames(x) <- ""A""
+  pca(cbind(x, z + rnorm(length(x), sd=sd)), ...)
+}
+
+ +

We're all set to go: it remains only to generate $B, \ldots, E$ and apply these procedures. I use the two scenarios described in the question: $A=B+C+D+E$ (plus some error in each) and $A=B+(C+D)/2+E$ (plus some error in each). First, however, note that PCA is almost always applied to centered data, so these simulated data are centered (but not otherwise rescaled) using sweep.

+ +
n.obs <- 80 # Number of cases
+n.vars <- 4 # Number of independent variables
+set.seed(17)
+z <- matrix(rnorm(n.obs*(n.vars)), ncol=n.vars)
+z.mean <- apply(z, 2, mean)
+z <- sweep(z, 2, z.mean)
+colnames(z) <- c(""B"",""C"",""D"",""E"") # Optional; modify to match `n.vars` in length
+
+ +

Here we go with two scenarios and three levels of error applied to each. The original variables $B, \ldots, E$ are retained throughout without change: only $A$ and the error terms vary.

+ +

+ +

The output associated with the upper left panel was

+ +
       A  B  C  D  E
+Comp.5 1 -1 -1 -1 -1
+
+ +

This says that the row of red dots--which is constantly at $0$, demonstrating a perfect multicollinearity--consists of the combination $0 \approx A -B-C-D-E$: exactly what was specified.

+ +

The output for the upper middle panel was

+ +
       A     B     C     D     E
+Comp.5 1 -0.95 -1.03 -0.98 -1.02
+
+ +

The coefficients are still close to what we expected, but they are not quite the same due to the error introduced. It thickened the four-dimensional hyperplane within the five-dimensional space implied by $(A,B,C,D,E)$ and that tilted the estimated direction just a little. With more error, the thickening becomes comparable to the original spread of the points, making the hyperplane almost impossible to estimate. Now (in the upper right panel) the coefficients are

+ +
       A     B     C     D     E
+Comp.5 1 -1.33 -0.77 -0.74 -1.07
+
+ +

They have changed quite a bit but still reflect the basic underlying relationship $A' = B' + C' + D' + E'$ where the primes denote the values with the (unknown) error removed.

+ +

The bottom row is interpreted the same way and its output similarly reflects the coefficients $1, 1/2, 1/2, 1$.

+ +

In practice, it is often not the case that one variable is singled out as an obvious combination of the others: all coefficients may be of comparable sizes and of varying signs. Moreover, when there is more than one dimension of relations, there is no unique way to specify them: further analysis (such as row reduction) is needed to identify a useful basis for those relations. That's how the world works: all you can say is that these particular combinations that are output by PCA correspond to almost no variation in the data. To cope with this, some people use the largest (""principal"") components directly as the independent variables in the regression or the subsequent analysis, whatever form it might take. If you do this, do not forget first to remove the dependent variable from the set of variables and redo the PCA!

+ +
+ +

Here is the code to reproduce this figure:

+ +
par(mfrow=c(2,3))
+beta <- c(1,1,1,1) # Also can be a matrix with `n.obs` rows: try it!
+process(z, beta, sd=0, main=""A=B+C+D+E; No error"")
+process(z, beta, sd=1/10, main=""A=B+C+D+E; Small error"")
+process(z, beta, sd=1/3, threshold=2/3, main=""A=B+C+D+E; Large error"")
+
+beta <- c(1,1/2,1/2,1)
+process(z, beta, sd=0, main=""A=B+(C+D)/2+E; No error"")
+process(z, beta, sd=1/10, main=""A=B+(C+D)/2+E; Small error"")
+process(z, beta, sd=1/3, threshold=2/3, main=""A=B+(C+D)/2+E; Large error"")
+
+ +

(I had to fiddle with the threshold in the large-error cases in order to display just a single component: that's the reason for supplying this value as a parameter to process.)

+ +
+ +

User ttnphns has kindly directed our attention to a closely related thread. One of its answers (by J.M.) suggests the approach described here.

+",2013-11-01 16:07:11.403 +58660,11490.0,1,,,,Transformation to normality for random variables with different locations,,CC BY-SA 3.0,"

I have a (potentially infinite) sequence of random variables $X_i$, with $i = 1, 2, \dots$, which have the same distribution (in terms of ""shape""), but different locations.

+ +

I have a sample of size $N$ from only one of these distributions and I would like to find a transformation to normality that works for the whole sequence of distributions. The locations are all unknown as well the analytic form of true distributions. All I have is a sample from one element of the sequence.

+ +

Notice that I need a transformation that uses the same parameters for every distribution. For example if, give my initial sample, I decide to use the Box-Cox($\lambda_1$, $\lambda_2$) transform, when I get a new sample I have to transform it using exactly the same transform (otherwise I would simply shift each sample to zero).

+ +

So far I haven't found anything that works: Box-Cox requires positivity of the sample (the shifting parameter doesn't help because the location varies from sample to sample). Similar problem for the Inverse Hyperbolic Sine transform.

+ +

Any suggestion is welcome!

+ +

EDIT:

+ +

This problem arises from an application where I have a data $y_1, \dots, y_n$, generated by a stochastic model $m(\theta_0)$, where $\theta_0$ is an unknown parameter.

+ +

I'm trying to estimate $\theta_0$ by Maximum Likelihood. At each step $k$ of the optimization I'm simulating a sample $x_1, \dots, x_n$ from the model $m(\theta_k)$, and I rely on the sample being approximately normally distributed to estimate the likelihood. Unfortunately the sample often has departures from normality, which is why I'm interested in transformations. I need to use exactly the same transformation across the parameters space in order to compute the likelihood correctly. For different $\theta$s the mean of the sample changes, but shape of the distribution seems to be fairly constant.

+",2013-11-01 16:31:55.667 +58661,23259.0,1,,,,How to find the support of the posterior distribution to apply Metropolis-Hastings MCMC algorithm?,,CC BY-SA 3.0,"

I am trying to sample from a posterior distribution using a MCMC algorithm using the Metropolis-Hastings sampler.

+ +

How should I deal with the situations where I'm stuck in regions of the posterior with zero probability?

+ +

These regions are present because the posterior distribution is truncated and also due to numerical limitations on the computer the likelihood can become zero if you are very far from the mean. That is, say the likelihood is distributed normally; if you are 100 standard deviations away from the mean you get what appears as zero probability to the computer.

+ +

What I want to know is how to chose the initial value of the chain in order to be sure that it is contained in the support of the posterior.

+",2013-11-01 17:34:53.247 +58662,23171.0,1,58668.0,,,How to compute conditional expectations with respect to a sigma field?,,CC BY-SA 3.0,"

Example: Toss a coin twice. Letting $\mathbb P$ be a probability measure, suppose $\mathbb P(HH)=p^2,\mathbb P(HT)=\mathbb P(TH)=p(1-p), \mathbb P(TT)=(1-p)^2.$ I would like to answer the following questions:

+ +
    +
  1. Define $Y$ to be the number of heads in the example. Derive the $\sigma$-field generated by $Y$.

  2. +
  3. Find the expectation of the random variable $Y$ from the previous +exercise, and also the conditional expectation of $Y$ given $\mathcal F_{1}$ s.t $\mathcal F_{1}=\{\varnothing,\{HH,HT\},\{TH,TT\},\Omega\}$. Check that in this case $E[E[Y|\mathcal F_{1}]]=E[Y]$.

  4. +
+ +

My answers: the sample space is $\Omega=\{HH,HT,TH,TT\}$. I think the answer to 1 is +$Y=\{\{TT\},\{HT,TH\},\{HH\}\}$, the $\sigma$-field generated by $Y$ is $\mathcal F(Y)=\{\varnothing,\{TT\},\{HH,HT,TH\},\{HT,TH\},\{HH,TT\},\{HH\},\{HT,TH,TT\},\Omega\}.$

+ +

But for question 2 I could only get $E(Y)=2p$. How is the conditional expectation computed?

+",2013-11-01 18:40:11.737 +58663,22705.0,2,,22797.0,,,,CC BY-SA 3.0,"

If X & Y are indeed co-integrated, then regressing X on Y will lead to spurious regression. So, I'm guessing your residuals will have a good amount of information which is not captured in the model.

+ +

But, I might have not understood some parts of your question well - When you say.. ""how does it affect my analysis"", it will help if you're a little more clearer about the specific problem because it seems as though you aren't doing time series analysis at all and are doing plain regression?

+ +

And, what do you mean by spread? Is it residual or something else?

+",2013-11-01 18:59:22.840 +58694,23278.0,1,,,,Linear regression with strongly non-normal response variable,,CC BY-SA 3.0,"

I have carried out a linear regression. The plot below shows the distribution of the response variable:

+ +

+ +

I believe the response variable is beta distributed, therefore virtually the exact opposite of normally distributed. However, when including all my predictor variables in the linear regression, the residuals turn out to be quite normally distributed, as shown in this plot:

+ +

+ +

Has my model satisfied the assumptions of linear regression? Might there be a better model to use?

+",2013-11-02 09:56:37.087 +58664,23262.0,1,,,,Question about prediction bands for non-linear regression computation?,,CC BY-SA 3.0,"

I am interested in computing prediction bands for a non-linear regression (log-logistic function with 3 parameters). I have read the Prism help page:

+
+

The calculation of the confidence and prediction bands are fairly standard. Read on for the details of how Prism computes prediction and confidence bands of nonlinear regression.

+

First, let's define G|x, which is the gradient of the parameters at a particular value of X and using all the best-fit values of the parameters. The result is a vector, with one element per parameter. For each parameter, it is defined as dY/dP, where Y is the Y value of the curve given the particular value of X and all the best-fit parameter values, and P is one of the parameters.)

+

G'|x is that gradient vector transposed, so it is a column rather than a row of values.

+

Cov is the covariance matrix (inversed Hessian from last iteration). It is a square matrix with the number of rows and columns equal to the number of parameters. Each item in the matrix is the covariance between two parameters.

+

Now compute c = G'|x * Cov * G|x. The result is a single number for any value of X.

+

The confidence and prediction bands are centered on the best fit curve, and extend above and below the curve an equal amount.

+

The confidence bands extend above and below the curve by: = sqrt(c)*sqrt(SS/DF)*CriticalT(Confidence%, DF)

+

The prediction bands extend a further distance above and below the curve, equal to: = sqrt(c+1)*sqrt(SS/DF)*CriticalT(Confidence%, DF)

+
+

I also read the topic How to compute prediction bands for non-linear regression?.

+

But I still do not understand something : how to compute the Hessian matrix. I do not think it is (I do not get values between -1 and 1), but, is it the second-order partial derivatives of my log-logistic function ?

+

In Prism, it refers to the "Hessian from last iteration" while in the topic, it is written "the second derivatives of the likelihood function at your estimates".

+

The likelihood function I use is ( -N / 2 ) * log( sum( ( dat - model ) .^2 ) ) where N is the number of observations, dat is a vector with data being fitted and model is a vector of model values. I do not really understand how it would be possible to do a second order derivatives of this function.

+

Could anyone give me a hand with this ?

+",2013-11-01 19:11:44.703 +58665,22544.0,1,,,,What part of an ARMA model requires a stationary time series - the AR or the MA?,,CC BY-SA 3.0,"

Could I use a non-stationary time series with simply an Autoregressive model?

+",2013-11-01 19:25:49.477 +58666,17573.0,2,,58070.0,,,,CC BY-SA 3.0,"

Sorry about the delay in answering. What's going on, I think, is seasonality.

+ +

Suppose there are three temperatures. First there is ""true"" temperature---whatever average high surface air temperature actually occurs on each day. Second, there is ground measured temperature. Third, there is satellite measured temperature. The second and third temperatures are the first temperature, except measured with error. Like this: +\begin{align} +\textrm{ground}_{d,y} &= \textrm{true}_{d,y} + \nu_{d,y}\\ +\textrm{satellite}_{d,y} &= \textrm{true}_{d,y} + \eta_{d,y}\\ +\end{align}

+ +

Above, $\mathrm{true}_{d,y}$ means the true temperature in year $y$ on day $d$, where day counts up from 1 on 01/01/year to 365 on 12/31/year --- I am too lazy to think about leap years here.

+ +

Now, if you regress ground on satellite, of course you are going to get a regression with some predictive power, because both measurements are being driven by the same true temperature. However, the predictive power will not be perfect. How good will the predictions be? Well, that depends on the ratio of signal (variance of movements in true temperature) to noise (variance of movements in $\nu$ and $\eta$. In fact, you can calculate the covariance and correlation between ground and satellite (assuming that the $\nu$ and $\eta$ are iid, independent of each other, and independent of true): +\begin{align} +Cov(\mathrm{ground},\mathrm{satellite}) &= V(\mathrm{true})\\ +Corr(\mathrm{ground},\mathrm{satellite}) &= \frac{V(\mathrm{true})} + {\sqrt{(V(\mathrm{true})+V(\nu))(V(\mathrm{true})+V(\eta))}} +\end{align}

+ +

Now, in the simple bivariate model, when you just regress ground on satellite, $R^2$ is equal to the correlation above squared, or: +\begin{align} +R^2 &= \frac{(V(\mathrm{true}))^2} + {(V(\mathrm{true})+V(\nu))(V(\mathrm{true})+V(\eta))} +\end{align}

+ +

So, why is $R^2$ so much higher in the Nov to Apr sample than in the Jan sample? That's easy. As long as you are looking at a weather station far from the equator, there are big month-to-month differences in true. So, the more of the year you include in your time window, the higher is $V(\mathrm{true})$, and the higher is $R^2$.

+ +

Why, then, is the mean of the sum of squared residuals higher in the Nov to Apr sample than in the Jan sample? The mean of the sum of squared residuals goes in probability to: +\begin{align} +\frac{1}{n}\sum e_i^2 &\rightarrow V(\mathrm{ground}) + \beta^2 V(\mathrm{satellite}) +\end{align}

+ +

The $\beta$ in the above is the probability limit of the regression coefficient. Notice that this quantity goes up as the variance of both ground and satellite go up. As you expand your time window from Jan only to Nov through Apr, the variance of true goes up and this drives the variance of both ground and satellite up. In turn, this drives the variance of the residuals up.

+ +

If you prefer a numerical example, here is some R code where you can see what is going on. All I have done is implement the reasoning above in a very simple monte carlo. As you can see if you run the code, I have pretty well replicated your $R^2$ and mean squared residual results using somewhat realistic temperature data:

+ +
# This program written in response to a Cross Validated question
+# http://stats.stackexchange.com/questions/73544/multiple-linear-regression-models-comparison-based-on-r-squared-and-residual-err#comment143511_73544
+
+# The program is a toy monte carlo.
+# It generates a ""true"" but unobservable-to-the-analyst high temperature series
+# in degrees Celcius for a location in the northern hemisphere of the earth 
+# pretty far from the equator.  It is very loosely based on Des Moines, IA.
+# The series simulates seasonality with a sinusoidal curve and generates 
+# temperatures for all days from 1980-2000, ignoring leap years.
+
+# Then it generates two series which measure the true temperature with error,
+# calling them satellite and ground.  Then it regresses ground on satellite 
+# by OLS over various slices of the data.  The slices are only January,
+# only Nov through Apr, and all year long.
+
+# day is days since 12/31/1979, ie it starts at 1 on 01/01/1980
+# I ignore leap years and assume 365 day years
+# day maxes out at 20*365=7300
+
+set.seed(12344321)
+day <- seq(1:7300)
+
+
+true.temp <- 32*sin(((day-1)/365)*2*pi + 3*pi/2) + 21 + 4*rnorm(7300)
+
+plot(day[1:365],true.temp[1:365])
+
+# Two fallible measures of true temp
+
+satellite <- true.temp + 4*rnorm(7300)
+ground <- true.temp + 4*rnorm(7300)
+
+plot(day[1:365],satellite[1:365])
+plot(day[1:365],ground[1:365])
+plot(satellite[1:365],ground[1:365])
+
+# Now predict ground with satellite over the whole year
+pred.reg <- lm(ground~satellite)
+summary(pred.reg)
+
+# Now predict ground with sattelite over January
+january <- ((day-1) %% 365) %/% 31 == 0
+plot(satellite[january],ground[january],xlab=""satellite"",ylab=""ground"",main=""January Only"")
+pred.reg <- lm(ground[january]~satellite[january])
+summary(pred.reg)
+
+# Now predict ground with sattelite over Nov through April
+novtoapr <- ((day-1)%%365)>304 | ((day-1)%%365)<92
+plot(satellite[novtoapr],ground[novtoapr],xlab=""satellite"",ylab=""ground"",main=""Nov to Apr"")
+pred.reg <- lm(ground[novtoapr]~satellite[novtoapr])
+summary(pred.reg)
+
+ +

Look at the plots of ground against satellite for Jan only and Nov to Apr. You can see the higher $R^2$ in the Nov to Apr:

+ +

+

+ +

Obviously, in your application there is a lot more going on since you have other right-hand-side variables in your analysis. Also, the measurement errors are probably not nice, normally distributed, iid things like in the monte carlo. Nevertheless, I think what is going on in your data is likely to be what is going on in this monte carlo. In conclusion, I don't think your results are anything to worry about. They are exactly what we would expect in this circumstance.

+",2013-11-01 19:35:04.697 +58667,9081.0,2,,57540.0,,,,CC BY-SA 3.0,"

If you need a reference for the answer in my comment above, here is one, from Andrew Gelman's blog:

+ +
+

Which reminds me of Lucien Le Cam’s reply when I asked him once whether he could think of any examples where the distinction between the strong law of large numbers (convergence with probability 1) and the weak law (convergence in probability) made any difference. Le Cam replied, No, he did not know of any examples. Le Cam was the theoretical statistician’s theoretical statistician, so there’s your answer.

+
+ +

One could maybe add that the real importance of this different modes of convergence lies in the mathematics, that they permit the use of different mathematical techniques, in the development of the theory, only. And that might be important enough, but for the development of theory, not in the concrete practical applications.

+",2013-11-01 20:34:39.070 +58668,668.0,2,,58662.0,,,,CC BY-SA 3.0,"

The formal definition of conditional expectation is that $E[Y|\mathcal{F}_1]$ is any random variable measurable with respect to $\mathcal{F}_1$ having the property that

+ +

$$\int_F E[Y|\mathcal{F}_1](\omega)d\mathbb{P}(\omega) = \int_F Y(\omega) d\mathbb{P}(\omega)$$

+ +

for all $\mathcal{F}_1$-measurable sets $F$.

+ +

In the present case, this definition invites us to inspect all the measurable subsets $F$ with respect to $\mathcal{F}_1$, which you already computed in the first problem. The trick is to begin with the smallest, most basic $\mathcal{F}_1$-measurable sets (apart from the empty set), which are $\{HH, HT\}$ and $\{TH, TT\}$. Although we don't yet know $E[Y|\mathcal{F}_1]$, we can use the right hand side to compute its integrals. Because neither of these events can be decomposed (nontrivially) into smaller ones, the conditional expectation must have a constant value on each one. For example, writing +$$E[Y|\mathcal{F}_1](HH) =E[Y|\mathcal{F}_1](HT) = z,$$

+ +

the definition gives

+ +

$$\eqalign{ +zp &= zp^2 + zp(1-p) \\ +&=E[Y|\mathcal{F}_1](HH) \mathbb{P}(HH) +E[Y|\mathcal{F}_1](HT) \mathbb{P}(HT)\\ +&=\int_{\{HH, HT\}} E[Y|\mathcal{F}_1](\omega)d\mathbb{P}(\omega)\\ +&= \int_{\{HH, HT\}} Y(\omega) d\mathbb{P}(\omega)\\ +&= Y(HH)\mathbb{P}(HH) + Y(HT)\mathbb{P}(HT) \\ +&= 2p^2 + 1p(1-p)= p+p^2, +}$$

+ +

whence we deduce

+ +

$$z = \frac{p+p^2}{p} = 1 + p.$$

+ +

A similar calculation for $F = \{TH, TT\}$ (do it!) establishes that

+ +

$$E[Y|\mathcal{F}_1](TH) =E[Y|\mathcal{F}_1](TT) = p.$$

+ +

There is a simple intuition to support these abstract calculations: $\mathcal{F}_1$ records the information available after flipping the coin the first time. If it comes up heads, the possible events (which have only partially occurred!) are $HH$ and $HT$. We already have one head and there is a chance $p$ that the second flip will be a head. Thus, at this stage, our expectation of $Y$ equals $1$ (for what has happened) plus $1\times p$ (for what could happen yet), summing to $1+p$. If instead the first flip is tails, we have seen no heads yet but there is still a chance of $p$ of seeing a head on the second flip: the expectation of $Y$ is just $0 + 1\times p = p$ in that case.

+ +

As a check, we can compute

+ +

$$\eqalign{ +E[E[Y|\mathcal{F}_1]] &= \int_\Omega E[Y|\mathcal{F}_1](\omega)d\mathbb{P}(\omega) \\ +&= (1+p)\mathbb{P}(E[Y|\mathcal{F}_1]=1+p) + (p)\mathbb{P}(E[Y|\mathcal{F}_1]=p)\\ +& = (1+p)\mathbb{P}(\{HH, HT\}) + (p) \mathbb{P}(\{TH, TT\})\\ +&= (1+p)(p^2 + p(1-p)) + p((1-p)p + (1-p)^2)\\ +&= 2p, +}$$

+ +

exactly as before.

+ +

It should be clear that this is just a laborious way of expressing the idea that there is a $p$ chance at the outset of heads--which has a conditional expectation of $1+p$-- and a $1-p$ chance of tails--which has a conditional expectation of $p$: everything reduces to the elementary calculation of conditional expectations, which needs no sigma fields or integrals. The point of this exercise is to build on that intuition to develop an understanding that will hold up when these sigma algebras get much, much more complicated.

+",2013-11-01 20:58:01.017 +58669,1602.0,1,,,,How to algorithmically determine the best order of fit?,,CC BY-SA 3.0,"

I am doing a least squares polynomial interpolation for 10,000 data sets that look mostly like one period of a sine curve, but whose values are not evenly spaced in the time domain, and can sometimes be quite noisy. A lot of them do just fine with a 3rd order fit (a + bx + cx^2 + dx^3) but some oscillate wildly (Runge's phenomenon) and need a 2nd order instead, and some are much better fitted with a 5th or even 8th order fit.

+ +

What methods exist for determining which fit is best with respect to avoiding both underfitting and overfitting?

+ +

One thing I think that I can do is to choose the largest order $m$ such that $m<2\sqrt{n}$ where $n$ is the number of points in my data set, and also check to make sure that there are some number of points in $2\sqrt{n}$ equally sized bins spanning the space. On the other hand, however, I would also like to penalize for having more parameters than necessary. Maybe I can also minimize $SSE(m)/(n-m-1)$. What better methods are out there for doing this?

+",2013-11-01 20:58:06.343 +58679,20320.0,1,331473.0,,,Differences between clustering and segmentation,,CC BY-SA 3.0,"

I have read about piecewise aggregate approximation (PAA) mining time series data, sliding window, top down and bottom up approaches for time series segmentation but these are applicable to single dimension time series.

+ +

What are the techniques for multi dimensional time series segmentation? Can Gaussian mixture model or K means clustering be used for segmentation? If so then what is the difference between segmentation and clustering?

+ +
    +
  1. What is the difference between segmenting and clustering

  2. +
  3. How do I segment motion time series data so that the temporal information is retained.

  4. +
  5. What are the algorithms for doing so - What are the techniques for multi dimensional time series segmentation??

  6. +
+ +

Please provide links or ideas thank you.

+",2013-11-02 00:32:34.453 +58670,7007.0,2,,58662.0,,,,CC BY-SA 3.0,"

Take the sample space $\Omega$ as the Cartesian product $\{H,T\}\times\{H,T\}$, with sigma-field $\mathscr{F}$ equal to the class of all subsets of $\Omega$. The sigma-field generated by $Y$ (denoted by $\sigma(Y)$) is the smallest sub-sigma-field of $\mathscr{F}$ in which $Y$ is measurable. Since $Y\in\{0,1,2\}$, the inverse images +$$ + Y^{-1}(\{0\}) = \{(T,T)\}\, , \quad Y^{-1}(\{1\}) = \{(H,T),(T,H)\}\, , \quad \quad Y^{-1}(\{2\}) = \{(H,H)\} \, , +$$ +show that +$$ + \sigma(Y)=\sigma\left\{\{(T,T)\}, \{(H,T),(T,H)\}, \{(H,H)\}\right\} + = \left\{\emptyset,\{(T,T)\}, \{(H,T),(T,H)\}, \{(H,H)\},\{(H,T),(T,H),(H,H)\},\{(T,T),(H,H)\},\{(T,T),(H,T),(T,H)\},\Omega\right\} \, . +$$ +Define +$$ + \mathscr{G}=\{\emptyset,\{(H,H),(H,T)\},\{(T,T),(T,H)\},\Omega\}\subset\mathscr{F} \, . +$$

+ +

The conditional expectation $Z=\mathrm{E}[Y\mid \mathscr{G}]$ is a $\mathscr{G}$-measurable random variable satisfying +$$ + \mathrm{E}[Y I_A]=\mathrm{E}[Z I_A] \, , \qquad\qquad (*) +$$ +for every $A\in\mathscr{G}$. The fact that $Z$ is $\mathscr{G}$-measurable entails that it is constant in the atoms of $\mathscr{G}$ (this is the crucial idea). Let +$$ + Z(H,H) = Z(H,T) = a, \quad Z(T,T) = Z(T,H) = b \, . +$$ +Taking $A=\{(H,H),(H,T)\}$, relation $(*)$ yields +$$ + 2 \cdot p^2 + 1 \cdot p(1-p) = a \cdot p^2 + a \cdot p(1-p) \, , +$$ +implying that $a=1+p$. Similarly, we find $b=p$.

+ +

Finally, +$$ + \mathrm{E}[Y]= 1 \cdot 2 p(1-p) + 2\cdot p^2 = 2p \, , +$$ +and +$$ + \mathrm{E}[Z] = (1+p) \cdot (p^2 + p(1-p)) + p \cdot ((1-p)^2 + p(1-p)) = 2p \, , +$$ +as ""expected"".

+ +

P.S. May I suggest a beautiful related exercise? Let $\Omega=[0,1]$, $\mathscr{F}$ be the Borel subsets of $\Omega$, and $P$ be Lebesgue measure. Let $\mathscr{G}$ be the sub-sigma-field of $\mathscr{F}$ generated by the partition +$$\{[0,1/2],(1/2,1]\}\, . +$$ +Let $X$ be the identity map ($X(\omega)=\omega$). Plot the graph of $\mathrm{E}[X\mid\mathscr{G}]$.

+",2013-11-01 21:16:30.703 +58671,23263.0,1,,,,Cox Regression Prediction of Event Count For Given Period,,CC BY-SA 3.0,"

I have a standard event history data set, which I used to fit a Cox proportional hazards model in Stata and R. The time variable is measured in days. There are no time-varying covariates on the RHS of the equation.

+ +

I have recovered the usual output (coefficients, hazard ratios, SEs, etc.) as well as the predicted baseline survival for each observation. What I need to do now is estimate the predicted number of events in the next 30 days. Any ideas on how to do this?

+ +

Thanks very much!

+",2013-11-01 21:16:59.367 +58672,3993.0,2,,58627.0,,,,CC BY-SA 3.0,"

It seems to me that Ellis could be referring to as many as three distinct ideas here. First he says something about creating ""simulated data generated by a model under the null hypothesis of no relation."" I would call this a form of parametric bootstrapping. Then he says that this would be ""probably based on resampling the times between each event (eg between each yawn) to create a new set of time stamps for hypothetical null model events."" Which, let's just be clear here, to do this is not to ""create simulated data."" We are instead, if I understand correctly, resampling from our actually observed data. This latter procedure is either a permutation test or nonparametric bootstrapping, depending on how the resampling takes place.

+ +

I guess I should say a few more words about parametric bootstrapping, permutation tests, and nonparametric bootstrapping.

+ +

Usually parametric bootstrapping is done by simulating based on the actually estimated model, and not based on a hypothetical model that is just like the estimated model except the null hypothesis is assumed true, as Ellis seems to suggest at first. By ""simulate data"" I mean something like as an example: my model states that my data come from two groups, each with a normal distribution, with means $\mu_1$ and $\mu_2$, respectively, and standard deviation $\sigma$, so I will generate many sets of data that satisfy this and use the distribution of test statistics computed from each of these simulated datasets as my sampling distribution. Note, I am creating this data using something like rnorm() in R, not directly using my observed data. Now, one could certainly do this procedure and get a sort of sampling distribution under the null hypothesis of, say, no difference in group means--we would just assume $\mu_1=\mu_2$ in all the simulated datasets, contrary to what we actually observed--and in this way we get a bootstrapped p-value (rather than a bootstrapped confidence interval, which is what the former/traditional method affords you). Again, I would just call this a way of obtaining a p-value via parametric bootstrapping.

+ +

A permutation test, on the other hand, involves shuffling your observed data over and over in a way that would be consistent with the null hypothesis. So for example, if the null hypothesis implies that group assignment makes no difference in terms of the group means, you can randomly shuffle the group labels among all your observations many many times and see what mean differences you would get for all possible ways of shuffling in this way. And then you would see where within the distribution of test statistics computed from these shuffled datasets does your actual observed statistic lie. Note that there is a finite (but usually large) number of ways that you can shuffle your actually observed data.

+ +

Finally, nonparametric bootstrapping is very similar to the permutation test, but we resample the observed data with replacement to try to get closer to an infinite ""population"" of values that our data might have been drawn from. There are many, many more ways to resample from your data with replacement than there are to shuffle your data (although it is technically finite in practice as well). Again, similar to parametric bootstrapping, this is usually done not under the null hypothesis, but under the model implied by the observed data, yielding confidence intervals around the observed test statistics, not p-values. But one could certainly imagine doing this under the null hypothesis like Ellis suggests and obtaining p-values in this way. As an example of nonparametric bootstrapping here (in the traditional fashion, i.e., not under the null hypothesis) using the same difference-in-group-means example I used in the parametric bootstrapping paragraph, to do this we would resample with replacement the observations within each group many times but not mixing observations between groups (unlike in the permutation test), and build up the sampling distribution of group mean differences that we get this way.

+",2013-11-01 21:44:34.163 +58673,20320.0,1,,,,Moving average filter for outlier removal,,CC BY-SA 3.0,"

I am using a moving average filter to smooth data for outlier removal. By changing the number of average points, I am getting different result.

+ +

My data are multi-dimensional feature vectors.

+ +

I applied the moving average to the entire matrix and then on individual variables.

+ +

They give different results.

+ +

So, how to choose/guess the number of points to average over and should it be applied on the entire matrix or on a one by one basis?

+",2013-11-01 21:44:37.660 +58674,503.0,2,,58673.0,,,,CC BY-SA 3.0,"

Neither. Both. All.

+ +

Sorry. But I think this is another attempt (albeit a clever one) to automate what can't really be automated. Of course different methods give different results; the only times they wouldn't is where the outlier is so obvious that you don't need a test.

+ +

My suggestion is to use a variety of methods to identify possible outliers, then examine those outliers on an individual basis.

+",2013-11-01 21:49:31.433 +58675,15717.0,1,,,,Price elasticity of demand and time lags,,CC BY-SA 3.0,"

This is probably a very basic question so apologies in advance, but we are stuck.

+ +

We currently change prices by varying amounts (both positively and negatively)every 3 months,and would like to calculate the price elasticity of demand.

+ +

Questions:

+ +

1) Should we eliminate a period of time immediately after the price change to allow time for people to adjust behavior?

+ +

2)Should we include the entire time between quarters (i.e. the three months before the price change (so everything right after the previous price change all the way up to the current price change) and the 3 months after the price change (so everything after the current price change all the way up to the next price change)?

+ +

Any help would be greatly appreciated!!!!

+",2013-11-01 22:35:28.023 +58676,20949.0,1,,,,"Tried to overfit a Bayes net, but mean prediction error is worse than learned network?",,CC BY-SA 3.0,"

I have variables A, B, C, D, and E. I am interested in building a classifier for A.

+ +

I learned a Bayes net structure from the data using greedy search and BIC as a score. Call this network 1. Using cross validation, I got the mean prediction error for node A in network 1.

+ +

I then, thought to create a network structure where all the arcs are incoming into A (B->A, C->A, D->A, E->A). Call this network 2.

+ +

I also learned a third network that was constrained by having all the arcs in network 2, but could allow additional arcs between B, C, D, and E. Call this network 3.

+ +

It seemed to me that the network 2 and network 3 would be better at predicting A because I was forcing all the info in the data as a direct input into predicting A -- i.e. overfitting. So I expected the mean prediction errors (MPE) for these networks to be less than the first network. But instead they were higher.

+ +

Also, when I repeated the cross validation several times, the mean prediction error for network 1 was always the same. But the value varied for networks 2 and 3.

+ +

Why would networks 2 and 3 have a higher MPE, and why would MPE for those networks be variable while not variable for network 1?

+",2013-11-01 22:48:23.687 +58677,20949.0,2,,32388.0,,,,CC BY-SA 3.0,"

Actually, the structure you learned and the structure you proposed are ""Markov equivalent"". This basically means you can use Bayes theorem to go from one to another.

+ +

Proof

+ +
P(bid)P(won|bid)P(sold|won) = 
+P(won, bid) P(won, sold)/P(won) = 
+P(bid|won)P(won)P(won|sold)P(sold)/P(won) =
+P(bid|won)P(won|sold)P(sold) 
+
+ +

which is what you want.

+ +

To show this in R, try this

+ +
bn.2 <- empty.graph(nodes(bn.hc))
+arcs(bn.2) <- matrix(c(""bid"", ""won"", ""won"", ""sold""), ncol = 2, byrow = T)
+score(bn.hc, dat) == score(bn.2, dat)
+> TRUE
+
+ +

Also checkout ?cpdag

+ +

Basically, you can change the direction of any arc in the network and get a graph of the same equivalence class, so long as that arc is not in a v-structure.

+",2013-11-01 23:13:01.773 +58678,22564.0,1,,,,Is randomization reliable with small samples?,,CC BY-SA 3.0,"

Jerome Cornfield has written:

+ +
+

One of the finest fruits of the Fisherian revolution was the idea of + randomization, and statisticians who agree on few other things have at + least agreed on this. But despite this agreement and despite the + widespread use of randomized allocation procedures in clinical and in + other forms of experimentation, its logical status, i.e., the exact + function it performs, is still obscure.

+
+ +

Cornfield, Jerome (1976). ""Recent Methodological Contributions to Clinical Trials"". American Journal of Epidemiology 104 (4): 408–421.

+ +

Throughout this site and in a variety of literature I consistently see confident claims about the powers of randomization. Strong terminology such as ""it eliminates the issue of confounding variables"" are common. See here, for example. However, many times experiments are run with small samples (3-10 samples per group) for practical/ethical reasons. This is very common in preclinical research using animals and cell cultures and the researchers commonly report p values in support of their conclusions.

+ +

This got me wondering, how good is randomization at balancing confounds. For this plot I modeled a situation comparing treatment and control groups with one confound that could take on two values with 50/50 chance (eg type1/type2, male/female). It shows the distribution of ""% Unbalanced"" (Difference in # of type1 between treatment and control samples divided by sample size) for studies of a variety of small sample sizes. The red lines and right side axes show the ecdf.

+ +

Probability of various degrees of balance under randomization for small sample sizes: +

+ +

Two things are clear from this plot (unless I messed up somewhere).

+ +

1) The probability of getting exactly balanced samples decreases as sample size is increased.

+ +

2) The probability of getting a very unbalanced sample decreases as sample size increases.

+ +

3) In the case of n=3 for both groups, there is a 3% chance of getting a completely unbalanced set of groups (all type1 in the control, all type2 in the treatment). N=3 is common for molecular biology experiments (eg measure mRNA with PCR, or proteins with western blot)

+ +

When I examined the n=3 case further, I observed strange behaviour of the p values under these conditions. The left side shows the overall distribution of pvalues calculating using t-tests under conditions of different means for the type2 subgroup. The mean for type1 was 0, and sd=1 for both groups. The right panels show the corresponding false positive rates for nominal ""significance cutoffs"" from .05 to.0001.

+ +

Distribution of p-values for n=3 with two subgroups and different means of the second subgroup when compared via t test (10000 monte carlo runs): +

+ +

Here are the results for n=4 for both groups: +

+ +

For n=5 for both groups: +

+ +

For n=10 for both groups: +

+ +

As can be seen from the charts above there appears to be an interaction between sample size and difference between subgroups that results in a variety of p-value distributions under the null hypothesis that are not uniform.

+ +

So can we conclude that p-values are not reliable for properly randomized and controlled experiments with small sample size?

+ +

R code for first plot

+ +
require(gtools)
+
+#pdf(""sim.pdf"")
+par(mfrow=c(4,2))
+for(n in c(3,4,5,6,7,8,9,10)){
+  #n<-3
+  p<-permutations(2, n, repeats.allowed=T)
+
+  #a<-p[-which(duplicated(rowSums(p))==T),]
+  #b<-p[-which(duplicated(rowSums(p))==T),]
+
+  a<-p
+  b<-p
+
+  cnts=matrix(nrow=nrow(a))
+  for(i in 1:nrow(a)){
+    cnts[i]<-length(which(a[i,]==1))
+  }
+
+
+  d=matrix(nrow=nrow(cnts)^2)
+  c<-1
+  for(j in 1:nrow(cnts)){
+    for(i in 1:nrow(cnts)){
+      d[c]<-cnts[j]-cnts[i]
+      c<-c+1
+    }
+  }
+  d<-100*abs(d)/n
+
+  perc<-round(100*length(which(d<=50))/length(d),2)
+
+  hist(d, freq=F, col=""Grey"", breaks=seq(0,100,by=1), xlab=""% Unbalanced"",
+       ylim=c(0,.4), main=c(paste(""n="",n))
+  )
+  axis(side=4, at=seq(0,.4,by=.4*.25),labels=seq(0,1,,by=.25), pos=101)
+  segments(0,seq(0,.4,by=.1),100,seq(0,.4,by=.1))
+  lines(seq(1,100,by=1),.4*cumsum(hist(d, plot=F, breaks=seq(0,100,by=1))$density),
+        col=""Red"", lwd=2)
+
+}
+
+ +

R code for plots 2-5

+ +
for(samp.size in c(6,8,10,20)){
+  dev.new()
+  par(mfrow=c(4,2))
+  for(mean2 in c(2,3,10,100)){
+    p.out=matrix(nrow=10000)
+
+    for(i in 1:10000){
+
+      d=NULL
+      #samp.size<-20
+      for(n in 1:samp.size){
+        s<-rbinom(1,1,.5)
+        if(s==1){
+          d<-rbind(d,rnorm(1,0,1))
+        }else{
+          d<-rbind(d,rnorm(1,mean2,1))
+        }
+      }
+
+      p<-t.test(d[1:(samp.size/2)],d[(1+ samp.size/2):samp.size], var.equal=T)$p.value
+
+      p.out[i]<-p
+    }
+
+
+    hist(p.out, main=c(paste(""Sample Size="",samp.size/2),
+                       paste( ""% <0.05 ="", round(100*length(which(p.out<0.05))/length(p.out),2)),
+                       paste(""Mean2="",mean2)
+    ), breaks=seq(0,1,by=.05), col=""Grey"", freq=F
+    )
+
+    out=NULL
+    alpha<-.05
+    while(alpha >.0001){
+
+      out<-rbind(out,cbind(alpha,length(which(p.out<alpha))/length(p.out)))
+      alpha<-alpha-.0001
+    }
+
+    par(mar=c(5.1,4.1,1.1,2.1))
+    plot(out, ylim=c(0,max(.05,out[,2])),
+         xlab=""Nominal alpha"", ylab=""False Postive Rate""
+    )
+    par(mar=c(5.1,4.1,4.1,2.1))
+  }
+
+}
+#dev.off()
+
+",2013-11-01 23:24:19.330 +58690,23274.0,1,,,,Interaction of variables in regression,,CC BY-SA 3.0,"

I have two variables that predict fraud behavior (dependent variable). The independent variables are perception of fraud being wrong (1-5) and probability of being caught (1-5). The dependent variable is frequency of committing fraud in the last 5 years (never, once, 2-3 times, 4 times and more). +Two questions:

+ +
    +
  1. What kind of regression should I use? Ordinal?
  2. +
  3. Theory predicts that interaction of these two variables predicts the fraud. For those who perceive it wrong, the probability of being caught affects differently than for those who perceive it right. How should I enter this to the model?
  4. +
+",2013-11-02 08:55:19.447 +58680,1145.0,2,,58678.0,,,,CC BY-SA 3.0,"

You are correct to point out the limitations of randomisation in dealing with unknown confounding variables for very small samples. However, the problem is not that the P-values are not reliable, but that their meaning varies with sample size and with the relationship between the assumptions of the method and the actual properties of the populations.

+ +

My take on your results is that the P-values performed quite well until the difference in the subgroup means was so large that any sensible experimenter would know that there was an issue prior to doing the experiment.

+ +

The idea that an experiment can be done and analysed without reference to a proper understanding of the nature of the data is mistaken. Before analysing a small dataset you must know enough about the data to be able to confidently defend the assumptions implicit in the analysis. Such knowledge commonly comes from prior studies using the same or similar system, studies that can be formal published works or informal 'preliminary' experiments.

+",2013-11-02 00:34:35.583 +58681,3993.0,2,,58296.0,,,,CC BY-SA 3.0,"

Here is another geometric view of suppression, but rather than being in the observation space as @ttnphns's example is, this one is in the variable space, the space where everyday scatterplots live.

+ +

Consider a regression $\hat{y}_i=x_i+z_i$, that is, the intercept is 0 and both predictors have a partial slope of 1. Now, the predictors $x$ and $z$ may themselves be correlated. We will consider two cases: first the case where $x$ and $z$ are positively correlated, which I will call the ""confounding"" case (characterized by the secondary regression $\hat{x}_i=\frac{1}{2}z_i$), and second the case where $x$ and $z$ are negatively correlated, which I will call the ""suppression"" case (with secondary regression $\hat{x}_i=-\frac{1}{2}z_i$).

+ +

We can plot our regression equation as a plane in the variable space that looks like this:

+ +

+ +

Confounding case

+ +

Let's consider the slope for the $x$ predictor in the confounding case. To say that the other predictor $z$ is serving as a confounding variable is to say that when we look at a simple regression of $y$ on $x$, the effect of $x$ here is stronger than is the effect of x in a multiple regression of $y$ on $x$ and $z$, where we partial out the effect of $z$. The effect of $x$ that we observe in the simple regression is, in some sense (not necessarily causal), partially due to the effect of $z$, which is positively associated with both $y$ and $x$, but not included in the regression. (For the purposes of this answer I will use ""the effect of $x$"" to refer to the slope of $x$.)

+ +

We will call the slope of $x$ in the simple linear regression the ""simple slope"" of $x$ and the slope of $x$ in the multiple regression the ""partial slope"" of $x$. Here is what the simple and partial slopes of $x$ look like as vectors on the regression plane:

+ +

+ +

The partial slope of x is perhaps easier to understand. It is shown in red above. It is the slope of a vector that moves along the plane in such a way that $x$ is increasing, but $z$ is held constant. This is what it means to ""control for"" $z$.

+ +

The simple slope of $x$ is slightly more complicated because it implicitly also includes part of the effect of the $z$ predictor. It is shown in blue above. The simple slope of $x$ is the slope of a vector that moves along the plane in such a way that $x$ is increasing, and $z$ also is increasing (or decreasing) to whatever extent $x$ and $z$ are associated in the dataset. In the confounding case, we set things up so that the relationship between $x$ and $z$ was such that when we move up one unit on $x$, we also move up half a unit on $z$ (this comes from the secondary regression $\hat{x}_i=\frac{1}{2}z_i$). And since one-unit changes in both $x$ and $z$ are separately associated with one-unit changes in $y$, this means that the simple slope of $x$ in this case will be $\Delta x + \Delta z = 1 + \frac{1}{2} = 1.5$.

+ +

So when we control for $z$ in the multiple regression, the effect of $x$ appears to be smaller than it was in the simple regression. We can see this visually above in the fact that the red vector (representing the partial slope) is less steep than the blue vector (representing the simple slope). The blue vector is really the result of adding two vectors, the red vector and another vector (not shown) representing the half the partial slope of $z$.

+ +

Okay, now we turn to the slope for the $x$ predictor in the suppression case. If you followed all of the above, this is a really easy extension.

+ +

Suppression case

+ +

To say that the other predictor $z$ is serving as a supressor variable is to say that when we look at a simple regression of $y$ on $x$, the effect of $x$ here is weaker than is the effect of x in a multiple regression of $y$ on $x$ and $z$, where we partial out the effect of $z$. (Note that in extreme cases, the effect of $x$ in the multiple regression might even flip directions! But I am not considering that extreme case here.) The intution behind the terminology is that it appears that in the simple regression case, the effect of $x$ was being ""suppressed"" by the omitted $z$ variable. And when we include $z$ in the regression, the effect of $x$ emerges clearly for us to see, where we couldn't see it as clearly before. Here is what the simple and partial slopes of $x$ look like as vectors on the regression plane in the suppression case:

+ +

+ +

So when we control for $z$ in the multiple regression, the effect of $x$ appears to increase relative to what it was in the simple regression. We can see this visually above in the fact that the red vector (representing the partial slope) is steeper than the blue vector (representing the simple slope). In this case the secondary regression was $\hat{x}_i=-\frac{1}{2}z_i$, so a one-unit increase in $x$ is associated with a half-unit decrease in $z$, which in turn leads to a half-unit decrease in $y$. So ultimately the simple slope of $x$ in this case will be $\Delta x + \Delta z = 1 + -\frac{1}{2} = 0.5$. As before, the blue vector is really the result of adding two vectors, the red vector and another vector (not shown) representing half of the reverse of the partial slope of $z$.

+ +

Illustrative datasets

+ +

In case you want to play around with these examples, here is some R code for generating data conforming to the example values and running the various regressions.

+ +
library(MASS) # for mvrnorm()
+set.seed(7310383)
+
+# confounding case --------------------------------------------------------
+
+mat <- rbind(c(5,1.5,1.5),
+             c(1.5,1,.5),
+             c(1.5,.5,1))
+dat <- data.frame(mvrnorm(n=50, mu=numeric(3), empirical=T, Sigma=mat))
+names(dat) <- c(""y"",""x"",""z"")
+
+cor(dat)
+#           y         x         z
+# y 1.0000000 0.6708204 0.6708204
+# x 0.6708204 1.0000000 0.5000000
+# z 0.6708204 0.5000000 1.0000000
+
+lm(y ~ x, data=dat)
+# 
+# Call:
+#   lm(formula = y ~ x, data = dat)
+# 
+# Coefficients:
+#   (Intercept)            x  
+#     -1.57e-17     1.50e+00  
+
+lm(y ~ x + z, data=dat)
+# 
+# Call:
+#   lm(formula = y ~ x + z, data = dat)
+# 
+# Coefficients:
+#   (Intercept)            x            z  
+#      3.14e-17     1.00e+00     1.00e+00  
+# @ttnphns comment: for x, zero-order r = .671 > part r = .387
+#                   for z, zero-order r = .671 > part r = .387
+
+lm(x ~ z, data=dat)
+# 
+# Call:
+#   lm(formula = x ~ z, data = dat)
+# 
+# Coefficients:
+#   (Intercept)            z  
+#     6.973e-33    5.000e-01 
+
+# suppression case --------------------------------------------------------
+
+mat <- rbind(c(2,.5,.5),
+             c(.5,1,-.5),
+             c(.5,-.5,1))
+dat <- data.frame(mvrnorm(n=50, mu=numeric(3), empirical=T, Sigma=mat))
+names(dat) <- c(""y"",""x"",""z"")
+
+cor(dat)
+#           y          x          z
+# y 1.0000000  0.3535534  0.3535534
+# x 0.3535534  1.0000000 -0.5000000
+# z 0.3535534 -0.5000000  1.0000000
+
+lm(y ~ x, data=dat)
+# 
+# Call:
+#   lm(formula = y ~ x, data = dat)
+# 
+# Coefficients:
+#   (Intercept)            x  
+#    -4.318e-17    5.000e-01  
+
+lm(y ~ x + z, data=dat)
+# 
+# Call:
+#   lm(formula = y ~ x + z, data = dat)
+# 
+# Coefficients:
+#   (Intercept)            x            z  
+#    -3.925e-17    1.000e+00    1.000e+00  
+# @ttnphns comment: for x, zero-order r = .354 < part r = .612
+#                   for z, zero-order r = .354 < part r = .612
+
+lm(x ~ z, data=dat)
+# 
+# Call:
+#   lm(formula = x ~ z, data = dat)
+# 
+# Coefficients:
+#   (Intercept)            z  
+#      1.57e-17    -5.00e-01  
+
+",2013-11-02 01:00:20.150 +58682,21840.0,1,,,,Find the Distribution,,CC BY-SA 3.0,"

Given $ Y_1, Y_2..Y_n$ are iid from a distribution with pmf,
+$f(y) = a^{2}$ for $y=0$,

+ +

$f(y) = 2a(1-a)$ for $y=1$ ,

+ +

$f(y) = (1-a)^{2}$ for $y=2$, where $0<a<1$.

+ +

For large n, calculate the approximate distribution of

+ +

a) $\sqrt {\bar{Y}}$ - Solution to part(a) posted as answer(awaiting confirmation)

+ +

b) $\sqrt n ({\bar{Y}-\mu)}+\bar Y^2$ , where $\mu=E(Y_1)$

+ +

Could you please verify my solution for part (b) :

+ +

By CLT +$\sqrt n ({\bar{Y}-\mu)} \rightarrow N(0,\sigma^2)$ (convergence in probability)

+ +

For $\bar Y^2$, applying delta method, +$\bar Y^2 \rightarrow N(\mu^2,\frac{4\mu^2\sigma^2}{n^2})$ (converges in distribution)

+ +

{EDIT} - Can I say : +$\bar Y^2 \rightarrow \mu^2$ in probability

+ +

where $\sigma^2 = Var Y$ and $Var \bar Y^2 = \sigma^2/n$

+ +

Can I apply slusky theorem, as one distribution converges in probability and other in distribution:

+ +

By Slutsky theorem ,

+ +

$\sqrt n ({\bar{Y}-\mu)} + \bar Y^2 \rightarrow [\mu^2 + N(\mu^2,\frac{4\mu^2\sigma^2}{n^2})]$

+ +

Thanks!

+",2013-11-02 01:15:04.970 +58683,20473.0,2,,58675.0,,,,CC BY-SA 3.0,"

Although you talk about ""calculation"", in order to justify the presence of your question in this forum I will treat your problem as one of ""statistical estimation"".

+ +

Moreover, I understand that this is not a supply-demand interaction framework, since it seems that you change prices and then ""wait to see what happens"", fulfilling whatever demand appears. Given these preliminaries:

+ +

$1$) The fact that people may exhibit some degree of inertia is an integral part of their behavior, so trying to ""eliminate"" it is not useful since what you will then be estimating would perhaps be the ""true"" price elasticity of demand, but one which will never materialize, since inertia will always be present, and will affect the total demand response. More over, how would you determine the length of time to ignore?

+ +

What you need is to obtain a separate estimate of both effects. This calls for a specification like the following:

+ +

$$\frac {\Delta q_t}{q_{t-1}} = a\frac {\Delta q_{t-1}}{q_{t-2}} + b\frac {\Delta p_{t}}{p_{t-1}} + u_t $$

+ +

wher $a$ captures the degree of inertia, expected positive, and $b$ is expected negative, while $u_t$ is assumed white noise. The inclusion of the error term, and the whole statistical approach permits to allow for other ""unpredictable"", ""uncontrollable"" factors that may affect demand in each quarter.
+The estimate $\hat b$ will be the estimated average price elasticity of demand, ""cleansed"" from any inertia effects and other ""random"" factors.

+ +

$2$) Yes you should use as your time period the whole quarter. Hopefully you have data for many quarters.

+",2013-11-02 02:11:40.757 +58684,16464.0,2,,14729.0,,,,CC BY-SA 3.0,"

Not that the answer @Whuber gave really needs to be expanded on but I thought I'd provide a brief description of the math.

+ +

If the linear combination $\mathbf{X'Xv}=\mathbf{0}$ for $\mathbf{v}\neq\mathbf{0}$ then $\mathbf{v}$ is an eigenvector of $\mathbf{X'X}$ associated with eigenvalue $\lambda=0$. The eigenvectors and eigenvalues of $\mathbf{X'X}$ are also the eigenvectors and eigenvalues of $\mathbf{X}$, so the eigenvectors of $\mathbf{X'X}$ associated with eigenvalues near $\lambda=0$ represent the coefficients for approximate linear relationships among the regressors. Principal Component Analysis outputs the eigenvectors and eigenvalues of $\mathbf{X'X}$, so you can use the eigenvectors $\mathbf{v}$ associated with small $\lambda$ to determine if linear relationships exist among some of your regressors.

+ +

One method of determining if an eigenvalue is appropriately small to constitute collinearity is to use the Condition Indices: +$$ +\mathbf{\kappa_j}=\frac{\lambda_{max}}{\lambda_j} +$$ +which measures the size of the smallest eigenvalues relative to the largest. A general rule of thumb is that modest multicollinearity is associated with a condition index between 100 and 1,000 while severe multicollinearity is associated with a condition index above 1,000 (Montgomery, 2009).

+ +

It's important to use an appropriate method for determining if an eigenvalue is small because it's not the absolute size of the eigenvalues, it's the relative size of the condition index that's important, as can be seen in an example. Consider the matrix +$$ +\mathbf{X'X}=\left[\begin{array}{rr} +0.001 & 0 & 0 \\ +0 & 0.001 & 0 \\ +0 & 0 & 0.001 \\ +\end{array} +\right]. +$$ +The eigenvalues for this matrix are $\lambda_1=\lambda_2=\lambda_3=0.001$. Although these eigenvalues appear small the condition index is +$$ +\mathbf{\kappa}=\frac{\lambda_{max}}{\lambda_{min}}=1 +$$ +indicating absence of multicolinearity and , in fact, the columns of this matrix are linearly independent.

+ +

Citations

+ +

Montgomery, D. (2012). Introduction to Linear Regression Analysis, 5th Edition. John Wiley & Sons Inc.

+",2013-11-02 05:19:49.820 +58685,23270.0,2,,58645.0,,,,CC BY-SA 3.0,"

I would propose Otsu (1979) or NG's (2006) threshold detection algorithms. They were initially implemented to detect manufacturing defects by testing for significant thresholds in greyscale image pixel intensities however they are decent at detecting thresholds in any histogram. They do so by identifying the threshold(s) that maximize the across class variance or minimize the within class variance of the data. Both papers are attached here.

+ +

NG's approach is a simple extension that adds what the author describes as a valley finding modification. It simply multiplies the across class variance of any possible threshold by 1 minus the probability of an observation at the threshold.

+ +

The R function below will return the threshold calculated by each method and will produce a histogram like the one displayed showing the threshold if passed a single column data frame.

+ +
###
+### NG and OTSU Thresholding of image intensity pixels.
+###
+
+threshold <- function(observations, plot=TRUE, otsu=TRUE){
+  # Ensure that observations is a data frame.
+  observations        <- as.data.frame(observations)
+  names(observations) <- c(""obs"")
+  attach(observations)
+
+  # Convert Observation List to Integer then back to Numeric (for histogram)   
+  print(""Starting Thresholding Algorithm..."")
+
+  observations$obs <- as.integer(observations$obs)                            
+  observations$obs <- as.numeric(observations$obs)                            
+  attach(observations)
+
+  # Produce some image summary variables.
+  # Distinct grey scale levels in image.
+  levels <- max(obs) - min(obs)
+  print(paste(""Distribution has"", levels,"" levels...""))
+
+  # Min and Max grey scale values in image.
+  minThreshold <- min(obs)
+  maxThreshold <- max(obs)-1
+
+  # Number of pixels/voxels in image
+  print(""Extracting frequencies for each level..."")
+  obsCount                  <- nrow(obs)
+  frequencies               <- hist(obs, breaks=levels, plot=FALSE)
+  densityVector             <- as.data.frame(frequencies$density)             #$
+  intensity                 <- (minThreshold:maxThreshold)
+  intensityFrequency        <- cbind(intensity, densityVector, 1-densityVector)
+  names(intensityFrequency) <- c(""intensity"", ""probability"", ""weight"")
+  outputArray               <- NULL
+  outputArray               <- as.data.frame(outputArray)
+
+  # For every possible threshold value... 
+  # Calculate two means, two variances.  
+  # IntraClass Variance. And Spit out to an array with Intensity, 
+  # and Probability of that intensity
+  print(paste(""Testing all possible threshold values...""))
+  for(i in minThreshold:maxThreshold){
+    lowerClassArray <- intensityFrequency[ which(intensity <= i),]
+    upperClassArray <- intensityFrequency[ which(intensity > i),]
+    lowerClassProb  <- sum(lowerClassArray$probability)                       #$
+    upperClassProb  <- sum(upperClassArray$probability)                       #$
+    lowerClassArray <- as.data.frame(lowerClassArray)
+    upperClassArray <- as.data.frame(upperClassArray)
+    lowerClassArray$product <- lowerClassArray$intensity *                    
+                               lowerClassArray$probability / lowerClassProb   #$
+    upperClassArray$product <- upperClassArray$intensity *                    
+                               upperClassArray$probability / upperClassProb   #$
+    lowerMu             <- sum(lowerClassArray$product)                       #$
+    upperMu             <- sum(upperClassArray$product)                       #$
+    lowerWithinClassVar <- lowerClassProb*(lowerMu^2)
+    upperWithinClassVar <- upperClassProb*(upperMu^2)
+    betweenClassVar     <- lowerWithinClassVar + upperWithinClassVar
+    outputBuffer        <- list(i, lowerClassProb, upperClassProb, lowerMu, 
+                                upperMu, lowerWithinClassVar, upperWithinClassVar)
+    print(paste(""Testing Threshold:"", i))
+
+    outputArray <- rbind(outputArray, outputBuffer)
+  }
+  names(outputArray)         <- c(""intensity"", ""lowerClassProb"", ""upperClassProb"",
+                                  ""lowerMu"", ""upperMu"", ""lowerWithinClassVar"",
+                                  ""upperWithinClassVar"")
+  outputArray$withinClassVar <- outputArray$lowerWithinClassVar +             
+                                outputArray$upperWithinClassVar               #$
+  outputArray <- merge(outputArray, intensityFrequency, by=c(""intensity""))
+  print(paste(""Calculating inter and intra-class variances...""))
+  outputArray$ngParam <- outputArray$weight*outputArray$withinClassVar        #$
+  attach(outputArray)
+  sortedOutputNg      <- outputArray[ order(-ngParam),]
+  sortedOutputOtsu    <- outputArray[ order(-withinClassVar),]
+  paramNG             <- sortedOutputNg[1,]
+  paramOTSU           <- sortedOutputOtsu[1,]
+  paramNG             <- as.integer(paramNG[1])
+  paramOTSU           <- as.integer(paramOTSU[1])
+  print(paste(""Ng threshold:"", paramNG))
+
+  if(otsu==TRUE){
+    print(paste(""Otsu threshold:"", paramOTSU))  
+  }
+
+  if(plot==TRUE){
+    #Histogram of pixel intensities.
+    hist(observations$obs, xlab=""Score"", breaks = levels, 
+         main=""Histogram of disease severity score frequencies"")
+
+    #Throw on a vertical line to show what a threshold looks like...
+    abline(v=paramNG, col='red', lty=4)
+    print(""Ng threshold is represented by red line, Otsu by blue."")
+    if(otsu==TRUE){
+      abline(v=paramOTSU, col='blue', lty=3)
+    }
+  }
+  resultArray <- c(paramNG, paramOTSU)
+  return(resultArray)
+  detach(outputArray)
+}
+
+ +

+",2013-11-02 05:28:21.003 +58686,22867.0,1,,,,Do Bayesians interpret the likelihood distribution as subjective as well?,,CC BY-SA 3.0,"

One of the main differences between Bayesians and frequentists is that they have a subjective interpretation to probability.

+ +

However, do Bayesians actually interpret subjectively the probabilities attached to an outcome GIVEN a set of parameters (i.e. for the likelihood), or is it just that they attach a subjective probability to the prior, and also to the posterior as a consequence? (but $p(x | \theta)$ is thought similarly to the way frequentists think about it.)

+",2013-11-02 07:44:50.887 +58687,23272.0,1,,,,Estimation of Missing Observations,,CC BY-SA 3.0,"

I have a data monthly as well as daily data of no of patients and I want to employ time series model .How can I estimate missing counts.Any one would please guide me

+",2013-11-02 07:49:31.200 +58688,21728.0,1,,,,Probability Generating Function of Poisson Distribution,,CC BY-SA 3.0,"

I was just wondering if someone could help me understand this derivation of the probability generating function for a Poisson distribution, (I understand it, until the last step):

+ +

$$\pi(s)=\sum^{\infty}_{i=0}e^{-\lambda}\frac{\lambda^i}{i!}s^i$$ +$$\pi(s)=e^{-\lambda}\sum^{\infty}_{i=0}\frac{e^{\lambda s}}{e^{\lambda s}}\frac{(\lambda s)^i}{i!}$$ +$$= e^{-\lambda}e^{\lambda s} $$

+ +

This is a re-production from some lecture notes, but I'm not sure how it jumps from the 2nd last step to the last step?

+ +

If someone can show me the intermediate steps I would be very grateful!!

+",2013-11-02 08:06:59.833 +58689,23242.0,1,58691.0,,,Show that $T=\min\{n:X_{n}\in B\}$ is an $\mathcal F_{n}$-stopping time,,CC BY-SA 3.0,"

Let $X_{n}$ be an $\mathcal F_{n}$-martingale and let $B\in \mathcal B$.
+Show that $T=\min\{n:X_{n}\in B\}$ is an $\mathcal F_{n}$-stopping time.
+$\mathcal B$ is Borel $\sigma$-algebra and filtration is $\mathcal F=\sigma(X_{1},\dots,X_{n})$. +Thanks for help.

+",2013-11-02 08:27:20.307 +58700,15827.0,2,,58697.0,,,,CC BY-SA 3.0,"

Without context, all that can be said is to collect some common usages:

+ +
    +
  1. An indicator wide sense is anything that indicates (i.e. is considered revealing or diagnostic), which could be any counted or measured variable. Consider statements such as that unemployment rate, GDP growth and government debt are good indicators of the state of an economy. In contrast, an indicator precise sense indicates the state of a binary or dichotomous variable (e.g. survived vs not, alive vs dead, present vs absent, male vs female) by 1 or 0. Often, which state is coded 1 and which is 0 is a matter of convention or convenience, as in the case of male or female. Indicator variables are perhaps more often called dummy variables in several fields of statistical science, although that term on occasion has been misread as offensive.

  2. +
  3. An index is sometimes a scaled composite variable: for example, a price index or wages index is based on a weighted sum of prices or wages, often scaled so it is 100 in some base year or at some base time. An index is also sometimes any kind of summary measure designed to capture some property (segregation, diversity, whatever) in a single number. Ideally, it is recognised whenever one index is used that others are also possible.

  4. +
  5. A variable varies, or at least in principle might vary. It is difficult to say more if that is unclear, except by contrast: anything defined to be constant cannot be a variable. Statistics makes enormous play, however, with parameters which are constant in a given situation but variable from situation to situation. So, mean temperature and the slope of a regression line are parameters.

  6. +
  7. A measure may or should be something with a strict definition in measure theory (http://en.wikipedia.org/wiki/Measure_theory), but either you know that already or it is irrelevant to you. At elementary or introductory or lay level, it might just be yet another word for anything you (count or) measure (e.g. one measure of diversity in ecology is just the number of species present at a site).

  8. +
+ +

I don't think any of these terms is really difficult -- at the level I have chosen here -- and any good dictionary should provide authority if you need it. That said, there are numerous formal accounts or theories of measurement, which do not all agree on terminology or definitions.

+",2013-11-02 11:31:13.207 +58701,503.0,2,,58690.0,,,,CC BY-SA 3.0,"

Regarding 1. Ordinal logistic is the first tool that springs to mind. If you have a good sense of the distribution of the people who committed fraud 4 + times per year, then other options may be available, but ordinal logistic looks good.

+ +
    +
  1. The usual way to enter an interaction is by multiplying the two variables. Whether that is appropriate here depends on what 1-5 mean. If these are anchored by proportions (e.g 1 = 0%, 2 = 20% or whatever) then multiplying may be possible. If you treat them as categorical you will have a lot of terms. How big is your data set?
  2. +
+",2013-11-02 11:35:52.133 +58702,13202.0,1,58884.0,,,PCA and PLS: testing variables for significance,,CC BY-SA 3.0,"

I'm trying to understand the process for statistical testing for principal component analysis or partial least squares.

+ +

Step 1. PCA: +I feel that I have a not-terrible understanding of PCA: You find the ellipsoid described by the covariance matrix of the data, and then successively take the largest axis of variation (principal component 1), then the second largest (principal component 2), and so on. If the ellipsoid is long and stretched, then the variation is mostly along the first principal component (the eigenvector corresponding to the largest eigenvalue of the ellipsoid). If the ellipsoid is a planar ""disc"", then the variation in the data is explained well by two principal components, etc.

+ +

I also understand that after choosing to use (for example) only the first two principal components, then all of the data points can be plotted on a ""Scores"" plot that shows, for each data point $D^{(i)}$, the projection of $D^{(i)}$ into the plane spanned by the first two principal components. Likewise, for the ""Loadings"" plot (I think) you write the first and second principal components as linear combinations of the input variables and then for each variable, plot the coefficients that it contributes to the first and second principal components.

+ +

Step 2. PLS or PLS-DA: +If there are labels on the data (let's say binary classes), then build a linear regression model to use the first and second principal components to discriminate class 0 (for data point $i$, that means $Y^{(i)}=0$) from class 1 (for data point $i$, that means $Y^{(i)}=1$) by first projecting all data to only lie in the plane spanned by the first and second principal components, and then regressing the projected input data $X_1', X_2'$ to $Y$. This regression could be written as (first step) the affine transformation (i.e. linear transformation + bias) that projects along $PC_1, PC_2$ (the first and second principal components), and then (second step) a second affine transformation that predicts $Y$ from $PC_1, PC_2$. Together these transformations $Y \approx Affine(Affine(X))$ can be written as a single affine transformation $Y \approx C (A X + B) + D = E X + F$.

+ +

Step 3. Testing variables from $X$ for significance in predicting the class $Y$: +This is where I could use some help (unless I'm way off already, in which case tell me!). How do you test whether an input variable (i.e. a feature that has not yet been projected onto the principal components (hyper)plane), and decide if it has a statistically significant coefficient in the regression $Y \approx E X + F$? Qualitatively, a coefficient in $E$ that is further from zero (i.e. positive and negative values with large magnitude) indicates a larger contribution from that variable.

+ +

I remember seeing linear regression t-tests for normally distributed data (to test whether the coefficients were zero). Is this the standard approach? In that case, I would guess that ever variable from $X$ has been transformed to have a roughly normal distribution in Step 0 (i.e. before any of these other steps are performed).

+ +

Otherwise, I could see performing a permutation test (by running this entire procedure thousands of times and each time permuting $Y$ to shuffle the labels, and then comparing each single coefficient in $E$ from the un-shuffled analysis to the distribution of coefficients from shuffled analyses).

+ +

Can you help me see anywhere my intuition is failing? I've been trying to look through papers using similar procedures to see what they did, and as is often the case, they're clear as mud. I'm preparing a tutorial for some other researchers, and I want to do a good job.

+",2013-11-02 11:55:32.803 +58703,594.0,2,,58598.0,,,,CC BY-SA 3.0,"

One general approach which you can adapt for part (a):

+ +

Consider a random variable $X$ and a constant, $c$, where $X\sim f_X(x), a\leq x\leq b$, say.

+ +

Let $Z=X+c$.

+ +

$P(Z\leq z) = P(X+c\leq z) = P(X\leq z-c) = F_X(z-c); a\leq z-c\leq b$.

+ +

You just need to get the pdf from the cdf and get the limits sorted out.

+ +
+ +

Note that you don't need to integrate anything to answer (a) unless you don't know the cdf for an exponential ($F_X(x)=1−e^{−λx};x>0,λ>0$).

+ +

If you know it, you just start by writing down the cdf; then $P(X≤z−c)$ simply requires substitution, not integration. So it's nothing more than 'substitute then differentiate'... but you must take care with the limits on the random variable, since $Z$ is clearly defined over a different set of values than $X$ is; what's needed there is clear from the last part of the line of algebra in my question.

+ +

In part (b), note that $nτ$ is just a constant. It's the same problem as before (a shifted distribution), but with a gamma instead of an exponential; the hint in my answer above is general enough to work for both.

+ +

The only new trick is finding the MGF, which should involve a straightforward manipulation of the integral for the MGF to get something times a Gamma MGF.

+ +

... and (b)(ii) simply involves recognizing the relationship between parts (a) and (b).

+",2013-11-02 12:47:43.727 +58704,23283.0,1,,,,Mapping a value in a normal distribution,,CC BY-SA 3.0,"

I apologize in advance for misuse of terminology.

+ +

If given a number $x$ from a normal distribution with mean 0 and standard deviation 1, how can I map that onto another distribution with a different mean $m$ and standard deviation $s$?

+ +

Is it simply $m + s\,x$ ?

+ +

Clarification - the real-world problem I'm trying to solve: +I would like to generate random numbers in a normal distribution for which I know the mean and standard deviation, however, I can only generate random floating point numbers in a distribution with mean 0.0 and stddev 1.0. How can I translate or ""map"" this value onto a different distribution?

+",2013-11-02 14:01:34.637 +58705,23285.0,1,,,,how to Normalize data(with noise) into 0-1 range good scale in mean and variance?,,CC BY-SA 3.0,"

i have a matrix data. Perhaps some data in one cluster and another in some cluster. +data scale is between [0-1000](just example). and i want to normalize into [0-1] and good in mean and variance. it means, i want to have a Good visibility in mean and variance. +for more description,mean and variance Be chosen in perfect place that Consider all data, and not ignore noisy data. +Foe example one way is zi=xi−min(x)max(x)−min(x), but this way if Most of the data in one cluster and few data in another cluster, mean and variance be chosen between large cluster and not attend to noisy data.
+Please introduce me a paper(if newest is better) or a way to solve this problem. +Thank u.

+",2013-11-02 14:23:26.503 +58706,17740.0,2,,58435.0,,,,CC BY-SA 3.0,"

This is one of the generalizations of classification that are tackled in semi-supervised learning. If you have a measurement of certainty you can use approaches that allow weighting of training instances. The higher the certainty, the larger the corresponding instance weight. Examples of such approaches include instance-weighted SVM and logistic regression.

+ +

I'm sure weka has implementations of these algorithms. If all else fails, sample multiple instances from the instances with high certainty. You can use this approach for traditional SVM or LR.

+ +

Example: SVM

+ +

If I am not mistaken, weka has interfaces to LIBSVM. LIBSVM allows you to solve class-weighted SVM in all its releases, and instance-weighted SVM in special versions of each release. I'm going to assume weka does not support the latter (which is what you need).

+ +

Class weighted SVM minimizes the following objective function: +$$ +\min_{\mathbf{w},\xi} \|\mathbf{w}\|^2 + {\color{blue}C_{pos}} \sum_{i \in \mathcal{P}} \xi_i + {\color{blue}C_{neg}} \sum_{i \in \mathcal{N}} \xi_i, +$$ +with $\mathbf{w}$ the separating hyperplane in feature space, $\xi$ the slack variables (which model training misclassification) and $\mathcal{P}$ and $\mathcal{N}$ the set of support vectors belonging to the positive and negative class, respectively. Using the weights $C_{pos}$ and $C_{neg}$ you can assign different misclassification penalties between classes.

+ +

Based on your question, it seems like you would ideally want to use 6 different weights (2 classes $\times$ 3 levels of certainty). You can achieve this for many approaches by duplicating samples of the points with high certainty.

+ +

For example, in terms of SVM, using the same data instance twice yields an identical solution to doubling its associated $C$ value. This is a very easy way to assign high misclassification penalties to certain data instances. You can follow the same approach for logistic regression.

+",2013-11-02 15:13:50.210 +58707,23129.0,2,,16313.0,,,,CC BY-SA 3.0,"

Here's a quote from Andrew Gilpin (1993) advocating Maurice Kendall's $τ$ over Spearman's $ρ$ for theoretical reasons:

+ +
+

[Kendall's $τ$] approaches a normal distribution more rapidly than $ρ$, as $N$, the sample size, increases; and $τ$ is also more tractable mathematically, particularly when ties are present.

+
+ +

I can't add much about Goodman-Kruskal $γ$, other than that it seems to produce ever-so-slightly larger estimates than Kendall's $τ$ in a sample of survey data I've been working with lately... and of course, noticeably lower estimates than Spearman's $ρ$. However, I also tried calculating a couple partial $γ$ estimates (Foraita & Sobotka, 2012), and those came out closer to the partial $ρ$ than the partial $τ$... It took a fair amount of processing time though, so I'll leave the simulation tests or mathematical comparisons to someone else... (who would know how to do them...)

+ +

As ttnphns implies, you can't conclude that your $ρ$ estimates are better than your $τ$ estimates by magnitude alone, because their scales differ (even though the limits don't). Gilpin cites Kendall (1962) as describing the ratio of $ρ$ to $τ$ to be roughly 1.5 over most of the range of values. They get closer gradually as their magnitudes increase, so as both approach 1 (or -1), the difference becomes infinitesimal. Gilpin gives a nice big table of equivalent values of $ρ$, $r$, $r^2$, d, and $Z_r$ out to the third digit for $τ$ at every increment of .01 across its range, just like you'd expect to see inside the cover of an intro stats textbook. He based those values on Kendall's specific formulas, which are as follows: +$$ +\begin{aligned} +r &= \sin\bigg(\tau\cdot\frac \pi 2 \bigg) \\ +\rho &= \frac 6 \pi \bigg(\tau\cdot\arcsin \bigg(\frac{\sin(\tau\cdot\frac \pi 2)} 2 \bigg)\bigg) +\end{aligned} +$$ +(I simplified this formula for $ρ$ from the form in which Gilpin wrote, which was in terms of Pearson's $r$.)

+ +

Maybe it would make sense to convert your $τ$ into a $ρ$ and see how the computational change affects your effect size estimate. Seems that comparison would give some indication of the extent to which the problems that Spearman's $ρ$ is more sensitive to are present in your data, if at all. More direct methods surely exist for identifying each specific problem individually; my suggestion would produce more of a quick-and-dirty omnibus effect size for those problems. If there's no difference (after correcting for the difference in scale), then one might argue there's no need to look further for problems that only apply to $ρ$. If there's a substantial difference, then it's probably time to break out the magnifying lens to determine what's responsible.

+ +

I'm not sure how people usually report effect sizes when using Kendall's $τ$ (to the unfortunately limited extent that people worry about reporting effect sizes in general), but since it seems likely that unfamiliar readers would try to interpret it on the scale of Pearson's $r$, it might be wise to report both your $τ$ statistic and its effect size on the scale of $r$ using the above conversion formula...or at least point out the difference in scale and give a shout out to Gilpin for his handy conversion table.

+ +

References

+ +

Foraita, R., & Sobotka, F. (2012). Validation of graphical models. gmvalid Package, v1.23. The Comprehensive R Archive Network. URL: http://cran.r-project.org/web/packages/gmvalid/gmvalid.pdf

+ +

Gilpin, A. R. (1993). Table for conversion of Kendall's Tau to Spearman's Rho within the context measures of magnitude of effect for meta-analysis. Educational and Psychological Measurement, 53(1), 87-92.

+ +

Kendall, M. G. (1962). Rank correlation methods (3rd ed.). London: Griffin.

+",2013-11-02 15:42:21.763 +58722,2873.0,2,,58714.0,,,,CC BY-SA 3.0,"

In the case of the geometric distribution this will work since it (and the exponential for continuous variables) are memoryless meaning that the probability that x = k + s given that x>=k is just the probability that x = k. This will not however work for any other distributions.

+ +

How are you generating the geometric to begin with? that may be what is slowing you down.

+",2013-11-02 19:46:15.890 +58708,14919.0,1,58717.0,,,self-selection bias due to nonresponse?,,CC BY-SA 3.0,"

I have administrative data from the whole population of new doctorates, in a given year, from my region. +We have also survey data from a sample of this same population (where the whole population was contacted to participate in the survey, but only around 65% of doctorates participated: this is the sample I should work on). +There will be self-selection bias if the non-response is not random.

+ +

I want to investigate possible wage gaps between different groups in my region (the dependent variable is the log of the ratio between two average wages).

+ +

Any inputs on how to tackle this issue? +Literature on this issue?

+",2013-11-02 15:46:03.437 +58709,23129.0,2,,3646.0,,,,CC BY-SA 3.0,"

Here's a quote from Andrew Gilpin (1993) advocating Kendall's τ over Spearman's ρ for theoretical reasons:

+
+

"[Kendall's $τ$] approaches a normal distribution more rapidly than $ρ$, as $N$, the sample size, increases; and $τ$ is also more tractable mathematically, particularly when ties are present."

+
+

Reference

+

Gilpin, A. R. (1993). Table for conversion of Kendall's Tau to Spearman's Rho within the context measures of magnitude of effect for meta-analysis. Educational and Psychological Measurement, 53(1), 87-92.

+",2013-11-02 15:50:35.117 +58710,23287.0,1,,,,Correct use of cross validation in LibsSVM,,CC BY-SA 3.0,"

I am classifying data points from two different groups using LibSVM.

+ +

I do the following:

+ +
    +
  1. Creating the input file for LibSVM. In the input file, I put all the data I have.
  2. +
  3. Scaling it (using svm-scale).
  4. +
  5. Using grid.py of libSVM for choosing gamma and c parameters.
  6. +
  7. Using svm-train for training. I use the entire dataset. I also use the -v 10 option for 10-fold cross validation (svm-train flag).
  8. +
+ +

My questions are:

+ +

a. Is the -v 10 option of cross validation can replace the testing step?

+ +

b. The result given by the steps above is suspiciously high (96%), and so I'm wondering if I am doing something wrong?

+ +

c. Could the use of grid.py for parameter selection before the train + cross validation damage the results (as if I were testing on data I've already trained)?

+",2013-11-02 15:54:55.837 +58711,13666.0,1,,,,Cleaning corpus,,CC BY-SA 4.0,"

I have a classifier that I want to use on short documents and I need to improve quality of the corpus used for training. I have 250,000 documents and manual inspection is expensive.

+ +

The collection of short documents has been pre-labelled by a collection of rules as good or bad. The accuracy of this pre-processing step has been assessed by sampling to be around 98% or higher. When I train the classifier on 80% of the corpus and test its accuracy on the remaining 20%, it has a 98% accuracy.

+ +

What I am wondering is this: if I filter the corpus and remove the documents for which there is disagreement and retrain and repeat, can I expect to arrive at a corpus for which the classifier agrees with the original labeling process within some small margin of error (less than 1%)?

+ +
EDIT:
+
+ +

After finding a corpus that appears to be clean, I would manually inspect the documents that were rejected. I expect there would be around 5000 (2% of 250,000)

+",2013-11-02 15:56:14.807 +58712,306.0,2,,58705.0,,,,CC BY-SA 3.0,"

If you know or you think that there are two clusters in the data, then fit a mixture of two normal distributions, one for each cluster. then for every point, choose the set of points belonging to that cluster and then perform the transformation you think is right. when reporting or using each point, report the distribution you have assigned the point to as well as the transformed value. that way you get to preserve the natural structure of data, yet change it to a form you want it to.

+",2013-11-02 16:09:37.953 +58713,2666.0,2,,16313.0,,,,CC BY-SA 3.0,"

These are all good indexes of monotonic association. Spearman's $\rho$ is related to the probability of majority concordance among random triplets of observations, and $\tau$ (Kendall) and $\gamma$ (Goodman-Kruskal) are related to pairwise concordance. The main decision to make in choosing $\gamma$ vs. $\tau$ is whether you want to penalize for ties in $X$ and/or $Y$. $\gamma$ does not penalize for ties in either, so that a comparison of the predictive ability of $X_{1}$ and $X_{2}$ in predicting $Y$ will not reward one of the $X$s for being more continuous. This lack of reward makes it a bit inconsistent with model-based likelihood ratio tests. An $X$ that is heavily tied (say a binary $X$) can have high $\gamma$.

+",2013-11-02 17:10:49.090 +58714,10772.0,1,,,,Sampling from truncated distribution,,CC BY-SA 3.0,"

I want to sample from a truncated distribution that appears in a Gibbs sampling scheme. +The full conditional of the distribution is given by

+ +

$p(X = k | \ldots) \propto (1 - p)^{k - 1} \mathbb{1} ( s \leq k)$, where $s$ is a positive integer.

+ +

This is a truncated geometric distribution. The tehcnique i am following to simulate from this is first to sample a random number from a geometric distribution and then add to this the number $s$.

+ +

First off all i want to ask if this that i am doing is right. And after, is there any obsious reason for this simulation to be slow? Or is it slow because i am doing something wrong?

+",2013-11-02 17:53:46.303 +58723,20473.0,1,,,,For which distributions does uncorrelatedness imply independence?,,CC BY-SA 3.0,"

A time-honored reminder in statistics is ""uncorrelatedness does not imply independence"". Usually this reminder is supplemented with the psychologically soothing (and scientifically correct) statement ""when, nevertheless the two variables are jointly normally distributed, then uncorrelatedness does imply independence"".

+ +

I can increase the count of happy exceptions from one to two: when two variables are Bernoulli-distributed , then again, uncorrelatedness implies independence. If $X$ and $Y$ are two Bermoulli rv's, $X \sim B(q_x),\; Y \sim B(q_y)$, for which we have $P(X=1) = E(X) = q_x$, and analogously for $Y$, their covariance is

+ +

$$\operatorname{Cov}(X,Y)= E(XY) - E(X)E(Y) = \sum_{S_{XY}}p(x,y)xy - q_xq_y $$

+ +

$$ = P(X=1,Y=1) - q_xq_y = P(X=1\mid Y=1)P(Y=1)-q_xq_y$$

+ +

$$= \Big(P(X=1\mid Y=1)-q_x\Big)q_y $$

+ +

For uncorrelatedness we require the covariance to be zero so

+ +

$$\operatorname{Cov}(X,Y) = 0 \Rightarrow P(X=1\mid Y=1) = P(X=1)$$

+ +

$$\Rightarrow P(X=1,Y=1) = P(X=1)P(Y=1) $$

+ +

which is the condition that is also needed for the variables to be independent.

+ +

So my question is: Do you know of any other distributions (continuous or discrete) for which uncorrelatedness implies independence?

+ +

Meaning: Assume two random variables $X,Y$ which have marginal distributions that belong to the same distribution (perhaps with different values for the distribution parameters involved), but let's say with the same support eg. two exponentials, two triangulars, etc. Does all solutions to the equation $\operatorname{Cov}(X,Y) = 0$ are such that they also imply independence, by virtue of the form/properties of the distribution functions involved? This is the case with the Normal marginals (given also that they have a bivariate normal distribution), as well as with the Bernoulli marginals -are there any other cases?

+ +

The motivation here is that it is usually easier to check whether covariance is zero, compared to check whether independence holds. So if, given the theoretical distribution, by checking covariance you are also checking independence (as is the case with the Bernoulli or normal case), then this would be a useful thing to know.
+If we are given two samples from two r.v's that have normal marginals, we know that if we can statistically conclude from the samples that their covariance is zero, we can also say that they are independent (but only because they have normal marginals). It would be useful to know whether we could conclude likewise in cases where the two rv's had marginals that belonged to some other distribution.

+",2013-11-02 20:13:49.093 +58724,1145.0,2,,58179.0,,,,CC BY-SA 3.0,"

This is a great question!

+ +

In my opinion the most important place to start an answer is to point out the distinction that should be drawn between P-values which are the result of a significance test and the decisions that are the results of hypothesis tests. Hypothesis tests are designed to control long term errors and are entirely frequentist in that the Frequentist or Repeated Sampling Principle is respected above all other considerations. In contrast the P-value from a significance test is an index of the evidence in the data and should be used for inference that is consistent with the Likelihood Principle.

+ +

Hypothesis tests and significance tests are usually assumed to be the same thing. Among the scientists with whom I hang out the most common test employed is a hybrid of the two which offers most of the disadvantages of both with few advantages. That hybrid has been written about many, including myself. See this paper for a description of the issues aimed at biomedical scientists: http://www.ncbi.nlm.nih.gov/pubmed/22394284.

+ +

The various scenarios with ad hoc alterations to experimental design that you present are very problematical to a frequentist analyst because the long run error rates associated with decisions depend on pre-specified criteria of $\alpha$ (usually 0.05), sample size etc. If the experiment changes between the design phase and implementation it is difficult to know what characteristics of design to impute. Thus the alterations in design affect the error rates and it is not always easy to make an appropriate adjustment to the analysis that would be acceptable to everyone.

+ +

For example, if an experiment is extended beyond the initial design sample size then it begs the question of whether the extension is an attempt to gain 'significance' from data that didn't quite make it at the designed end point. Such behaviour increases the risk of false positive results. So does stopping prematurely with a significant result.

+ +

If you view the data as evidence then the matching up of experimental design and implementation is much less important. A larger sample will provide more reliable and more convincing evidence. The P-value need not be 'adjusted' or 'corrected' for deviations from the planned sample size when it is viewed as an index of evidence.

+ +

Of course, what it means to be an index of evidence is not clear to most people. It turns out that for any P-value from a significance test of a certain sample size (pre-specified or not) there is a specific likelihood function. That likelihood function depicts the evidence that the P-value indexes. I have written a long paper on this topic that, by happy coincidence, I arXived just two days ago. Its title is ""To P or not to P: on the evidential nature of P-values and their place in scientific inference"" http://arxiv.org/abs/1311.0081. The next paragraph is a relevant extract:

+ +
+ +

The long-run error rates associated with an experiment are a property of the experimental design and the behaviour of the experimenter rather than of the data. The 'size' of the experiment, $\alpha$, is properly set before the data is available, so it cannot be data-dependent. In contrast, the P-value from a significance test is determined by the data rather than the arbitrary setting of a threshold. It cannot logically be an error rate because it doesn't force a decision in the way that inductive behaviour does, and if a decision is made to discard the null hypothesis when a small (presumably) P-value is observed, the decision is made on the basis of the smallness of the P-value in conjunction with whatever information that the experimenter considers relevant. Thus the rate of erroneous inferences is a function of not only the P-value but the quality and availability of additional information and, sometimes, the intuition of the experimenter. P-values are not error rates, whether 'observed', 'obtained' or 'implied'.

+ +
+ +

So, answers. Yes, P-values are still useful in your scenarios. Unadjusted P-values still index the likelihood functions that depict the evidence in the data and you can still make inferences using that evidence. You should not adjust the P-values for multiple comparisons, but instead understand that the evidence needs to be interpreted in light of the number of comparisons made. Deviations from the plan will invalidate frequentist hypothesis tests, but not necessarily invalidate a likelihood-based evaluation of the evidence.

+ +

Getting the most out of the evidence and avoiding inflated type I errors are not the same thing, and you cannot comply with the frequentist principle and the likelihood principle at the same time in many circumstances. Your edit makes it sound like you have a Bayesian model in mind. That is usually a good idea (and will usually comply with the likelihood principle).

+",2013-11-02 20:56:45.060 +58715,17740.0,2,,58710.0,,,,CC BY-SA 3.0,"

It seems like you are mixing a couple of things up. First of all, cross-validation is used to get an accurate idea of the generalization error when certain tuning parameters are used.

+ +

You can use svm-train in k-fold cross-validation mode using the -v k flag. In this mode, svm-train does not output a model -- just a cross-validated estimate of the generalization performance.

+ +

grid.py is basically a wrapper around svm-train in cross-validation mode. It allows you to easily assess the best parameter tuple out of a given set of options via cross-validation. It is essentially a loop over the specified parameter tuples which performs cross-validation.

+ +
+

a. Is the -v 10 option of cross validation can replace the testing step?

+
+ +

Not entirely. Cross-validation is indeed used to get an estimate of the generalization performance of a model, but when performing cross-validation the entire training set is never used to construct a single model. The typical steps are (i) find optimal tuning parameters using cross-validation, (ii) train a model using these optimal parameters on the full training set and (iii) test this model on the test set.

+ +
+

b. The result given by the steps above is suspiciously high (96%), and so I'm wondering if I am doing something wrong?

+
+ +

Don't worry, be happy. Such classification accuracies are quite feasible for a wide range of problems.

+ +
+

c. Could the use of grid.py for parameter selection before the train + cross validation damage the results (as if I were testing on data I've already trained)?

+
+ +

grid.py does cross-validation for you. There is no point to perform cross-validation again after you ran grid.py.

+",2013-11-02 17:59:06.290 +58716,23289.0,1,58733.0,,,Wilcoxon signed rank test on anonymous sample,,CC BY-SA 3.0,"

I carried out a test on a group of 100 people. The test contained 10 questions with 5-point Likert scale answers (1 = Strongly Agree, 5 = Strongly Disagree). After the test was conducted, the group was educated over a period of time on how to answer the questions. After the education completed, the same test was carried out again on the same group, but with 10 absent. Knowing that the pre and post tests were anonymously conducted, I would like to test whether there is a significant difference between the two test results. +If I use the Wilcoxon Signed Rank Test for a paired sample to compare results for individual question of each test, the results will be different depending of the entry of the test results. In other words, I can't tell if one student improved because I can't link his first test to his second test.

+ +

I don't know if using the Wilcoxon Signed Rank Test for a paired sample would be possible given the tests were done anonymously.

+ +

Any help is appreciated. +Thanks

+",2013-11-02 18:11:51.227 +58717,503.0,2,,58708.0,,,,CC BY-SA 3.0,"

A few points:

+ +

1) The log of a ratio is simply the difference of the logs. Is this really what you want?

+ +

2) As to your main point; although it is not completely clear, the two methods that come to mind here are multiple imputation and propensity scores. There is a huge amount of literature on both of these. You could start by looking up both those terms right here on CrossValidated. That should get you to places where you can access the wider literature.

+ +

Here are 28 posts about propensity scores.

+ +

Here are 53 posts about multiple imputation.

+ +

and here is one post about using both (with links to several articles)

+",2013-11-02 18:22:39.377 +58718,23292.0,2,,1760.0,,,,CC BY-SA 3.0,"

Do you want to know if his forecast is more accurate than another forecast? If so, you can look at basic accuracy metrics for probabilistic classification like cross-entropy, precision/recall, ROC curves, and the f1-score.

+ +

Determining if the forecast is objectively good is a different matter. One option is to look at calibration. Of all the days where he said that there would be a 90% chance of rain, did roughly 90% of those days have rain? Take all of the days where he has a forecast and then bucket them by his estimate of the probability of rain. For each bucket, calculate the percentage of the days where rain actually occurred. Then for each bucket plot the actual probability of rain against his estimate for the probability of rain. The plot will look like a straight line if the forecast is well calibrated.

+",2013-11-02 19:04:43.113 +58719,4320.0,2,,58658.0,,,,CC BY-SA 3.0,"

You need to first calculate all your updates as if the wieghts weren't shared, but just store them, don't actually do any updating yet.

+ +

Let $w_k$ be some weight that appears at locations $I_k = \{(i,j) \colon w_{i,j} = w_k\}$ in your network and $\Delta w_{i,j} = -\eta \frac{\partial J}{\partial w_{i,j}} $ where $\eta$ is the learning rate and $J$ is your objective function. Note that at this point if you didn't have weight sharing you would just upade $w_{i,j}$ as +$$ + w_{i,j} = w_{i,j} + \Delta w_{i,j}. +$$ +To deal with the shared weights you need to sum up all the individual updates. So set +$$ + \Delta w_k = \sum_{(i,j) \in I_k} \Delta w_{i,j} +$$ +and then update +$$ + w_k = w_k + \Delta w_k. +$$

+",2013-11-02 19:10:34.450 +58720,22682.0,1,58743.0,,,Variance of sum of random number of random variables (Cambridge University Worksheet),,CC BY-SA 3.0,"

In the vein of my last question, I'm now at a roadblock on question 3 of this sheet:

+ +

http://www.trin.cam.ac.uk/dpk10/IA/exsheet3.pdf

+ +

(note: it's not my intention to ask every question I get stuck on here, merely the ones which have interesting general results; it just so happens that the two I've struggled with so far fit this criteria)

+ +

It goes as follows:

+ +

Let $N$ be a non-negative integer-valued random variable with mean $\mu_1$ and variance $\sigma_1^2$, and let $X_1, X_2, ...$ be random variables, each with mean $\mu_2$ and variance $\sigma_2^2$; furthermore, assume that $N, X_1, X_2, . . .$ are independent. Without using generating functions, calculate the mean and variance of the random variable $S_N = X_1 + ... + X_N$ (when $N=0$ interpret $S_N$ as $0$).

+ +

I have the answer to calculating the mean, which I'll write up as an answer for future reference. Here's where I've got to with the variance bit:

+ +

$\mathbb{E}(S_N^2) = 0.P(N=0) + \sum_{r=1}^\infty\mathbb{E}((\sum_{i=1}^rX_i)^2).P(N=r)$

+ +

For $r=1$, we have the inner expectation as: $\mathbb{E}(X_1^2)$ which is equal to $\sigma_2^2+\mu_2^2$.

+ +

For $r > 1$, we have the inner expectation equal to: $r(\sigma_2^2 + 2\mu_2^2)$

+ +

So $\mathbb{E}(S_N^2) = (\sigma_2^2+\mu_2^2)P(N=1) + (\sigma_2^2 + 2\mu_2^2)\sum_{r=2}^\infty rP(N=r)$

+ +

which is almost in the form where I can use the same trick as in calculating the mean, but not quite.

+ +

Any help is much appreciated.

+",2013-11-02 19:16:47.060 +58726,23296.0,1,58746.0,,,Covariance with conditional expectation,,CC BY-SA 3.0,"
+

Suppose $X$ and $Y$ are random variables, $E(Y^2) < \infty$ and +$\varepsilon = Y - E(Y|X)$ so that $Y = E(Y|X) + \varepsilon$.

+

Given that $E(\varepsilon | X) = E(\varepsilon) = 0$, show that +$Cov(\varepsilon , E(Y|X)) = 0$.

+
+

This question has multiple parts so $E(Y^2) < \infty$ may or may not be applicable in this case.

+

Here's what I tried so far. I used the fact that $Cov(X,Y) = E(XY) - E(X)E(Y)$ and $Cov(X,Y) = E[(X - E(X))(Y-E(Y))]$ and concluded that $Cov(\varepsilon , E(Y|X)) = E(\varepsilon E(Y|X))$ or in other words, $E(\varepsilon E(Y)) = 0$.

+

From there, I guess the only thing I have to show is that: $E(\varepsilon E(Y|X)) = 0$, but I'm having trouble doing this.
+Am I going in the right track or is this completely the wrong approach to tackling this problem?

+",2013-11-02 21:34:27.800 +58727,9063.0,1,58729.0,,,Survival regression and prediction using median,,CC BY-SA 3.0,"

I'm using artificially generated hazard curves (that is, I know the true hazard curve) and Aalen's additive model to fit the covariates. For example, below is an individual's hazard curve and my estimate of it:

+

It's a good fit to my eye, and here is the individual's estimated survival curve (exp of the negative of the above )

+

+

Again, this looks fine to me. What happens next though is odd. To make predictions of individuals' lifetimes, I use the median of the survival curve (the intersection of the curve and 0.5). I then plot this prediction against the observed value, and what I was expecting to see was a high correlation between the two, instead I see this:

+

+

This seems very wrong to me. A few questions I have:

+
    +
  • Is using the median correct? Intuitively I see nothing wrong with it, but my results suggest otherwise.

    +
  • +
  • Is predicting lifetimes just too noisy of a challenge and this is indeed that best I can do?

    +

    Has anyone experienced something like this before?

    +
  • +
+",2013-11-02 21:50:56.473 +58728,23284.0,1,58734.0,,,Frequency of time series in R,,CC BY-SA 3.0,"

I have 10 minutes-intervaled wait-times data for a coffee shop for 4 weeks between 9am-5pm. I use R's ts for my analysis. What should be the frequency parameter?

+ +

Is it (48=# of intervals per day) or just 1?

+",2013-11-02 22:01:03.497 +58729,503.0,2,,58727.0,,,,CC BY-SA 3.0,"

The bottom plot is simply saying that lifetimes often vary from the median. This is inherent in the first two plots. That is, the person in the plot has a median of about 10, but he/she also has about a 20% chance of dying before 5, and about a 20% chance of making it past 20.

+ +

If curves for other people are similar, then the bottom plot agrees with the top 2.

+",2013-11-02 22:29:13.400 +58730,23297.0,1,58732.0,,,Standard Deviation and Variance in Sample and Population Formulas for all?,,CC BY-SA 3.0,"

What are the differences between those Population and Standard Deviation and what are their fomulas? I was told that they are different formulas and which situation do you use population or sample?

+ +

If a user gives 10 random numbers, would I be using population or sample?

+",2013-11-02 22:30:16.207 +58731,503.0,2,,58730.0,,,,CC BY-SA 3.0,"

OK....

+ +

The formulas vary because the population SD has N in the denominator and the sample SD has N-1 in the denominator.

+ +

As to when to use each, well.... it's sort of inherent in the names of the two terms; or are you unclear on sample vs. proportion?

+",2013-11-02 22:39:59.947 +58732,594.0,2,,58730.0,,,,CC BY-SA 3.0,"

The population variance is the average squared distance from the mean. You use it when you have the population.

+ +

That is, if you have every member of the population of interest, you compute $\sigma^2 = \bar v = \frac{\sum_i v_i}{n}$ where $v_i = (x_i - \mu)^2$.

+ +

(If you're asking about variances of random variables, see here for the details of the relevant formulas)

+ +

The sample variance, $s^2$ uses the same formula, but usually the denominator of the average is taken to be one smaller because observations are closer to the sample mean than they are to the population mean, which makes the squared deviations too small on average; replacing $\frac{}{n}$ with $\frac{}{n-1}$ makes them right on average. The $n-1$ denominator version is sometimes called $s^2_{n-1}$ by contrast from the version with the $n$ denominator, $s^2_n$. You use sample variance when you have a sample.

+ +

The relevant standard deviations are simply the square roots of the corresponding variances.

+",2013-11-02 22:49:02.983 +58733,5448.0,2,,58716.0,,,,CC BY-SA 3.0,"

The short answer, in three parts, is a) no, you can't do a paired test, as has been pointed out in comments, b) yes, you can do an unpaired test, and c) that 10% of non-respondents to the second test may be important.

+ +

Let us consider a simplistic hierarchical model of response, where there is an individual-level characteristic $\theta_i$ which has some distribution $f(\theta)$ and a test-specific response $y_{ij}$ for tests $j \in {1,2}$ that depends upon the individual-level characteristic through its distribution $p_j(y_{ij} | \theta_i)$. If we know $i$ for each $y_{ij}$ we can obviously do the paired test, and the difference between $y_{i1}$ and $y_{i2}$ obviously are not influenced by the differences between the $\theta_i$, since it is for the same $i$.

+ +

If, on the other hand, we don't know the individual $i$, we are faced with draws from two distributions $p^*_j(y_{ij}) = \int_\Theta p_j(y_{ij} | \theta_i) f(\theta_i)\text{d}\theta_i$. The scores $y_{ij}$ are still independent across $i$ and, under the null, independent across $j$ as well. The distribution itself no longer varies across $i$. Under alternative hypotheses, the distributions $p_j$ will still differ across $j$, it's just that they are population-level distributions rather than individual-level distributions.

+ +

Consequently, we can still perform an (unpaired) test for differences between $j$, but it's going to be less powerful than if you could get rid of the extra variability introduced by not knowing the individuals. It's just a matter of what you can condition on; more conditioning reduces variability and thereby increases power.

+ +

Personally, I'd use the unpaired version of the Wilcoxon, as you can't lose much relative to the unpaired version of the $t$-test and you might gain a lot. See this question for a little more information.

+ +

Of greater concern is that missing 10% of the original sample. You'd really like to understand the missing data mechanism, if any. Consider the possibility that the 10 who dropped out were among the poorest performers on the original test, and that the amount of improvement was strongly negatively related to how well an individual performed on the first test (i.e., poor performers improved a lot more on average than good performers). That, combined with regression-to-the-mean effects, means you'd likely be missing data on some of your largest gains, thus a) weakening your ability to detect a significant difference, and b) biasing your estimate(s) of how much improvement there was downwards. OTOH, under the null hypotheses, we expect to see a gain, because we are including the low scorers in sample 1 but removing 10 likely low scorers from sample 2. So there's an upwards bias there too. Which effect dominates isn't clear, but what is clear is that your test and associated estimates would almost certainly be biased.

+ +

For example, if I simulate from the simple model above assuming $\theta_i \sim \text{N}(0,1)$ and $y_{ij} \sim \text{N}(\theta_i,1)$, and drop the $y_{i2}$ for which $y_{i1}$ was in the 10 lowest values, the expected value of $y_{i2} \approx 0.136$ while that of $y_{i1} = 0$. 0.136 is about 0.96 standard deviations above 0, relative to the std. dev. of the difference between the means of $y_{i1}$ and $y_{i2}$, which would obviously have a huge impact on your type I and type II error probabilities.

+ +
e2 <- rep(0,100000)
+for (i in 1:100000) {
+   theta <- rnorm(100)
+   y1 <- rnorm(100, theta)
+   y2 <- rnorm(100, theta)
+   y2[order(y1)[1:10]] <- NA
+   e2[i] <- mean(y2, na.rm=TRUE)
+}
+mean(e2)
+[1] 0.1360364
+
+",2013-11-03 00:03:21.377 +58734,132.0,2,,58728.0,,,,CC BY-SA 3.0,"

Most likely you have two seasonal periods: 48 (number of intervals per day) and 48x5 (number of intervals per week assuming a 5-day week).

+ +

The tbats() function from the forecast package in R will handle multiple seasonal periods. For example (where x is the data):

+ +
library(forecast)
+x <- msts(x, seasonal.periods=c(48, 48*5))
+fit <- tbats(x)
+fc <- forecast(fit, h=48*5)
+plot(fc)
+
+ +

Alternatively (and the only easy option if there are missing data) is to use Fourier terms for the seasonal periods and ARMA errors to handle any remaining serial correlation. The ARIMA functions in R do not automatically handle multiple seasonal periods, but the following R code should work:

+ +
x <- ts(x, frequency=48)
+seas1 <- fourier(x, K=3)
+seas2 <- fourier(ts(x, freq=48*5), K=3)
+fit <- auto.arima(x, xreg=cbind(seas1,seas2))
+
+seas1.f <- fourierf(x, K=3, h=48*5)
+seas2.f <- fourierf(ts(x, freq=48*5), K=3, h=48*5)
+fc <- forecast(fit, xreg=cbind(seas1.f,seas2.f))
+
+ +

The number of Fourier terms (arbitrarily set to 3 for both seasonal periods in the above code) can be selected by minimizing the AIC.

+",2013-11-03 00:08:43.977 +58735,503.0,5,,,,,,CC BY-SA 3.0,"

The formula for the sample mean is

+ +

$$\bar{x} = \frac{\sum_i x_i}{n}$$

+ +

where $x_i$ are the individual values and $n$ is the number of values in the sample.

+ +

There could also be sample means for other means, but without further details, the mean here refers to the arithmetic mean.

+",2013-11-03 00:16:49.967 +58736,503.0,4,,,,,,CC BY-SA 3.0,The sample mean refers to the arithmetic mean of a sample from a population.,2013-11-03 00:16:49.967 +58739,17900.0,1,,,,Multiple hypothesis testing of same hypothesis,,CC BY-SA 3.0,"

I read an interesting statement: running the same data through multiple tests no longer counts as multiple comparisons.

+ +

I am fairly confident this is incorrect if you are testing different hypotheses e.g. testing for hundreds of different lags on the same data will yield some false significant results (if you have enough data to test for large lags).

+ +

However, if you are testing exactly the same hypothesis, but with two different tests, is this still a multiple comparison?

+ +

My thinking so far is that firstly it is rarely if ever true that two tests have exactly the same null hypothesis. e.g. the Shapiro-wilk test and Jarque–Bera test, while both ostensibly ""normality tests"" are testing slightly different null hypotheses, that the data came from a normal distribution and that the data cam from a distribution with skew and kurtosis equal to a normal distribution. Are there any tests with exactly the same null hypothesis?

+ +

If there are tests that against exactly the same null hypothesis, would this count as a multiple comparison? And if the tests have exactly the same null hypothesis, is it possible for them to yield different results given the same data?

+",2013-11-03 00:22:49.317 +58740,16992.0,2,,58435.0,,,,CC BY-SA 3.0,"

The difficulty of the problem is highly dependent on how erroneous the uncertain labels can be. If the the uncertain labels are right, say, 90% of the time, you can probably get away with just using logistic regression. On the other hand, if the labels are wrong almost half the time, you may need to resort to some special techniques. Here's one stab I took at a very similar problem. (We had multiple observations per label, but otherwise the setup is quite similar.)

+",2013-11-03 00:58:09.247 +58741,23298.0,1,58745.0,,,Kolmogorov-Smirnov test strange output,,CC BY-SA 3.0,"

I am trying to fit my data to the one of the continuous PDF (I suggest it to be gamma- or lognormal-distributed). The data consists of about 6000 positive floats. +But the results of the Kolmogorov-Smirnov test completely refute my expectations providing the very low p-values.

+ +

Data empirical distribution

+ +

+ +

Distribution fitting +

+ +

Python code:

+ +
import numpy
+import sys
+import json
+import matplotlib.pyplot as plt
+import scipy
+from scipy.stats import *
+
+dist_names = ['gamma', 'lognorm']
+limit = 30
+
+def distro():
+    #input file
+    with open(sys.argv[1]) as f:
+        y = numpy.array(json.load(f))
+
+    #output
+    results = {}
+    size = y.__len__()
+    x = scipy.arange(size)
+    h = plt.hist(y, bins=limit, color='w')
+    for dist_name in dist_names:
+        dist = getattr(scipy.stats, dist_name)
+        param = dist.fit(y)
+        goodness_of_fit = kstest(y, dist_name, param)
+        results[dist_name] = goodness_of_fit
+        pdf_fitted = dist.pdf(x, *param) * size
+        plt.plot(pdf_fitted, label=dist_name)
+        plt.xlim(0, limit-1)
+        plt.legend(loc='upper right')
+    for k, v in results.iteritems():
+        print(k, v)
+    plt.show()
+
+ +

This is the output:

+ +
    +
  • p-value is almost 0 'lognorm', (0.1111486360863001, 1.1233698406822002e-66)
  • +
  • p-value is 0 'gamma', (0.30531260123096859, 0.0)
  • +
+ +

Does it mean that my data does not fit gamma distribution?.. But they seem so similar...

+",2013-11-03 01:00:53.333 +58742,9483.0,2,,10541.0,,,,CC BY-SA 3.0,"

The Statistics Toolbox implements the gap statistic as a class in the package clustering.evaluation since R2013b:

+ +
load fisheriris;
+rng('default');  % For reproducibility
+eva = evalclusters(meas,'kmeans','gap','KList',[1:6])
+figure;
+plot(eva);
+
+ +

+ +

You can also use this file exchange.

+",2013-11-03 01:02:29.910 +58743,594.0,2,,58720.0,,,,CC BY-SA 3.0,"

The easy way is to use the law of total variance:

+ +

$$\text{Var}(S) = E_N\left[\text{Var}(S|N)\right] + \text{Var}_N\left[E(S|N)\right] =\text{E}_N\left[N\cdot \text{Var}(X)\right] + \text{Var}_N\left[N\cdot\text{E}(X)\right]$$

+ +

Can you do it from there? It's pretty much just substitution (well, that and really basic properties of expectation and variance).

+ +

(The first part is even more straightforward using the law of total expectation.)

+ +

--

+ +

As Spy_Lord notes, the answer is $\text{E}(N)\cdot \text{Var}(X) + \text{Var}(N)\cdot\text{E}(X)^2$

+ +
+ +

Alternative approach is to evaluate $E(S_N^2)$. Following the approach you seem to be aiming at:

+ +

\begin{eqnarray} +E(S_N^2) &=& \sum_r E(S_N^2|N=r) p_r\\ + &=& \sum_r (r\sigma_2^2+r^2 \mu_2^2) p_r\\ + &=& \sigma_2^2\sum_r rp_r+\mu_2^2\sum_rr^2 p_r \\ + &=& \sigma_2^2 \text{E}N+\mu_2^2\text{E}(N^2) +\end{eqnarray}

+ +

and I assume you can do it from there.

+ +

However, to be honest, I think this way is easier (it's actually the same approach, you just don't need to sum over all the mutually exclusive events that way). The law of total expectation says $\text{E}(X) = \text{E}_Y[\text{E}_{X|Y}(X|Y)]$, so

+ +

\begin{eqnarray} +\text{E}(S^2_N) &=& \text{E}_N[\text{E}(S^2_N|N)]\\ + &=& \text{E}_N[N\sigma_2^2+N^2\mu_2^2]\\ + &=& \sigma_2^2\text{E}(N)+\mu_2^2\text{E}(N^2) +\end{eqnarray}

+",2013-11-03 01:03:41.747 +58744,15972.0,2,,58723.0,,,,CC BY-SA 4.0,"

"Nevertheless if the two variables are normally distributed, then uncorrelatedness does imply independence" is a very common fallacy.

+

That only applies if they are jointly normally distributed.

+

The counterexample I have seen most often is normal $X \sim N(0,1)$ and independent Rademacher $Y$ (so that it is 1 or -1 with probability 0.5 each); then $Z=XY$ is also normal (clear from considering its distribution function), $\operatorname{Cov}(X,Z)=0$ (the problem here is to show $\mathbb{E}(XZ)=0$ e.g. by iterating expectation on $Y$, and noting that $XZ$ is $X^2$ or $-X^2$ with probability 0.5 each) and it is clear the variables are dependent (e.g. if I know $X>2$ then either $Z>2$ or $Z<-2$, so information about $X$ gives me information about $Z$).

+

It's also worth bearing in mind that marginal distributions do not uniquely determine joint distribution. Take any two real RVs $X$ and $Y$ with marginal CDFs $F_X(x)$ and $G_Y(y)$. Then for any $\alpha<1$ the function:

+

$$H_{X,Y}(x,y)=F_X(x)G_Y(y)\left(1+\alpha\big(1-F_X(x)\big)\big(1-G_Y(y)\big)\right)$$

+

will be a bivariate CDF. (To obtain the marginal $F_X(x)$ from $H_{X,Y}(x,y)$ take the limit as $y$ goes to infinity, where $G_Y(y)=1$. Vice-versa for $Y$.) Clearly by selecting different values of $\alpha$ you can obtain different joint distributions!

+",2013-11-03 01:37:09.837 +58745,5237.0,2,,58741.0,,,,CC BY-SA 4.0,"

Yes. Neither of these distributions is a good fit for your data by that criterion. There are some other distributions you could try, but it strikes me as (ultimately) unlikely that real data come from any of the well-studied distributions, and you have 6k data, so even a trivial discrepancy will make the test 'significant'. (For more along those lines, see: Is normality testing 'essentially useless'?)

+

On the other hand, instead of checking to see if your data significantly diverge from these distributions, you could see how well your data correlate with the distributions you are interested in--the fit may well be 'good enough' for your purposes. (For more along these lines, see my answer here: Testing randomly generated data against its intended distribution.)

+",2013-11-03 01:40:09.613 +58746,20473.0,2,,58726.0,,,,CC BY-SA 3.0,"

$$E\Big((Y-E(Y\mid X))E(Y\mid X)\Big) = E\Big(YE(Y\mid X) - E(Y\mid X)^2\Big) $$

+ +

$$E\Big(YE(Y\mid X)\Big) - E\Big(E(Y\mid X)^2\Big) $$

+ +

Now by the law of total (or is it iterated - I always forget) expectation, we have for the first term

+ +

$$E\Big(YE(Y\mid X)\Big) = E\Big[E\Big(YE(Y\mid X)\mid X \Big)\Big] = E\Big[E(Y\mid X)E(Y\mid X ) \Big] = E\Big(E(Y\mid X)^2\Big)$$

+ +

so the whole expression equals zero.

+",2013-11-03 01:43:01.940 +58747,18403.0,2,,56780.0,,,,CC BY-SA 4.0,"

There's a mathematical point of view that is very simple. What you have is a projection problem in a Hilbert space, much like projecting a vector in $\mathbb{R}^n$ onto a subspace.

+ +

Let $(\Omega, \mathcal{F}, \mu)$ denote the underlying probability space. For the problem to make sense, consider the random variables with finite second moments, that is, the Hilbert space $L^2(\Omega, \mathcal{F}, \mu)$. The problem now is this: given $X, Y \in L^2(\Omega, \mathcal{F}, \mu)$, find the projection of $Y$ onto the subspace $L^2(\Omega, \mathcal{F}_X, \mu)$, where $\mathcal{F}_X$ is the $\sigma$-subalgebra of $\mathcal{F}$ generated by $X$. (Just as in the finite dimensional case, minimizing $L^2$-distance to a subspace means finding the projection). The desired projection is $E(Y|X)$, by construction. (This actually characterizes $E(Y|X)$, if one inspects the proof of existence).

+ +

Edit

+ +

Re ""...,by construction.""

+ +

By definition, the conditional mean of $Y$ on $X$ is a random variable $\psi$ +with the following two properties:

+ +
    +
  1. $\psi$ lies in $L^2(\Omega, \mathcal{F}_X, \mu)$.

  2. +
  3. $E[\psi 1_{A}] = E[Y 1_{A}]$, for all $A \in \mathcal{F}_X$, which implies that $E[\psi g] = E[Y g]$, for all $g \in L^2(\Omega, \mathcal{F}_X, \mu)$, by standard argument use denseness of simple functions.

  4. +
+ +

Standard Hilbert space projection arguments show that such a $\psi$ always exists and is unique.

+ +

This applies to any Hilbert space. The above can be re-phrased verbatim for, say, $\mathbb{R}^n$:

+ +
+

Let $Y \in \mathbb{R}^n$ and $V$ be a subspace. Then the projection of + $Y$ onto $V$ is characterized by the same two properties:

+ +
    +
  1. $\psi$ lies in $V$.

  2. +
  3. $\langle \psi, g \rangle = \langle Y, g \rangle$, for all $g \in V$.

  4. +
+
+ +

Note +This discussion is restricted to $L^2$ random variables, as the original question does implicitly. Conditional mean in general is defined for $L^1$ random variables, which is a larger than $L^2$. $L^1$ is a Banach space and conditional mean is still a projection, in an appropriate sense.

+",2013-11-03 03:17:13.590 +58748,23249.0,1,,,,ANOVA with very uneven sample sizes-- Can fit be improved through reference to a more complete dataset?,,CC BY-SA 3.0,"

I am analyzing a set of costs for contractor services for a particular city. These data have been extremely difficult to collect due the difficulty of contacting contractors and reluctance of contractors to divulge their pricing information. Factors are ""service type"" and ""contractor"".

+ +

We generally have quotes from more than one contractor on each service and a number of contractors perform most of the services. For a few services we have fairly good replication (i.e. ~15 quotes), for some of the services we have some degree of replication (i.e. 5-10 quotes per service). But for about a third of the services replication is very poor, i.e. <5 quotes per service.

+ +

The data have proven extremely challenging and time consuming to collect and I doubt if there will be more.

+ +

The upside to this story is that there are several construction cost estimation tools that can give us estimates of the costs of these services. And while these costs wouldn't be specific to the area we're interested in, we think we can assume that the prices of the services relative to each other would be similar to what we expect to see in the area we're looking at.

+ +

Sometimes elegance is the first casualty of pragmatism, but this is what I'm thinking of doing-- I am planning to collect a parallel set of cost estimates from these cost estimation tools and compare the relationships between the results of our surveys to the relationships of costs within this set of parallel cost data.

+ +

Essentially using the better replicated data points in our dataset as anchors and generating expectations for our poorly replicated data points based on the relationships from the parallel dataset.

+ +

I'm not even sure what this approach would be called and it feels like a very specific type of issue, so it's difficult to research online. But I'm wondering if there's any theoretical basis for this type of approach that's been worked out. Shall I buy a book on Bayesian stats?

+ +

Most of my experience in grad school was analyzing data from designed experiments, so this sort of situation is interesting, but definitely foreign to me.

+ +

Thanks in advance for your help.

+",2013-11-03 03:18:54.287 +58749,22923.0,2,,58602.0,,,,CC BY-SA 3.0,"

If your dynamic system is +$$ x_t = A_t x_{t-1} + \eta_t $$ +$$ y_t = B_t x_t + \varepsilon_t $$ +Then when people say system matrices $A_t, B_t$ should be deterministic, this means that Kalman Filter gives you an estimate of state $x_t$ conditional on past and current values of parameters $$\mathbf E\left(x_t|\,y_t,\dots,y_1, \,A_t,\dots,A_1, \,B_t, \dots, B_1\right).$$
+So when you do a filtering step to estimate this conditional expectation of state, you consider those matrices to be already known (observed) rather than unknown and random. Of course they can be realizations of some external random process (which is often the case) or be deterministic functions of time - this doesn't matter much.

+ +

What seems authors in above paper describe in 3.2 is an extension of KF when they assume $A_t, B_t$ to be random but they don't what to condition on their values when filtering. So they don't assume matrices to be known at the moment of filtering, but rather assume that they come from a distribution with known mean/variance.

+",2013-11-03 03:20:40.113 +58750,9049.0,2,,58288.0,,,,CC BY-SA 3.0,"

Using your example for your first question: Yes, parent's education level is a fixed effect as you set them to be at any arbitrary number you can/want to use. There is no ""randomness"" regarding the levels, you fix them, you fully observe them.

+ +

Regarding your second question: The Nunn & Wantchekon article is a good clarification for what you want. I am no expert on the matter but I think your intuition is correct. Ultimately it is an OLS; the way they test for significance is somewhat specialized but otherwise nothing crazy. So... ""Yes"" is also your second question's answer. Ν. & W. strive to have a somewhat rigorous falsification of their findings, checking of their ""unknown unknowns"" but yes, otherwise nothing too exotic. In general, if there is an observable fixed effects you can include it in the OLS to start with. If you suspect there are some unobservable ones you try to control for them by adding dummies; surrogate variables if you like.

+ +

As mentioned, I am not an expert on panel data and I haven't work in economics ever so take it with a grain of salt. I believe your statistical intuitions are not wrong though.

+",2013-11-03 05:20:06.133 +58751,23292.0,2,,13631.0,,,,CC BY-SA 3.0,"

The Hidden markov model is the sequential version of Naive Bayes. In naive bayes, you have a label with several possible values (in your case 0/1) and a set of features. The value for y is selected by modeling p(features | label) * p(label).

+ +

In a hidden markov model, a sequence of labels is predicted by modeling p(label | previous label) and P(features | label).

+",2013-11-03 06:07:01.217 +58772,21823.0,2,,58435.0,,,,CC BY-SA 3.0,"

I had a brief run in with image recognition and classification.

+ +

Random Forests is an easy to use technique. I've implemented it on R, it should be available on Weka as well. Ease of use trumps prediction accuracy though. If you have a large enough training set, it can classify multiple labels.

+ +

It worked to recognize handwritten digits quite well, but if your images are more complex, then only a trial would tell you if it does well.

+",2013-11-03 17:32:41.363 +58752,14253.0,1,,,,Comparing means with unequal variance and very different sample sizes,,CC BY-SA 3.0,"

I am trying to compare the means of the same variable between men and women. This is the statistics:

+ +
     N        Mean        Variance    Coef. Var.     Gender    
+   2000      26.12         10.89         0.13         Male        
+     50      56.10         25.01         0.09        Female
+
+ +

Neither variable is normally distributed but taking the log makes it pretty darn close. What is the appropriate way to test the means between males and females? Should I use the log or not? Any additional advice using Stata would be helpful.

+ +

My initial reaction is that females fare better than men, but I want to be statistically rigorous.

+",2013-11-03 06:22:09.433 +58753,21029.0,2,,58752.0,,,,CC BY-SA 3.0,"

The traditional test for comparing two sample means is the t-test. There are no assumptions about the sizes of the samples, so it is OK if they are different.

+ +

However, you touch upon the normality assumption. Even if the population is not normally distributed, the Central Limit Theorem allows us to infer normality as the sample sizes increase. This means your test will be approximate, but the sample size for female is a little low.

+ +

Finally, the result of the t-test will be different for the original and log-ed data. Do you have a specific reason based on your data to use the logarithm? Perhaps there is another assumption you would like to test about the behavior of the log of your data? Do not take the log simply to create a normal curve if there is no deeper meaning, but for fun compare the difference between the two results anyway!

+",2013-11-03 06:50:17.713 +58754,10372.0,1,,,,Naive SE vs Time Series SE: which statistics should I report after Bayesian estimation?,,CC BY-SA 3.0,"

I am new to Bayesian estimation.

+ +

When I do some estimations with JAGS, I find there are statistics called Naive SE and Time Series SE.

+ +

What exactly do they mean? Is it necessary that I report one or both of them as part of the estimation result?

+",2013-11-03 07:09:54.383 +58755,21840.0,1,,,,Show $Y$ converges to $a$,,CC BY-SA 3.0,"

Given: +$f_{Y_{(1)}}(y) = nbe^{-nb(y-a)}$, where $b> 0$ and $y \geq a$.

+ +

Show that as $n \rightarrow\infty$, $Y_{(1)}$ converges to $a$ in probability.

+ +

I have calculated $E[Y_{(1)}] = \frac{1}{nb} + a$

+ +

Which theorem should I apply to show convergence. I was trying to use Chebyshev's inquality.

+ +

[EDIT]

+ +

If I want to find to what $Y_{(1)}$ converges in distribution, is this the right way to do it:

+ +

$F_{Y_{(1)}} = 1-e^{-nb(y-a)}$

+ +

As $n \rightarrow \infty $

+ +

$F_{Y_{(1)}} = 1, y < a$

+ +

$F_{Y_{(1)}} = 0, y \geq a$

+ +

$P(|Y_{(1)}|< y) = P(|Y_{(1)}|< \epsilon)$ [replacing $y$ with $\epsilon$] = $1-e^{-nb(\epsilon-a)}$

+ +

As $n \rightarrow \infty $

+ +

$P(|Y_{(1)}|< y) \rightarrow 1$; So $ Y_{(1)} \rightarrow Y$ in distribution. So the limiting distribution is degenerate.

+ +

Please let me know if this approach is correct.

+",2013-11-03 07:43:41.483 +58756,21840.0,1,,,,Convergence in distribution,,CC BY-SA 3.0,"

For a statistic $T_n = \frac{1}{n} \sum_{i=1}^nY_i - \frac{1}{a}$. +Prove directly (without CLT) that scaled and appropriately shifted version of $T_n$ converges in distribution to $N(0,1)$.

+ +

[EDIT]

+ +

$f(y|a,b)=ae^{-a(y-b)}$ for $ y\geq b$

+ +

How should I approach the problem?

+ +

[EDIT]

+ +

I thought that if I could find the expected value and variance of $T_n$ and then represent it in $N(\mu,\sigma^2)$

+ +

for $f(y)$, $E[Y] = b+1/a$, $Var [Y] = 1/a^2$

+ +

For $Tn$,

+ +

$Var[T_n] = Var[\frac{1}{n} \sum_{i=1}^nY_i - \frac{1}{a}]$ = $a^2/n$

+ +

$E[T_n] = E [\frac{1}{n} \sum_{i=1}^nY_i - \frac{1}{a}] = b$

+ +

[EDIT]

+ +

$M_X(t)=\frac{ae^{bt}}{(a-t)}$

+ +

How should I go from here?

+",2013-11-03 07:52:04.963 +58757,594.0,2,,58752.0,,,,CC BY-SA 3.0,"

Taking logs and testing the mean on the log scale would normally not correspond to a difference in means on the original scale.

+ +

However:

+ +

[Edit: my comments apply to an earlier version of the data, and don't apply to the data that are presently in the question. As such, my comments really apply to the situation where the coefficient of variation in two close-to-lognormal samples are very similar, rather than to the case now at hand.]

+ +

The coefficient of variation is almost identical in the two samples, which does suggest that you might consider these as having a scale shift; if you think the logs look reasonably close to normal, then that would suggest lognormal distributions with common coefficient of variation. In that case a difference of means on the log-scale would actually indicate a scale-shift on the original scale (and hence that one of the means is a multiple of the other mean on the original scale).

+ +

That is, under an assumption of equal variance and normal distribution on the log-scale, a rejection of equality of means implies that the means on the original scale have a ratio that differs from 1.

+ +

It seems like that would be a reasonable assumption.

+ +

There are other things you could do, though.

+",2013-11-03 09:20:02.827 +58759,503.0,4,,,,,,CC BY-SA 4.0,"A matrix is singular when its determinant is 0; for such matrices, the inverse is not defined. Also, related topics like singular fits",2013-11-03 11:27:30.657 +58758,503.0,5,,,,,,CC BY-SA 3.0,,2013-11-03 11:27:30.657 +58761,503.0,5,,,,,,CC BY-SA 3.0,"

A kernel in the context of kernel smoothing is a local similarity function $K$, which must integrate to 1 and is typically symmetric and nonnegative. Kernel smoothing uses these functions to interpolate observed data points into a smooth function.

+ +

For example, Watson-Nadaraya kernel regression estimates a function $f : \mathcal X \to \mathbb R$ based on observations $\{ (x_i, y_i) \}_{i=1}^n$ by +$$ +\hat{f}(x) = \frac{\sum_{i=1}^n K(x, x_i) \, y_i}{\sum_{i=1}^n K(x, x_i)} +,$$ +i.e. a mean of the observed data points weighted by their similarity to the test point.

+ +

Kernel density estimation estimates a density function $\hat{p}$ from samples $\{ x_i \}_{i=1}^n$ by +$$ +\hat{p}(x) = \frac{1}{n} \sum_{i=1}^n K(x, x_i) +,$$ +essentially placing density ""bumps"" at each observed data point.

+ +

The choice of kernel function is of theoretical importance but typically does not matter much in practice for estimation quality. (Wikipedia has a table of the most common choices.) +Rather, the important practical problem for kernel smoothing methods is that of bandwidth selection: choosing the scale of the kernel function. Undersmoothing or oversmoothing can result in extremely poor estimates, and so care must be taken to choose an appropriate bandwidth, often via cross-validation.

+ +
+ +

Note that the word ""kernel"" is also used to refer to the kernel of a reproducing kernel Hilbert space, as in the ""kernel trick"" common in support vector machines and other kernel methods. See [kernel-trick] for this usage.

+",2013-11-03 11:37:38.377 +58760,503.0,4,,,,,,CC BY-SA 3.0,"Kernel smoothing techniques, such as kernel density estimation (KDE) and Nadaraya-Watson kernel regression, estimate functions by local interpolation from data points. Not to be confused with [kernel-trick], for the kernels used e.g. in SVMs.",2013-11-03 11:37:38.377 +58762,23302.0,1,,,,How can I test the difference of two Weibull distributions?,,CC BY-SA 3.0,"

I have two Weibull distribution sets from two wind datasets in order to check whether they are same.

+ +

I thought a 2 sample t-test would be applicable but I couldn't find any ways to do that on the Internet.

+ +

Does anyone know what type of test is applicable to my purpose? and what R function can you recommend?

+ +

Plus, if it turned out that there is a difference between the two datasets, can I just fit a linear line between the datasets?

+",2013-11-03 12:14:55.333 +58763,23303.0,2,,53261.0,,,,CC BY-SA 3.0,"

This is eta-squared and it is a fairly poor measure of effect size (partial eta-squared is often reported in statistical software such as SPSS when calculating ANOVA)

+",2013-11-03 13:02:01.453 +58764,15870.0,1,,,,The mode of multivariate Gamma distribution,,CC BY-SA 3.0,"

Let X, Y, Z be i.i.d. distributed Gamma random variables. What could the mode of the vector $(X, X+Y, X+Y+Z)$ be?

+ +

Does the mode of a random vector equal the combination of the marginal modes?

+",2013-11-03 13:51:08.163 +58765,12683.0,2,,58619.0,,,,CC BY-SA 3.0,"

This is a central composite design so I assume you're fitting a full second-order model for the mean response $\mathrm{E}(Y)$ on continuous predictors $x_1$, $x_2$, & $x_3$

+ +

$$\mathrm{E}(Y)= \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + +\beta_{12} x_1 x_2 + \beta_{13} x_1 x_3 + \beta_{23} x_2 x_3 + +\beta_{11} x_1^2 +\beta_{22} x_2^2 +\beta_{33} x_3^2$$

+ +

& estimating the coefficients $\beta$ by ordinary least squares.

+ +

I fitted the model: the estimated mean response has a stationary point, a maximum, of $\mathrm{E}(Y)=102$ at $x_1=0.62, x_2=-0.11, x_3=0.13$. (You can check for yourself by differentiating the equation for the response with respect to each predictor, setting each derivative to zero (all slopes are zero at a stationary point), & solving the resulting simultaneous equations.) Contour plots at slices through the stationary point are a good way of visualizing the fitted model: +

+ +

The $95\%$ confidence interval for $\mathrm{E}(Y)$ at the maximum is $(89,114)$, & the $95\%$ prediction interval for $Y$ at the maximum is $(68,136)$. These are rather wide compared to the range of the response across the whole design. Indeed the residual standard deviation is $14$ (just look at the spread of responses over your centre points). I don't know the context of your experiment, but in many situations this would be cause for concern—are there other factors significantly contributing to process variability that you haven't taken into account?

+",2013-11-03 14:14:02.213 +58766,9175.0,2,,58755.0,,,,CC BY-SA 3.0,"

You have $E[Y_{(1)}] = \frac{1}{nb} + a$.

+ +

$Lim_{n->\infty} = \frac{1}{\infty} + a$

+",2013-11-03 14:41:17.737 +58767,21346.0,1,58788.0,,,Derivative of the transformed explanatory variable,,CC BY-SA 3.0,"

I have an explanatory variable that is transformed as suggested in footnote 25 of the article as follows (the explanatory variable is continuous and can take negative, zero or positive values) +\begin{equation} +y=\text{sign}(x)\,\log{(|x|+1)} +\end{equation} +where $\text{sign}(x)$ takes a value of $1$ if $x>0$, $0$ if $x=0$, and $-1$ if $x<0$. Let's suppose my dependent variable is $y$ (not transformed). Now I need to take the derivative of $y$ with respect to $x$ to find out the marginal effect and elasticity.

+ +

I did this as follows: +\begin{equation} +\frac{\partial y}{\partial x}=\text{sign}(x)\,\frac {1}{(|x|+1)}\,\frac{\partial|x|}{\partial x} +\end{equation}

+ +

As far as I understand, $\frac{\partial|x|}{\partial x} =1$ if $x>0$ and $\frac{\partial|x|}{\partial x} =-1$ if $x<0$, and is not defined for $x=0$. My question is how I compute the marginal effect for the observation with the value $x=0$ with this transformation.

+",2013-11-03 15:01:42.123 +58768,20304.0,1,58773.0,,,k-fold cross-validation for large data sets,,CC BY-SA 3.0,"

I am performing 5-fold cross-validation on a relatively large data set and I have noticed that the validation error for each of the 5 training sets are very similar. So I guess, in this case, cross-validation is not very useful (it would be about the same as just using one training and test set). So I was wondering if I am working with a special case, or is this the case for all large data sets. I'm thinking that perhaps if you have enough training examples, the average cross-validation score would not be very different than the score for one training and test set. Is this intuitition correct?

+",2013-11-03 16:36:30.537 +58769,22415.0,1,58795.0,,,Expected value of a series of random variables in a markov chain,,CC BY-SA 3.0,"

I have a Markov Chain such that $X_n = max(X_{n-1}+\xi _n,0)$ where the $\xi_n$ series is independent and identically distributed. I want to show that if $\mathbb E(\xi_n) > 0$ (where $\mathbb E(\xi_n)$ is the expected value of $\xi_n$) then$\frac{X_n}{n}$ tends to $\mathbb E(\xi_n)$ as n approaches infinity for any choice of $X_0$.

+ +

And... I have no idea how to show this. I know that repeated application of the above will yield $X_n = max(X_0+\sum\limits_{i=1}^n\xi_i, \sum\limits_{i=2}^n\xi_i, \sum\limits_{i=3}^n\xi_i, ..., 0)$, but I'm stuck here.

+",2013-11-03 16:57:39.573 +58770,23307.0,1,59372.0,,,Probability of drawing a given word from a bag of letters in Scrabble,,CC BY-SA 3.0,"

Suppose you had a bag with $n$ tiles, each with a letter on it. There are $n_A$ tiles with letter 'A', $n_B$ with 'B', and so on, and $n_*$ 'wildcard' tiles (we have $n = n_A + n_B + \ldots + n_Z + n_*$). Suppose you had a dictionary with a finite number of words.

+ +

You pick $k$ tiles from the bag without replacement.

+ +

How would you compute (or estimate) the probability that you can form a given word, of length $l$ (with 1 < $l$ =< $k$) from the dictionary given the $k$ tiles selected?

+ +

For those not familiar with Scrabble (TM), the wildcard character can be used to match any letter. Thus the word 'BOOT' could be 'spelled' with the tiles 'B', '*', 'O', 'T'. The order in which the letters are drawn does not matter.

+ +

Suggestion: in order to simplify the writing of answers, it might be better to just answer the question: what is the probability of having the word 'BOOT' among your possible moves after drawing 7 letters from a fresh bag.

+ +

(the problem's introduction has been copied from this similar question)

+",2013-11-03 17:15:02.737 +58771,23308.0,1,,,,Statistical significance in canonical correlation analysis,,CC BY-SA 3.0,"

I do canonical correlation analysis between two multivariate datasets $X$ and $Y$. For each pair of canonical variates (x-y pair) I get the canonical correlation coefficient. How can I test its statistical significance?

+",2013-11-03 17:24:26.470 +58773,,2,,58768.0,user88,,,CC BY-SA 3.0,"

It is certainly adds value to a single test because you get a stronger justification that your estimated accuracy is correct.

+ +

Large dataset certainly helps in making robust, accurate models though it won't bias the cross-validation on its own. The only possible problem you should check for is whether the set contains significant fraction of duplicated objects -- this may happen if the number of attributes is very small in comparison.

+",2013-11-03 18:09:20.110 +58774,21523.0,2,,54915.0,,,,CC BY-SA 3.0,"

After read carefully Pettitt paper, I can understand the method and check my code.

+ +

Pettitt starts with a general method expresed:

+ +

$$U_{t,T} = \sum_{i=1}^{t} \sum_{j=t+1}^{T} D_{ij}$$

+ +

where $D_{ij} = sign(X_i - X_j)$

+ +

Then Pettitt follows expressing a test for discrete Bernoulli and Binomial data and two variants for continuos data.

+ +

For discrete test, he use $U_{t,T} = T \cdot (S_t - \frac{tS_t}{T})$, where $S_t = \sum_{1}^{T} X_j$, $S_T = \sum_{1}^{T} X_j$ and $X_j$ is eval function expresed like Bernoulli serie where values are 0 or 1.

+ +

For continuos test, he use:

+ +
    +
  1. $U_{t,T} = 2 \cdot W_t - t(T+1)$, when $W_t=\sum R_j$ and $R_j$ are the rank or the data. (There is a variation when ties exists.)
  2. +
  3. $U_{t,T} = U_{t-1,T} + V_{t,T}$, with $t=2,\dots,T$ and $V_{t,T} = \sum_{j=1}^{T} sign(X_t - X_j)$.
  4. +
+ +

Then, my original code are variant 1 for continuos test and results are correct, except for guessing critical values in Pettitt series results.

+ +

I calculate these critical values solving $K_T$ from +$P_{OA} = 2e^{(\frac{-6{K_{T}}^2}{T^3+T^2})}$, but although $P_{OA}$ is solved correctly, the critical value is not. Why? Actually, I don't know and I can't reach others papers where shows how to calculate table expresed in, for example, Sahin et al paper (link in previous comments).

+ +

So that, I change my function code in order to include all variants and $P_{OA}$ calcul. Here are:

+ +
<!-- language: lang-R -->
+pettitt<-function(x,alpha=0.01,method=c(""discrete"",""continuos""),
+                  alternative=c(""rank"",""variation"")) {
+   # Pettitt AN. 1979 A non-parametric approach to the change point detection.
+   # (Sección 2.3)
+   #
+   # x is a numeric vector
+   # alpha is a integer
+   x<-na.omit(x)
+   orden<-rank(x)
+   T<-length(x)
+   method<-match.arg(method)
+   alternative<-match.arg(alternative)
+   #
+   U.t.T<-c()
+   V.i.T<-c()
+   P.o.a<-NULL
+   P.o.a.p<-NULL
+   P.o.a.n<-NULL
+   k.T<-NULL
+   k.T.p<-NULL
+   k.T.n<-NULL
+   #
+   if (!is.numeric(x))
+      stop(""'x' must be a numeric vector)
+   #
+   # Discrete values
+   if (method == ""discrete"") {
+      x.j<-sign(x)
+      x.j[which(x.j==-1)]<-0
+      S.T<-sum(x.j)
+      for (i in 1:T) {
+        S.t<-sum(x.j[1:i])
+        U.t.T<-c(U.t.T,
+           T*(S.t-(i*S.T/T)))
+      }
+      k.T<-max(abs(U.t.T))
+      k.T.p<-max(U.t.T)
+      k.T.n<-min(U.t.T)
+      P.o.a<-exp((-2*k.T.p^2)/(S.T*(T^2-T*S.T)))
+      critical<-sqrt((log(alpha)*(S.T*(T^2-T*S.T)))/-2)
+   }
+   #
+   # Continuos value.
+   if (method == ""continuos"" & alternative == ""rank"") {
+      TIES<-length(unique(x)) < T
+      if (!TIES) { 
+        for (i in 1:T) {
+           U.t.T<-c(U.t.T,
+              2*(colSums(as.matrix(orden[1:i])))
+                 -(i*(T+1))
+           )
+         }
+      } else {
+        frequency<-as.vector(table(x))
+        total.frequency<-sum(frequency)
+        for (i in 1:length(frequency)) {
+           U.t.T<-c(U.t.T,
+              1-(total.frequency*(frequency[i]^2-1))/(T*(T^2-1))
+           )
+        }
+      }
+      k.T<-max(abs(U.t.T))
+      P.o.a<-2*exp((-6*k.T^2)/(T^3+T^2))
+      critical<-sqrt((log(alpha/2)*(T^3+T^2))/-6)
+   }
+   if (method == ""continuos"" & alternativa == ""variation"") {
+      V.i.T<-matrix(rep(NA,T^2),ncol=T)
+      for (i in 1:T) {
+        for (j in 1:T) {
+           V.i.T[j,i]<-sign(x[i]-x[j])
+        }
+        if (i==1) {
+           U.t.T<-sum(V.i.T[,i],na.rm=T)
+        } else {
+           U.t.T<-c(U.t.T,
+              U.t.T[(i-1)]+sum(V.i.T[,i],na.rm=T)
+           )
+        }
+      }
+      V.i.T<-colSums(V.i.T,na.rm=T)
+      k.T.p<-max(U.t.T)
+      k.T.n<-min(U.t.T)
+      k.T<-max(abs(U.t.T))
+      P.o.a.p<-exp((-6*k.T.p^2)/(T^3+T^2))
+      P.o.a.n<-exp((-6*k.T.n^2)/(T^3+T^2))
+      P.o.a<-2*exp((-6*k.T^2)/(T^3+T^2))
+      critical<-sqrt((log(alpha/2)*(T^3+T^2))/-6)
+   }
+   output<-list(U.t.T,V.i.T,P.o.a,P.o.a.p,P.o.a.n,k.T,k.T.p,k.T.n,critical)
+   return(output)
+}
+
+ +

If anyone knows how calcul critical values are welcome.

+ +

So many thanks.

+",2013-11-03 18:31:04.590 +58775,23309.0,1,58776.0,,,Beyond the normal distribution: what if a particular distribution can't be assumed,,CC BY-SA 3.0,"

In a random sample of 150 community college students, the mean number of hours spent studying per week is 11.7 hours and the standard deviation is 4 hours.

+ +

Without assuming anything about the distribution of the number of hours community college students study per week, at least what percentage (approximately) of the students study between 5.3 and 18.1 hours per week?

+",2013-11-03 18:56:00.920 +58776,15827.0,2,,58775.0,,,,CC BY-SA 3.0,"

You can say for your sample data that whatever percent it is lie between 5.3 and 18.1 hours/week.

+ +

You may be reaching for Chebyshev's inequality. If so, go for e.g. http://en.wikipedia.org/wiki/Chebyshev's_inequality and don't return here. The ""at least"" suggests a problem with this flavour.

+ +

You can't say that otherwise without making some assumptions. You can calculate the probability that values lie between 5.3 and 18.1 hours/week, which is 1.6 SD either side of the mean, easily if you do assume a normal distribution.

+ +

Your question sounds like self-study, so I will stop there.

+",2013-11-03 19:14:08.280 +58777,9456.0,1,,,,Confusion related to intractability in topic models,,CC BY-SA 3.0,"

+ +

I was reading this paper related to topic models. I am a bit confused why the marginal likelihood is not tractable and how converting the graphical model into the new one actually helps. First I don't understand how the coupling between $\beta$ and $\theta$ result in making it intractable. It points to a reference. But I cannot access the article (Dickey 1983). So I cannot understand why it is intractable. can anyone please provide me some information?

+",2013-11-03 19:28:19.663 +58778,22942.0,1,,,,Error implemented a specific factorial design in Minitab (default generators),,CC BY-SA 3.0,"

I'm intending to implement the following factorial design

+ +

I wish to obtain this alias structure - with a 2^(5-2) factorial design.

+ +
I+ACE+BDE+ABCD
+A+CE+ABDE+BCD
+B+ABCE+DE+ACD
+C+AE+BCDE+ABD
+D+ACDE+BE+ABC
+E+AC+BD+ABCDE
+AB+BCE+ADE+CD
+AD+CDE+ABE+BC
+
+ +

I've obtained this (the alias structure) by hand, using the design generators I=ACE and I=BDE. I've controlled my alias structure and confirmed it is the correct one.

+ +

However, using Stat->DOE->Create factorial design-> 2 level design (specify generators)

+ +

and using 5 factors

+ +

And subsequently using

+ +

Default generators: ACE, BDE

+ +

Gives the error ""Block are aliased with the mean"".

+ +

Why doesn't this work?

+",2013-11-03 19:43:29.497 +58779,15120.0,1,,,,Issue in graph construction,,CC BY-SA 3.0,"

I have a symbolic representation of time series obtained from SAX toolbox. I was wondering if it is possible to construct a graph where each node represents a unique symbol and the edges represent the transition providing that there is no transition to itself. For ex, let the time series T of n=20 data points be represented as + T=[1 1 2 1 2 1 3 1 1 1 2 2 3 3 3 1 1 2 3 3 1]' where number of alphabets used to symbolize = 3 and they are (1,2,3). I have combined co-occuring symbols together so, the compressed time series becomes

+ +
T' = [1 2 1 2 1 3 1 2 3 1 2 3 1]'
+
+ +

In order to construct a graph (Esp. fuzzy cognitive map ) with fuzzy membership values from T' where the nodes will be (1,2,3), there will be an edge from $Node_i$ to $Node_j$ and it will have a weight $W_{ji}$. How do I find the weights ? I do not know which theory to search for this kind of problem and so if there are any ideas as to what can form the weights. Thank you

+",2013-11-03 20:04:07.333 +58780,2015.0,1,58844.0,,,Community finding algorithms in large heterogeneous networks,,CC BY-SA 3.0,"

Consider a network that consists of vertices with various meanings. For example: stack overflow users, keywords and user location when asking/answering a question. In this network, when a user asks a question its ID node is linked to several keyword vertices and to a vertex representing users' geographical location.

+ +

I would like to ask a general and vague question: are there any meaningful communities in this network? One approach would be to link (lets say) users using other vertices and then analyzing the resulting graph of users. However, are there approaches that retain the heterogeneous types of information in the graph? Are there metric measurements that take into consideration various types of nodes?

+",2013-11-03 20:11:09.887 +58781,211.0,1,58783.0,,,"How is this sampling called? (getting a ""representative"" sample set for teaching)",,CC BY-SA 4.0,"

I know of random sampling and stratified sampling, but what I am not sure what is the name of the type of sampling I need:

+ +

I wish to work-out an example with my students of simple correlation/regression. I found the following nice dataset: ""When do Babies Start to Crawl"", but it has too many observations (12). And what I wish is something even smaller that we can do calculations on (let's say 6). I would like to sample these points, but I would rather do it in such a way so that the result of the analysis would remain as similar as possible to what I would get from the larger dataset.

+ +

So of course the SE would change, but I would like the correlation to be as close as possible, and if possible to have the range stay similar (I'm ignoring the outlier issue).

+ +

Does this type of situation have a known name which I can use when searching for good solutions?

+",2013-11-03 22:20:23.053 +58797,23322.0,2,,58338.0,,,,CC BY-SA 3.0,"

Look at the ADF Unit Root Test section.

+ +

If your data is a random walk with drift, then it will be under the type 'Single Mean'.

+ +

For the ADF test, +H0: Non-stationary +Ha: Stationary

+ +

if P-value < 0.05, you reject the null hypo (H0) and conclude that data series is stationary. It should be as you already differenced the data once.

+ +

Under 'Pr < Rho' which stands for the P-value of your Rho (autocorrelation), it is 0.0129 and <0.0001 thus, we reject the null hypo and conclude that the data is stationary.

+",2013-11-04 03:44:35.610 +58782,22507.0,2,,55361.0,,,,CC BY-SA 3.0,"

I am not going to analyze the code, but below is the solution.

+ +

Let

+ +
    +
  • P(loc60) be the probability that a random locomotive has number 60
  • +
  • P(N) be the prior probability that there are exactly N locomotives
  • +
  • P(loc60|N) be the probability that a random locomotive has number 60, if the total number of locomotives is N,
  • +
  • P(N|loc60) be the probability that there are exactly N locomotives, if a random locomotive has number 60
  • +
+ +

Then

+ +

$$ P(N|\text{loc60}) = {P(\text{loc60}|N) P(N) \over P(\text{loc60})} = {P(\text{loc60}|N) P(N) \over \sum_M P(\text{loc60}|M)}$$

+ +

But $$ P(\text{loc60}|N) = \cases {1/N & if $N\ge 60$ \\ 0 & otherwise } $$

+ +

From now on, we assume that $N \ge 60$.

+ +

$$ P(N|\text{loc60}) = {P(N)/N \over \sum_{M=60}^\infty P(M)/M} $$

+ +

Now we should select P(N), otherwise we are stuck. Since we don't know even the order of magnitude of P(N), it is reasonable to assume that $\log N$ is uniformly distributed between 0 and some $\log N_\max$ (i. e. the probability that $10^2\le N<10^3$ is the same as the probability that $10^3\le N<10^4$). Guestimating $N_\max$ is a tricky task, but from my prior knowledge about railroads and locomotives, I can assume that $N_\max \gg 60$ .

+ +

The uniform distribution of $\log N$ means that $$P(N) = c(\log (N+1)-\log N) \approx c/N$$, where c is a constant independent on N.

+ +

Substituting this to the previous formula, we have: +$$ P(N|\text{loc60}) \approx {c/N^2 \over \sum_{M=60}^{N_\max} c/M^2} $$

+ +

But $$\sum_{M=60}^{N_\max} c/M^2 \approx \int_{60}^{N_\max} {c\over M^2}dM = {c \over 60} - {c \over N_\max} \approx {c\over60} $$

+ +

Now we have

+ +

$$ P(N|\text{loc60}) \approx {60/N^2} $$

+ +

What is the median value of N? Let it be $N_\text{med}$ , then

+ +

$$ \int_{60}^{N_\text{med}} {60 \over N^2} dN = 1/2 $$

+ +

$$ 60/N - {60 \over N_\text{med}} = 1/2 $$

+ +

$$ N_\text{med} = 120 $$

+ +

If what we need is mathematical expectation rather than median, then

+ +

$$ E(N) = \int_{60}^{N_\max} {60\over N^2} N dN = 60 \log {N_\max \over 60} $$

+ +

From what I know about railroads, $N_\max$ should be between $10^3$ and $10^6$, so E(N) is somewhere between 170 and 600.

+",2013-11-03 23:06:15.900 +58783,668.0,2,,58781.0,,,,CC BY-SA 3.0,"

I don't know whether it has a name, but I have used similar techniques to create synthetic datasets to answer questions on this site: they frame the problem as an optimization and then carry it out.

+ +

The methods to use for optimization depend on the problem. In this case you can explore the entire collection of possible six-element samples of the dataset, because $\binom{12}{6} = 924$ is small. In general you cannot perform an exhaustive search and have to be content with some form of randomized search, guided perhaps by methods of simulated annealing or genetic algorithms. But for your purposes you don't need a truly optimal solution, so a blind search ought to work fine.

+ +
+ +

To illustrate, I saved the ""crawling"" dataset in a file and applied the R script below. Its output lists the case numbers of the optimal subset and compares the statistics you would like to reproduce. (I assumed these were the coefficients of the regression of mean crawling age against temperature, but almost any small set of statistics will work provided they are not sample-size dependent, as recognized in the question itself.)

+ +
Sample: 2 3 4 5 9 12
+         (Intercept) temperature
+Original    35.70254 -0.07560731
+Sample      35.70062 -0.07548532
+
+ +

In this plot of the data, the optimal subset is shown in red and the two fits in corresponding colors; they are indistinguishable.

+ +

+ +

To compare the exhaustive search to the blind random search, I set the random number seed to 17, the number of combinations to search to 924, and forced the code to perform the randomized search (thereby going to exactly the same computational effort, but with no guarantee of optimality). The output this time was

+ +
Sample: 3 5 8 10 11 12
+         (Intercept) temperature
+Original    35.70254 -0.07560731
+Sample      35.70047 -0.075770
+
+ +

It is a different sample, but the results are almost as good as before.

+ +
+ +
f <- read.csv(""f:/research/R/crawling.txt"", sep=""\t"")
+#
+# Function to return the statistics to match in a sample.
+#
+get.coef <- function(g) {
+  coef(lm(avg_crawling_age ~ temperature, weights=n, data=g))
+}
+#
+# Compute these statistics for all possible samples of a specified size.
+#
+n.max <- 10^4 # Limits the execution time.
+sample.size <- 6
+system.time( {
+  if (choose(nrow(f), sample.size) > n.max) {
+    print(""Using randomized search."", quote=FALSE)
+    samples <- replicate(n.max, sample.int(nrow(f), sample.size))
+  } else {
+    samples <- combn(1:nrow(f), sample.size)
+  }
+  x <- apply(samples, 2, function(i) get.coef(f[i, ]))
+})
+#
+# Compare these statistics, using their variation across all possible 
+# samples to establish relative scales, to their values for the data
+# and select the closest.  (One might retain the several best, rather
+# than just the one best, and choose among them using additional qualitative
+# criteria.)
+#
+delta <- apply((x - get.coef(f)) / apply(x, 1, sd), 2, function(y) sum(y*y))
+sample.cases <- sort(samples[, which.min(delta)])
+g <- f[sample.cases, ]
+#
+# Check that the best match `g` reproduces the coefficients reasonably closely.
+#
+z <- rbind(get.coef(f), get.coef(g))
+rownames(z) <- c(""Original"", ""Sample"")
+cat(""Sample:"", sample.cases)
+print(z)
+#
+# Plot the data and the subset to compare visually.
+#
+col.red <- ""#ff000080""
+plot(subset(f, select=c(temperature, avg_crawling_age)), cex=1.2)
+points(subset(g, select=c(temperature, avg_crawling_age)), pch=19, col=col.red)
+abline(get.coef(f), lwd=3, col=""Gray"")
+abline(get.coef(g), lwd=3, lty=2, col=col.red)
+
+",2013-11-03 23:14:32.760 +58784,14227.0,1,,,,Topic models evaluation in Gensim,,CC BY-SA 3.0,"

I've been experimenting with LDA topic modelling using Gensim. I couldn't seem to find any topic model evaluation facility in Gensim, which could report on the perplexity of a topic model on held-out evaluation texts thus facilitates subsequent fine tuning of LDA parameters (e.g. number of topics). It would be greatly appreciated if anyone could shed some light on how I can perform topic model evaluation in Gensim. This question had also been posted on Stackoverflow.

+",2013-11-03 23:23:45.773 +58785,15972.0,2,,58720.0,,,,CC BY-SA 3.0,"

The law of total variance is the easiest way to do this. But there are several occasions when we don't know how many random variables we are dealing with (e.g. branching processes such as Galton-Watson, birth-death processes, queues) where probability-generating functions are a useful technique. It is possible to derive mean and variance using the PGF, so I want to demonstrate how this can serve as an alternative. Why bother? One motivation is that this method will generalize easily to find any factorial moment, and hence any moment, of distribution.

+ +

A few general results: a PGF $G_X(z)=\mathbb{E}(z^X)$ has $\lim_{z\uparrow 1}\, G_X(z)=\lim_{z\uparrow 1}\, \mathbb{E}(z^X)=1$. Factorial moments are found by taking the limit of the appropriate derivative of the PGF as $z$ goes to 1 from below. So for a random variable $X$:

+ +

\begin{eqnarray} +\mathbb{E}(X) &=& \lim_{z\uparrow 1}\, G'_X(z)\\ +\mathbb{E}(X(X-1)) &=& \lim_{z\uparrow 1}\, G''_X(z)\\ +\mathbb{E}(X(X-1)(X-2)) &=& \lim_{z\uparrow 1}\, G'''_X(z) +\end{eqnarray}

+ +

And so on for higher moments. The key here is that if $S_N=\sum_{i=1}^N X_i$ with iid $X_i$ then $G_{S_N}(z)=G_N(G_X(z))$. Proof:

+ +

\begin{eqnarray} +G_{S_N}(z) &=& \mathbb{E}_N(\mathbb{E}(z^{\sum_{i=1}^N X_i})) &=& \mathbb{E}_N(\mathbb{E}(\prod_{i=1}^N z^{X_i})) &=& \mathbb{E}_N(\prod_{i=1}^N (\mathbb{E}(z^{X_i})) \\ + &=& \mathbb{E}_N(\prod_{i=1}^N G_X(z)) &=& \mathbb{E}_N(G_X(z)^N) &=& G_N(G_X(z)) +\end{eqnarray}

+ +

Also note $\lim_{z\uparrow 1}\, G_X(z)=\lim_{z\uparrow 1}\, G_N(z)=1$, $\lim_{z\uparrow 1}\, G'_X(z)=\mu_X$, $\lim_{z\uparrow 1}\, G'_N(z)=\mu_N$, $\lim_{z\uparrow 1}\, G''_X(z)=\mathbb{E}(X^2-X)=\sigma_X^2+\mu_X^2-\mu_X$ and $\lim_{z\uparrow 1}\, G''_N(z)=\sigma_N^2+\mu_N^2-\mu_N$.

+ +

Since $G_{S_N}(z)=G_N(G_X(z))$ we can use the chain rule to find the mean and variance of $S_N$:

+ +

\begin{eqnarray} +\mathbb{E}(S_N) &=& \lim_{z\uparrow 1}\, \frac{d}{dz}G_N(G_X(z))=\lim_{z\uparrow 1}\, G'_X(z)G'_N(G_X(z))=\mu_X \mu_N\\ +\mathbb{E}(S_N(S_N-1)) &=& \lim_{z\uparrow 1}\, \frac{d^2}{dz^2}G_N(G_X(z))\\ +\mathbb{E}(S_N^2-S_N) &=& \lim_{z\uparrow 1}\, \left(G''_X(z)G'_N(G_X(z))+G'_X(z)^2 G''_N(G_X(z))\right) \\ +\mathbb{E}(S_N^2)-\mu_X \mu_N &=& (\sigma_X^2+\mu_X^2-\mu_X)(\mu_N)+(\mu_X)^2(\sigma_N^2+\mu_N^2-\mu_N) \\ +\mathbb{E}(S_N^2) &=& \mu_N \sigma_X^2 + \mu_X^2 \sigma_N^2 + \mu_X^2 \mu_N^2 \\ +\operatorname{Var}(S_N) &=& \mathbb{E}(S_N^2)-\mathbb{E}(S_N)^2=\mu_N \sigma_X^2 + \mu_X^2 \sigma_N^2 + \mu_X^2 \mu_N^2-(\mu_X \mu_N)^2 \\ +\operatorname{Var}(S_N) &=& \mu_N \sigma_X^2 + \mu_X^2 \sigma_N^2 +\end{eqnarray}

+ +

It's a little gruesome and there's no doubt the law of total variance is easier. But if the standard results are taken for granted, this is only a couple of lines of algebra and calculus, and I've given more detail than some of the other answers which makes it look worse than it is. If you wanted the higher moments, this is a viable approach.

+",2013-11-03 23:38:19.507 +58786,594.0,2,,58781.0,,,,CC BY-SA 3.0,"

I don't think there's a specific name for it, but it's the kind of task I've undertaken a number of times in various guises.

+ +

While the number of ways of choosing a sample of size 6 from 12 (924) is already manageable, you greatly cut down the search space by saying ""of similar range"".

+ +

Looking at the plot:

+ +

+ +

That pretty much limits you to choosing the leftmost point from the three points at the left and the rightmost point from the two points at the right, and the four interior points from the remaining points inside the ones you choose.

+ +

If we simplify that a little and simply split the data into three subgroups of size (from left to right) 3, 7 and 2 from which we choose 1,4 and 1 point, we get a total of

+ +

$\binom{3}{1} \binom{7}{4} \binom{2}{1} = 210$, combinations which is less than a quarter the size (almost small enough to examine by hand).

+ +

Edit: Since whuber has given a comprehensive answer (one which I happily upvoted), so I won't go through the details of how to do it in this cut down case, but you could apply similar techniques to the search over a smaller space.

+ +
+ +

Working by hand

+ +

In the past when I've done it, I often do a hand search first, because it often turns out that by starting with a reasonable choice and tweaking it (swapping a few points in and out to improve the characteristics I want) I can often get something quite good enough in a few tries, saving the effort of coding a more formal search.

+ +

So for example, just by eye, I'd start with (say) these six points:

+ +

+ +

And then by looking at the direction of deviations of the values of the sample statistics I want (like the correlation) from the values I want them to have, and the likely effect of swapping a point in the set with a point not in, I can usually get close enough for my purposes in a minute or two.

+ +

Perhaps surprisingly, a spreadsheet like Excel is often a good tool for this task. I mark what's in (/not in) the sample with say a column of 1's and 0's and compute the statistics I want from that. A few moments of choosing which points to put in or out (swapping the 1s and 0s) and its done.

+ +

Such approaches are also useful when making up data - simulate something close to what's desired, and then manipulate a few points (adding or removing or altering) to make it look closer to what you need. Again, a spreadsheet is often a handy tool for this type of task.

+",2013-11-04 00:14:48.820 +58787,22507.0,2,,58752.0,,,,CC BY-SA 3.0,"

From the data you cannot infer that the variance between males and females is same, in fact the opposite is almost certainly true. Also, since 50 is indeed a bit low, suppose you cannot assume normality.

+ +

Compare each female's value with the median of men's values. If median female is neither better nor worse than a median male (null hypothesis), then each female will have 1/2 chance to be better than a median male. The chance that K or less females are worse than a median male is $P(K) = 2^{-50} \sum_{m=0}^K {50 \choose K}$ . Here we consider the error in the male's median to be negligible, since there are much more males than females, and the variance between males is smaller than the variance between females.

+",2013-11-04 00:16:05.017 +58798,14253.0,1,,,,Understanding Differences of Mean between Two Groups,,CC BY-SA 3.0,"

I have a variable test score that is measured for an organization consisting of 2050 people. The goal is to test if men outperform women (or vice versa). Unfortunately, only 50 of the 2050 people are women. I was told that in order to run inference when there is such a disproportion of one group within a sample that I should pursue some sort of matching strategy.

+ +

First, is this true, that if I match on a few characteristics that I can run inference (e.g., ordinary least squares, linear probability models) and be less worried about the fact that there are so few females--since many males will drop out?

+ +

Second, is there anything I can do to compare the groups before the match even as an anecdote?

+ +

This is the data:

+ +
           Test Score    Test Score
+     N        Mean        Variance    Coef. Var.     Gender    
+   2000      26.12         10.89         0.13         Male        
+     50      56.10         25.01         0.09        Female
+
+",2013-11-04 03:58:12.543 +58788,594.0,2,,58767.0,,,,CC BY-SA 3.0,"

While it looks like its the case that as written the derivative isn't defined there, let's look more carefully. Note that, apart from at zero,

+ +

$\text{sign}(x)$ and $\frac{\partial|x|}{\partial x}$

+ +

are both the same quantity and the inverse of each other.

+ +

That is, except at 0,

+ +

\begin{equation} +\frac{\partial y}{\partial x}=\frac {1}{(|x|+1)} +\end{equation}

+ +

+ +

And then the question is, can we reasonably replace the missing point with its limiting value (the value 1 at x=0)

+ +

Let's look at the original function $z$:

+ +

+ +

And we then realize that the function is not only continuous but actually smooth right through 0.

+ +

That is, the way we've written the function down and then performed our manipulations has led us to fool ourselves about the derivative being a problem. Clearly the correct value for that derivative at zero is actually 1, and we should fill the point in with its limit without being overly concerned that we did something nefarious.

+ +

The derivative of the function can simply be taken as

+ +

\begin{equation} +\frac{\partial y}{\partial x}=\frac {1}{(|x|+1)} +\end{equation}

+ +

including at zero. (If you're in doubt, start with that derivative, and integrate it to produce the original function (using that y=0 at x=0 fixes the ""+C""), and then work from there.)

+ +

(Indeed, if you go back to brass tacks and do it like when we were very first learning how to take derivatives, as $\lim_{h\to 0} \frac{f(x+h) - f(x)}{h}$, there's no difficulty.)

+",2013-11-04 01:06:33.153 +58789,13165.0,1,58952.0,,,The junction tree theorem,,CC BY-SA 3.0,"

In [1] (Page 31,equation 2.12 ) it is claiming that in a graph which is processed by the junction tree algorithm, the joint distribution of the variables could be found by +$$ +p(x_1, ..., x_m) = \frac{ \prod_{C \in \mathcal{C}} \mu_{C}(x_C) }{ \prod_{S \in \mathcal{S}} [\mu_{S}(x_S)]^{d(S)-1} } +$$ +In which $d(S)$ denote the number of maximal cliques to which it is adjacent. +The problem is that, I don't really see where this distribution is coming from. Any ideas?

+ +

[1] http://www.eecs.berkeley.edu/~wainwrig/Papers/WaiJor08_FTML.pdf

+",2013-11-04 01:12:24.783 +58790,14597.0,1,67041.0,,,What is the loss function of hard margin SVM?,,CC BY-SA 3.0,"

People says soft margin SVM use hinge loss function: $\max(0,1-y_i(w^\intercal x_i+b))$. However, the actual objective function that soft margin SVM tries to minimize is +$$ +\frac{1}{2}\|w\|^2+C\sum_i\max(0,1-y_i(w^\intercal x_i+b)) +$$ +Some authors call the $\|w\|^2$ term regularizer and the $\max(0,1-y_i(w^\intercal x_i+b))$ term loss function.

+ +

However, for hard margin SVM, the whole objective function is just +$$ +\frac{1}{2}\|w\|^2 +$$ +Does that mean hard margin SVM only minimize a regularizer without any loss function? That sounds very strange.

+ +

Well, if $\frac{1}{2}\|w\|^2$ is the loss function in this case, can we call it quadratic loss function? If so, why the loss function of hard margin SVM becomes regularizer in soft margin SVM and make a change from quadratic loss to hinge loss?

+",2013-11-04 01:31:36.213 +58791,22507.0,2,,58469.0,,,,CC BY-SA 3.0,"

For unknown nonlinear dependences, there are nonlinear models. The most commonly used are tree models, i. e. random forest (package randomForest) and boosted trees (package gbm) and neural networks.

+ +

If you want to use linear regression for nonlinear dependence, one possibility is to create a big polynom with lots of terms, and then do stepwise elimination. Use logarithms if (and only if) the feature change orders of magnitude, i.e. you try to predict the life span of arbitrary animal from its weight.

+ +

Another possibility is to try Kernel Ridge Regression.

+ +

You may also want to read about Generalized Additive Model.

+",2013-11-04 01:49:18.887 +58792,10448.0,2,,58748.0,,,,CC BY-SA 3.0,"

This sounds more like a data collection problem. If you have smaller sample sizes there is nothing you can do from a statistical perspective to extract more information.

+",2013-11-04 02:22:58.883 +58793,22507.0,2,,58032.0,,,,CC BY-SA 3.0,"

Define error as the difference between real and observed value.

+ +

Suppose your errors at different points are independent and normally distributed (i.e. no systematic error).

+ +

If you know the standard deviation of each error, you also know the standard deviation of difference between the value at two curves. Now you have the vector of differences, with known standard deviation of each difference. Divide each difference by its standard deviation, and you have vector of normalized values with standard deviation of 1 each. The null hypothesis is that they are distributed as N(0,1). Test it with any normality test.

+",2013-11-04 02:29:30.647 +58794,14553.0,1,,,,Unsupervised Dimensional reduction for mixed data types,,CC BY-SA 3.0,"

I have a data set with about 50K rows and 100 columns. You can consider every row to be representing one restaurant.

+ +

My goal is to calculate dissimilarities between all the restaurants - Gower's coefficient.

+ +

Of those 100 columns (features), a few of them are numeric data and nominal data. The problem is the other columns (about 90) are very sparse binary data (1/0).

+ +

I do think that those 90 columns of binary data can be reduced to some smaller number of columns, so that the computational time can be reduced significantly. But I don't know what method I should use to reduce such a large amount of binary data.

+ +

Can anyone give me some suggestions?

+ +

It will be most helpful if you can provide me some references and R code.

+",2013-11-04 02:45:08.003 +58795,20473.0,2,,58769.0,,,,CC BY-SA 3.0,"

$$X_n = \max(X_0+\sum\limits_{i=1}^n\xi_i, \sum\limits_{i=2}^n\xi_i, \sum\limits_{i=3}^n\xi_i, ..., 0)$$

+ +

$$\Rightarrow \frac 1nX_n = \max\left(\frac 1n(X_0+\sum\limits_{i=1}^n\xi_i), \frac 1n\sum\limits_{i=2}^n\xi_i, \frac 1n\sum\limits_{i=3}^n\xi_i, ..., \frac 1n\cdot0\right)$$

+ +

$$\Rightarrow \operatorname{plim} \frac 1nX_n = \operatorname{plim}\max\left(\frac 1n(X_0+\sum\limits_{i=1}^n\xi_i), \frac 1n\sum\limits_{i=2}^n\xi_i, \frac 1n\sum\limits_{i=3}^n\xi_i, ..., \frac 1n\cdot0\right)$$

+ +

The $\max$ is a continuous function. Also, since the $\xi$-series is i.i.d, the Law of Large Numbers holds. Then

+ +

$$\operatorname{plim} \frac 1nX_n = \max\left(\operatorname{plim}\frac 1n(X_0+\sum\limits_{i=1}^n\xi_i), \operatorname{plim}\frac 1n\sum\limits_{i=2}^n\xi_i, \operatorname{plim}\frac 1n\sum\limits_{i=3}^n\xi_i, ..., \operatorname{plim}\frac 1n\cdot0\right)$$

+ +

$$\rightarrow_{p} \max\left(\frac 1n\sum\limits_{i=1}^nE(\xi_i), \frac 1n\sum\limits_{i=2}^nE(\xi_i), \frac 1n\sum\limits_{i=3}^nE(\xi_i), ..., 0\right)$$

+ +

$$= \max\left(\frac 1nnE(\xi), \frac 1n(n-1)E(\xi), \frac 1n(n-2)E(\xi),..., 0\right) = E(\xi)$$

+ +

under the assumption that $E(\xi)>0$. QED

+",2013-11-04 03:32:50.263 +58796,23321.0,1,,,,Compare dispersion between two distributions with thick tails,,CC BY-SA 3.0,"

What tests would I use to see if two distributions are 'significantly' different in dispersion if they are thick tailed, like a t-distribution? thanks

+",2013-11-04 03:41:59.057 +58799,23292.0,1,,,,Distribution of Product of Normal and Poisson?,,CC BY-SA 3.0,"

Suppose that X is distributed Poisson with a known rate and Y is a normal distributed with a know mean and variance. My goal is to approximate the distribution Z where P(Z) = P(X) * P(Y), where Z is a non-negative integer. I could get a good approximation by sampling, but I'd really like to have a fast solution, ideally closed-form.

+",2013-11-04 04:16:42.723 +58800,22843.0,1,58899.0,,,In finding the moment generating function why do we multiply by $e^{tx}$ for each pmf term?,,CC BY-SA 3.0,"

The moment generating function that is associated with the discrete random variable $X$ and pmf $f(x)$ is defined as:

+ +

$$M(t) = E\left[e^{tX}\right] = \sum_{x \in S} e^{tx} f(x).$$

+ +

Where does this $e^{tx}$ come from? This vaguely looks like an integrating factor from differential equations.

+ +

Also, I find it strange that when we take the derivative of $M(t)$ we don't say ""lets take the partial derivative with respect to t"" but isn't this what we're doing, treating $x$ as a constant? Why do we use the derivative notation instead of the partial derivative operator?

+",2013-11-04 04:19:45.700 +58801,14227.0,2,,58784.0,,,,CC BY-SA 3.0,"

Found the answer on the gensim mailing list.

+ +

In short, the bound() method of LdaModel computes a lower bound on perplexity, based on a held-out corpus.

+",2013-11-04 05:05:47.587 +58802,23325.0,1,,,,Bayesian model selection in PyMC,,CC BY-SA 3.0,"

I'm trying to do model selection using PyMC (v2.2), but having difficulty assessing the models using various Information Criteria and/or Bayes Factor. My model is similar to a typical regression, with several parameters (~10) with priors modelled by uniform distributions, and a single observation modelled by a normal distribution with a deterministic mean and uniform standard deviation (through a precision deterministic). The mean (dynamic) response of the system is actually generated by an ordinary differential equation and typically yields around 1500 data points. I can get reasonably accurate results using either Adaptive or Non-Adaptive MCMC, with around 50K samples following 50K burn-in samples.

+ +

Based on issues reported with BF for complex models, I started looking at the DIC values produced after performing the MCMC analysis, but for models consisting of various combinations of the true parameter set, they were quite large with very little difference between them. For example, -14623.9 and -14624.8. Are the DIC, and other similar criterion such as BPIC, normally so insensitive to different (sub-)models?

+ +

Using some code from the sandbox, I also computed the sample likelihood for my models in an attempt to exploit Bayes Factor. However, the log-likelihoods produced were very large, ranging from -1e5 to 1e5. Regularization further decreased the minimum and exponentiating resulted in overflow (see function weight), so the array exp(loglikes[m]) ended up comprising all zeros and one one! Why would my log-likelihoods, calculated by logp after a call to draw_from_prior, be so large?

+ +

I'm fairly new to Bayesian estimation, so any help would be greatly appreciated!

+",2013-11-04 05:42:50.047 +58803,2081.0,2,,58771.0,,,,CC BY-SA 3.0,"

Let $p_x$ and $p_y$ be the number of variables in your sets $X$ and $Y$. $N$ is the sample size. You have obtained $m=\min(p_x,p_y)$ canonical correlations $\gamma_1, \gamma_2,...,\gamma_m$. Testing them usually goes as follows.

+ +

Given $\gamma_j$, its corresponding eigenvalue is $\lambda_j= \frac{1}{1-\gamma_j^2}-1$.

+ +

Wilk's lambda statistic for it is $w_j= \frac{1}{1+\lambda_j}w_{j+1}$. So, first compute $w_m$ which is $\frac{1}{1+\lambda_m}$, then compute $w_{m-1}$ using $w_m$, etc., backwards.

+ +

This statistic has approximately Chi-square distribution (under assumptions of normality and large $N$) with $df_j= (p_x-j+1)(p_y-j+1)$. To recalculate Wilk's into the Chi-square: $\chi_j^2= -\ln(w_j)(N-(p_x+p_y+3)/2)$.

+ +

So, substitute $\chi_j^2$ in Chi-square cdf distribution with $df_j$, subtract from 1, and have the p-value for correlation $\gamma_j$.

+ +

What does this p-value mean in fact? Nonsignificant p-value for $\gamma_1$ tells that all canonical correlations $\gamma_1$ through $\gamma_m$ are not significant (i.e. the hypothesis that they all are zero should not be rejected). Significant p-value for $\gamma_1$ and nonsignificant p-value for $\gamma_2$ tells that $\gamma_1$ is significant (likely to be nonzero in the population), while the rest $\gamma_2$ through $\gamma_m$ are all not significant; etc. Sometimes, p-value for $\gamma_{j+1}$ is lower than for $\gamma_{j}$. That should not be taken in the sense ""$\gamma_{j+1}$ is more significant"" because a more junior correlation cannot be more significant than more senior one. As said already, if $\gamma_{j}$ is not significant for you, all the remaining junior correlations must automatically be considered not significant too.

+ +

For an algorithm of CCA, look here.

+",2013-11-04 05:50:42.140 +58804,12152.0,1,58808.0,,,"In stochastic gradient descent, is there only one update to $\theta$ for each iteration?",,CC BY-SA 3.0,"

I have read that the update equation for stochastic gradient descent is as shown below, for each iteration, k. Does one iteration correspond to one training example? So for each example is there only one update to $\theta$?

+ +

$ \theta^{k+1} = \theta^k - \epsilon_k \frac{\partial L(\theta^k,z)}{\partial \theta^k} $

+ +

Update: Is it different for Online learning?

+",2013-11-04 07:14:44.243 +58805,11117.0,1,,,,Use the improper prior $p(v) \propto 1/v$ into Jags,,CC BY-SA 3.0,"

I know that one can approximate this density ($p(v) \propto 1/v$) using its truncated version and implement it this way:

+ +
   B~dunif(log(BInf),log(BSup))
+   v<-exp(B)
+
+ +

but I would like to use the exact form (I checked that under this prior my posterior is proper). Is there any solution to achieve this ?

+",2013-11-04 08:03:10.103 +58806,1406.0,2,,58756.0,,,,CC BY-SA 3.0,"

Exploit the fact, that characteristic function convergence is equivalent to convergence in distribution. I.e. show that

+ +

$$E\exp(i\tilde T_{n}t)\to \exp(-t^2/2),$$

+ +

for any $t$, where $\tilde T_n$ is appropriately scaled and shifted version of $T_n$. Another useful fact is that the characteristic function of the sum of independent random variables is the product of the corresponding characteristic functions of the summands.

+",2013-11-04 08:04:13.173 +58830,2149.0,2,,58824.0,,,,CC BY-SA 3.0,"

You should investigate ARMAX/Dynamic Regression/Transfer Functions making sure that you deal with any Outliers/Level Shifts/Seasonal Pulses/Local Time Trends while testing for constancy of parameters and constancy of error variance. If you wish to post your data, do so and I will send you some results illustrating these ideas.

+",2013-11-04 16:44:17.273 +58807,23328.0,1,58829.0,,,Can I simply subtract an index from another?,,CC BY-SA 3.0,"

I am currently using a measure of the price level (CPI) from which I would like to remove the effects of food prices (also an index). Notice that food prices are included in the CPI (so that CPI is the total index).

+ +

First, I made sure that both were based in the same way (that is, the base value of 100 is on the same time point). Now, if I simply subtract the food price index from the CPI, I end up with small values hovering around 2, so I am quite sure it is not the way.

+ +

I tried the following: CPI+(CPI-food price) and CPI-(CPI-food price) in order to get rid of the food price index. However, I am not sure whether it is the first or the second method since, obviously, both give me a value of 100 on the base date.

+ +

My hunch is, it is the CPI-(CPI-food price) but I would like your take on this.

+",2013-11-04 08:39:40.640 +58808,23320.0,2,,58804.0,,,,CC BY-SA 3.0,"
+

Does one iteration correspond to one training example?

+
+ +

Yes.

+ +
+

So for each example is there only one update to $\theta$?

+
+ +

It is possible for a single example to be picked and used to update theta many times.

+",2013-11-04 08:49:48.073 +58809,1959.0,1,,,,"p-value of hypothesis ""a home court advantage exists""",,CC BY-SA 3.0,"

The practical POV:

+ +

I sample 35 matches: 15 turn out as home win, 10 as tie and 10 as home lose. I want to conduct a test for the hypothesis ""A home court advantage does exist."" with a significance level of say 5%. What would be a reasonable test statistic / calculation of the p-value to accept or reject the hypothesis?

+ +

Clarification:

+ +

The matches I sample are independent of a specific team. I just take a number of matches and count for the home teams - wins, ties and defeats. Due to symmetry the guest teams will have deterministically corresponding results and hence can be ignored.

+ +
+ +

The generalized POV:

+ +

$A_1$ be the random variable for the number of matches the local team won in a total sample of $N$ matches. Naturally this is a bernoulli experiment (given we only assume the possibility of win and lose) and the p-value of the hypothesis ""a home court advantage exists - i.e. the probability of a home win $p_1$ is larger than 0.5."" given $a_1$ observed wins is:

+ +

$$\Pr[A_1 >= a_1 | p_1 = 0.5]=\sum_{k={a_1}}^N{N\choose{k}}(0.5)^k$$

+ +

So far so good (though please correct me if I am wrong or my wording is problematic) - now the aspect that puzzles me is how to factor in ties.

+ +

My idea would be to create a test statistic $\tilde{A}_1 := A_1 + \lfloor{A_0/2}\rfloor$ with $A_1$ and $A_0$ being the random variables for number of home wins and ties in $N$ matches.

+ +

$$\Pr[\tilde{A}_1 >= \tilde{a}_1 | \tilde{p}_1 = 0.5]=\sum_{k={a_1}}^N{N\choose{k}}(0.5)^k$$

+ +

Now in my mind this makes sense somehow because it reflects the indecisive and symmetric nature of a tie.

+ +

On the other hand I am not sure how to bend the bernoulli experiment to reflect the notion of $\tilde{p}_1$ being the probability of a home win in the second model (which means home win or ""50% of a tie"") while keeping up the pretence of formal correctness.

+ +

Does this make sense?

+",2013-11-04 08:57:54.620 +58810,22511.0,1,,,,Helmert Transformations,,CC BY-SA 3.0,"

Let $Y_f, Y_1,..., Y_n$ denote measurements taken from a father and his n sons. It is assumed that $Y_f, Y_1,...,Y_n$ have equal variance with father-son correlation $ρ_f$ and equal correlation $ρ_S$ between sons. Transform $Y_f, Y_1,..., Y_n$ linearly to $Z_0, Z_1,..., Z_n$ to remove the correlations.

+ +

The hint is to let $Z_0=Y_f+c(Y_1+...+Y_n)$ where c is a suitably chosen constant.

+ +

Here is what I have tried. +I let $Z_0, Z_1,..., Z_n$ be the Helmert transformations of $Y_f, Y_1,..., Y_n$ where $Z_j=H_jY$ for j=1,2,...,n and thus, +

+ +

where $Y = Y_1,..., Y_n$

+ +

I am stuck until this step and i don't have a clue on how to find the constant c to solve the problem. Thanks for the help.

+",2013-11-04 09:02:41.087 +58811,22423.0,1,58834.0,,,Simple linear regression with a random predictor,,CC BY-SA 3.0,"

We understand a SLR model as +$$y_i = \alpha + \beta x_i + \varepsilon_i$$ +with $\varepsilon_i$ i.i.d with equal variance.

+ +

Suppose I have two instruments measuring a common entity, say, density of samples of different liquids, and assume that we do not know the exact density of each liquid (nor is there any way to accurately measure it).

+ +

My objective is to construct a model that will summarise the difference between the two instruments.

+ +

Here is how I go about it: (for simplicity sake, all models are SLR models)

+ +

Let readings from instrument 1 and instrument 2 when measuring the $i$-th liquid be $y_i$ and $w_i$ respectively, and the actual density of the $i$-th liquid be $x_i$ Then we have: +$$y_i=\alpha_1+\beta_1 x_i + \varepsilon_{1i} \\ +w_i=\alpha_2+\beta_2 x_i + \varepsilon_{2i}$$ +where $\varepsilon_{1i}$ and $\varepsilon_{2i}$ are i.i.d and equal variance w.r.t $i$.

+ +

Rearranging from the second equation we have: +$$x_i= \frac{w_i- \alpha_2- \varepsilon_{2i}}{\beta_2 } $$ +Plugging into the first equation, +$$ \begin{align*} y_i &=\alpha_1+\beta_1 \frac{w_i- \alpha_2- \varepsilon_{2i}}{\beta_2 } + \varepsilon_{1i} \\ +&= \alpha_1 - \frac{\beta_1}{\beta_2} \alpha_2 + \frac{\beta_1}{\beta_2} w_i + \varepsilon_{1i}-\frac{\beta_1}{\beta_2}\varepsilon_{2i}\\ +&= \alpha + \beta w_i + \varepsilon_i +\end{align*} +$$

+ +

where $\beta = \frac{\beta_1}{\beta_2}$, $\alpha = \alpha_1 - \beta \alpha_2$, and $ \varepsilon_i = \varepsilon_{1i}-\beta \varepsilon_{2i}$, and $ \varepsilon_i $ are consequently i.i.d. with equal variance w.r.t. $i$.

+ +

I would like to ask,

+ +
    +
  1. Is this a valid SLR model for my objective? Do I just treat readings from instrument 2 as an 'independent variable' and proceed like a normal SLR analysis?

  2. +
  3. If so, can I interpret the slope($\beta$) as the ratio of the individual slopes of the two instruments?

  4. +
  5. Can I then test the hypothesis that the two instruments are similar by testing $\alpha = 0$ and $ \beta =1 $?

  6. +
  7. Suppose there are N total liquids we are interested in. All N liquids have been measured by instrument 1. However, we only have time/money to measure n ($<$N) liquids using instrument 2. How should I select the n liquids from N such that I would minimise variance for $\beta$ and $\alpha$?

  8. +
+",2013-11-04 09:12:04.710 +58812,23331.0,2,,48103.0,,,,CC BY-SA 3.0,"

The current methods to fit a sin curve to a given data set require a first guess of the parameters, followed by an interative process. This is a non-linear regression problem. +A different method consists in transforming the non-linear regression to a linear regression thanks to a convenient integral equation. Then, there is no need for initial guess and no need for iterative process : the fitting is directly obtained. +In case of the function y = a + r*sin(w*x+phi) or y=a+b*sin(w*x)+c*cos(w*x), see pages 35-36 of the paper ""Régression sinusoidale"" published on Scribd : +http://www.scribd.com/JJacquelin/documents +In case of the function y = a + p*x + r*sin(w*x+phi) : pages 49-51 of the chapter ""Mixed linear and sinusoidal regressions"". +In case of more complicated functions, the general process is explained in the chapter ""Generalized sinusoidal regression"" pages 54-61, followed by a numerical example y = r*sin(w*x+phi)+(b/x)+c*ln(x), pages 62-63

+",2013-11-04 10:32:01.167 +58813,22049.0,1,,,,Finding anomalies using moving average in a time series,,CC BY-SA 3.0,"

I want to find anomalies in a time series. Is it possible to find anomalies using moving average?

+",2013-11-04 10:58:38.770 +58814,16665.0,1,58820.0,,,"Fligner test in R, with several variables indicating the grouping",,CC BY-SA 3.0,"

I have to run an multiple way Anova and want to test the assumption of homoscedasticity.

+ +

How can I run a Fligner.test with several independent variables (variables indicating the grouping) with R? Does it make sense to do such a thing?

+",2013-11-04 11:10:15.180 +58831,23339.0,1,,,,Beginner's Question about Plotting in R,,CC BY-SA 3.0,"

Alright, so I'm new to R, and I imagine what I'm trying to do should be very simple. I want to use plot(x,y), but then add the curve lm(y~x^2) over the plot. I'm sure I could add more detail this, but I am not sure what is needed, so I'll be vigilant to answer any questions you all may have. I appreciate any help I can get!

+",2013-11-04 16:48:47.160 +58815,23333.0,1,,,,Double integration diverges due to the non-Gaussian noise,,CC BY-SA 3.0,"

I added Gaussian noise to my input data and then I integrated it twice (I used trapezoidal rule). I was wondering if the the integration itself transform the Gaussian noise to something else. I looked at the probability density function of which is

+ +

$$f(x)= \frac {1} {\sigma \sqrt{2 \pi}} e^{- \frac{1}{2}(\frac{x- \mu}{\sigma})^2}$$

+ +

When we integrate the probability density function it's equal to 1 (property of Gaussian distribution). +$$\int f(x)=1$$ +Thus the distribution of the noise is Gaussian. +When we integrated again (double integration) the probability density function is not equal to 1 anymore and it depends on the domain which we integrate it. I was wondering if we can conclude that after the second integration the Gaussian noise is not Gaussian anymore from the fact that the double integration of the probability density function is not 1.

+",2013-11-04 11:29:16.110 +58816,2149.0,2,,58813.0,,,,CC BY-SA 3.0,"

Anomalies can be easily detected while using ""moving averages"" . Please review the woRk or Tsay http://www.unc.edu/~jbhill/tsay.pdf . You might also search for ""AUTOMATIC DETECTION OF INTERVENTION VARIABLES"" using Google . Post your actual data and I will post the results.

+",2013-11-04 12:02:31.710 +58817,9081.0,2,,58799.0,,,,CC BY-SA 4.0,"

There is one book dedicated to the problem of products of random variables: +http://www.amazon.com/Products-Random-Variables-Applications-Arithmetical/dp/0824754026/ref=sr_1_1?s=books&ie=UTF8&qid=1383564424&sr=1-1&keywords=product+of+random+variables

+

Maybe you can find it in a library. (Or search google scholar with the author names)

+

There is a connection between products of independent random variables and the Mellin transform, see the paper: "Some Applications of the Mellin Transform in Statistics" by Benjamin Epstein, which is on JSTOR. There is a Wikipedia article on the Mellin Transform, and search google scholar for "Mellin transform product of random variables" gives some relevant papers.

+",2013-11-04 12:06:40.070 +58818,16441.0,1,58837.0,,,Log or square-root transformation for ARIMA,,CC BY-SA 3.0,"

With the below dataset, I have a series which needs transforming. Easy enough. However, how do you decide which of the SQRT or LOG transformations is better? And how do you draw that conclusion?

+ +
x<-c(75800,54700,85000,74600,103900,82000,77000,103600,62900,60700,58800,134800,81200,47700,76200,81900,95400,85400,84400,103400,63000,65500,59200,128000,74400,57100,75600,88300,111100,95000,91500,111400,73700,72800,64900,146300,83100,66200,101700,100100,120100,100200,97000,120600,88400,83500,73200,141800,87700,82700,106000,103900,121000,98800,96900,115400,87500,86500,81800,135300,88900,77100,109000,104000,113000,99000,104500,109400,92900,88700,90500,140200,91700,78800,114700,100700,113300,122800,117900,122200,102900,85300,92800,143800,88400,75400,111200,96300,114600,108300,113400,116600,103400,87300,88200,149800,90100,78800,108900,126300,122000,125100,119600,148800,114600,101600,108800,174100,101100,89900,126800,126400,141400,144700,132800,149000,124200,101500,106100,168100,104200,79900,126100,121600,139500,143100,144100,154500,129500,109800,116200,171100,106700,85500,132500,133700,135600,149400,157700,144500,165400,122700,113700,175000,113200,94400,138600,132400,129200,165700,153300,141900,170300,127800,124100,206700,131700,112700,170900,153000,146700,197800,173800,165400,201700,147000,144200,244900,146700,124400,168600,193400,167900,209800,198400,184300,214300,156200,154900,251200,127900,125100,171500,167000,163900,200900,188900,168000,203100,169800,171900,241300,141400,140600,172200,192900,178700,204600,222900,179900,229900,173100,174600,265400,147600,140800,171900,189900,185100,218400,207100,178800,228800,176900,170300,251500,149900,150300,192000,185100,184500,228800,219000,180000,241500,184300,174600,264500,166100,151900,194600,214600,201700,229400,233600,197500,254600,194000,201100,279500,175800,167200,235900,207400,215900,261800,236800,222400,281500,214100,218200,295000,194400,180200,250400,212700,251300,280200,249300,240000,304200,236900,232500,300700,207300,196900,246600,262500,272800,282300,271100,265600,313500,268000,256500,318100,232700,198500,268900,244300,262400,289200,286600,281100,330700,262000,244300,309300,246900,211800,263100,307700,284900,303800,296900,290400,356200,283700,274500,378300,263100,226900,283800,299900,296000,327600,313500,291700,333000,246500,227400,333200,239500,218600,283500,267900,294500,318600,318700,283400,351600,268400,251100,365100,249100,216400,245500,232100,236300,275600,296500,296900,354300,277900,287200,420200,299700,268200,329700,353600,356200,396500,379500,349100,437900,350600,338600,509100,342300,288800,378400,371200,395800,450000,414100,387600,486600,355300,358800,526800,346300,295600,361500,415300,402900,484100,412700,395800,491300,391000,374900,569200,369500,314900,422500,436400,439700,509200,461700,449500,560600,435000,429900,633400,417900,365700,459200,466500,488500,531500,483500,485400,575700,458000,433500,642600,409600,363100,430100,503900,500400,557400,565500,526700,628900,547700,520400,731200,494400,416800,558700,537100,556200,686700,616600,582600,725800,577700,552100,806700,554200,455000,532600,693000,619400,727100,684700)
+y<-ts(x,frequency=12, start=c(1976,1))
+#Transforming the data to log or sqrt and plotting it
+log.y<-log(y)
+plot(log.y)
+sqrt.y<-sqrt(y)
+plot(sqrt.y)
+
+",2013-11-04 12:55:18.167 +58819,23284.0,1,,,,Time Series Analysis for Non-Uniform Data,,CC BY-SA 3.0,"

I have wait-time data for a coffee shop for 4 weeks. Since the data is crowdsourced, it is sparse and non-uniform along the time. So my question is: how should I deal with this non-uniform data? What are the some methods in time series forecasting that can handle non-uniform data? Or is there any way to make the data uniform?

+",2013-11-04 12:57:26.843 +58820,16665.0,2,,58814.0,,,,CC BY-SA 3.0,"

There are two ways of using the fligner.test() function. You can either do

+ +
+

fligner.test(x=…, g=…)

+
+ +

or

+ +
+

fligner.test(formula= a~b+c+d, data=your_data)

+
+ +

The second case allows to check an anova model with several independent variables.

+ +

And yes, it totally makes sense to run a Fligner test with several grouping variables.

+",2013-11-04 13:10:34.553 +58821,12683.0,2,,58633.0,,,,CC BY-SA 3.0,"

Propensity score matching: +First you'd model the probability of students' going to an advisor, based on available predictors—perhaps course marks, financial situation, living arrangements, medical records, &c. For each student the predicted probability of going to an advisor is the propensity score. Next you'd match each student who did go to an advisor with one who didn't but has the same, or as near as possible, propensity score. So you'd now proceed with an analysis of retention based on matched treatment-control pairs.

+ +

Of course, this approach doesn't provide as compelling evidence as an experiment in which students are randomly assigned to go to an advisor or not—you'd need to consider whether predictors you hadn't accounted for could be significantly contributing to the effect.

+ +

Rosenbaum & Rubin (1983), ""The Central Role of the Propensity Score in Observational Studies for Causal Effects"", Biometrika, 70, 1

+ +

Rosenbaum & Rubin (1985), ""Constructing a Control Group Using Multivariate Matched Sampling Methods that Incorporate the Propensity Score"", The American Statistician, 39, 1

+ +

And the Matching package for R might be useful.

+",2013-11-04 13:52:25.530 +58822,4537.0,1,58843.0,,,Why does the Lasso provide Variable Selection?,,CC BY-SA 3.0,"

I've been reading Elements of Statistical Learning, and I would like to know why the Lasso provides variable selection and ridge regression doesn't.

+ +

Both methods minimize the residual sum of squares and have a constraint on the possible values of the parameters $\beta$. For the Lasso, the constraint is $||\beta||_1 \le t$, whereas for ridge it is $||\beta||_2 \le t$, for some $t$.

+ +

I've seen the diamond vs ellipse picture in the book and I have some intuition as for why the Lasso can hit the corners of the constrained region, which implies that one of the coefficients is set to zero. However, my intuition is rather weak, and I'm not convinced. It should be easy to see, but I don't know why this is true.

+ +

So I guess I'm looking for a mathematical justification, or an intuitive explanation of why the contours of the residual sum of squares are likely to hit the corners of the $||\beta||_1$ constrained region (whereas this situation is unlikely if the constraint is $||\beta||_2$).

+",2013-11-04 14:39:19.147 +58823,23336.0,1,,,,"Fitting a ""sigmoid"" function: Why is my fit so bad?",,CC BY-SA 3.0,"

I tried to fit a curve to the black points using the following code. Why is the fit so bad? Do I need to fit another type of function?

+ +
fit <- nls(grad ~ theta1/(1 + exp(-(theta2 + theta3*x1))), 
+           start=list(theta1 = 4, theta2 = 0.09, theta3 = 0.31), trace=TRUE)
+
+p = predict(fit)
+
+plot(x1, grad)
+points(x1, p, col = ""red"")
+
+ +

+",2013-11-04 14:51:50.083 +58855,12980.0,1,58878.0,,,How to generate from the copula by inverse conditional cdf function of the copula?,,CC BY-SA 3.0,"

I am trying to write a code (I am using MATLAB) for estimating the goodness of fit of the copula based on a Rosenblatt transformation ( Dobrić and Schmid 2007, http://dx.doi.org/10.1016/j.csda.2006.08.012) my question is this:

+ +

In the algorithm it says: ""Generate i.i.d. observations from the copula with parameter theta (I can't use copularnd function because it only covers a few families). If my copula is bi-variate like C(u,v, theta) how can I generate these i.i.d. observations? what will be my input to copula function?

+ +

Thanks

+",2013-11-04 22:00:27.077 +58824,16504.0,1,,,,Time Series Modeling with Lagged Variables,,CC BY-SA 3.0,"

I have a dataset with columns that represent lagged values of predictors. To illustrate with a simple example, suppose we had car sales data for 3 years and the only predictors available were income and population for a number of car dealers, the dataset could be represented as follows,

+ +
ID  IncLag1  PopLag1  SalesLag1  IncLag2  PopLag2 SalesLag2  IncCurrent  PopCurr  SalesCurr
+a       100      1000     200        150      2000    300        500       2500         450
+b       10        300      50         60       900     80         90       1000         100
+
+ +

...

+ +
k       30        60      10        200      2000     60         80          800         ??
+
+ +

My dependent variable is SalesCurr - i.e., given a history of past sales and corresponding Income and Population values (which we can use as the train-test data), predict what the Sales will be in the current year (SalesCurr).

+ +

My question is as follows -- Using R or GRETL, how is it possible to create an ARIMA/TimeSeries model with the above data to predict the SalesCurrent variable. Using simple Linear Regression, one could simply have a formula such as say, lm (SalesCurrent ~ ., data=mytable), but it would not be a time-series model since it does not take into account the relationship between the different variables.

+ +

Alternatively, I am quite familiar with Machine Learning models and wanted to get your thoughts on how such a dataset could be modeled using say, randomForest, GBM, etc.

+ +

Thanks in advance.

+",2013-11-04 15:05:10.873 +58825,306.0,2,,58819.0,,,,CC BY-SA 3.0,"

Although not quite made for data for the coffee shop, you can have a look at the Autoregressive Conditional Duration model. It was originally made for waiting times of trades data in finance but i suggest you should give it a shot. and please do report the results you got. thanks.

+",2013-11-04 15:05:19.823 +58826,4910.0,1,227108.0,,,"Is there any ""standard"" for statistical model notation?",,CC BY-SA 4.0,"

In, for example, the BUGS manual or the upcoming book by Lee and Wagenmakers (pdf) and in many other places a type of notation is used that to me seems very flexible in that it can be used to succinctly describe most statistical models. An example of this notation is the following:

+

$$ y_i \sim \text{Binomial}(p_i,n_i) \\ + \log\left(\frac{p_i}{1 - p_i}\right) = b_i \\ +b_i \sim \text{Normal}(\mu_p,\sigma_p) $$

+

which would describe a hierarchical logistic model with no predictors, but with $i = 1\dots n $ groups. This way of describing models seem to work equally well for describing frequentist and Bayesian models, for example, to make this model description fully Bayesian you would just have to add priors on $\mu_p$ and $\sigma_p$.

+

Is this type of model notation/formalism described in detail in some article or book?

+

If you want to use this notation to write models there are many different ways of doing things and it would be really useful with a comprehensive guide both to follow and to reference others to. Some differences I've found in how people use this type of notation:

+
    +
  • What do you call distributions? E.g., I've seen $\mathcal{N},\text{N},\text{Norm},\text{Normal}$, etc.
  • +
  • How do you deal with indexes? E.g. I've seen $y_{ij}$,$y_{i[j]}$,$y_{j|i}$, etc.
  • +
  • Which parameter symbols are usually used for parameters. For example, it is common to use $\mu$ as the mean for the normal distribution, but what about other distributions? (For this I usually check the distributions of Wikipedia)
  • +
+

Follow up question: Does this notation have a name? (For lack of a better name I called it the probability distribution centric convention +in a blog post I wrote...)

+",2013-11-04 15:05:21.753 +58827,306.0,2,,58807.0,,,,CC BY-SA 3.0,"

What i am suggesting is not the right answer and i agree with the comments that it cannot be calculated. However if you still had to do it, and nothing else is given, then i would suggest that you can fit a regression model with the CPI as the dependent variable and the food price index as the independent variable. if you get a good fit, then you know how to remove the effect of the food price index simply by subtracting the food price index multiplied by the coefficient you get. and once again, this is just an attempt if nothing else is provided and you get a good fit.

+",2013-11-04 15:10:23.797 +58828,23337.0,1,,,,Large coefficients in logistic regression,,CC BY-SA 3.0,"

This is from the book The statistical sleuth--A course in methods of Data analysis Chapter 20, Exercise 12(c)-(e). I am using logistic regression to predict carrier with possible predictors CK and H. Here is my solution:

+ +
Carrier <- c(0,0,0,0,0,1,1,1,1,1)  
+CK      <- c(52,20,28,30,40,167,104,30,65,440)  
+H       <- c(83.5,77,86.5,104,83,89,81,108,87,107)  
+logCK   <- log(CK)  
+fit4    <- glm(Carrier~logCK+H, family=""binomial"", control=list(maxit=100))  
+Warning message:  
+glm.fit: fitted probabilities numerically 0 or 1 occurred   
+summary(fit4)
+## 
+## Call:
+## glm(formula = Carrier ~ logCK + H, family = ""binomial"", control = list(maxit = 100))
+## 
+## Deviance Residuals: 
+##        Min          1Q      Median          3Q         Max  
+## -1.480e-05  -2.110e-08   0.000e+00   2.110e-08   1.376e-05  
+##
+## Coefficients:  
+##              Estimate Std. Error z value Pr(>|z|)  
+## (Intercept)   -2292.8  4130902.8  -0.001        1  
+## logCK           315.6   589675.2   0.001        1  
+## H                11.5    21279.6   0.001        1
+
+ +

This results appear to be weird, because it seems that all coefficients are not significant. Also the next question is to do a drop-in-deviance test for this full model and the reduced model that neither of logCK and H is useful predictor. I get:

+ +
fit5 <- glm(Carrier~1, family=""binomial"")  
+1-pchisq(deviance(fit5)-deviance(fit4), df.residual(fit5)-df.residual(fit4))  
+## [1] 0.0009765625
+
+ +

So the p-value indicates that at least one of logCK and H is useful. Then I'm stuck at the next question, it asks me to calculate odds ratio for a woman with (CK, H)=(300,100) over one with (CK, H)=(80, 85).

+ +

But how can I get a meaningful result with all coefficients in this model ranging so wildly? Is there anything wrong with the way I did this logistic regression? Are there any remedial measures?

+",2013-11-04 15:29:17.310 +58829,17573.0,2,,58807.0,,,,CC BY-SA 3.0,"

Picking up on the comment of @user31264, CPI is calculated with a formula like this: +\begin{align} +CPI_t = \frac{\sum_i w_i p_{it}}{\sum_i w_i p_{it^*}} +\end{align} +Price of item $i$ at time $t$ is $p_{it}$. Prices in the baseline year are $p_{it^*}$. Each price's weight in the CPI is $w_i$. These weights are determined from a ""market basket"" which represents an attempt to match the spending habits of a typical US, urban household (assuming you are talking about the CPI-U for the US) --- you can productively think of these weights as representing the quantity of item $i$ bought by an average US household. The weights are not literally these quantities, but it is OK to think about them this way most of the time.

+ +

Some of the items (some of the $i$) are food items, and some are not. Let's denote the food items as $i \in F$ and the non-food items as $i \not\in F$. Then CPI-food and CPI ex food look like: +\begin{align} +CPI_t^F &= \frac{\sum_{i \in F} w_i p_{it}}{\sum_{i \in F} w_i p_{it^*}}\\ + &\strut \\ +CPI_t^{\tilde{}F} &= \frac{\sum_{i \not\in F} w_i p_{it}}{\sum_{i \not\in F} w_i p_{it^*}}\\ +\end{align}

+ +

Now, we can do some algebra: +\begin{align} +CPI_t &= \frac{\sum_i w_i p_{it}}{\sum_i w_i p_{it^*}}\\ \strut \\ + &= \frac{\sum_{i \in F} w_i p_{it}+\sum_{i \not\in F} w_i p_{it}} + {\sum_iw_ip_{it^*}} \\ \strut\\ + &= \frac{\sum_{i \in F} w_i p_{it^*}}{\sum_iw_ip_{it^*}} + \frac{\sum_{i \in F} w_i p_{it}}{\sum_{i \in F} w_i p_{it^*}} + +\frac{\sum_{i \not\in F} w_i p_{it^*}}{\sum_iw_ip_{it^*}} + \frac{\sum_{i \not\in F} w_i p_{it}}{\sum_{i \not\in F} w_i p_{it^*}} + \\ \strut\\ + &=\frac{\sum_{i \in F} w_i p_{it^*}}{\sum_iw_ip_{it^*}} \cdot CPI_t^F + +\frac{\sum_{i \not\in F}w_i p_{it^*}}{\sum_iw_ip_{it^*}}\cdot CPI_t^{\tilde{}F} + \\ \strut \\ +CPI_t^{\tilde{}F} &= \frac{\sum_iw_ip_{it^*}}{\sum_{i \not\in F}w_i p_{it^*}} + \cdot CPI_t + -\frac{\sum_{i \in F} w_i p_{it^*}}{\sum_{i \not\in F}w_i p_{it^*}} + \cdot CPI_t^F +\end{align}

+ +

So, to back out CPI ex food from CPI and CPI food, you divide overall CPI by the proportion of spending on non-food in the baseline year (the year the market basket was measured in), then you subtract off the CPI food times the ratio of food to non-food spending in the baseline/market-basket year.

+ +

You need to be careful that each of the years of CPI data you use are calculated based on the same market basket. The Bureau of Labor Statistics does occasionally update the market basket. Currently, CPI is based on a 2009/10 market basket. Furthermore, if you use a ""chained"" index, the market basket is updated every year, so that the method outlined here does not apply.

+ +

Finally, a caveat. What the BLS actually does to calculate the CPI is more complicated than what I have laid out above. What I have laid out above is the basic idea, but the implementation details are dizzying. Spend a long time reading the various reports and documentation on the BLS website if you want to actually understand.

+",2013-11-04 16:16:58.027 +58832,11808.0,1,,,,Why Bayes Rule in Naive Bayes compared to simple P(class|features),,CC BY-SA 3.0,"

I would like to improve on my recommendation system. Imagine I have training data of $M=7,000,000$ samples. Each training sample contains a variable number of words in the body, and a variable amount of tags assigned to the case. There are roughly 100 million unique words, and roughly 40,000 unique tags.

+ +

The goal is to, given a set of words, to correctly classify the tag(s) when the tags are not observed.

+ +

Intuition tells me to simply compute the probability for each word, and then use the conditional probability for each tag given the set of words for a tag. More concretely:

+ +

Imagine one case has 5 unique words. Each word will have a $P(W_i)$. This is a constant value for the training set, and can be stored in memory. The probability of $T_1$ given $P(W_1, W_2....W_5)$, is then simply the number of occurrences in the intersection of $T_1$ for each word divided by $M$.

+ +

Why does Naive Bayes suggest using (according to the Bayes rule) $\frac{P(W_1..W_5|T_1) * P(T_1)}{P(W_1...W_5)}$ when $P(T_1|W_1...W_5)$ seems easier to obtain, and more intuitive*?

+ +

* +""What is the probability of a tag given some combination of words?"" seems to be exactly what we want to know.

+",2013-11-04 16:52:24.793 +58833,9047.0,2,,58831.0,,,,CC BY-SA 3.0,"
x <- 1:10
+set.seed(42)
+y <- x^2+3 + rnorm(10)
+plot(x,y)
+
+#I assume you actually want this, i.e., fit a quadratic function:
+fit <- lm(y~I(x^2))
+pred.fit <- function(xnew) predict(fit, newdata=list(x=xnew))
+
+#add curve
+curve(pred.fit, from=min(x), to=max(x), n=1e3, add=TRUE)
+
+ +

+",2013-11-04 16:55:23.353 +58834,17573.0,2,,58811.0,,,,CC BY-SA 3.0,"

The answer to 1 is no which makes the answers to all the others not applicable.

+ +

Let me start with your last equation: +\begin{align} +y_i = \alpha + \beta w_i + \epsilon_i +\end{align}

+ +

Now, let's assume that your earlier equations for $y$ and $w$ are valid classical linear regression models, so that $Cov(x,\epsilon_1)=0$ and $Cov(x,\epsilon_2)=0$. I'm not sure what SLR stands for---Simple Linear Regression?

+ +

Anyway, now let's calculate $Cov(w,\epsilon)$ in order to verify whether your new equation is part of a valid classical linear regression model (recall that we need this to be zero): +\begin{align} +Cov(w,\epsilon) &= Cov(w,\epsilon_1-\frac{\beta_1}{\beta_2}\epsilon_2) \\ \strut \\ + &= Cov(w,\epsilon_1) - \frac{\beta_1}{\beta_2}Cov(w,\epsilon_2) + \\ \strut \\ + &= Cov(\epsilon_2,\epsilon_1) - \frac{\beta_1}{\beta_2}V(\epsilon_2) +\end{align}

+ +

The second term is not zero unless $\beta_1=0$, and that would make the example pretty silly. Even the first term is not likely to be zero in most physical applications. For that term to be zero, you would have to make the additional assumption that the errors made by the two instruments were completely uncorrelated. You could get wildly lucky (in a stopped-clock-is-right-twice-a-day kind of sense) and the two terms could magically cancel out, but there is no systematic tendency of the two terms to cancel out.

+ +

The bias in estimating $\beta$ will be: +\begin{align} +\frac{Cov(\epsilon_2,\epsilon_1) - \frac{\beta_1}{\beta_2}V(\epsilon_2)}{V(w)} +\end{align}

+ +

Below, I attach a bit of R code which makes a toy monte carlo to demonstrate the effect. The theoretical bias in the monte carlo is -0.25 and the answer we get in the monte carlo is too low by 0.23. So, demonstrates the point pretty well.

+ +

In general, even if you can't see how to evaluate the bias in an example like this, you can always run a little monte carlo to see what is going on. This is one of the great things about statistical software languages. Monte Carlo simulations are amazingly powerful tools to give you feedback as to whether your ideas are really good or really not.

+ +
# This program written in response to a Cross Validated question
+# http://stats.stackexchange.com/questions/74527/simple-linear-regression-with-a-random-predictor
+
+# The program is a toy monte carlo.
+# It generates a ""true"" but unobservable-to-the-analyst physical state x.
+# Then it generates two measurements of that state from different instruments.
+# Then it regresses one measurement on the other.
+
+set.seed(12344321)
+
+# True state, 1000 runs of the experiment
+x <- rnorm(1000)
+
+# Set the various parameters of the monte carlo
+# Play with these for fun and profit:
+
+alpha_1 <- 0
+alpha_2 <- 0
+beta_1  <- 1
+beta_2  <- 1
+stddev_e1 <- 1
+stddev_e2 <- 1
+corr_e1e2 <- 0.5
+
+# Fallible measurements
+e_1 <- stddev_e1*rnorm(1000)
+e_2 <- stddev_e2*(corr_e1e2*e_1+sqrt(1-corr_e1e2^2)*rnorm(1000))
+y <- alpha_1 + beta_1*x + e_1
+w <- alpha_2 + beta_2*x + e_2
+
+var(data.frame(e_1,e_2))
+var(data.frame(x,w,y))
+
+lm(y~x)
+lm(w~x)
+
+# By the bias formula in the answer, this regression should have a bias of
+# -0.25 = (0.5-1*1)/2.  That is, the coefficient should not be close to 1,
+# the correct value of beta_1/beta_2.  Instead, it should be close 
+# to 0.75 - 1-0.25
+
+lm(y~w)
+
+",2013-11-04 17:00:21.280 +58835,20470.0,2,,58832.0,,,,CC BY-SA 3.0,"

As the number of words that you condition $P(T_1|W_1,..,W_N)$ on increases, you will find that calculating this probability will no longer be easy. The more conditions you add, the sparser your conditional probability tables will get.

+ +

Naive Bayes makes the assumption that the probability of words occurrences are independent of each other conditioned on the tag, thereby reducing $P(W_1,...,W_N|T_1)$ to $P(W_1|T_1)*...*P(W_N|T_1)$. This 'naive' independence assumption results in:

+ +

$P(T_1|W_1,..,W_N) \propto P(W_1|T_1) * ... * P(W_N|T_1) * P(T_1) $

+ +

Note how the denominator: $P(W_1,...,W_N)$ is dropped since it only serves as a normalising constant that ensures the probability values you get are between $0$ and $1$. You do not really need it to calculate the $T_i$ that maximises the formula given above.

+",2013-11-04 17:15:52.913 +58836,23341.0,1,58908.0,,,How to create forecast data prediction interval bands,,CC BY-SA 3.0,"

I have seasonal data from which I create forecasts. The steps I perform are: deseasonalizing the data, finding the linear regression for the deseasonalized points, predicting a few points from the linear regression and adding seasonality to the predicted values to get forecast data. My input is quite sinusoidal so all works well.

+ +

The problem is that the more in the future you predict, the more prediction errors increase. I'd like to show that on a chart, but I am not sure how to calculate these errors. I was thinking something like prediction interval bands for forecast data (whatever they are called). These bands would increase the further you predict in the future.

+ +

Here are some images that show what I'm trying to do: +sample bands image1 +sample bands image2

+ +

My question is what is the name for these bands? (then I can do a google search for it) +I'd also appreciate the formulas needed for the band calculations. I'm guessing there is a standard deviation in there somewhere.

+ +

I've looked at confidence interval, but that seems to be for the data already present, not for the forecast data.

+",2013-11-04 17:47:34.543 +58837,2149.0,2,,58818.0,,,,CC BY-SA 3.0,"

Transformations are like drugs ! Some are good for you and some aren't !. Haphazard selection of transformations should be studiously avoided.

+ +

a) One of the requirements in order to perform valid statistical tests of necessity is that the variance of the errors from the proposed model must not be proven to be non-constant. If the variance of the errors changes at discrete points in time then one has recourse to Generalized Least Squares or GLM .

+ +

b) If the variance of the errors is linearly relatable to the level of the observed series then a Logarithmic Transformation might be appropriate. If the square root of the variance of the errors is linearly relatable to the level of the original series then a Square Root transformation is appropriate. More generally the appropriate power transformation is found via the Box-Cox test where the optimal lambda is found. Note that the Box-Cox test is universally applicable and doesn't soley requite time series or spatail data.

+ +

All of the above ( a and b ) require that the mean of the errors cannot be proven to differ significantly from zero for all points. If your data is not time series or spatial in nature then the only anomaly you can detect is a pulse. However if your data is time series or spatial then Level Shifts , Seasonal Pulses and/or Local Time Trends might be suggested to render the mean value of the error term to be 0.0 everywhere or at least not significantly different from 0.0 .

+ +

In my opinion one should never willy-nilly transform the data unless one has to in order to satisfy (in part) the Gaussian assumptions. Some econometricians take logs for the simple and simply wrong reason in order to obtain direct estimates of elasticity's rather than assessing the % change in Y for a % change in x from the best model.

+ +

Now one caveat, if one knows from theory or at least one thinks that one knows from theory that transformations are necessary i.e. proven by previous well-documented research , then by all means follow that paradigm as it may prove to be more beneficial that the empirical procedures I have laid out here.

+ +

In closing use the original data, minimize any warping of the results by mindless transformations, test all assumptions and sleep well at night. Statisticians like Doctors should never do harm to their data/patients y providing drugs/transformations that have nasty and unwarranted side-effects.

+ +

Hope This Helps .

+ +

Data Analysis using time series techniques on time series data:

+ +

a plot suggests a series that has structural change. The Chow Test yielded a signifciant break point . . Analysis of the modt recent 147 values starting at 1999/5 yielded with a Residual Plot with the following ACF . The forecast plot is . The final model is with all parameters statistically significant and no unwarranted power transformations which often unfortunately lead to wildly explosive and unrealistic forecasts. Power transforms are justified whrn it is proven via a Box-Cox test that the variablility of the ERRORS is related to the expected value as detailed here. N.B. that the variability of the original series is not used but the variability of model errors.

+ +

+",2013-11-04 17:51:13.340 +58838,855.0,2,,24506.0,,,,CC BY-SA 3.0,"

You can automate the process within JMP using the JSL (JMP Scripting Language). Start by selecting Script > Save to Script Window from the analysis contextual red-triangle menu. That will give you a script to rerun the platform. You can also issue other commands to the report object that the script creates, such as to save the prediction interval or fit formula.

+",2013-11-04 17:53:08.993 +58839,22710.0,1,,,,Is AIC appropriate for model selection when the parameters are fitted by least-squares rather than MLE,,CC BY-SA 3.0,"

I want to compare the fit of a linear model (M1) and nonlinear model (M2):

+ +
    +
  • M1: $y = b_0 + b_1x_1 + b_2x_2 + b_3x_1x_2 + \epsilon, \epsilon \sim N(0, \sigma^2)$
  • +
  • M2: $y = b_0 + b_1x_1 + b_2x_2 + b_1 b_2x_1x_2 + \epsilon, \epsilon \sim N(0, \sigma^2)$
  • +
+ +

In particular I want to know whether M1 is significantly different from M2.

+ +

To estimate the parameters, I am minimizing the least-squares errors rather than maximizing the likelihood through MLE procedures. In particular, I am using the R function nls() as follow:

+ +
# Creating a sample data set
+n <- 50
+x1 <- rnorm(n, 1:n, 0.5)
+x2 <- rnorm(n, 1:n, 0.5) 
+b0 <- 1
+b1 <- 0.5
+b2 <- 0.2
+y <- b0 + b1*x1 + b2*x2 + b1*b2*x1*x2 + rnorm(n,0,0.1)
+# Actual model fit
+M1 <- nls(y ~ b0 + b1*x1 + b2*x2 + b3*x1*x2, start=list(b0=1, b1=0.5, b2=0.5, b3=0.5))
+M2 <- nls(y ~ b0 + b1*x1 + b2*x2 + b1*b2*x1*x2, start=list(b0=1, b1=0.5, b2=0.5))
+
+ +

I want to compare the models using a measure of relative fit such as AIC, which can be done in R as follow:

+ +
AIC(M1, M2)
+   df       AIC
+M1  5 -88.47849
+M2  4 -90.46491
+
+ +

Because $\Delta AIC \approx 2$ and the models differ by only one parameter, I would conclude that both of them fit the data similarly well.

+ +

In addition, I want to know whether the parameter $b_3$ from M1 significantly add to the fit using a statistical test such as an F-test. This can be done in R as follow:

+ +
anova(M1, M2)
+Analysis of Variance Table
+
+Model 1: y ~ b0 + b1 * x1 + b2 * x2 + b3 * x1 * x2
+Model 2: y ~ b0 + b1 * x1 + b2 * x2 + b1 * b2 * x1 * x2
+  Res.Df Res.Sum Sq Df      Sum Sq F value Pr(>F)
+1     46    0.40843                              
+2     47    0.40855 -1 -0.00011097  0.0125 0.9115
+
+ +

My general question is:

+ +
    +
  • Are these analyses appropriate?
  • +
+ +

More specifically:

+ +
    +
  • Can I use AIC to compare least-squares fitted models?
  • +
+ +

From a few posts such as this one it looks like AIC should be appropriate. However, I've seen posts such as this one that indicates that using AIC on non-MLE fitted models might be a problem. I understand that least-squares is equivalent to MLE if the error is normally distributed, but is this true even for non-linear models?

+ +
    +
  • Can I use a F-test to test whether $b_3$ is significantly different from $b_1 b_2$?
  • +
+ +

I know such F-test makes sense if the model are nested, but I'm unsure whether it is appropriate in this case.

+",2013-11-04 17:58:04.380 +58856,21985.0,1,,,,Likelihood ratio for normal distribution,,CC BY-SA 3.0,"

Let $X_1, \dots , X_n \sim \mathrm{N}(\mu,\sigma^2)$. $\sigma^2$ is known. We want to test $\mathrm{H}_0: \mu = 0$ versus $\mathrm{H}_1: \mu > 0$.

+ +

For the likelihood ratio I got: $\Lambda_1 = \exp(\frac{n(\mu_0^2 - \mu_1^2)}{2 \sigma^2}) \cdot \exp(\frac{\mu_1 - \mu_0}{\sigma^2} \cdot \sum x_i)$. Where the first term is a constant. Hope this is correct.

+ +

Now we know that for expected value of normal randowm sample $T(X_{1:n} = \sum X_i)$ is a sufficient statistic. I have to rewrite $\Lambda_1$ as a function of $T$, which gives me $\Lambda_2$. Can I now just exchange $\sum x_i$ in $\Lambda_1$ with $T$?

+ +

Another question is what I can say about the rejection region of $\Lambda_1$ and $\Lambda_2$, keeping in mind that $\mu$ is zero or bigger. I do not know what is meant here...

+",2013-11-04 22:01:17.580 +58857,594.0,2,,58796.0,,,,CC BY-SA 3.0,"

Here's the situation as I understand it:

+ +

1) you want to test for equality of spread with heavy tailed distributions.

+ +

2) you assume they will only differ in spread.

+ +

3) you have a measure of spread already in mind (median absolute deviation); what you need is a single statistic that identifies how much those spreads are unalike (since it's a measure of scale, their ratio is a reasonable choice, but in fact for what I am going to suggest we're going to be able to do something simpler).

+ +

4) the usual assumptions like independence etc apply.

+ +

(5) it sounds like you are using R; if I give any example code I'll use R also)

+ +

With these assumptions, under the null, the distributions of the two samples are the same; the null is a test of identical distributions, the test statistic is a measure that's sensitive to a particular kind of deviation from identical distributions.

+ +

If you want a test where dispersion is measured by median absolute deviation, then in these circumstances you could just do a permutation test (or randomization test if the samples are too large to evaluate all permutations).

+ +

Because it's a permutation test, we don't have to take the ratio of mads, anything whose size is sensitive to deviations from the null would also work. In this case I'm going to discuss looking at the mad of the smaller-sized sample, in the hopes of calculating a bit faster (though calculation speed shouldn't be the only consideration, it allows me to make some useful points about what we're doing along the way).

+ +

If the null were true, the labels for the two groups are 'arbitrary' - we could combine the samples and randomly relabel the observations without altering their underlying distribution. However, under the alternative, the labels signify something important - that we are sampling from distributions that differ in spread. In that case, the sample test statistic won't be 'typical' of the permutation distribution.

+ +

Basic outline of how to do a test of equality of distributions using a specified statistic

+ +

Permutation test:

+ +
    +
  1. Combine both samples and allocate the set of group labels in all possible combinations to find the distribution of the test statistic under the null of identical distributions, calculating the test statistic each time.

  2. +
  3. find how far 'in from the end(s)' the sample value occurs, and calculate the proportion of null values from the permutation distribution that are at least as extreme as it. (the p-value)

  4. +
  5. If the sample statistic is sufficiently extreme in the null distribution (if that p-value is low enough), reject the null.

  6. +
+ +

In this case I assume you want a two-tailed test and then (with the mad of the smaller group as the statistic) we have the problem of what to count up to in the other tail (this would be easy with the ratio, since we just take the reciprocal). However we'll take the ratio to the combined mad (which doesn't change so we don't have to sample that), and use the reciprocal from that to get the cutoff in the other tail.

+ +

The randomization test is basically the same as the above but instead of evaluating all possible permutations it simply samples (with replacement) from the set of permutations.

+ +

[Another alternative would be to use bootstrapping; in that case you're seeing whether a ratio of median absolution deviations of 1 could plausibly have come from the bootstrap distribution of the ratio. In this case you would resample with replacement within groups and compute the ratio each time, and see whether the null value is consistent with it (would be inside a confidence interval for the ratio). But bootstrapping tends not to work so well on medians, and you would require fairly large samples. A smoother (but similarly robust) statistic might work better.]

+ +
+ +

Example (here I've chosen the sample sizes to be just a little bit too large to do a permutation test; there's 3.2 billion combinations - you could do it if you were determined to get the exact permutation distribution, but we won't)

+ +
# create some data with heavy tail and different spread
+set.seed(3498732)
+nrand <- 100000
+x <- rt(20,df=3)*5+50
+y <- rt(15,df=3)*20+50
+print(x,d=4)
+print(y,d=4)
+xy <- c(x,y)
+
+# do the randomization
+madrand <- replicate(nrand,mad(sample(xy,15),constant=1))
+mm <- mad(xy,constant=1)
+t <- mad(x,constant=1)/mm
+tr <- range(t,1/t)
+madrand <- madrand/mm
+hist(madrand,n=200)
+abline(v=mad(x,constant=1)/mm,col=3)
+abline(v=1/(mad(x,constant=1)/mm),col=3)
+
+(pvalue <- (sum(madrand <= tr[1])+sum(madrand >= tr[2]))/nrand)
+
+ +

This produces a p-value of 0.114 (with a binomial standard error of 0.001); the histogram looks like this:

+ +

+ +

[This looks 'rough' like I haven't sampled enough. Not so - the exact permutation distribution will look like this too; it's a result of using a median (of the absolute deviations from the median) on small samples -- the resulting statistic has a somewhat ""clumpy"" distribution even though the original data was continuous.]

+ +

If I'd been clever I'd have used a constant in the calls to mad of 1/mm.

+ +

If we wanted to retain the scale of the mad in our results, we could still do the other tail by computing mm^2/mad(x,constant=1) as the cutoff and get the same resulting p-value.

+ +

Hopefully that's clear enough, but I can elaborate if needed.

+ +

The ratio of mads that I originally discussed can be done almost as easily - the sampling will be slower (it will take maybe twice as long to run) but the fiddling around at the end would be simpler.

+ +
+ +

Edit: If you're not wedded to the mad, the package exactRankTests has some alternative robust permutation tests of equality of spread.

+",2013-11-04 22:39:54.083 +58840,23343.0,1,,,,Correct calculation of effect size after Friedman test in a skewed population,,CC BY-SA 3.0,"

I have some data that compares behaviour in four conditions (repeated measures)(I have several different behavioural variables and each behaviour gets analysed separately, and the behaviours are currently in % of time format). The behavioural variables are very non-normal, even after transformation. So I have run Friedman tests for the omnibus tests (friedman.test in R). For posthocs I have used friedman from the Agricolae package, but I would also like to report effect sizes. I am not sure which is the correct calculation to use because the data is not only non-normal, but also very positively skewed. I have tried the rFromWilcox function from Field but I am not sure it is the right one to use. Also I am getting r results such as -0.564NULL. +I am quite new to doing my own analysis, but keen to learn, so please don't be offended if I haven't phrased something quite right.

+ +

@Michael: I am trying to determine if certain behaviours occur more or less often under the different conditions, and where those differences are (nothing too deep). Two similar but different treatments are applied between condition one and two, and between three and four. I am predicting that there will be certain changes (some behaviours will increase and some will decrease) between all conditions but particularly between one and two, and three and four. My posthocs test between all pair combinations and I have used a Bonferroni correction. I was planning on reporting the summary stats, for each condition for each behaviour, the test statistic and p value, and posthoc results where Friedman is significant, and effect sizes on those posthocs.

+",2013-11-04 18:03:06.073 +58841,9804.0,1,58852.0,,,Probability as a dependent variable in a time-series regression,,CC BY-SA 3.0,"

Are there any issues to run a Newey-West time-series regression on a dependent variable that is a probability? What are the biases that I am facing? I can't find anything online that can help me out with this issue.

+",2013-11-04 18:13:22.133 +58842,20828.0,1,58851.0,,,Bayesian model averaging for variable selection in R,,CC BY-SA 3.0,"

I am trying to use Bayesian model averaging for variable selection with a large number of variables. In R, the BMS package allows to apply the method, with the option of using MCMC sampler (Metropolis Hastings algorithm) when the number of covariates is large.

+ +

Here is a sample code:

+ +
 data(datafls)
+ fls1 = bms(datafls, burn = 50000, iter=100000, g = ""BRIC"", mprior = ""uniform"", nmodel = 2000, mcmc=""bd"", user.int=F)
+ result = coef(fls1)
+
+ +

However, if you run the same code twice, the results (i.e. the posterior probabilities) would be completely different. Does anyone know how to tune the code, so that the results are consistent for every run?

+ +

Thank you,

+",2013-11-04 18:52:57.227 +58843,5448.0,2,,58822.0,,,,CC BY-SA 4.0,"

Let's consider a very simple model: $y = \beta x + e$, with an L1 penalty on $\hat{\beta}$ and a least-squares loss function on $\hat{e}$. We can expand the expression to be minimized as:

+ +

$\min y^Ty -2 y^Tx\hat{\beta} + \hat{\beta} x^Tx\hat{\beta} + 2\lambda|\hat{\beta}|$

+ +

Keep in mind this is a univariate example, with $\beta$ and $x$ being scalars, to show how LASSO can send a coefficient to zero. This can be generalized to the multivariate case.

+ +

Let us assume the least-squares solution is some $\hat{\beta} > 0$, which is equivalent to assuming that $y^Tx > 0$, and see what happens when we add the L1 penalty. With $\hat{\beta}>0$, $|\hat{\beta}| = \hat{\beta}$, so the penalty term is equal to $2\lambda\beta$. The derivative of the objective function w.r.t. $\hat{\beta}$ is:

+ +

$-2y^Tx +2x^Tx\hat{\beta} + 2\lambda$

+ +

which evidently has solution $\hat{\beta} = (y^Tx - \lambda)/(x^Tx)$.

+ +

Obviously by increasing $\lambda$ we can drive $\hat{\beta}$ to zero (at $\lambda = y^Tx$). However, once $\hat{\beta} = 0$, increasing $\lambda$ won't drive it negative, because, writing loosely, the instant $\hat{\beta}$ becomes negative, the derivative of the objective function changes to:

+ +

$-2y^Tx +2x^Tx\hat{\beta} - 2\lambda$

+ +

where the flip in the sign of $\lambda$ is due to the absolute value nature of the penalty term; when $\beta$ becomes negative, the penalty term becomes equal to $-2\lambda\beta$, and taking the derivative w.r.t. $\beta$ results in $-2\lambda$. This leads to the solution $\hat{\beta} = (y^Tx + \lambda)/(x^Tx)$, which is obviously inconsistent with $\hat{\beta} < 0$ (given that the least squares solution $> 0$, which implies $y^Tx > 0$, and $\lambda > 0$). There is an increase in the L1 penalty AND an increase in the squared error term (as we are moving farther from the least squares solution) when moving $\hat{\beta}$ from $0$ to $ < 0$, so we don't, we just stick at $\hat{\beta}=0$.

+ +

It should be intuitively clear the same logic applies, with appropriate sign changes, for a least squares solution with $\hat{\beta} < 0$.

+ +

With the least squares penalty $\lambda\hat{\beta}^2$, however, the derivative becomes:

+ +

$-2y^Tx +2x^Tx\hat{\beta} + 2\lambda\hat{\beta}$

+ +

which evidently has solution $\hat{\beta} = y^Tx/(x^Tx + \lambda)$. Obviously no increase in $\lambda$ will drive this all the way to zero. So the L2 penalty can't act as a variable selection tool without some mild ad-hockery such as ""set the parameter estimate equal to zero if it is less than $\epsilon$"".

+ +

Obviously things can change when you move to multivariate models, for example, moving one parameter estimate around might force another one to change sign, but the general principle is the same: the L2 penalty function can't get you all the way to zero, because, writing very heuristically, it in effect adds to the ""denominator"" of the expression for $\hat{\beta}$, but the L1 penalty function can, because it in effect adds to the ""numerator"".

+",2013-11-04 18:59:26.697 +58844,18358.0,2,,58780.0,,,,CC BY-SA 4.0,"

The first graph you describe here is a tripartite graph, which means it has three types of nodes, and links only between nodes of different types. The second graph you describe, containing only user nodes, is the result of the so-called projection over the user dimension. However, performing such an operation results in a loss of information, because several multipartite graphs can lead to the same projection (as shown in Guillaume'06 for bipartite graphs). So, it is better to directly detect communities in the original graph.

+

However, certain questions arise if you want to do so. The most important is: what is a community for you? A group of nodes of any type, or a group of same-type nodes? Depending on this, you can apply various methods. A lot of them were developped to handle folksonomy graphs, in which the three types of nodes respectively represent users, annotations and shared resources (the users associated annotations to resources). But they can be applied to networks representing other systems, such as yours.

+

Here're some community detection algorithms designed for tripartite graphs (the list is obviously not complete):

+ +

However, I don't think any implementation is freely available, if that's what you're looking for. Also, note there's a generalized version of the modularity (see Murata'10), defined for tripartite network. The modularity is a metric measuring the "quality" of a community structure. Many community detection methods are based on the optimization of this metric. If you want to program your own tool, implementing this modularity and then applying a classic optimization method might be the easiest way.

+",2013-11-04 19:06:17.763 +58845,23345.0,1,,,,Posterior probability,,CC BY-SA 3.0,"

Suppose that we have have scoring functions $f(\textbf{x})$ and $g(\textbf{x})$ for classifying an object as red or blue. These are based on linear discriminant analysis. So if $f(\textbf{x}) > g(\textbf{x})$, then the object is red. Why is the posterior probability that an object is red the following:

+ +

$$\frac{\exp(f(\textbf{x})}{\exp(f(\textbf{x}))+\exp(g(\textbf{x}))}$$

+ +

Why do we exponentiate the functions?

+",2013-11-04 19:33:36.103 +58858,13037.0,2,,58849.0,,,,CC BY-SA 3.0,"

I think I got it from the hints you guys gave:

+ +

$$\begin{align*} + P(|X_{(1)} -\theta| > \epsilon) &= P(X_{(1)} -\theta > \epsilon)\\ +&= P(X_{(1)} > \epsilon + \theta) \\ +&= P(X_1 > \epsilon+\theta)^n\\ +&= (1-\epsilon^2)^n +\end{align*} $$

+ +

and $\lim_{n\to\infty}{(1-\epsilon^2)^n}=0$ for $\epsilon \in (0,1)$.

+",2013-11-04 22:53:44.773 +58859,668.0,2,,58818.0,,,,CC BY-SA 3.0,"

This question is answered beautifully by means of a spread-versus-level plot: a cube root transformation will stabilize the spreads of the data, providing a useful basis for further exploration and analysis.

+ +
+ +

The data show a clear seasonality:

+ +
plot(y)
+
+ +

+ +

Take advantage of this by slicing the data into annual (or possibly biennial) groups. Within each group compute resistant descriptors of their typical value and their spread. Good choices are based on the 5-letter summary, consisting of the median (which splits the data into upper and lower halves), the medians of the two halves (the ""hinges"" or ""fourths""), and the extremes. Because the extremes are not resistant to outliers, use the difference of the hinges to represent the spread. (This ""fourth-spread"" is the length of a box in a properly constructed box-and-whisker plot.)

+ +
spread <- function(x) {
+  n <- length(x)
+  n.med <- (n + 1)/2
+  n.fourth <- (floor(n.med) + 1)/2
+  y <- sort(x)[c(floor(n.fourth), ceiling(n.fourth), 
+               floor(n+1 - n.fourth), ceiling(n+1 - n.fourth))]
+  return( y %*% c(-1,-1,1,1)/2 )
+}
+years <- floor((1:length(x) - 1) / 12)
+z <- split(x, years)
+boxplot(z, names=(min(years):max(years))+1976, ylab=""y"")
+
+ +

+ +

The boxplots clearly get longer over time as the level of the data rises. This heteroscedasticity complicates analyses and interpretations. Often a power transformation can reduce or remove the heteroscedasticity altogether.

+ +

A spread versus level plot shows whether a power transformation (which includes the logarithm) will be helpful for stabilizing the spread within the groups and suggests an appropriate value for the power: it is directly related to the slope of the spread-vs.-level plot on log-log scales.

+ +
z.med <- unlist(lapply(z, median))
+z.spread <- unlist(lapply(z, spread))
+fit <- lm(log(z.spread) ~ log(z.med))
+plot(log(z.med), log(z.spread), xlab=""Log Level"", ylab=""Log Spread"", 
+     main=""Spread vs. Level Plot"")
+abline(fit, lwd=2, col=""Red"")
+
+ +

+ +

This plot shows good linearity and no large outliers, attesting to a fairly regular relationship between spread and level throughout the time period.

+ +

When the fitted slope is $p$, the power to use is $\lambda=1-p$. Upon applying the suggested power transformation, the spread is (approximately) constant regardless of the level (and therefore regardless of the year):

+ +
lambda <- 1 - coef(fit)[2]
+boxplot(lapply(z, function(u) u^lambda), names=(min(years):max(years))+1976, 
+        ylab=paste(""y^"", round(lambda, 2), sep=""""),
+        main=""Boxplots of Re-expressed Values"")
+
+ +

+ +
plot(y^lambda, main= ""Re-expressed Values"", ylab=paste(""y^"", round(lambda, 2), sep=""""))
+
+ +

+ +

Often, powers that are reciprocals of small integers have useful or natural interpretations. Here, $\lambda = 0.32$ is so close to $1/3$ that it may as well be the cube root. In practice, one might choose to use the cube root, or perhaps round it to the even simpler fraction $1/2$ and take the square root, or sometimes go all the way to the logarithm (which corresponds to $\lambda = 0$).

+ +
+ +

Conclusions

+ +

In this example, the spread-versus-level plot (by virtue of its approximate linearity and lack of outliers) has shown that a power transformation will effectively stabilize the spread of the data and has automatically suggested the power to use. Although powers can be computed using various methods, none of the standard methods provides the insight or diagnostic power afforded by the spread-versus-level plot. This should be in the toolkit of every data analyst.

+ +
+ +

References

+ +

Tukey, John W. Exploratory Data Analysis. Addison-Wesley, 1977.

+ +

Hoaglin, David C., Frederick Mosteller, and John W. Tukey, Understanding Robust and Exploratory Data Analysis. John Wiley and Sons, 1983.

+",2013-11-04 22:59:09.357 +26070,5911.0,1,26657.0,,,Euclidean distance is usually not good for sparse data (and more general case)?,,CC BY-SA 4.0,"

I have seen somewhere that classical distances (like Euclidean distance) become weakly discriminant when we have multidimensional and sparse data. Why? Do you have an example of two sparse data vectors where the Euclidean distance does not perform well? In this case which similarity should we use?

+",2012-06-01 13:55:13.253 +58846,21762.0,2,,58798.0,,,,CC BY-SA 3.0,"

This answer will not be very complete.

+ +

1) If the performance scores are reliable (in the sense that they do not change considerably when measured repeatedly within worker), then there is absolutely no point in doing statistical inference (i.e. estimates, tests, confidence intervals) about population parameters, because your sample is equivalent to the corresponding population. Your descriptive presentation of means and variances could be complemented by adding medians, quartiles and boxplots etc.

+ +

2) If the performance scores are not very reliable, then you could think of a very simple model of the form +$$ +\text{Performance} = \alpha + \beta \cdot \text{Gender} + \varepsilon +$$ +with Gender = 1 if female and 0 if male. The random error $\varepsilon$ accounts for the fact that performance varies within person (depending on day, mood etc.) and that the true ""gender-effect"" $\beta$, i.e. the true average overperformance of women, is not exactly $\hat\beta =56.10 - 26.12 = 29.98$ but might change from day to day. With a two-sample t-test (with Welch-correction for unequal variance) you might test whether $\beta \ne 0$. If the p-value is below 5%, you can assert with 95% confidence that women systematically outperform men. (We could now discuss about whether we should pick an other two-sample test or log transform the data etc.)

+ +

3) As quite always when men are compared to women, the question of confounding raises: Is the difference really due to gender or can it be explained at least partially by other factors such as age? It's possible that in your company, women are typically older than men, which could already partially explain the difference in performance. One simple way to try to adjust for confounding is to modify the simple model from 2) to +$$ +\text{Performance} = \alpha + \beta' \cdot \text{Gender} + \gamma \cdot \text{Age} + \varepsilon +$$ +and infer about the age-corrected gender effect $\beta'$ by linear regression.

+ +

An alternative to the modelling approach is matching by age followed by a paired t-test (or similar). This also adjusts for potential age-confounding but leads to another estimated ""gender-effect"" $\hat \beta''$. Depending on the variability of the performance difference between pairs (which is directly linked to the intra-pair correlation) and the difference between $\hat \beta$ and $\hat \beta''$, the power of such test is higher or lower than the two-sample test in 2).

+ +

Summa summarum: Matching leads to inference about a confounder-adjusted effect of gender, whereas the unmatched comparison infers about the ""crude"" gender effect without considering any confounders. Both parameters might be interesting, so you could easily summarize both of them.

+",2013-11-04 19:57:30.790 +58847,22144.0,1,,,,Imbalanced data classification using boosting algorithms,,CC BY-SA 3.0,"

I am working on a binary data classification problem. The dataset is imbalanced, it consists of 92% 'false' labels and 8% 'true' labels. The number of features is 18 and I have a small number of 650 data points. I want to use boosting algorithms in MATLAB like GentleBoost to solve this problem. I assign Uniform for prior as follows:

+ +
ada = fitensemble(Xtrain,Ytrain,'GentleBoost',10,'Tree','LearnRate',0.1, 'prior', 'uniform')
+
+ +

but the performance is consistently poor. How should I set the parameters? Is it necessary to set a cost? How can I do this? Is there any classifier that perform better than this?

+",2013-11-04 20:25:27.970 +58848,13037.0,1,58883.0,,,Convergence in probability of reciprocal,,CC BY-SA 3.0,"

This is a homework problem. If $X_n$ converges in probability to 1, show $X_n^{-1}$ converges in probability to 1.

+ +

My attempt:

+ +

$$\begin{align*} P(|X_n^{-1}-1| > \epsilon) &= P(|X_n^{-1}-X_n + X_n-1| > \epsilon)\\ + &\leq P(|X_n^{-1} - X_n| > \epsilon/2) + P(|X_n - 1| > \epsilon/2)\\ + &= \end{align*} $$

+ +

I know I can bound the 2nd term, but I am not sure how to bound the first term. Perhaps another approach is necessary. Any suggestions would be appreciated.

+",2013-11-04 20:45:14.520 +58849,13037.0,1,58858.0,,,Convergence in probability of minimum,,CC BY-SA 3.0,"

This is a homework problem. Suppose we have a random sample $X_1,\ldots,X_n \overset{iid}{\sim} F$ with density $f(x) = 2(x-\theta)$ for $x\in (\theta,\theta+1)$. Let $X_{(1)} = \min{\{X_1,\ldots,X_n\}}$. Show $X_{(1)} \overset{P}{\longrightarrow} \theta$.

+ +

My attempt:

+ +

A few calculations first:

+ +

$F(x) = (x-\theta)^2$ for $x\in(\theta,\theta+1)$

+ +

$f_{X_{(1)}} = n(1-F(x))^{n-1}f(x) = 2n(x-\theta)(1-(x-\theta)^2)^{n-1}$

+ +

$$ \begin{align*} P(|X_{(1)} -\theta| > \epsilon) &= P(|X_1-\theta|> \epsilon, \ldots, |X_n-\theta|> \epsilon)\\ +&= P(|X_1-\theta|> \epsilon)^n\\ +&= \end{align*}$$

+ +

Not exactly sure where to go from here. Any suggestions would be appreciated.

+",2013-11-04 20:54:16.367 +58850,22144.0,1,,,,Imbalanced data classification using Random Forest in matlab,,CC BY-SA 3.0,"

I want to solve an imbalanced data classification, with small number of data points (approximately 600 ) with the ratio of true labels to false , 1:12. Is there any function or matlab code for using Random forest for classification of imbalanced data? How should I access it and set the parameters? +Thank you for your help.

+",2013-11-04 20:54:41.707 +58851,6204.0,2,,58842.0,,,,CC BY-SA 3.0,"

This is an artifact of the stochasticity of MCMC sampling. The best you can do is set a specific seed before each run, using set.seed():

+ +
set.seed(123)
+fls1 = bms(datafls, burn = 50000, iter=100000, g = ""BRIC"", mprior = ""uniform"", nmodel = 2000, mcmc=""bd"", user.int=F)
+
+set.seed(123)
+fls2 = bms(datafls, burn = 50000, iter=100000, g = ""BRIC"", mprior = ""uniform"", nmodel = 2000, mcmc=""bd"", user.int=F)
+
+ +

This causes the initial conditions of R's random number generator to be the same before you run your samplers, so the results of fls1 and fls2 will be the same.

+",2013-11-04 21:06:03.067 +58852,23348.0,2,,58841.0,,,,CC BY-SA 3.0,"

For dependent variables distributed continuously between 0 and 1, you'll want to use a beta regression. I don't know what statistical software you are using, but this article should be helpful, especially if you are using R: http://cran.r-project.org/web/packages/betareg/vignettes/betareg.pdf

+ +

In order to help control for auto-correlation, try including a lagged version of the outcome variable as a predictor in the model. Then you can run a set of diagnostics to see if auto-correlation is still a problem.

+",2013-11-04 21:09:46.613 +58853,16046.0,1,,,,Can posterior distribution for a continuous variable be greater than one?,,CC BY-SA 3.0,"

I already asked this question here, but I am not sure where would be better to ask it? This might sound a dumb question but I am really confused about it. According to Bayes' rule we do have the following: +$$p(\theta|X)=\frac{p(\theta)p(X|\theta)}{\int{p(\theta)p(X|\theta)d\theta}}$$I know that probability density function can be greater than one in general but it seems to me because there exist discrete summations of denominator integral which would is greater than the nominator, therefore posterior probability density function cannot be greater than one in any point. Is this correct?!

+ +

To explain reasoning a bit in more detail:

+ +

Suppose we are interested in $p(\theta=\theta_0|X)$ and we know that: +$$\int{p(\theta)p(X|\theta)}d\theta\approx\sum\limits_{n}{p(\theta_i)p(X|\theta_i)}$$ but now only consider the summations which include $p(\theta_0)p(\theta_0|X)$. Then the denominator will obviously be larger than the nominator.

+",2013-11-04 21:52:19.260 +58854,23349.0,2,,58822.0,,,,CC BY-SA 3.0,"

Suppose we have a data set with y = 1 and x = [1/10 1/10] (one data point, two features). One solution is to pick one of the features, another feature is to weight both features. I.e. we can either pick w = [5 5] or w = [10 0].

+ +

Note that for the L1 norm both have the same penalty, but the more spread out weight has a lower penalty for the L2 norm.

+",2013-11-04 21:59:58.647 +58860,7246.0,1,58864.0,,,"Are non-replacing draws from an infinite, non-shuffling pool independent?",,CC BY-SA 3.0,"

An infinite number of opaque jars are each filled with N spheres, p N of which are gold and (1 - p) N of which are tungsten (0 ≤ p < 1; N > 0). You have drawn from a jar x spheres without replacement, all of which were made of tungsten (0 < x < N). N and E[p] are constant across jars.

+ +

Should you (a) keep drawing from the same jar, (b) switch to a different jar (assuming zero switching cost) or (c) not care?

+ +

Why I'm asking: trying to figure out if the probability of finding a BTC block is always independent between hashes. A block is ""found"" when the hash of the block header is below a threshold. Three variables in the block header: the nonce value, time stamp, and coinbase address. The nonce value and time stamps are in a manageably finite discrete space. The coinbase address is in a practically infinite discrete space.

+",2013-11-04 23:10:10.990 +58861,1804.0,1,59515.0,,,Question on calculating power for a prediction based study,,CC BY-SA 3.0,"

I've calculated required power for a study in the past but I've come upon a scenario that I can't figure out quite how to do it. Pretty much I have a procedure that results in a particular complication 15% of the time. My study uses an imaging intervention on all study participants that we believe can predict the occurance of that complication >= 50% of the time. How would I go about calculating the required power/patients needed for a study like this?

+ +

Thank you.

+",2013-11-04 23:13:24.217 +58862,9554.0,2,,57755.0,,,,CC BY-SA 3.0,"

First of all, this is a great description of your project and of the problem. +And I am big fan of your home-made measurement framework, which is super cool... so why on earth does it matter what you call ""averaging the integrals""?

+ +

In case you are interested in some broader positioning of your work, what you would like to do is often referred to as Anomaly detection. In its simplest setting it involves comparing a value in a time-series against the standard deviation of the previous values. The rule is then if $$x[n] > \alpha SD(x[1:n-1]) => x[n]\text{ is outlier}$$ +where $x[n]$ is the $n^{th}$ value in the series, $SD(x[1:n-1])$ is the standard deviation of all previous values between the $1^{st}$ and $(n-1)^{th}$ value, and $\alpha$ is some suitable parameter you pick, such as 1, or 2, depending on how sensitive you want the detector to be. You can of course adapt this formula to work only locally (on some interval of length $h$), +$$x[n] > \alpha SD(x[n-h-1:n-1]) => x[n]\text{ is outlier}$$

+ +

If I understood correctly, you are looking for a way to automate the testing of your devices, that is, declare a device as good/faulty after it performed the entire test (drew the entire diagonal). In that case simply consider the above formulas as comparing $x[n]$ against the standard deviation of all values.

+ +

There are also other rules you might want to consider for the purpose of classifying a device as faulty:

+ +
    +
  • if any deviation (delta) is greater than some multiple of the SD of all deltas
  • +
  • if the square sum of the deviations is larger than a certain threshold
  • +
  • if the ratio of the sum of the positive and negative deltas is not approximately equal (which might be useful if you prefer smaller errors in both directions rather than a strong bias in a single direction)
  • +
+ +

Of course you can find more rules and concatenate them using boolean logic, but I think you can get very far with the three above.

+ +

Last but not least, once you set it up, you will need to test the classifier (a classifier is a system/model mapping an input to a class, in your case the data of each device, to either ""good"", or ""faulty""). Create a testing set by manually labelling the performance of each device. Then look into ROC, which basically tells you the offset between how many devices your system correctly picks up out of the returned, in relation to how many of the faulty devices it picks up.

+",2013-11-04 23:31:30.570 +58863,23355.0,1,,,,Visualizing/explaining multilevel model (dichotomous IV),,CC BY-SA 3.0,"

I am using multilevel modeling (xtmixed in Stata) to predict a quasi-continuous level-3 DV using a dichotomous IV (sex). My results are markedly different from what the basic difference between men and women would suggest. No problem there - this is why I'm using MLM and not OLS regression or a t-test.

+ +

However, I'm having trouble describing the results. In particular, one of my graphs shows the difference in means, and it's more than a little confusing when you compare that to the formal test results. Does anyone know of a way to graphically present this kind of information in an intuitive way, or explain the apparent disparity of results without getting too technical?

+",2013-11-04 23:35:58.820 +58864,1889.0,2,,58860.0,,,,CC BY-SA 3.0,"

It depends on whether (i) there are exactly $pN$ gold spheres at the start in your jar, or (ii) each of the $N$ spheres had an independent probability $p$ of being gold when put into the jar.

+ +

In case (i), you should stick with the jar you have partly emptied. The probability the next sphere you draw from it is gold is $\frac{pN}{N-x}$ which is greater than the probability $p$ that a sphere drawn from another jar is gold.

+ +

In case (ii), it does not matter. The probability that your next sphere is gold is $p$ whether you draw it from the same jar or a different jar.

+",2013-11-05 01:45:08.497 +58865,10684.0,2,,45534.0,,,,CC BY-SA 3.0,"

The cost of compliance for the company is $c_{ij}$ and the penalty for not complying is $\Lambda$. Since $c_{ij}$ is a random variable with cdf $F$, we have that $F(\Lambda)$ is another way of writing $P(c_{ij} < \Lambda)$, or in other words the probability that the compliance cost will be less than the cost of not complying. Or in other words, the probability that the company will comply.

+ +

So saying that $F(\Lambda) <1$ is just saying that the probability that the company will comply is less than $1$. Similarly, $F(\Lambda) < 0.4$ means that there is less than a $40\%$ chance that the company will comply.

+ +

For your questions in the comments about how to derive (3) and (4), to get (4) you observe that the only way in which a company can do more damage under dealing is if it's in category $\beta$ on page 365 of the paper. This is the same as $\mathrm{max}\{c_{i1}, c_{i2}\} < \Lambda$. Since $c_{i1}$ and $c_{i2}$ are independent, the probability of both of them being less than $\Lambda$ is +$$P(c_{i1}, c_{i2} < \Lambda) = P(c_{i1} <\Lambda)P(c_{i2} < \Lambda) = F(\Lambda)^2$$ +which gives (4).

+ +

To get (3), the company needs to be in category $\alpha$ on page 364, which means that one $c_{ij}$ has to be between $\Lambda$ and $2\Lambda$ and the other $c_{ij}$ has to be greater than $\Lambda$. The desired probability is +$$\alpha(\Lambda) = P((\Lambda < c_{i1} < 2\Lambda \text{ and } c_{i2} > \Lambda) \text{ OR } (\Lambda < c_{i2} < 2\Lambda \text{ and } c_{i1} > \Lambda))$$ +but when you have the ``OR"" of two events you have to take into account that they might have outcomes in common, so you need to use the formula $P(X \text{ or } Y)=P(X) + P(Y) - P(X \text{ and } Y)$. Here, this gives you +$$\alpha(\Lambda) = P(\Lambda < c_{i1} < 2\Lambda \text{ and } c_{i2} > \Lambda) + P(\Lambda < c_{i2} < 2\Lambda \text{ and } c_{i1} > \Lambda) - P(\Lambda < c_{i1} < 2\Lambda \text{ and } c_{i2} > \Lambda \text{ and } \Lambda < c_{i2} < 2\Lambda \text{ and } c_{i1} > \Lambda)$$ +which using independence reduces to +$$P(\Lambda < c_{i1} < 2\Lambda)P(c_{i2} > \Lambda) + P(\Lambda < c_{i2} < 2\Lambda)P(c_{i1} > \Lambda) - P(\Lambda < c_{i1} < 2\Lambda)P(\Lambda < c_{i2} < 2\Lambda)$$ +which gives +$$\alpha(\Lambda) = 2(1-F(\Lambda))(F(2\Lambda)-F(\Lambda))-(F(2\Lambda)-F(\Lambda))^2$$ +which is (3).

+",2013-11-05 01:46:04.520 +58866,7007.0,2,,58848.0,,,,CC BY-SA 3.0,"

+ +

Consider the case $0<\epsilon<1$. Defining $\delta = \epsilon / (1+\epsilon)$, from the figure we have +$$ + |x-1|<\delta \Rightarrow \left| \frac{1}{x}-1\right| < \epsilon \, . +$$ +Hence, +$$ + P\left(\left| \frac{1}{X_n}-1\right| < \epsilon\right) \geq P\left(\left| X_n-1\right| < \delta\right) \to 1 \, , +$$ +when $n\to\infty$. The case $\epsilon\geq 1$ is easy, because +$$ + P\left(\left| \frac{1}{X_n}-1\right| < \epsilon\right) \geq P\left(\left| \frac{1}{X_n}-1\right| < \frac{1}{2}\right) \, , +$$ +and we can use the previous case.

+",2013-11-05 02:29:05.937 +58875,17196.0,1,,,,Calculating the Kolmogorov-Smirnov coefficient,,CC BY-SA 3.0,"

I need to calculate the coefficient of the Kolmogorov-Smirnov test for any given null hypothesis rejection level.

+ +

For example, have a look at the table in this wikipedia entry. It only gives $c(\alpha)$ for $\alpha=0.10, 0.05, 0.025, 0.01, 0.005, 0.001$. How can I calculate $c(\alpha)$ for a wider range of $\alpha$s?

+ +

If it is not trivial, can any one guide me on a place I can find a table for $c(\alpha)$ with larger $\alpha$ values?

+ +

Thanks in advance.

+",2013-11-05 10:15:58.833 +58867,23360.0,1,,,,Variance of sample mean of bootstrap sample,,CC BY-SA 3.0,"

Let $X_{1},...,X_{n}$be distinct observations (no ties). Let $X_{1}^{*},...,X_{n}^{*}$denote +a bootstrap sample (a sample from the empirical CDF) and let $\bar{X}_{n}^{*}=\frac{1}{n}\sum_{i=1}^{n}X_{i}^{*}$. +Find $E(\bar{X}_{n}^{*})$ and $\mathrm{Var}(\bar{X}_{n}^{*})$.

+ +

What I have so far is that $X_{i}^{*}$ is $X_{1},...,X_{n}$ each with probability $\frac{1}{n}$ so +$$ +E(X_{i}^{*})=\frac{1}{n}E(X_{1})+...+\frac{1}{n}E(X_{n})=\frac{n\mu}{n}=\mu +$$ and +$$E(X_{i}^{*2})=\frac{1}{n}E(X_{1}^{2})+...+\frac{1}{n}E(X_{n}^{2})=\frac{n(\mu^{2}+\sigma^{2})}{n}=\mu^{2}+\sigma^{2}\>, +$$ +which gives +$$ +\mathrm{Var}(X_{i}^{*})=E(X_{i}^{*2})-(E(X_{i}^{*}))^{2}=\mu^{2}+\sigma^{2}-\mu^{2}=\sigma^{2} \>. +$$

+ +

Then, +$$E(\bar{X}_{n}^{*})=E(\frac{1}{n}\sum_{i=1}^{n}X_{i}^{*})=\frac{1}{n}\sum_{i=1}^{n}E(X_{i}^{*})=\frac{n\mu}{n}=\mu +$$ +and +$$ +\mathrm{Var}(\bar{X}_{n}^{*})=\mathrm{Var}(\frac{1}{n}\sum_{i=1}^{n}X_{i}^{*})=\frac{1}{n^{2}}\sum_{i=1}^{n}\mathrm{Var}(X_{i}^{*})$$ +since the $X_{i}^{*}$'s are independent. This gives $\mathrm{Var}(\bar{X}_{n}^{*})=\frac{n\sigma^{2}}{n^{2}}=\frac{\sigma^{2}}{n}$

+ +

However, I don't get the same answer when I condition on $X_{1},\ldots,X_{n}$ and use the formula for conditional variance: +$$ +\mathrm{Var}(\bar{X}_{n}^{*})=E(\mathrm{Var}(\bar{X}_{n}^{*}|X_{1},...,X_{n}))+\mathrm{Var}(E(\bar{X}_{n}^{*}|X_{1},\ldots,X_{n})) \>. +$$

+ +

$E(\bar{X}_{n}^{*}|X_{1},\ldots,X_{n})=\bar{X}_{n}$ and $\mathrm{Var}(\bar{X}_{n}^{*}|X_{1},\ldots,X_{n})=\frac{1}{n^{2}}(\sum X_{i}^{2}-n\bar{X}_{n}^{2})$ so plugging these into the formula above gives (after some algebra) $\mathrm{Var}(\bar{X}_{n}^{*})=\frac{(2n-1)\sigma^{2}}{n^{2}}$.

+ +

Am I doing something wrong here? My feeling is that I am not using the conditional variance formula correctly but I'm not sure. Any help would be appreciated.

+",2013-11-05 03:18:40.277 +58868,18998.0,1,58869.0,,,Calculating the probability of 31 of 628 items are sampled (no replacement) more than 10x amongst 150 participants drawing 50 items each,,CC BY-SA 3.0,"

I am sorry for having to ask a simple probability question, but I have been thinking about it for weeks and an extensive google search has given no answers.

+ +

I have a group of 628 questions. 31 Questions belong to category A. 150 participants will each be given 50 randomly sampled questions from the total 628 questions (no replacement). What is the probability that questions belonging to category A will be answered more than 10 times by the group of participants?

+ +

I looked at a binomial distribution using R's binomial density function, but the best I could come up with was the probability of 1 item from category A being administered to the 150 participants if only 1 question was sampled from the total of 628 questions.

+ +
    x <- seq(1,50,by=1)
+    high.biDen <-dbinom(x,size=150,prob=((31/628)))
+    round(high.biDen,2)
+
+    round(sum(high.biDen[10:50]),2)
+
+ +

The answer obtained is .208 or a 20.8% probability that a question from category A will be administered to the participants more than 10 times if 1 question is sampled from the pool of 628 questions. I would like to know the probability if 50 questions are sampled instead of 1.

+ +

Is the answer obtained by some sort of bayesian technique? It seems to rely heavily on conditional probability seeing how if one item is sampled only 30 remain to be randomly selected.

+ +

Thanks so much for sharing your knowledge and expertise!

+ +

-Xander

+",2013-11-05 03:58:53.787 +58869,22507.0,2,,58868.0,,,,CC BY-SA 3.0,"

(1) What is the probability $p_m$ that one participant receives exactly $m$ A-questions? For $m>31$ it is zero, otherwise

+ +

$$ p_m = { {31 \choose m} {597 \choose 50-m} \over {628 \choose 50} } $$

+ +

(Explanation: we should select $m$ A-questions than $(50-m)$ non-A-questions, and there are ${31 \choose m} {597 \choose 50-m}$ ways to do it.)

+ +

(2) What is the probability $q_{k,m}$ that $k$ participants together receive exactly $m$ A-questions? For k>0:

+ +

$$ q_{k,m} = \cases {p_m & if $k=1$ \\ \sum_{n=0}^{\min(m,31)} p_n q_{k-1,m-n} & otherwise} $$

+ +

(Explanation: let $n$ be the number of A-questions received by the last participant. The probability that he/she received $n$ A-questions, and previous participants received $m-n$ questions, is $p_n q_{k-1,m-n}$. Since we don't know how A-questions he receive, we should sum it by $n$.)

+",2013-11-05 05:38:35.593 +58870,23367.0,1,,,,Why use ${1/\sigma^2}$ as a prior for $\sigma^2$?,,CC BY-SA 3.0,"

In a lot of cases, the prior for $\sigma^2$ is chosen so that it is proportional to ${1/\sigma^2}$. I have a few queries re this:

+ +
    +
  1. What is the intuition for this choosing this prior?
  2. +
  3. What is the information conveyed by this prior? Does it mean that a higher value for $\sigma^2$ is less likely?
  4. +
  5. I know that this is an improper prior, but is it non-informative? Sorry, I am not entirely sure how non-informative priors are different from improper priors.
  6. +
+",2013-11-05 07:22:39.053 +58871,4318.0,1,,,,Estimating (MLE) 2D Vector Entries by a Noisy Samples of its Entries and its Norm,,CC BY-SA 4.0,"

I'd like your assistance with developing the Maximum Likelihood Estimator (MLE) and CRLB of the following case:

+ +

Given a 2D vector (A Point in XY Plane) $ p = ({x}_{p}, {y}_{p}) $.
+The measurements are noisy measurements of this vector entries and its norm.
+Namely the measurements vector is $ m = ({x}_{m}, {y}_{m}, {r}_{m}) $.
+Where the the distribution of those are given by:

+ +

$$ {x}_{m} \sim N({x}_{p}, {\sigma}_{x}), {y}_{m} \sim N({y}_{p}, {\sigma}_{y}), {r}_{m} \sim N(\sqrt{{x}_{p}^{2} + {y}_{p}^{2}}, {\sigma}_{r}) $$

+ +

Namely, the measurements are Normally Distributed, Unbiased and Independent of each other.

+ +

The parameters to estimate are $ \Theta = ({x}_{p}, {y}_{p}) $ given the measurement vector $ m = ({x}_{m}, {y}_{m}, {r}_{m}) $ as defined above.

+ +

In simple words, Estimate a point coordinates in the 2D plane given a noisy measurements of its coordinates and its range / distance from (0, 0).

+ +

I'd be happy to hear your solutions, ideas, related articles and the CRLB (Or any other lower bound on the estimation).

+",2013-11-05 07:49:41.810 +58872,11117.0,2,,58870.0,,,,CC BY-SA 3.0,"

First of all, $p(\sigma^2) \propto 1/\sigma^2$ is the Jeffreys prior (http://en.wikipedia.org/wiki/Jeffreys_prior) for a scale parameter. It also coincides with the reference prior under certain conditions.

+ +

1) and 2): +Intuitively, it can be understood as the only prior expressing correclty that $\sigma$ is a scale parameter: let $X$ distributed according to $1/\sigma .f(x/\sigma)$ then $Y=c.X$ has the same distribution than $X$ but with a different scale, $p(\sigma^2) \propto 1/\sigma^2$ also fits this property.

+ +

3): +Jeffreys priors are generally improper. However, to my knowledge improperness is not related to informativeness (but simply to integrability considerations).

+",2013-11-05 07:50:20.087 +58873,22200.0,1,,,,Post-hoc test in the context of cluster analysis,,CC BY-SA 3.0,"

I performed a cluster analysis based on a principal components. Total sample size is around 140. Now I want to describe the differences among the clusters (by means of the components) while performing a post-hoc analysis. By using the Levene test, I found out that the homogeneity of the variances can be assumed. I have the following questions:

+ +

1) Can I assume normal distribution for the ANOVA, since the components are z-transformed?

+ +

2) Which post-hoc test is appropriate? I have unequal simple size considering the clusters. I might go for Tukey-Kramer test.

+",2013-11-05 08:07:24.480 +58874,11072.0,1,,,,What are the tool that can help me to find non-informative words in documents for classification and eliminate them?,,CC BY-SA 3.0,"

Simply I want to remove words in documents that are not informative for classification purpose instead of biased stop word elimination. Is there any tool that can help me in that way? I know NLTK but it provides a simple stop word eliminator.

+",2013-11-05 08:46:03.203 +58876,7615.0,1,69414.0,,,Interpretation of lasso recovery results,,CC BY-SA 3.0,"

When people say that lasso regression can under certain assumptions recover ""the support"", i.e. non-zero regression weights, what does this mean?

+ +

This cannot mean causal recovery, because Pearl has taught us you cannot infer causality from correlation (and lasso is just glorified correlation). So are these non-zeros in some way the minimum set of variables that cannot be explained by others? Because people tend to speak in terms of removing ""spurious"" and recovering ""non-spurious"" correlation, but really how can correlation be spurious, it's just the association between two random variables, there's no room for causal language here.

+ +

I am confused.

+",2013-11-05 10:19:28.547 +58877,18198.0,1,,,,Converting standardized betas back to original variables,,CC BY-SA 3.0,"

I realise this is probably a very simple question but after searching I can't find the answer I am looking for.

+ +

I have a problem where I need to standardize the variables run the (ridge regression) to calculate the ridge estimates of the betas.

+ +

I then need to convert these back to the original variables scale.

+ +

But how do I do this?

+ +

I found a formula for the bivariate case that

+ +

$$ +\beta^* = \hat\beta \frac{S_x}{S_y} \>. +$$

+ +

This was given in D. Gujarati, Basic Econometrics, page 175, formula (6.3.8).

+ +

Where $\beta^*$ are the estimators from the regression run on the standardized variables and $\hat\beta$ is the same estimator converted back to the original scale, $S_y$ is the sample standard deviation of the regressand, and $S_x$ is the sample standard deviation.

+ +

Unfortunately the book doesn't cover the analogous result for multiple regression.

+ +

Also I'm not sure I understand the bivariate case? Simple algebraic manipulation gives the formula for $\hat\beta$ in the original scale:

+ +

$$ +\hat\beta=\beta^* \frac{S_y}{S_x} +$$

+ +

It seems odd to me that the $\hat\beta$ that were calculated on variables which are already deflated by $S_x$, has to be deflated by $S_x$ again to be converted back? (Plus why are the mean values not added back in?)

+ +

So, can someone please explain how to do this for a multivariate case ideally with a derivation so that I can understand the result?

+",2013-11-05 10:56:30.327 +58878,21756.0,2,,58855.0,,,,CC BY-SA 3.0,"

A typical approach (see e.g. Nelsen 2006, p. 41) is to sample two independent uniform distributed random vectors $u$ and $y$ of the desired sample length. The conditional copula $C_u$ (conditioned on $u$) is given through the partial derivative: $$ C_u(v) = \frac{\partial}{\partial u} C(u,v) $$ +Hence, one needs to solve $C_u(v)=y$ for $v$ to get the desired pair $(u,v)$. For a ""custom made"" copula, one has to calculate its partial derivative and its quasi-inverse. In case the copula is not completely ""custom made"" it might already be covered in other statistical software. One might for instance take a look into the R packages copula and VineCopula offering a rich set of families (speaking from my R experience, there are more in R and of course in other languages).

+",2013-11-05 11:01:09.727 +58879,17196.0,2,,58875.0,,,,CC BY-SA 3.0,"

I found this paper (Marsaglia, et al. (2003)), that has a method to approximate the probability of obtaining a certain $D_n$ for a certain number of data. It Also has a C program that I used in my program and worked excellently.

+",2013-11-05 11:16:10.153 +58880,18690.0,1,58895.0,,,Proof that there is no autocorrelation,,CC BY-SA 3.0,"

I was wondering whether my reasoning is correct and whether I've written it down correctly:

+ +

Given the following equation $bread_i=\beta_{0} + \beta_{1}wage + u_{t}$, where $u_{t}$ is normally distributed with a mean of zero and a variancce of $\sigma^{2}$ and is independent of $wage$.

+ +

Furthermore we know that there is no autocorrelation, $bread$ and $wage$ are not cointegrated, however there is a unit root present in $bread$ and $wage$.

+ +

Given a first difference model $\Delta bread_{t} = \beta_1 \Delta wage_{t}+\epsilon_{t}$, where $\epsilon = \Delta u_{t}$. Is the error term $\epsilon_{t}$ in this equation uncorrelated over time? Please motivate your answer with a formal derivation.

+ +

Well I thought the following:

+ +

Since $\epsilon = u_{t} - u_{t-1}$ and we know that there is no autocorrelation between them, after first differencing there will be no autocorrelation afterwards.

+ +

Question is of course how to show it? Using COV maybe? I am currently trying to derive it, but I'm a bit stuck.

+ +

Thanks in advance!

+",2013-11-05 11:16:50.937 +58881,12503.0,1,,,,How to evaluate a regression model's sensitivity to noise,,CC BY-SA 3.0,"

How can I investigate the sensitivity of a regression model to noise?

+ +

A basic idea is to add some (Gaussian) noise to the dependent and/or independent variables and (re)evaluate the RMSE.

+ +

However, the problem with this approach is, that one is mostly interested in modeling the true data, and not the noisy data. +Hence, is it reasonable to measure the RMSE on the original dependent variable, not the noisy one, even if the latter has been used for model building?

+",2013-11-05 11:42:05.483 +58882,16665.0,1,58898.0,,,Categorical response variable prediction,,CC BY-SA 3.0,"

I have the following kind of data (coded in R):

+ +
v.a = c('cat', 'dog', 'dog', 'goat', 'cat', 'goat', 'dog', 'dog')
+v.b = c(1, 2, 1, 2, 1, 2, 1, 2)
+v.c = c('blue', 'red', 'blue', 'red', 'red', 'blue', 'yellow', 'yellow')
+set.seed(12)
+v.d = rnorm(8)
+aov(v.a ~ v.b + v.c + v.d) # Error
+
+ +

I would like to know if the value of v.b or the value of v.c has any ability to predict the value of v.a. I would run an ANOVA (as shown above) but I think it does not make any sense since my response variable is not ordinal (it is categorical). What should I do?

+",2013-11-05 11:44:13.703 +58883,11383.0,2,,58848.0,,,,CC BY-SA 3.0,"

Fix $\varepsilon\gt 0$. We have for each positive $A$, +$$\{|X_n^{-1}-1|\gt \varepsilon\}=\{|X_n-1|\gt |X_n|\varepsilon\}\subset\{|X_n-1|\gt A\varepsilon\}\cup\{|X_n|\leqslant A\},$$ +this because we wrote $S=(S\cap \{|X_n|\gt A\})\cup (S\cap\{|X_n|\leqslant A\})$.

+ +

Take $A:= 1/2$; then $\{|X_n|\leqslant 1/2\}\subset\{|X_n-1|\geqslant 1/2\}$ (because $(-1/2,1/2)\subset (-\infty,1/2)\cup(3/2,\infty)$). We thus obtained

+ +

$$\{|X_n^{-1}-1|\gt \varepsilon\}\subset\{|X_n-1|\gt \varepsilon/2\}\cup\{|X_n-1|\gt 1/2\}.$$ +The probability of the last two events goes to $0$ as $n\to\infty$.

+",2013-11-05 12:35:28.053 +58884,2958.0,2,,58702.0,,,,CC BY-SA 3.0,"

Inclusion/exclusion of variates (step 3):

+

I understand that you ask which of the original measurement channels to include into the modeling.

+
    +
  • Is such a decision sensible for your data?
    +E.g. I work mainly with spectroscopic data, for which PLS is frequently and successfully used. Well measured spectra have a high correlation betweeen neighbour variates and the relevant information in spectroscopic data sets tends to be spread out over many variates. PLS is well suited for such data, but deciding on a variate-to-variate basis which variates to use for the model IMHO is usually not appropriate (decisions about inclusion/exclusion of spectral ranges based on spectroscopic knowledge about the application is IMHO a far better approach).
  • +
  • If for your data and application variable selection is a natural choice, is PLS the regularization technique you want?
    +You may want to read the sections about regularization (3.4 - 3.6) in the Elements of Statistical Learning where PLS as a regularization is compared to other regularization approaches. My point here is that in contrast to e.g. the Lasso, PLS is a regularization technique that does not tend to completely exclude variables from the model. I'd thus say that PLS is probably more suitable for data where this behaviour is sensible, but in that case variable selection is not a natural choice (e.g. spectroscopic data).
  • +
  • Does your data contain enough information for such a data-driven model optimization? Doing a t-test for each input variable is a massive multiple testing situation.
    +IMHO the main point of PLS (or other regularization techniques) is to avoid the need for such a variable selection.
  • +
+

Remark to Step 2:

+

If you build a linear regression model in PCA score space, that is usually called principal component regression (PCR) in chemometrics. It is not the same as a PLS model.

+

How to find out which variates are used by the PCA/PLS model?

+

There are several ways to approach this question. Obviously, variates where the PCA loadings or PLS weights are 0 do not enter the model. Whether it is sufficient to look at the loadings or whether you need to go a step further depends on your data: if the data set is not standardized you may want to calculate how much each variate "contributes" to the respective PCA/PLS score.
+Literature where we did that with LDA (works just the same way): C. Beleites, K. Geiger, M. Kirsch, S. B. Sobottka, G. Schackert and R. Salzer: Raman spectroscopic grading of astrocytoma tissues: using soft reference information, Anal. Bioanal. Chem., 400 (2011), 2801 - 2816. The linked page has both links to the official web page and my manuscript.

+

You can also derive e.g. bootstrap distributions of the loadings (or the contributions) and have a look at them. For PCR and PLS coefficients that is straightforward, as the Y variable automatically "aligns" the coefficients. PCA and PLS scores need some more care, as e.g. flipping of the directions needs to be taken into account, and you may decide to treat models as equivalent where the scores which are then used for further modeling are just rotated or scaled versions of each other. Thus, you may want to align the scores first e.g. by Procrustes analysis. The paper linked above also discusses this (for LDA, but again, the ideas apply to the other bilinear models as well).

+

Last but not least, you need to be careful not to overinterprete the models, and you can have situations where important variates have coefficients frequently touching the zero mark in the bootstrap experiments if you have correlation between variates. However, ehat you can or cannot conclude will depend on your type of data, though.

+",2013-11-05 12:54:06.930 +27120,568.0,1,27132.0,,,Mean square error or mean squared error,,CC BY-SA 3.0,"

As a non-native english speaker I was wondering which of the square or squared expression I should use. For instance in mean square error or mean squared error.

+ +

According to the Internet, it seems both forms are used indistinctly. Is one expression more square than the other ?

+",2012-06-20 16:41:10.737 +58885,7860.0,1,58887.0,,,Explain the meaning of $\alpha$ in a Kolmogorov-Smirnov two-sample test,,CC BY-SA 3.0,"

The Wikipedia article on the two-sample Kolmogorov-Smirnov test states that:

+ +
+

The Kolmogorov–Smirnov test may also be used to test whether two + underlying one-dimensional probability distributions differ. In this + case, the Kolmogorov–Smirnov statistic is

+ +

$$D_{n,n'}=\sup_x |F_{1,n}(x)-F_{2,n'}(x)|$$

+ +

where $F_{1,n}$ and $F_{2,n'}$ are the empirical distribution + functions of the first and the second sample respectively. The null + hypothesis is rejected at level $\alpha$ if

+ +

$$D_{n,n'}>c(\alpha)\sqrt{\frac{n + n'}{n n'}}.$$

+
+ +

It is not clear to me the meaning of the $\alpha$ level. Where does it come from and what does it mean statistically?

+",2013-11-05 13:03:04.053 +58886,23375.0,1,,,,Analytical formula for distribution of partial sum of standardized random variable,,CC BY-SA 4.0,"

I would like to know if there is an analytical formula for the distribution of partial sums of standardized random variables. (Of course, if one standardizes a random variable, the sum of all the individual observations will be zero).

+ +

I have written a Python function that estimates quantiles by bootstrapping. Each bootstrap iteration consists of three steps:

+ +
    +
  1. n samples of a Normal(0, 1) random variable are generated.
  2. +
  3. The resulting sample is then standardized.
  4. +
  5. The sum of the first k elements of the standardized sample is calculated.
  6. +
+ +

This procedure is followed niter times, generating a sample of size niter. The quantiles of this sample are then calculated.

+ +

In case it helps, here is the code:

+ +
import numpy as np
+import scipy.stats.mstats
+
+def boot_standardized(n, k, niter=1000, prob=None):
+    if prob is None:
+        prob = [0.05, 0.1, 0.5, 0.9, 0.95]
+
+    x = np.random.randn(n, niter)
+    x_std = (x - np.mean(x, axis=0))/np.std(x, axis=0)
+
+    boot_values = np.sum(x_std[:k,:], axis=0)
+
+    return scipy.stats.mstats.mquantiles(boot_values, prob=prob)
+
+",2013-11-05 13:08:15.937 +58887,16043.0,2,,58885.0,,,,CC BY-SA 3.0,"

The level $\alpha$ is the ""significance level"" of the test, the rate of Type I error, the probability of detecting a difference under the assumptions of the null hypothesis (that the two samples are drawn from the same distribution).

+",2013-11-05 13:09:57.440 +58888,11383.0,2,,58800.0,,,,CC BY-SA 3.0,"

If $X$ is a discrete random variable taking its vales in $S$ and $h$ is a non negative function, then +$$\mathbb E[h(X)]=\sum_{x\in S}h(x)\mathbb P\{X=x\}.$$ +Indeed, we formally have +$$\mathbb E[h(X)]=\mathbb E[h(X)\chi_{\bigcup_{x\in S}\{X=x\}}]=\sum_{x\in S}\mathbb E[h(X)\chi_{\{X=x\}}].$$ +Now we use this with the map $x\mapsto e^{tx}$, where $t$ is fixed.

+ +

We don't need the partial derivative as there is no ambiguity: we can only take the derivative with respect to $t$.

+",2013-11-05 13:31:07.547 +58889,23376.0,1,,,,General location model,,CC BY-SA 3.0,"

Let $Z_1$ and $Z_2$ be categorical random variables with $3$ and $2$ categories, respectively. Let $Y_1$ and $Y_2$ be $2$ continuous random variables. Define completely the GLOM (general location model) for the joint distribution of $Z=(Z_1,Z_2)^T$ and $Y=(Y_1,Y_2)^T$

+ +

I couldn't solve this problem. Can anyone help me?

+",2013-11-05 13:33:17.720 +58890,2958.0,2,,58435.0,,,,CC BY-SA 3.0,"

First of all, as @Marc Claesen already explained, semi-supervised classification is one of the techniques to take care of the situation where you know that the classes are really distinct, but you are not certain which class the case actually belongs to.

+ +

However, there are related situations as well, where the the ""reality"" isn't that clear, and the assumption of having really distinct classes is not met: bordeline cases may be a ""physical"" reality (see below for papers about an application where we met such a condition).

+ +

There is one crucial assumption for semi-supervised classifers that you need to make sure is met: the assumption that in feature space, class boundaries come along with low sample density. This is referred to as the cluster assumption.
+Even if the reality underlying your data has distinct classes, your data set may have disproportionally more borderline cases: e.g. if your classification technique is targeted at classifying difficult cases, while the clear and easy cases are not of interest and already your training data reflects this situation.

+ +
+

only taking ""certain"" classifications for training? I fear that in this case, there will be more misclassifications because ""border"" cases are not covered.

+
+ +

I fully agree with you that excluding the borderline cases is often a bad idea: + by removing all difficult cases you end up with an artificially easy problem. IMHO it is even worse that excluding borderline cases usually does not stop with model training, but the borderline cases are also excluded from testing, thus testing the model only with easy cases. With that you'd not even realize that the model does not perform well with borderline cases.

+ +

Here are two papers we wrote about a problem that differs from yours in that in our application also the reality can have ""mixed"" classes (a more general version of your problem: uncertainty in reference labels is covered as well).

+ + + +

The links go to a project page of an R package I developed to do the performance calculations. There are further links to both the official web page and my manuscripts of the papers. +While I have not used Weka so far, I understand that an interface to R is available.

+ +
+ +

practical considerations:

+ +
    +
  • While the copy-and-label-differently approach is straightforward, it does not work well with all classifiers and implementations in practice. E.g. AFAIK there is no way to tell libSVMs tuning by cross validation that all copies of each data point need to be kept in the same cross validation fold. Thus libSVMs tuning would probably yield a massively overfit model.
  • +
  • Also for logistic regression, I found that many implementations did not allow the partial membership labels I needed.
  • +
  • The implementation I used for the papers above is actually an ANN without hidden layer using the logistic as sigmoidal link function (nnet::multinom).
  • +
+",2013-11-05 13:38:54.717 +58891,,2,,58856.0,user31668,,,CC BY-SA 3.0,"

For your first question: Yes, you can substitute sufficient statistic as it is just a re-writing of $\sum x_i$

+ +

For your second part, since your data are normal, you can use the Wilks Likelihood ratio test to get a rejection region.

+",2013-11-05 13:43:52.100 +58892,23377.0,1,,,,Transfer functions in R (TSA package),,CC BY-SA 3.0,"

In Time Series models’ transfer functions there is a decay parameter in the +formula (let’s call it b). In TSA package that decay parameter is not mentioned. When I used other software before (such as SAS) I used to determine b after analyzing ‘prewhitened series’. But +in TSA package in R there is no need to specify the decay parameter once you analyze CCF?

+ +

If not how am I going to know when the decay starts?

+ +

I understand CCF is used after prewhitening to determine how to +filter the outputs but where b comes into the picture?

+",2013-11-05 14:10:31.143 +58893,13740.0,1,58920.0,,,Comparing (and testing) two discrete distributions with different magnitudes,,CC BY-SA 3.0,"

I am comparing authoritative survey data (large amount of observations) with data gained from a social network (very small amount of observations). Particularly, I want to compare population per district as surveyed with population per district as found in a location based social network.

+ +

Example dataset:

+ +
   type variable   value
+1     1      vgi    1064
+2     2      vgi     873
+3     3      vgi       8
+4     4      vgi     246
+9     1      pop 2248360
+10    2      pop 3544721
+11    3      pop   70934
+12    4      pop 2090647
+
+ +

type is the district (1-4), variable=vgi denotes users found in the social network while variable=pop is the actual population size per distcrict.

+ +

Even though the scales are completely different in magnitude, is there a way to qualitatively (e.g. with a plot) and quantitatively compare both distributions?

+ +

With qualitative I mean a plot where one can easily see which district is likely under- or overrepresented on social media and with quantitative I mean something like a Chi-Square-Test in order to see whether the distributions significantly differ from each other. For example, one can see from the data that district 2 is underrepresented on vgi, or one could also say that district 1 is overrepresented on vgi -- but that is the problem - what is considered over- or underrepresented?!

+ +

I don't have experience with such data, thus I am asking. I was able to plot both distributions with R, but the different scales make them hard to compare - I should probably transform one of both types but I don't know how.

+",2013-11-05 14:11:48.850 +58901,23384.0,1,,,,Equations from linear regression,,CC BY-SA 3.0,"

Notation. Let $y$, $a$, and $b$ be $n\times 1$, $p\times 1$, and $q\times1$ real vectors. Let also $X$ and $Z$ be $n\times p$ and $n \times q$ real matrices.

+ +

Suppose that there is no solution, $a$, to $y = X a$.

+ +

Question. What are the conditions on $Z$ such that $y = Xa + Zb$ has no solution for each choice of $b$?

+ +

Context. I came across this problem in the context of linear regression. The fact that $y=Xa$ has no solution can be interpreted as ""no hyperplane can perfectly fit the data"". I am analysing an extension of this problem which has lead me to the need for finding something similar for ""$y = Xa + Zb$ has no solution"", but in this case $b$ is not fixed and can actually take any value in ${\mathbb R}^q$.

+",2013-11-05 15:16:51.700 +58894,23378.0,2,,58847.0,,,,CC BY-SA 3.0,"

If you have R2012b or later, use the RUSBoost algorithm. It is recommended for imbalanced datasets.

+ +

If you go with GentleBoost, you need to optimize the tree complexity and the number of trees in the ensemble. (You could also play with the learning rate.) Both parameters are likely far off their optimal values in your code.

+ +

First, fitensemble for GentleBoost by default produces decision stumps (trees with two leaves). Since the minority class is only 8% of the data, stumps are not sensitive to observations of the minority class. I often set the minimal leaf size to one half of the size of the minority class. The optimal setting for the leaf size may not be exactly that but should be in that ballpark. Do:

+ +
tmp = ClassificationTree.template('minleaf',some_number);
+ens = fitensemble(Xtrain,Ytrain,'GentleBoost',Ntrees,tmp,'prior','uniform')
+
+ +

Second, 10 trees are most usually not enough. Inspect the ensemble accuracy by cross-validation or using an independent test set to decide how many trees are needed. Typically, a few hundred should be used for boosting.

+ +

Also, after you train the ensemble, don't just look at the classification error. Use the perfcurve function to compute a performance curve and find the optimal threshold on the classification score. It is up to you to define what ""optimal"" means. You can assign, for instance, different misclassification costs to the two classes and find the threshold minimizing the expected cost. +.....

+",2013-11-05 14:15:25.160 +58895,20473.0,2,,58880.0,,,,CC BY-SA 3.0,"

I will just use $y$ for bread and $x$ for wage. You write that ""a unit root is present in both $y$ and $x$."" Although this is rather vague, let's say that we have

+ +

$$y_t=y_{t-1}+v_t,\Rightarrow \Delta y_t = v_t$$ +$$ x_t=x_{t-1}+\omega_t \Rightarrow \Delta x_t = \omega_t$$

+ +

with $v_t$ and $\omega_t$ independent white noises.

+ +

Then in the first difference specification +$$\Delta y_t=\beta_{1}\Delta x_t + \Delta u_{t}$$ +we substitute using the expressions for $\Delta y_t$ and $\Delta x_t$ to obtain

+ +

$$v_t=\beta_{1}\omega_t + \Delta u_{t} \Rightarrow v_t-\beta_1\omega_t = \Delta u_t$$

+ +

So $\Delta u_t$ is a linear combination of independent white noises, and therefore white noise itself.

+ +

ADDENDUM

+ +

$$\operatorname {Cov}(\Delta u_t, \Delta u_{t-1}) = E\left( \Delta u_t\Delta u_{t-1}\right) = E\left( ( v_t-\beta_1\omega_t)( v_{t-1}-\beta_1\omega_{t-1})\right)$$ +$$=E(v_tv_{t-1})-\beta_1E(v_t\omega_{t-1})-\beta_1E(\omega_t v_{t-1})+ \beta_1^2E(\omega_t\omega_{t-1})$$ +$$=0+0+0+0=0$$ +since we have white noises, and independent between them.

+",2013-11-05 14:17:00.047 +58896,15827.0,2,,58839.0,,,,CC BY-SA 3.0,"

This may not the answer you seek, but in general

+ +
    +
  • The first things to check are how close $b_3$ in M1 is to $b_1 b_2$ in M2 and whether predicted values match each other. AIC and F-tests tell you how well each model fits, but they say nothing about how the models differ. Simple numeric and graphical comparisons may tell you more.

  • +
  • In M1 the value of $b_3$ is unconstrained and in M2 it is constrained. If the criterion is closeness of fit to the data in some absolute sense, then it would be surprising if a constrained fit were better. Otherwise the comparison will hinge on precisely whether and how you penalise yourself for using one more parameter in M1. So, watch out: you won't get anything out of AIC or similar or dissimilar figures of merit that is not a strict consequence of how they are defined. That no doubt is obvious, but it is important.

  • +
+",2013-11-05 14:21:22.350 +58897,23380.0,2,,58850.0,,,,CC BY-SA 3.0,"

The function would be the same as the one for balanced data - TreeBagger or fitensemble. By default, either grows deep trees; the default minimal leaf size is 1 for classification. This typically gives you enough sensitivity to find a good decision boundary between the classes. The default decision boundary, at which the class posterior probabilities are equal, is most usually not what you want for imbalanced data. As I advised in your other post, use the perfcurve function to find the optimal threshold on the posterior probability for the minority class.

+ +

By the way, try MATLAB Answers http://www.mathworks.com/matlabcentral/answers for MATLAB questions. I read that site much more often than this one.

+",2013-11-05 14:29:26.380 +58898,8958.0,2,,58882.0,,,,CC BY-SA 4.0,"

You could use ANY classifier. Including Linear Discriminants, multinomial logit as Bill pointed out, Support Vector Machines, Neural Nets, CART, random forest, C5 trees, there are a world of different models that can help you predict $v.a$ using $v.b$ and $v.c$. Here is an example using the R implementation of random forest:

+ +
# packages
+library(randomForest)
+
+#variables
+v.a= c('cat','dog','dog','goat','cat','goat','dog','dog')
+v.b= c(1,2,1,2,1,2,1,2)
+v.c= c('blue', 'red', 'blue', 'red', 'red', 'blue', 'yellow', 'yellow')
+
+# model fit
+# note that you must turn the ordinal variables into factor or R wont use
+# them properly
+model <- randomForest(y=as.factor(v.a),x=cbind(v.b,as.factor(v.c)),ntree=10)
+
+#plot of model accuracy by class
+plot(model)
+
+ +

+ +
# model confusion matrix
+model$confusion
+
+ +

Clearly these variables don't show a strong relation.

+",2013-11-05 14:33:44.167 +58899,594.0,2,,58800.0,,,,CC BY-SA 3.0,"

There's a potential ambiguity in your question:

+ +

If you're asking (a) ""Why is $e^{tX}$ inside the expectation?"" then the answer is 'that's the definition of the moment generating function'.

+ +

On the other hand, if you're asking (b) ""Given we want $E(e^{tX})$, why does $e^{tX}$ appear in the summation?"", this would be because that's how expectation works.

+ +

That is, for the continuous case $E[g(X)] = \int g(x) f(x) dx$ (indeed, some authors seem to regard that law as a definition of expectation).

+ +
+

Why do we use the derivative notation instead of the partial derivative operator?

+
+ +

Because $X$ isn't regarded an argument of $M$, $t$ is. If it were $M(X,t)$, then we'd write $\partial$, but it's $M_X(t)$, so we write $d$. This isn't some accident of notation, however - it really is a function of $t$ we're dealing with, though calculated with respect to the distribution of $X$. Inside the integral/sum, $x$ is actually a dummy variable; it could as easily be represented by $u$, or any other dummy, and $M$ is certainly not a function of the dummy variable in any sense. You might think of $X$ like an indexing variable (since it determines which $M$ you get), but if $M$ is to be regarded as a function of something to do with $X$, it's really $F_X$ (through $dF$) that $M$ is a function of.

+",2013-11-05 14:53:19.913 +58900,750.0,2,,58219.0,,,,CC BY-SA 3.0,"

I will walk through how I generated the exact statistics for the $\chi^2$ distribution and (hopefully) update later with a paper that gives tables for the exact distributions and where the exact distribution converges to the theoretical $\chi^2$ with $6$ degrees of freedom. Ditto for the K-S and Kuiper distributions that ttnphns mentions.

+ +

So the steps to generate the exact distributions are:

+ +
    +
  1. Generate all potential combinations of outcomes allocating crimes into the $7$ weekday bins. The total number of combinations ends up being $\binom{M + N - 1}{M - 1}$ where $M = 7$ weekday bins and $N$ equals the number of crimes observed.
  2. +
  3. For each of the combinations, calculate the probability of observing that outcome under the null hypothesis. Here the null is that each crime has a multinomial distribution in which the weekdays are the outcomes and are equi-probable (e.g. each day has a probability of $1/7$ of being selected).
  4. +
  5. Generate the test statistic for that set of observations.
  6. +
+ +

From this information you can generate critical values for the test distributions under the null. So If you observe $3$ crimes, with two occurring on Monday and one on Tuesday, the probability of that event under the null is:

+ +

$$Pr(\text{Mon.} = 2, \text{Tues.} = 1) = \frac{3!}{2!1!} \cdot ({P_m}^2)({P_t}^1) = 0.00874635568513119$$

+ +

Where $P_m$ and $P_t$ symbolize the probability of an event on Monday and Tuesday respectively, and $P_m = P_t = 1/7$. (If you wanted to generalize to a window in which say spans 10 days, you may want to consider unequal probalities of $2/10$ for the overlapping days and $1/10$ for the others.)

+ +

For an example of generating the exact distribution of the test statistic, with three crimes there are only three different possible $\chi^2$ values out of the 84 different combinations (since order doesn't matter for the statistic). The below table symbolizes these potential outcomes. (Just imagine sorting the days of the week so the day of the week with the most events is in the left most column.)

+ +
A  B   C
+.
+.  .
+.  ..  ...
+
+ +

Subsequently, combinations of A appear 7 times, B 42 times, and C 35 times. The below table shows the probabilties of obtaining said $\chi^2$ statistics and how to generate the CDF of the null hypothesis. From here you can see that it is actually possible to reject the null at a .05 critical level if all three events are observed on the same day.

+ +
    #  ChiSq    Prob(Sum) CDF
+C  35    4         .61    .61
+B  42    8.67      .37    .98
+A   7   18         .02   1.0 
+
+ +

Also from the set of all potential combinations you can generate the distributions under various alternative hypotheses, and this allows you to evaluate the power of the test under those circumstances. So for example for $5$ crimes, the exact $\chi^2$ distribution has a $.05$ critical value at $10.4$. So for an alternative hypothesis of data having positive probability in only two days, you have 100% power (i.e. the only observable $\chi^2$ values if 5 crimes occur in 2 or fewer days of the week is over $10.4$).

+ +

The image below shows the CDFs for the exact $\chi^2$ distribution with $5$ crimes in $7$ weekday bins (in light grey lines), CDFs for different alternative hypotheses in dark grey lines, and the critical value $\chi^2$ highlighted with a red guideline. The alternative hypotheses are for differing numbers of days that are equi-probable for the crimes to occur on for $1$ to $4$ days during the week.

+ +

+ +

You can see from this chart even for an alternative of equi-probable chances over three days for just 5 crimes the power is just under 40%, 1 minus the CDF of the alt. hypothesis where it intersects the critical value. (Earlier I wrote that a tie goes to rejecting the null, but that would be incorrect as the Type 1 error would be inflated to .39 instead of .02 in my 3 crimes example.)

+ +
+ +

I have a paper going through this same analysis and generating critical values for Kuiper's $V$ and the $\chi^2$ test now posted on SSRN, Testing for Randomness in Day of Week Crime Sprees with Small Samples.

+",2013-11-05 14:59:31.733 +58902,17573.0,2,,58622.0,,,,CC BY-SA 3.0,"

You asked for an algebraic answer to 1, and it is given below. However, a non-algebraic answer would be much better, so that is given first.

+ +

The usual technique used to run linear regressions is called Ordinary Least Squares (OLS). The formula you give above is the formula for the OLS estimator. However, the formula you give above is not the definition of the OLS estimator. The definition of the OLS estimator is ""that coefficient vector, $\hat{\beta}$, which minimizes the sum of squared residuals."" First, verbally . . .

+ +

Define the number of observations as $N$, so that each of the $\delta$ and $\alpha$ models have $N$ observations, and the $\beta$ model has $2N$. Consider the $\delta$ and $\alpha$ which solve the first two OLS problems you pose above, and ask, do they (transformed as you have transformed them) solve the third, pooled, $\beta$, OLS problem? Well, think about that third problem's objective function $\sum_{i=1}^{2N} (Y_i-\hat{Y}_i)^2$. We know that $\hat{\alpha}$ minimizes the first $N$ terms (definition of OLS) and has no effect on the last $N$ terms (by inspection). We know that $\hat{\delta}$ minimizes the last $N$ terms (definition of OLS) and has no effect on the first $N$ terms (by inspection). Last, there are ""enough"" $\beta$ such that the $\alpha$ and $\delta$ may be adjusted independently in the third problem. That is essentially a proof that the OLS estimators have to be exactly the same (again up to your transformation). The proof is not completely formal and explicit, but it's not hard from here to make it so.

+ +

To your question 2, yes this result is true much more generally. For any estimator defined by an optimization problem (any ""M Estimator,"" like maximum likelihood for example), there is a result like this. The key requirements are that the parameter spaces of the pooled and separate models are ""the same"" up to an invertible transformation (like the one you gave to transform $\alpha$ and $\delta$ to $\beta$) and that the objective function of the pooled model can be decomposed (linearly or multiplicatively) into non-interacting parts corresponding (up to an increasing transformation) to the objective functions of the separate problems. This is a lot of models.

+ +

The kind of argument I give above is incredibly useful when you are using any kind of M-estimator. For example, it is a famous result for OLS that if you re-scale (change the units of) one of the $X$ variables, that its coefficient will be rescaled by OLS in an exactly offsetting way and that nothing else about the OLS estimator will change. This can be proved by a fairly tedious algebra exercise or by a very brief optimization argument like the one I gave above. The result is not just true for OLS, though. Any M estimator which has the $X$ multiplied against a coefficient will have this magic re-scaling property as well.

+ +

Now, to the algebraic demonstration. Changing your notation a little for clarity, start by comparing estimating these two equations by OLS: +\begin{align} +Y_1 &= X_1\delta + \epsilon_1 \\ +Y_2 &= X_1\alpha + \epsilon_2 +\end{align} +With estimating this equation by OLS: +\begin{align} +\left(\begin{array}{r} Y_1 \\ Y_2 \end{array} \right) &= +\left[\begin{array}{r r} X_1 & 0 \\ X_1 & X_1 \end{array} +\right] +\left(\begin{array}{r} \gamma_1 \\ \gamma_2 \end{array} +\right) ++ \left(\begin{array}{r} \epsilon_1 \\ \epsilon_2 \end{array} +\right) \\ \strut \\ +Y &= X\gamma + \epsilon +\end{align} +My $\gamma_1$ is your $\beta_0$ and $\beta_1$ stacked up and etc. +\begin{align} +\hat{\gamma} &= (X'X)^{-1}X'Y \\ \strut \\ + &= +\left[\begin{array}{r r} X_1'X_1 & 0 \\ 2X_1'X_1 & X_1'X_1 \end{array} +\right]^{-1} +\left(\begin{array}{r} X_1'Y_1 \\ X_1'Y_1 + X_1'Y_2 \end{array} +\right) \\ \strut \\ +&= +\left[\begin{array}{r r} (X_1'X_1)^{-1} & 0 \\ -2(X_1'X_1)^{-1} & (X_1'X_1)^{-1} \end{array} +\right] +\left(\begin{array}{r} X_1'Y_1 \\ X_1'Y_1 + X_1'Y_2 \end{array} +\right) \\ \strut \\ +\left( +\begin{array}{r} \hat{\gamma}_1 \\ \hat{\gamma}_2 \end{array} +\right) +&= \left( +\begin{array}{r} (X_1'X_1)^{-1}X_1'Y_1 \\ +-2(X_1'X_1)^{-1}X_1'Y_1+(X_1'X_1)^{-1}X_1'Y_1 + (X_1'X_1)^{-1}X_1'Y_2 \end{array} +\right) \\ \strut \\ +\left( +\begin{array}{r} \hat{\gamma}_1 \\ \hat{\gamma}_2 \end{array} +\right) +&= \left( +\begin{array}{r} \hat{\delta}_1 \\ \hat{\alpha}_1 - \hat{\delta}_1 \end{array} +\right) +\end{align}

+ +

That's exactly what you got.

+",2013-11-05 15:36:42.763 +58903,23385.0,1,,,,GPML gives too large length scale when optimising hyperparameters,,CC BY-SA 3.0,"

I recently started trying to apply Gaussian process regression to a problem, using the MATLAB GPML toolbox. The problem has five (or more) input variables, but for now I'm just looking at one of them.

+ +

With manually selected hyperparameters, I get reasonable-looking output, but my problem is that when I minimize() the hyperparameters, I get a very large length scale, thus producing a flat mean and covariance function.

+ +

If I run the following code, hyp2.cov = [5.4, 1.1] after the call to minimize, so the length scale is exp(5.4)=214, even though the data points are on the interval [-0.14, 0.06].

+ +

Could anyone tell me why this is so?

+ +
x=[-0.0350; 0.0550;-0.0830;-0.1360; 0.0190];
+y=[3.0700;3.3200;3.0400;3.0600;2.9200];
+covfunc = @covSEiso; hyp2.cov = [0; 0]; hyp2.lik = log(0.1);
+likfunc = @likGauss; sn = 0.1; hyp2.lik = log(sn);
+hyp2 = minimize(hyp2, @gp, -100, @infExact, [], covfunc, likfunc, x, y);
+
+",2013-11-05 15:56:56.647 +58904,23389.0,1,,,,Combining results with different confidence levels,,CC BY-SA 3.0,"

I have an experiment where I throw a biased coin were with confidence interval [0.38, 0.42] and confidence level of 0.95 I will get heads. If I get heads then I throw a biased 6-sided die which gives me 1 with probability given in the interval [0.48, 0.52] and confidence level 0.90.

+ +

What is the probability of getting 1 and what is my confidence level for that result. If no intervals and no confidence levels the problem is trivial and could be done with simple multiplication, but I am pretty sure simple multiplication will not be a solution in interval case.

+",2013-11-05 16:51:39.300 +58905,14965.0,1,58910.0,,,Entropy and Likelihood Relationship,,CC BY-SA 3.0,"

This is a theoretical question.

+ +

Suppose that I have a sample s1 coming from distribution K, and a sample s2 coming from distribution M. But I don't know what K or M are. I hypothesize that s1 and s2 are coming from distribution T. I plug in s1 and s2 to pdf of T and calculate their likelihoods, say L(s1), L(s2). If I know differential entropies of K and M, being H(K) and H(M) respectively, can I find a relation between L(s1) and L(s2) in terms of H(K) and H(M). Basically what I am trying to show is something along the lines of this:

+ +

$\frac {L(s1)}{L(s2)} \sim \frac {e^{H(K)}} {e^{H(M)}}$

+ +

Is this even meaningful?

+ +

Thanks

+",2013-11-05 17:08:48.903 +58906,3580.0,1,,,,Lasso-ing the order of a lag?,,CC BY-SA 3.0,"

Suppose I have longitudinal data of the form $\mathbf Y = (Y_1, \ldots, Y_J) \sim \mathcal N(\mu, \Sigma)$ (I have multiple observations, this is just the form of a single one). I'm interested in restrictions on $\Sigma$. An unrestricted $\Sigma$ is equivalent to taking +$$ +Y_j = \alpha_j + \sum_{\ell = 1} ^ {j - 1} \phi_{\ell j} Y_{j-\ell} + \varepsilon_j +$$ +with $\varepsilon_j \sim N(0, \sigma_j)$.

+ +

This is typically not done since it requires estimating $O(J^2)$ covariance parameters. A model is ""lag-$k$"" if we take +$$ +Y_j = \alpha_j + \sum_{\ell = 1} ^ k \phi_{\ell j} Y_{j - \ell} + \varepsilon_j, +$$ +i.e. we only use the preceding $k$ terms to predict $Y_j$ from the history.

+ +

What I'd really like to do is use some kind of shrinkage idea to zero out some of the $\phi_{\ell j}$, like the LASSO. But the thing is, I also would like the method I use to prefer models which are lag-$k$ for some $k$; I'd like to penalize higher order lags more than lower order lags. I think this is something we would particularly like to do given that the predictors are highly correlated.

+ +

An additional issue is that if (say) $\phi_{35}$ is shrunk to $0$ I would also like it if $\phi_{36}$ is shrunk to $0$, i.e. the same lag is used in all of the conditional distributions.

+ +

I could speculate on this, but I don't want to reinvent the wheel. Is there any LASSO techniques designed to get at this sort of problem? Am I better off just doing something else entirely, like stepwise inclusion of lag orders? Since my model space is small, I could even use an $L_0$ penalty on this problem I guess?

+",2013-11-05 17:14:23.857 +58907,16174.0,2,,58882.0,,,,CC BY-SA 3.0,"

This is a more a partial practical answer, but it works for me to do some exercises before getting deeply into theory.

+ +

This ats.ucla.edu link is a reference that might help beggining to understand about multinomial logistic regression (as pointed out by Bill), in a more practical way.
+It presents reproducible code to understand function multinom from nmet package in R and also gives a briefing about outputs interpretation.

+ +

Consider this code:

+ +
va = c('cat','dog','dog','goat','cat','goat','dog','dog') 
+     # cat will be the outcome baseline
+vb = c(1,2,1,2,1,2,1,2)
+vc = c('blue','red','blue','red','red','blue','yellow','yellow') 
+     # blue will be the vc predictor baseline
+set.seed(12)
+vd = round(rnorm(8),2)
+
+data = data.frame(cbind(va,vb,vc,vd))
+
+library(nnet)
+fit <- multinom(va ~ as.numeric(vb) + vc + as.numeric(vd), data=data)
+
+# weights:  18 (10 variable)
+initial  value 8.788898 
+iter  10 value 0.213098
+iter  20 value 0.000278
+final  value 0.000070 
+converged
+
+fit
+
+Call:
+multinom(formula = va ~ as.numeric(vb) + vc + as.numeric(vd), 
+    data = data)
+
+Coefficients:
+     (Intercept) as.numeric(vb)     vcred  vcyellow as.numeric(vd)
+dog    -1.044866       120.3495 -6.705314  77.41661      -21.97069
+goat   47.493155       126.4840 49.856414 -41.46955      -47.72585
+
+Residual Deviance: 0.0001656705 
+AIC: 20.00017 
+
+ +

This is how you can interpret the log-linear fitted multinomial logistic model:

+ +

\begin{align} +\ln\left(\frac{P(va={\rm cat})}{P(va={\rm dog})}\right) &= b_{10} + b_{11}vb + b_{12}(vc={\rm red}) + b_{13}(vc={\rm yellow}) + b_{14}vd \\ + &\ \\ +\ln\left(\frac{P(va={\rm cat})}{P(va={\rm goat})}\right) &= b_{20} + b_{21}vb + b_{22}(vc={\rm red}) + b_{23}(vc={\rm yellow}) + b_{24}vd +\end{align}

+ +

Here is an excerpt about how the model parameters can be interpreted:

+ +
+
    +
  • A one-unit increase in the variable vd is associated with the decrease in the log odds of being ""dog"" vs. ""cat"" in the amount of 21.97069 ($b_{14}$).
  • +
+
+ +

the same logic for the second line but, considering ""goat"" vs. ""cat"" with ($b_{24}$=-47.72585).

+ +
+
    +
  • The log odds of being ""dog"" vs. ""cat"" will increase by 6.705314 if moving from vc=""blue"" to vc=""red""($b_{12}$).
  • +
+
+ +

.....

+ +

There is much more in the article, but I thought this part to be the core.

+ +
+ +

Reference:

+ +

R Data Analysis Examples: Multinomial Logistic Regression. UCLA: Statistical Consulting Group.
+from http://www.ats.ucla.edu/stat/r/dae/mlogit.htm (accessed November 05, 2013).

+",2013-11-05 17:14:53.610 +58908,,2,,58836.0,user31668,,,CC BY-SA 3.0,"

Based on OP's comment that deseasonalized time series is a linear trend (t is a true predictor), then you will either want the Prediction Interval for linear regression (if you are trying to predict 1 time period ahead), or tolerance intervals if you are trying to capture a specific proportion of future measurements.

+ +

If the residuals from your linear fit to the deseasonalized data are approximately normal, then there are nice formulas for as you will see in the above links: Also, eee this other CrossValidated Post.

+ +

You would then re-seasonalize these intervals/bands to get your actual forcasts.

+",2013-11-05 17:15:42.367 +58954,1145.0,2,,58913.0,,,,CC BY-SA 3.0,"

You are correct in assuming that you can't (shouldn't, really) analyse the data with the controls having zero variance.

+ +

It sounds like you should consider using a two-way ANOVA on the raw data with the within day variance accounted for in the manner of a paired test. I wrote about the approach in this paper that is intended for pharmacologists with little statistical background: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2042947/

+",2013-11-06 05:06:59.730 +58909,2149.0,2,,58892.0,,,,CC BY-SA 3.0,"

Standard notation for identifying a Transfer Function includes r,s and b

+ +

r is the number of denominator coefficients +s is the number of numerator coefficients +and +b is the pure delay or dead time before y responds to x

+ +

AUTOBOX a piece of software that I have helped develop does this ( at the user's option ) automatically or a user can specify the model themselves. You can freely download a 30 day version from (http://www.autobox.com) and play it with and compare your model identification with it's suggestion. It is a great(free) way of learning how to identify this class of model.

+",2013-11-05 17:28:00.947 +58910,,2,,58905.0,user31668,,,CC BY-SA 3.0,"

It is known that maximizing the likelihood of the data is equivalent to minimizing the Kullback-Liebler distance between the hypothesized distribution family, T, and K or M. The kullback-Liebler distance is related to entropy, but its not as simple as your assumed proportional model. See also link.

+",2013-11-05 17:38:49.307 +58911,261.0,2,,8529.0,,,,CC BY-SA 3.0,"

Jim Berger's review articles: http://www.stat.duke.edu/~berger/papers.html

+ +

You might start with Could Fisher, Jeffreys and Neyman have agreed upon testing?

+",2013-11-05 17:52:09.723 +58912,,2,,58904.0,user31668,,,CC BY-SA 4.0,"

A confidence interval is a "post-data" interval estimate that is supposed to bracket the true parameter in %C of the samples. What you appear to be trying to do is to predict a future event. For this, you need a little more structure on your problem. In particular, a confidence interval is insufficient. What you really need is a distribution of possible head probabilities and probabilities of rolling 1. Then, you need to calculate the probability as such:

+

Let $C$ be the outcome of the coin toss and $X$ be the results of the die roll, $f_H(p)$ isthe density function on the probability of heads (i.e., $p$) and $f_{1|C=H}(q)$ is the density function for the probability of rolling a 1 (i.e, $q$) given that you got a head. Therefore,

+

$P(X=1)=E[1_{C=H}1_{X=1|C=H}]=E[1_{C=H}]E[1_{X=1|C=H}]$ where $1_{C=H}$ and $1_{X=1|C=H}$ are indicator functions that take value 1 when the event in subscript happens, and 0 otherwise. Conditional independence between the coin toss and die roll allow you to multiply expected values.

+

Now, $E[1_{C=H}]E[1_{X=1|C=H}]=\int\limits_0^1 \int\limits_0^1pf_H(p)qf_{1|C=H}(q)dqdp$. In other words, you multiply the expected values of the distributions on P(heads) and P(X=1|Heads). So, you will need more info to solve your problem as formulated.

+

IF you have the underlying data that produced each CI, then you can use methods from Bayesian prediction or predictive likelihood

+",2013-11-05 18:05:17.900 +58913,23394.0,1,,,,Statistical test for normalised data,,CC BY-SA 3.0,"

I am working with culture cells where one dish has been transfected with a scrambled knockdown clone and two dishes which have been transfected with two knockdown clones each knocking down the expression of a single gene.

+ +

An example of an experiment I have performed is to measure the mitochondrial membrane potential (using a fluorescent dye) in these cells using a confocal microscope. +This experiment was repeated on three independent occasions.

+ +

On each experimental day, the intensity of the laser which I used (the laser ""gain"") varies therefore I cannot combine all experimental days without expressing the dye intensity of each knockdown clone as a percent of the control ""scrambled"" clone (e.g. control = 100% mean intensity; knockdown clone 1 = 50% mean intensity).

+ +

Therefore, I need to test for a difference in means between my control scrambled clone and each of the knockdown clones, where my control scrambled clone is set to 100% dye intensity on each experimental day and my knockdown clones are normalised to this control. Therefore, my control has no variance (100% for all three experimental days) while my knockdown clones do have variance.

+ +

I know an ANOVA would not be feasible given the difference in variance. I will look into the procedure suggested by Michael Lew, but would a t-test be unacceptable as well? (I have seen papers using ANOVA and t-tests in these circumstances, but in spite of this I am assuming these should not be used). Thanks in advance.

+",2013-11-05 18:28:38.613 +58914,18690.0,1,,,,Variable selection (automated),,CC BY-SA 3.0,"

I was wondering whether the following mechanical selection procedure will result in a possible bias. First let me introduce the first procedure, we start with a model and only look at the t-value and possibly correct them for it (heteroskedasticity / autocorrelation). We then only add variables into our final model that are significant. However I am well aware this gives us a bias with F-test, since even though some variables can be insignificant, they can be jointly significant.

+ +

However if we also take that into account, whether adding that variable gives us a ""sensible"" subset to do a F-test on and only add them if they give a significant result (indicating joinly significance), this would then actually be a pretty good automated method or does this give some bias? Will anything happen that we do not like to have?

+ +

Furthermore is a forward variable selection (starting with small set and increase it) better or backwards variable selection better?

+ +

Thanks for you answer!

+",2013-11-05 18:32:49.877 +58915,14811.0,1,,,,Regression Tree with nested factors,,CC BY-SA 3.0,"

I am working on a prediction model in which I have several factor variables that have many levels. These factor variables have a nested structure, in the form of a Category, a Sub-Category, and a Sub-Sub-Category. For example suppose that I had one field that was the type of device a user browsed the website with (pc, tablet, phone), which then can be sub segmented into ((Apple, windows, linux), (kindle, iOS, android), (windows, iOS, android, RIM)), and then each of those could be subdivided into version numbers.

+ +

Is there a standard way of handling nested features like this in tree models. At an intuitive level, I don't think the tree should be splitting on one of the subgroupings until it has first split on one of the major groups (since a windows phone and a windows PC are quite different). Creating a single factor that describes the full tree path would have too many possible levels.

+",2013-11-05 18:43:11.333 +58916,23397.0,1,,,,Computing probability of completing a task composed of independent events,,CC BY-SA 3.0,"

This is a general question. I have a task that is composed of 3 independent events: A, B, and C. All are mutually exclusive and don't happen at the same time. So first A, then B, then C. I know the probability of completing each event with respect to time, I have the pdfs. How can I calculate the probability of completing the task at hand with respect to time?

+",2013-11-05 19:20:36.173 +58917,16551.0,1,58930.0,,,Hazard Function - Survival Analysis,,CC BY-SA 3.0,"

I just started taking survival analysis class and I'm stumped on this question.

+ +

Let $T_1,...,T_n$ random independent continuous variables, with hazard function of $h_1(t),...,h_n(t)$. +$T=min(T_1,...,T_n)$. +And we need to show that the hazard function of T is $\sum_j{h_j(t)} $

+ +

Any help or direction are welcome :)

+",2013-11-05 19:23:03.373 +58918,5448.0,2,,58375.0,,,,CC BY-SA 3.0,"

You can do this with the lm and associated functions, but you need to be a little careful about how you construct your weights.

+ +

Here's an example / walkthrough. Note that the weights are normalized so that the average weight = 1. I'll follow with what happens if they aren't normalized. I've deleted a lot of the less relevant printout associated with various functions.

+ +
x <- rnorm(1000)
+y <- x + rnorm(1000)
+wts <- rev(0.998^(0:999)) # Weights go from 0.135 to 1
+wts <- wts / mean(wts)    # Now we normalize to mean 1
+> summary(unwtd_lm <- lm(y~x))
+
+          Estimate Std. Error t value Pr(>|t|)    
+(Intercept)  0.04238    0.031    ---
+x            1.03071    0.03268  31.539   <2e-16 ***
+Residual standard error: 1.01 on 998 degrees of freedom
+
+> summary(wtd_lm <- lm(y~x, weights=wts))
+
+            Estimate Std. Error t value Pr(>|t|)    
+(Intercept)  0.03436    0.03227   1.065    0.287    
+x            1.03869    0.03295  31.524   <2e-16 ***
+Residual standard error: 1.02 on 998 degrees of freedom
+
+ +

You can see that with this much data we don't have much difference between the two estimates, but there is some.

+ +

Now for your question. It's not clear whether you want the distance in standard errors where the standard errors are for fitted values or for prediction, so I'll show both. Let us say we are doing this for the value $x = 1$ and the target value (green dot) $y = 1.1$):

+ +
> y_eval <- 1.10
+> wtd_pred <- predict(wtd_lm, newdata=data.frame(x=1), se.fit=TRUE)
+> # Distance relative to predictive std. error
+> (y_eval-wtd_pred$fit[1]) / sqrt(wtd_pred$se.fit^2 + wtd_pred$residual.scale^2)
+[1] 0.02639818
+> 
+> # Distance relative to fitted std. error
+> (y_eval-wtd_pred$fit[1]) / wtd_pred$se.fit
+[1] 0.5945089
+
+ +

where I've deleted the warning message associated with predictive confidence intervals and weighted model fits.

+ +

Now I'll show you how to do the residual variance calculation. First, if your weights aren't normalized, you will have problems:

+ +
> wts <- rev(0.998^(0:999))
+> summary(wtd_lm <- lm(y~x, weights=wts))
+
+            Estimate Std. Error t value Pr(>|t|)    
+(Intercept)  0.03436    0.03227   1.065    0.287    
+x            1.03869    0.03295  31.524   <2e-16 ***
+Residual standard error: 0.6707 on 998 degrees of freedom
+
+> predict(wtd_lm, newdata=data.frame(x=1), interval=""prediction"")
+       fit        lwr      upr
+1 1.073049 -0.2461643 2.392262
+
+ +

Note how that residual standard error has gone way down and the prediction confidence interval has really changed, but the coefficient estimates themselves have not. This is because the calculation for the residual s.e. divides by the residual degrees of freedom (998 in this case) without regard for the scale of the weights. Here's the calculation, mostly lifted from the interior of summary.lm:

+ +
w <- wtd_lm$weights
+r <- wtd_lm$residuals
+rss <- sum(w * r^2)
+sqrt(rss / wtd_lm$df)
+[1] 0.6707338
+
+ +

which you can see matches the residual s.e. in the previous printout.

+ +

Here's how you ought to do this calculation if you find yourself in a position where you need to do it by hand, so to speak:

+ +
> rss_w <- sum(w*r^2)/mean(w)
+> sqrt(rss_w / wtd_lm$df)
+[1] 1.019937
+
+ +

However, normalizing the weights up front takes care of the need to divide by mean(w) and the various lm-related calculations come out correctly without any further manual intervention.

+",2013-11-05 19:33:01.153 +58919,16551.0,1,,,,Trying to prove Kaplan-Meier statistic with no censoring reduces to empirical survival function,,CC BY-SA 4.0,"

I just started taking survival analysis class and I'm stumped on this question.

+

We need to show that when there are no censored observations, $\hat{S}(t)=\prod_{t_{(i)\le t}}\frac{(n_i-d_i)}{n_i}$ equals to the empirical survival function $S_n(t)=\frac{\# \{ t_{(i)} \ge t \}}n . $

+

It looks pretty obvious to me but I'm not sure how to approach this.

+",2013-11-05 19:38:25.167 +58920,21762.0,2,,58893.0,,,,CC BY-SA 3.0,"

You could present relative frequencies of people found in the social network, i.e. ""value over pop""

+ +
 type Percent
+    1  0.0473
+    2  0.0246
+    3  0.0113
+    4  0.0118
+
+ +

and just compare these percentages.

+ +

+ +

As the numbers and also the barplot show, the relative frequencies of people found in the districts vary quite a bit, i.e. not all districts are equally represented in the social network.

+ +

I doubt whether it is useful to use methods from inductive statistics here because your data set does not seem to be a random sample from a population. Should my impression be wrong, then you could either think of adding binomial confidence intervals to each of those percentages and/or run a chi-squared goodness-of-fit test using the population distribution as the reference.

+ +

In R:

+ +
N <- c(2248360, 3544721, 70934, 2090647)
+n <- c(1064, 873, 8, 246)
+chisq.test(n, p = N/sum(N))
+
+# Output
+        Chi-squared test for given probabilities
+
+data:  n
+X-squared = 526.0491, df = 3, p-value < 2.2e-16
+
+ +

At the 5% level, you could reject the null hypothesis that all districts are equally represented in the social network.

+",2013-11-05 20:05:58.143 +58921,10494.0,1,58987.0,,,Can I compare models from linear regression and nonlinear regression using RMSE?,,CC BY-SA 3.0,"

I am comparing multiple published equation forms, refit with independent data. I'm trying to be true to the original authors' methods as much as possible. Therefore, I have 3 linear equations (fit in R using lm()), two of which use transformed Y-variables, and one equation fit using nonlinear regression (fit in R using the gnls() function).

+ +

In all instances cases I'm weighting the residual variance structure using the inverse of one of the predictors to account for observed heteroskedasticity.

+ +

I have been evaluating the models using R2, and RMSE- using back-transformed data for the two models with transformations.

+ +

I've calculated RMSE ""by hand"" using the following equation:

+ +
 RMSE<-sqrt(sum(residuals(Equation)^2)/length(residuals(Equation))-2))
+
+ +

Should I use similar code to calculate RMSE for the linear and nonlinear regression models? Is the metric still a valid statistic for comparison, or am I missing some important assumption?

+ +

Edited: I initially stated that I was also comparing models using AIC; I later recalled that AIC would not be appropriate if the Y-variables were transformed because the models would be estimating different things.

+",2013-11-05 20:27:18.590 +58922,4910.0,2,,58770.0,,,,CC BY-SA 3.0,"

So this is a Monte Carlo solution, that is, we are going to simulate drawing the tiles a zillion of times and then we are going to calculate how many of these simulated draws resulted in us being able to form the given word. I've written the solution in R, but you could use any other programming language, say Python or Ruby.

+ +

I'm first going to describe how to simulate one draw. First let's define the tile frequencies.

+ +
# The tile frequency used in English Scrabble, using ""_"" for blank.
+tile_freq <- c(2, 9 ,2 ,2 ,4 ,12,2 ,3 ,2 ,9 ,1 ,1 ,4 ,2 ,6 ,8 ,2 ,1 ,6 ,4 ,6 ,4 ,2 ,2 ,1 ,2 ,1)
+tile_names <- as.factor(c(""_"", letters))
+tiles <- rep(tile_names, tile_freq)
+## [1] _ _ a a a a a a a a a b b c c d d d d e e e e e e
+## [26] e e e e e e f f g g g h h i i i i i i i i i j k l
+## [51] l l l m m n n n n n n o o o o o o o o p p q r r r
+## [76] r r r s s s s t t t t t t u u u u v v w w x y y z
+## 27 Levels: _ a b c d e f g h i j k l m n o p q r ... z
+
+ +

Then encode the word as a vector of letter counts.

+ +
word <- ""boot""
+# A vector of the counts of the letters in the word
+word_vector <- table( factor(strsplit(word, """")[[1]], levels=tile_names))
+## _ a b c d e f g h i j k l m n o p q r s t u v w x y z 
+## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 0 0 0 0 0 
+
+ +

Now draw a sample of seven tiles and encode them in the same way as the word.

+ +
tile_sample <- table(sample(tiles, size=7))
+## _ a b c d e f g h i j k l m n o p q r s t u v w x y z 
+## 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 
+
+ +

At last, calculate what letters are missing...

+ +
missing <- word_vector - tile_sample
+missing <- ifelse(missing < 0, 0, missing)
+## _ a b c d e f g h i j k l m n o p q r s t u v w x y z 
+## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 
+
+ +

... and sum the number of missing letters and subtract the number of available blanks. If the result is zero or less we succeeded in spelling the word.

+ +
sum(missing) - tile_sample[""blank""] <= 0
+## FALSE
+
+ +

In this particular case we didn't though... Now we just need to repeat this many times and calculate the percentage of successful draws. All this is done by the following R function:

+ +
word_prob <- function(word, reps = 50000) {
+  tile_freq <- c(2, 9 ,2 ,2 ,4 ,12,2 ,3 ,2 ,9 ,1 ,1 ,4 ,2 ,6 ,8 ,2 ,1 ,6 ,4 ,6 ,4 ,2 ,2 ,1 ,2 ,1)
+  tile_names <- as.factor(c(""_"", letters))
+  tiles <- rep(tile_names, tile_freq)
+  word_vector <- table( factor(strsplit(word, """")[[1]], levels=tile_names))
+  successful_draws <- replicate(reps, {
+    tile_sample <- table(sample(tiles, size=7))
+    missing <- word_vector - tile_sample
+    missing <- ifelse(missing < 0, 0, missing)
+    sum(missing) - tile_sample[""_""] <= 0
+  })
+  mean(successful_draws)
+}
+
+ +

Here reps is the number of simulated draws. Now we can try it out on a number of different words.

+ +
> word_prob(""boot"")
+[1] 0.0072
+> word_prob(""red"")
+[1] 0.07716
+> word_prob(""axe"")
+[1] 0.05088
+> word_prob(""zoology"")
+[1] 2e-05
+
+",2013-11-05 21:39:23.123 +58955,1145.0,2,,58929.0,,,,CC BY-SA 3.0,"

A probability distribution has to have an area of unity. If the measure is continuous then there is an infinite number of values that it can take (i.e. an infinite number of values along the x-axis of the distribution). The only way that the total area of the probability distribution can be finite is for the value at each of the infinite number of values to be zero. One divided by infinity.

+ +

In 'real life' there can be no measures that take an infinite number of values (by several different philosophical arguments that don't matter much here) so no value need take a probability of exactly zero. A useful practical argument is based on the finite precision of real-world measurements. If you use a stopwatch that measures to one tenth of a second, the train will have one tenth of a second in which to arrive in 'exactly' five minutes.

+",2013-11-06 05:22:10.843 +58923,23348.0,2,,58633.0,,,,CC BY-SA 3.0,"

A question about your data: Do students visit the advisers more than once? What I mean is, for the students who choose to go to the advisers, do some students visit their adviser once while others choose to visit 15-20 times? If this is the case, then I would advise against using the traditional propensity score method as laid out by Rosenbaum and Rubin, and instead look at the Generalized Propensity Score as discussed by Hirano and Imbens (2004) ""The Propensity Score with Continuous Treatments"".

+ +

Also, I feel the need to insert an obligatory ""correlation is not causation"" statement here because of your observation that ""Just looking at 2013 data alone makes it seem that the advisers make things worse: the students they see do worse, and are more likely to leave university."" Something I see a LOT in education data is that, whenever students are free to self-select into a treatment, the types of students who select into that treatment are very different than those who don't, and often there is huge variation even among the students that do select into treatment. This makes evaluating the impact of an optional treatment program very difficult!

+",2013-11-05 21:42:45.707 +58924,23400.0,1,,,,What statistic procedure to use for analyzing my data?,,CC BY-SA 3.0,"

I'm currently a fourth year university student. As part of my studies, I'm taking a class called Capstone, where students design and carry out a research project. An essential part of formulating this research is choosing a statistical procedure with which to analyze and present your results.

+ +

My study focuses on studying the increase of middle school students' awareness on the subject of bullying.

+ +

To do that, we will have a group of students who will take an initial questionnaire which has multiple choice questions about different situations and what type of bullying they represent. After that test, the same students will be giving a workshop where we will discuss bullying: the types that exist, how to recognize them and the negative impact they can have. After those workshops, the students will take another test, consisting of exactly the same questions as the first.

+ +

The goal is that, by comparing the answers on both tests, we will find that students answers on that second test correspond to a better identification and understanding of what bullying is.

+ +

My question the is: what type of statistical test would you recommend I use to sort and analyze the data I recollect?

+",2013-11-05 21:59:38.740 +58925,23401.0,2,,35097.0,,,,CC BY-SA 3.0,"

I don't see any problem with the frequentist's approach. If the null hypothesis is rejected, the p-value is the probability of a type 1 error. A type 1 error is rejecting a true null hypothesis. In this case we have a p-value of 0.028. This means that among all the hypothesis tests with this p-value ever conducted, roughly 3 out of a hundred will reject a true null hypothesis. By construction, this would be one of those cases. Frequentists accept that sometimes they'll reject true null hypothesis or retain false null hypothesis (Type 2 errors), they've never claimed otherwise. Moreover, they precisely quantify the frequency of their erroneous inferences in the long run.

+ +

Perhaps, a less confusing way of looking at this result is to exchange the roles of the hypotheses. Since the two hypotheses are simple, this is easy to do. If the null is that the sun went nova, then the p-value is 35/36=0.972. This means that this is no evidence against the hypothesis that the sun went nova, so we can't reject it based on this result. This seems more reasonable. If you are thinking. Why would anybody assume the sun went nova? I would ask you. Why would anybody carry out such an experiment if the very thought of the sun exploding seems ridiculous?

+ +

I think this just shows that one has to assess the usefulness of an experiment beforehand. This experiment, for example, would be completely useless because it tests something we already know simply from looking up to the sky (Which I'm sure produces a p-value that is effectively zero). Designing a good experiment is a requirement to produce good science. If your experiment is poorly designed, then no matter what statistical inference tool you use, your results are unlikely to be useful.

+",2013-11-05 22:06:15.713 +58926,15624.0,1,,,,modeling cumulative effect of a winter,,CC BY-SA 3.0,"

I am trying to make a model that predicts the survival rate of wildlife based on the severity of an entire winter. The idea is that colder weather in combination with lots of snow has a negative effect (the colder it is the longer the snow lasts). It might be cold winter with lots of snow, but if the cold and snow did not occur in the same month it would have less of an effect. In order to preserve the interaction of snow and temp, we want to use monthly data, but our survival will always be for the entire winter. It doesn't really matter which month the bad weather occurs in so I am hesitant to use month as a variable.

+ +

Is there some technique that I can use where it preserves the interaction of snow and temp each month but uses the cumulative effect of all months in a winter to predict survival?

+ +

example of what the data looks like:

+ +
Year1 survival = 0.9
+month      temp     snow
+ m1         -5       20
+ m2        -15       20
+ m3        -20      100
+ m4          2      100
+
+Year2 survival = 0.7
+month     temp     snow
+ m1        -5       20
+ m2       -18      110
+ m3       -20      100
+ m4       -11       20
+
+ +

Temp was the average temperature for the month, and snow was the sum of all snow in a month. I'm not sure where the comment about order statistics is going but if it makes a difference daily data is available.

+",2013-11-05 22:36:04.513 +58927,668.0,2,,58770.0,,,,CC BY-SA 3.0,"

Answers to the referenced question apply here directly: create a dictionary consisting only of the target word (and its possible wildcard spellings), compute the chance that a random rack cannot form the target, and subtract that from $1$. This computation is fast.

+ +

Simulations (shown at the end) support the computed answers.

+ +
+ +

Details

+ +

As in the previous answer, Mathematica is used to perform the calculations.

+ +
    +
  1. Specify the problem: the word (or words, if you like), the letters, their counts, and the rack size. Because all letters not in the word act the same, it greatly speeds the computation to replace them all by a single symbol $\chi$ representing ""any letter not in the word.""

    + +
    word = {b, o, o, t};
    +letters = {b, o, t, \[Chi], \[Psi]};
    +tileCounts = {2, 8, 6, 82, 2};
    +rack = 7;
    +
  2. +
  3. Create a dictionary of this word (or words) and augment it to include all possible wildcard spellings.

    + +
    dict[words_, nWild_Integer] := Module[{wildcard, w},
    +   wildcard = {xx___, _, yy___} -> {xx, \[Psi], yy};
    +   w = Nest[Flatten[ReplaceList[#, wildcard] & /@ #, 1] &, words, nWild];
    +   Union[Times @@@ Join[w, Times @@@ words]]];
    +dictionary = dict[{word}, 2]
    +
    + +
    +

    $\left\{b o^2 t, b o^2 \psi ,b o t \psi ,o^2 t \psi ,b o \psi ^2,o^2 \psi ^2,b t \psi ^2,o t \psi ^2\right\}$

    +
  4. +
  5. Compute the nonwords:

    + +
    alphabet = Plus @@ letters;
    +nonwords = Nest[PolynomialMod[# alphabet, dictionary] &, 1, rack]
    +
    + +
    +

    $b^7 + 7 b^6 o + 21 b^5 o^2 + \cdots +7 \chi \psi ^6+\psi ^7$

    +
    + +

    (There are $185$ non-words in this case.)

  6. +
  7. Compute the chances. For sampling with replacement, just substitute the tile counts for the variables:

    + +
    chances = (Transpose[{letters, tileCounts/(Plus @@ tileCounts)}] /. {a_, b_} -> a -> b);
    +q = nonwords /. chances;
    +1 - q
    +
    + +
    +

    $\frac{207263413}{39062500000}$

    +
    + +

    This value is approximately $0.00756036.$

    + +

    For sampling without replacement, use factorial powers instead of powers:

    + +
    multiplicities = MapThread[Rule, {letters, tileCounts}];
    +chance[m_] :=  (ReplaceRepeated[m , Power[xx_, n_] -> FactorialPower[xx, n]] 
    +               /. multiplicities);
    +histor = chance /@ MonomialList[nonwords];
    +q0 = Plus @@ histor  / FactorialPower[Total[tiles], nn];
    +1 - q0
    +
    + +
    +

    $\frac{2381831}{333490850}$

    +
    + +

    This value is approximately $0.00714212.$ The calculations were practically instantaneous.

  8. +
+ +
+ +

Simulation results

+ +

Results of $10^6$ iterations with replacement:

+ +
simulation = RandomChoice[tiles -> letters, {10^6, 7}];
+u = Tally[Times @@@ simulation];
+(p = Total[Cases[Join[{PolynomialMod[u[[All, 1]], dictionary]}\[Transpose], 
+       u, 2], {0, _, a_} :> a]] / Length[simulation] ) // N
+
+ +
+

$0.007438$

+
+ +

Compare it to the computed value relative to its standard error:

+ +
(p - (1 - q)) / Sqrt[q (1 - q) / Length[simulation]] // N
+
+ +
+

$-1.41259$

+
+ +

The agreement is fine, strongly supporting the computed result.

+ +

Results of $10^6$ iterations without replacement:

+ +
tilesAll = Flatten[MapThread[ConstantArray[#1, #2] &, {letters, tiles}] ]
+    (p - (1 - q)) / Sqrt[q (1 - q) / Length[simulation]] // N;
+simulation = Table[RandomSample[tilesAll, 7], {i, 1, 10^6}];
+u = Tally[Times @@@ simulation];
+(p0 = Total[Cases[Join[{PolynomialMod[u[[All, 1]], dictionary]}\[Transpose], 
+       u, 2], {0, _, a_} :> a]] / Length[simulation] ) // N
+
+ +
+

$0.00717$

+
+ +

Make the comparison:

+ +
(p0 - (1 - q0)) / Sqrt[q0 (1 - q0) / Length[simulation]] // N
+
+ +
+

$0.331106$

+
+ +

The agreement in this simulation was excellent.

+ +

The total time for simulation was $12$ seconds.

+",2013-11-05 22:42:57.193 +58928,17038.0,1,58933.0,,,Can the selling price be treated as a random variable?,,CC BY-SA 3.0,"

I have some data from my company that I have been looking at. I have been comparing deals that we have ""lost"" to deals that we have ""won"". I have been comparing the product averages and standard deviations, and have found that the average and standard deviation (for some industries) are lower for the ""lost"" deals.

+ +

We sell each product at set prices, but products can have special discounts when requested which causes ""random prices."" Some products will be sold higher or lower depending on the customer. When I look at the data, the prices are slightly negatively skewed but seem to be normally distributed.

+ +

Does this mean that I can treat the price as random and apply the central limit theorem? Also, I will be suggesting more aggressive lower pricing if I can treat the range as normally distributed. Is this logical?

+",2013-11-05 22:58:55.747 +58995,3999.0,1,59094.0,,,"What aspects of the ""Iris"" data set make it so successful as an example/teaching/test data set",,CC BY-SA 3.0,"

The ""Iris"" dataset is probably familiar to most people here - it's one of the canonical test data sets and a go-to example dataset for everything from data visualization to machine learning. For example, everyone in this question ended up using it for a discussion of scatterplots separated by treatment.

+ +

What makes the Iris data set so useful? Just that it was there first? If someone was trying to create a useful example/testing data set, what lessons could they take away from it?

+",2013-11-06 19:03:35.617 +58929,23406.0,1,58938.0,,,Probability that a continuous random variable assumes a fixed point,,CC BY-SA 4.0,"

I'm in an introductory statistics class in which the probability density function for continuous random variables has been defined as $P\left\{X\in B\right\}=\int_B f\left(x\right)dx$. I understand that the integral of $\int\limits_a^af(x)dx=0$ but I can't rectify this with my intuition of a continuous random variable.

+

Say X is the random variable equal to the number of minutes from time t that the train arrives. How do I calculate the probability that the train arrives exactly 5 minutes from now? How can this probability be zero? Is it not possible? What if the train does arrive exactly 5 minutes from now, how could it occur if it had probability 0?

+",2013-11-05 23:21:57.263 +58930,20473.0,2,,58917.0,,,,CC BY-SA 3.0,"

Before obtaining the hazard function of $T=\min\{T_1,...,T_n\}$, let's first derive its distribution and its density function, i.e. the CFD and PDF of the first-order statistic from a sample of independently but not identically distributed random variables.

+ +

The distribution of the minimum of $n$ independent random variables is

+ +

$$F_T(t) = 1-\prod_{i=1}^n[1-F_i(t)]$$

+ +

(see the reasoning in this CV post, if you don't know it already)

+ +

We differentiate to obtain its density function:

+ +

$$f_T(t) =\frac {\partial}{\partial t}F_T(t) = f_1(t)\prod_{i\neq 1}[1-F_i(t)]+...+f_n(t)\prod_{i\neq n}[1-F_i(t)]$$

+ +

Using $h_i(t) = \frac {f_i(t)}{(1-F_i(t)} \Rightarrow f_i(t) = h_i(t)(1-F_i(t)) $ and substituting in $f_T(t)$ we have

+ +

$$f_T(t) = h_1(t)(1-F_1(t))\prod_{i\neq 1}[1-F_i(t)]+...+h_n(t)(1-F_n(t))\prod_{i\neq n}[1-F_i(t)]$$

+ +

$$=\left(\prod_{i=1}^n[1-F_i(t)]\right)\sum_{i=1}^nh_i(t),\;\;\; h_i(t) = \frac {f_i(t)}{1-F_i(t)} \tag{1}$$

+ +

which is the density function of the minimum of $n$ independent but not identically distributed random variables.

+ +

Then the hazard rate of $T$ is

+ +

$$h_T(t) = \frac {f_T(t)}{1-F_T(t)} = \frac {\left(\prod_{i=1}^n[1-F_i(t)]\right)\sum_{i=1}^nh_i(t)}{\prod_{i=1}^n[1-F_i(t)]} = \sum_{i=1}^nh_i(t) \tag{2}$$

+",2013-11-05 23:30:31.687 +58931,9554.0,2,,58924.0,,,,CC BY-SA 3.0,"

I think you should do a simple pairwise difference comparison (before and after workshop) for each question separately.

+ +

Since you will probably use some Likert scale in your questionnaire (such as ""Strongly agree"", ""Agree"", etc.) your data will be ordinal.

+ +

You can use the Wilcoxon signed rank test, to estimate whether there was a significant change in the responses for each question after the workshop.

+ +

I think any serious statistical package will support it. I'm sure you will have for example SPSS at school.

+ +

If you want to be able to claim that it was the workshop that caused the change, I would recommend you to go a step further. Let the class fill out the questionnaire, then split the class in two parts randomly, and send only half of the class to the workshop. Then let the entire class repeat the questionnaire. The part of the class not taking your workshop will be your control group. You can check whether there will be significant difference even without the workshop.

+ +

(if the workshop offers some real additional value for the students, send the other half to the workshop after they have finished filling out the questionnaire a second time)

+",2013-11-05 23:47:22.403 +58932,20795.0,2,,58818.0,,,,CC BY-SA 3.0,"

@digdeep, as usual @whuber provided an excellent and comprehensive answer from a statistical view point. I'm not trained in statistics, so take this response with a grain of salt. I have used the response below in my real world practice data, so I hope this is helpful.

+ +

I'll try to provide a non statistician view of transformation of time series data for Arima modeling. There is no straightforward answer. Since you are interested in knowing which transformation to use, it might be helpful to review why we do transformation.We do transformation for 3 main reasons and there might be ton of other reasons:

+ +
    +
  1. Transformation makes the data's linear structure more usable for
    +ARIMA modeling.
  2. +
  3. If variance in the data is increasing or changing then transformation of data might be helpful to stabilize the variance in data.
  4. +
  5. Transformation also makes the errors/residuals in + ARIMA model normally distributed which is a requirement in ARIMA modeling proposed by Box-Jenkins.
  6. +
+ +

There are several data transformations including Box-Cox, Log, square root, quartic and inverse and other transformations mentioned @irishstat. As with all the statistical methods there is no good guidance/answer on which transformation to select for a particular dataset.

+ +

As the famous statistician G.E.P Box said ""All models are wrong but some are useful"", this would apply to the transformations as well ""All transformations are wrong but some are useful"".

+ +

The best way to choose a transformation is to experiment. Since you have a long time series, I would hold out the last 12 - 24 months, and build a model using all the transformation and see if a particular transformation is helpful at predicting your out of sample data accurately. Also examine the residuals for normality assumption of your model. Hopefully, this would guide you in choosing an appropriate transformation. You might also want to compare this with non-transformed data and see if the transformation helped your model.

+ +

@whuber's excellent graphical representation of your data motivated me to explore this data graphically using a decomposition method. I might add, R has an excellent decomposition method called STL which would be helpful in identifying patterns that you would normally not notice. For a dataset like this, STL decomposition is helpful in not only selecting an appropriate method for analyzing your data, it might also be helpful in identifying anomalies such as outliers/level shift/change in seasonality etc., See below. Notice that the remainder (irregular) component of the data, looks like there is stochastic seasonality and the variation is not random, there appears to be a pattern. See also change in level of trend component after 2004/2005 that @whuber is refrencing.

+ +

Hopefully this is helpful.

+ +
g <- stl(y,s.window = ""periodic"")
+plot(g)
+
+ +

+",2013-11-05 23:49:58.747 +58933,20473.0,2,,58928.0,,,,CC BY-SA 3.0,"

Prices cannot be normally distributed, since they are non-negative. But you write that actual prices fluctuate below or above the preset prices.

+ +

Denote $p_s$ the preset price and $u$ the special discount or increase above the pre-set price. Then, realized price is

+ +

$$p=p_s+u \Rightarrow u=p-p_s$$

+ +

Now $u$ is distributed around zero, but not necessarily symmetrically. There is no reason to ignore any skewness present in the data. Go for the Skew-normal Distribution, which has been conceived exactly for modelling skewness.

+",2013-11-05 23:54:19.533 +58934,23407.0,1,,,,Prediction using Naive Bayes of klaR package fails,,CC BY-SA 3.0,"

I am trying to replicate a example that I found in Tom Mitchell's book Machine Learning (1997), using R. It is a example from chapter 6.

+ +

There are 14 training examples (shown below) of the target concept PlayTennis, where each day is described by the attributes Outlook, Temperature, Humidity, and Windy.

+ +

Training examples:

+ +
Outlook,Temperature,Humidity,Windy,Play
+overcast,cool,normal,true,yes
+overcast,hot,high,false,yes
+overcast,hot,normal,false,yes
+overcast,mild,high,true,yes
+rainy,cool,normal,false,yes
+rainy,mild,high,false,yes
+rainy,mild,normal,false,yes
+sunny,cool,normal,false,yes
+sunny,mild,normal,true,yes
+rainy,cool,normal,true,no
+rainy,mild,high,true,no
+sunny,hot,high,false,no
+sunny,hot,high,true,no
+sunny,mild,high,false,no
+
+ +

Here's my code:

+ +
library(""klaR"")
+library(""caret"")
+
+data = read.csv(""example.csv"")
+
+x = data[,-5]
+y = data$Play
+
+model = train(x,y,'nb',trControl=trainControl(method='cv',number=10))
+
+Outlook <- ""sunny""
+Temperature <- ""cool""
+Humidity <- ""high""
+Windy <- ""true""
+
+instance <- data.frame(Outlook,Temperature,Humidity,Windy)
+
+predict(model$finalModel,instance)
+
+ +

The example tries to predict the outcome for

+ +
Outlook=sunny, Temperature=cool,Humidity=high and Wind=strong
+
+ +

The problem is that I am getting a different prediction from the one in the book.

+ +

Here are the probabilities I've got from my code:

+ +
no          yes
+0.001078835 0.9989212
+
+ +

Here are the book's probabilities:

+ +
no     yes
+0.0206 0.0053
+
+ +

My code classifies the unseen data as Yes and the book's classifier classifies it as No.

+ +

Shouldn't both give the same answer since we are using the same naive Bayes classifier?

+ +

EDIT:

+ +

I replicated the example using scikit-learn MultinomialNB classifier and I have got the following probabilities

+ +
no    yes
+0.769  0.231
+
+ +

which are similar to the normalized probabilities of the book.

+ +

Normalized probabilities of the book

+ +
no     yes
+0.795  0.205
+
+",2013-11-06 00:14:56.537 +58935,20473.0,2,,58929.0,,,,CC BY-SA 3.0,"
+

What if the train does arrive exactly 5 minutes from now, how could it + occur if it had probability 0?

+
+ +

A probabilistic statement is not a statement about the possibility/feasibility of an event. It only reflects our attempt to quantify our uncertainty about it happening. So when a phenomenon is continuous (or is modeled as one), then our tools and current state of knowledge do not permit us to make a probabilistic statement about it taking a specific value. We can only make such a statement related to a range of values. Of course the usual trick here is to discretize the support, to consider""small"" intervals of values rather than single values. Since continuous random variables bring great benefits and flexibility compared to discrete random variables, this has been found to be a rather small price to pay, perhaps as small as the intervals we are forced to consider.

+",2013-11-06 00:16:00.993 +58936,23408.0,2,,2509.0,,,,CC BY-SA 3.0,"

Here's one for Grandma:

+ +

In our town there are streets going north and south, some going east and west, and even some going northwest and southeast, some NE to SW. One day a guy measures all the traffic on all the streets, he finds that the most traffic is going diagonally, from northwest to southeast, the second biggest is perpendicular to this going northeast to southwest and all the rest is fairly small. So he draws a big square and puts a big line left to right and says that is the NW to SE, then draws another line vertically up and down through the middle. He says that's the second most crowded direction for traffic (NE to SW). The rest is small so it can be ignored.

+ +

The left right line is the first eigenvector and the up down line is the second eigenvector. The total number of cars going left and right are the first eigenvalue and those going up and down are the second eigenvalue.

+",2013-11-06 00:22:17.697 +58937,9554.0,2,,58929.0,,,,CC BY-SA 3.0,"

To give you some intuition for the above, try the following (thought) experiment:

+ +

Draw a real line around zero with a ruler. Now take a sharp dart and let it fall from above randomly on the line(let's assume you will always hit the line and only the lateral positioning matters for the sake of the argument).

+ +

However many times you let the dart fall randomly on the line, you will never hit the point zero. Why? Think what is the point zero, think what is its width. And after you recognise that its width is 0, do you still think you can hit it?

+ +

Will you be able to hit point 1, or -2? Or any other point you pick on the line for that matter?

+ +

To get back to maths, this is the difference between the physical world, and a mathematical concept such as real numbers (represented by the real line in my example). Probability theory has quite a bit more complicated definition of probability than you will see in your lecture. To quantify the probability of events and any combination of their outcomes, you need a probability measure. Both the Borel measure and Lebesgue measure are defined for an interval [a, b] on the real line as: +$$\mu([a,b])=b-a$$ +from this definition you can see what happens with the probability if you reduce the interval to a number (setting a = b).

+ +

The bottom line is that based on our current definition of probability theory (dating back to Kolmogorov) the fact that an event has 0 probability does not mean it cannot occur.

+ +

And as far as your example with the train goes, if you will have an infinitely precise watch, your train will never arrive exactly on time.

+",2013-11-06 00:34:04.570 +58938,594.0,2,,58929.0,,,,CC BY-SA 3.0,"

You may be falling into the trap of regarding 'five minutes from now' as lasting some finite period of time (which would have a nonzero probability).

+ +

""Five minutes from now"" in the continuous variable sense is truly instantaneous.

+ +

Imagine that the arrival of the next train is uniformly distributed between 8:00 and 8:15. Further imagine we define the arrival of a train as occurring at the instant the front of the train passes a particular point on the station (perhaps the midpoint of the platform if there's no better landmark). Consider the following sequence of probabilities:

+ +

a) the probability a train arrives between 8:05 and 8:10

+ +

b) the probability a train arrives between 8:05 and 8:06

+ +

c) the probability a train arrives between 8:05:00 and 8:05:01

+ +

d) the probability a train arrives between 8:05:00 and 8:05:00.01 (i.e. in the space of one hundredth of a second

+ +

e) the probability a train arrives between 8:05 and one billionth of a second later

+ +

f) the probability a train arrives between 8:05 and one quadrillionth of a second later

+ +

... and so on

+ +

The probability that it arrives precisely at 8:05 is the limiting value of a sequence of probabilities like that. The probability is smaller than every $\epsilon>0$.

+",2013-11-06 00:48:06.460 +58939,16650.0,2,,58770.0,,,,CC BY-SA 3.0,"

For the word ""BOOT"" with no wildcards: +$$ +p_0=\frac{\binom{n_b}{1}\binom{n_o}{2}\binom{n_t}{1}\binom{n-4}{3}}{\binom{n}{7}} +$$ +With wildcards, it becomes more tedious. Let $p_k$ indicate the probability of being able to play ""BOOT"" with $k$ wildcards: +$$ +\begin{eqnarray*} +p_0&=&\frac{\binom{n_b}{1}\binom{n_o}{2}\binom{n_t}{1}\binom{n-4}{3}}{\binom{n}{7}} \\ +p_1&=&p_0 +\frac{\binom{n_*}{1}\binom{n_o}{2}\binom{n_t}{1}\binom{n-4}{3}}{\binom{n}{7}} + \frac{\binom{n_b}{1}\binom{n_o}{1}\binom{n_*}{1}\binom{n_t}{1}\binom{n-4}{3}}{\binom{n}{7}} + \frac{\binom{n_b}{1}\binom{n_o}{2}\binom{n_*}{1}\binom{n-4}{3}}{\binom{n}{7}}\\ +&=&p_0 +\frac{\binom{n_*}{1}\binom{n-4}{3}}{\binom{n}{7}}(\binom{n_o}{2}\binom{n_t}{1} + \binom{n_b}{1}\binom{n_o}{1}\binom{n_t}{1} + \binom{n_b}{1}\binom{n_o}{2})\\ +p_2&=&p_1 + \frac{\binom{n_*}{2}\binom{n-4}{3}}{\binom{n}{7}}(\binom{n_b}{1}\binom{n_o}{1} + \binom{n_b}{1}\binom{n_t}{1} + \binom{n_o}{2} + \binom{n_o}{1}\binom{n_t}{1})\\ +p_3&=&p_2 + \frac{\binom{n_*}{3}\binom{n-4}{3}}{\binom{n}{7}}(\binom{n_b}{1} + \binom{n_o}{1} + \binom{n_t}{1})\\ +p_4&=&p_3 + \frac{\binom{n_*}{4}\binom{n-4}{3}}{\binom{n}{7}}\\ +p_i&=&p_4, i\ge4 +\end{eqnarray*} +$$

+",2013-11-06 00:56:42.760 +58940,23411.0,1,58946.0,,,"Official name for ""symmetric percent difference"" function (x-y)/max(x,y)",,CC BY-SA 3.0,"

I frequently use this formula to compare two positive numbers $x$ and $y$ to see if they are ""more different"" than some threshold:

+ +

$$ +x-y \over \max(x,y) +$$

+ +

It is nice because it is symmetric and bounded to $[-1,1]$ (unlike relative percent difference). I call it a ""symmetric percent difference."" I see a similar formula on this Wikipedia page, apparently generalized to negative or positive numbers, but it's not named:

+ +

$$ +|x-y| \over \max(|x|,|y|) +$$

+ +

Does anyone know the official name for this function?

+ +

Note: Another similar function, bounded to $[0,1]$, is used to calculate sMAPE:

+ +

$$ +|x-y| \over x+y +$$

+",2013-11-06 00:59:09.070 +27194,1085.0,1,,,,What does that mean that two time series are colinear?,,CC BY-SA 3.0,"

I am familiar with the concept of cointegration.

+ +

But I hear sometimes people talking about colinearity (or collinearity) for time series. +A set of points is collinear if they are on the same line. But what does that mean for time series?

+ +

Is it exactly the same as cointegration of order 1? +Or is there something stronger/different in the concept of collinearity?

+",2012-06-22 02:20:37.490 +58941,23413.0,1,,,,Correlation among categories between categorical nominal variables,,CC BY-SA 3.0,"

I have a data set with two categorical nominal variables (both with 5 categories). I would like to know if (and how) I am able to identify potential correlations between the categories from these two variables.

+ +

In other words whether for example the results of category $i$ in variable 1 show a strong correlation with a specific category $j$ in variable 2. Since I have two variables with 5 categories, the total correlation analysis for all the categories would come down to 25 results (at least if it works the way I hope/expect it to work).

+ +

I have tried to formulate the problem into concrete questions:

+ +

Question 1: Let's say I transfer the categorical variable into 5 different dummy variables per value (category). This same procedure I run for the second variable as well. Then I want to determine the correlation between dummy 1.i and 2.i (for example). Is it statistically correct for me to execute this procedure by means of an ordinary correlation coefficient procedure? Does the correlation coefficient resulting from this procedure provide a proper insight in a correlation among the two dummy variables?

+ +

Question 2: If the procedure described in question one is a valid procedure, is there a way to execute this analysis for all categories of 2 (or perhaps more) categorical nominal variables all at once?

+ +

The program I am using is SPSS (20).

+",2013-11-06 02:00:54.243 +58942,23412.0,1,,,,Correlation assumption of Hochberg procedure in repeated measures design,,CC BY-SA 3.0,"

I'm running a 3-factor (Between by Between by Within) ANOVA with the correlation structure modeled using a Random Intercept and Gaussian Serial Correlation. I have the following variables:

+ +

Dependent variable: Bodyweight

+ +

Independent:

+ +

Size - size of the subject (between). 2 levels: normal and large
+Diet - diet given (between). 2 levels: normal or experimental diet.
+Time - weekly bodyweight measurements (within). 12 levels: 1:12

+ +

I have a significant 3-way interaction and all 2-way interactions(p<0.05), and so I am stratifying the 3-factors to test all pairwise differences between Normal and Large Size, on each Diet, and at each week. I'm doing these specific comparisons because I'm interested in whether Size further increases the weight gain known to occur on the experimental diet. Using this approach, I have a total of 24 post-hoc comparisons.

+ +

I would like to improve power by using Hochberg's procedure rather than Holm's to adjust for multiple comparisons, but I'm unclear whether my data meet the assumption of independent or positively associated test statistics/p-values (as per Simes' test and multcomp documentation in R).

+ +

I'm employing the mixed model because of the positively correlated repeated measures, so I'm tempted to conclude that the post-hoc/pairwise test statistics at each time would also be positively correlated. But I could very well be wrong.

+ +

Here are my questions:

+ +
    +
  1. Would the pairwise test statistics/p-values of these repeated measures data meet the independence/positive assumption of Hochberg's procedure?

  2. +
  3. Is there a way to test or at least calculate/extract the correlations of the test statistics/p-values to empirically validate the assumption?

  4. +
+",2013-11-06 02:07:17.747 +58943,16588.0,2,,58779.0,,,,CC BY-SA 3.0,"

Here's how I'd do it in R. Note, I have used your original data with self-transitions (non-transitions) included. You can do the same thing with the ""compressed time series,"" however.

+ +

Say I have already constructed an edge list from your example data in the form of the following matrix

+ +
el <- structure(c(1, 2, 3, 1, 2, 1, 2, 3, 1, 1, 1, 2, 2, 3, 3, 3, 4, 
+2, 3, 4, 1, 1, 2, 3), .Dim = c(8L, 3L), .Dimnames = list(NULL, 
+c(""i"", ""j"", ""w"")))
+el
+#     i j w
+#[1,] 1 1 4
+#[2,] 2 1 2
+#[3,] 3 1 3
+#[4,] 1 2 4
+#[5,] 2 2 1
+#[6,] 1 3 1
+#[7,] 2 3 2
+#[8,] 3 3 3
+
+ +

It captures the facts that state $1$ followed state $1$ (i.e., no transition) four times, state $1$ followed state $2$ twice, and so on. I now capture this same information in the form of a square $3 \times 3$ matrix.

+ +
X <- matrix(0,3,3)
+X[el[,1:2]]<-el[,3]
+X
+#     [,1] [,2] [,3]
+#[1,]    4    4    1
+#[2,]    2    1    2
+#[3,]    3    0    3
+
+ +

Now I want the elements to represent proportions with respect to the rest of the elements in their own row only. That is, rows should sum to $1$ meaning that in every state there will be some transition with probability $1$ and the individual elements of the row capture the distribution of that probability across all possible next states.

+ +
X <- X / rowSums(X)
+X
+#          [,1]      [,2]      [,3]
+#[1,] 0.4444444 0.4444444 0.1111111
+#[2,] 0.4000000 0.2000000 0.4000000
+#[3,] 0.5000000 0.0000000 0.5000000
+
+ +

These proportions can be taken as the transition probabilities. Here we see that the probability of transitioning from state $1$ to state $3$ is found at location $X_{13}$ and is equal to $0.11$. This is to say that of all the observations of the system being in state $1$ ($9$ times not counting the last one that offered no information about transition), in approximately $11\%$ of them (once) the process transitioned to state $3$.

+",2013-11-06 02:11:50.383 +58944,15972.0,1,58950.0,,,Necessary and sufficient condition on joint MGF for independence,,CC BY-SA 3.0,"

Suppose I have a joint moment generating function $M_{X,Y}(s,t)$ for a joint distribution with CDF $F_{X,Y}(x,y)$. Is $M_{X,Y}(s,t)=M_{X,Y}(s,0)⋅M_{X,Y}(0,t)$ both a necessary and sufficient condition for independence of $X$ and $Y$? I checked a couple of textbooks, which only mentioned necessity:

+ +

$$F_{X,Y}(x,y)=F_X(x)\cdot F_Y(y) \implies M_{X,Y}(s,t)=M_X(s) \cdot M_Y(t)$$

+ +

That result is clear as independence implies $M_{X,Y}(s,t)=\mathbb{E}(e^{sX+tY})=\mathbb{E}(e^{sX}) \mathbb{E}(e^{tY})$. Since the MGFs of the marginals are determined by the joint MGF we have:

+ +

$$X,Y\text{ independent} \implies M_{X,Y}(s,t)=M_{X,Y}(s,0)⋅M_{X,Y}(0,t)$$

+ +

But after searching online I found only a fleeting reference, without proof, to the converse. Is the following sketch proof workable?

+ +

Given a joint MGF $M_{X,Y}(s,t)$, this uniquely determines the marginal distributions of $X$ and $Y$ and their MGFs, +$M_X(s)=M_{X,Y}(s,0)$ and $M_Y(t)=M_{X,Y}(0,t)$. The marginals alone are compatible with many other possible joint distributions, and uniquely determine a joint distribution in which $X$ and $Y$ are independent, with CDF $F_{X,Y}^{\text{ind}}(x,y)=F_X(x) \cdot F_Y(y)$ and MGF:

+ +

$$M_{X,Y}^{\text{ind}}(s,t) = M_X(s) \cdot M_Y(t) = M_{X,Y}(s,0)⋅M_{X,Y}(0,t)$$

+ +

So if we are given, for our original MGF, that $M_{X,Y}(s,t) = M_{X,Y}(s,0)⋅M_{X,Y}(0,t)$, this is sufficient to show $M_{X,Y}(s,t) = M_{X,Y}^{\text{ind}}(s,t)$. Then by the uniqeness of MGFs, our original joint distribution has $F_{X,Y}(x,y) = F_{X,Y}^{\text{ind}}(x,y) = F_X(x) \cdot F_Y(y)$ and $X$ and $Y$ are independent.

+",2013-11-06 02:13:09.800 +58963,23425.0,1,,,,For a research project I'm trying to estimate how many videos currently exist on Youtube,,CC BY-SA 3.0,"

Official stats on how many videos (rather than how many hours or what volume of data) seem quite hard to come by.

+ +

My current idea is something like this:

+ +

A youtube URL is like: http://www.youtube.com/watch?v=w1sjRD7NSec

+ +

where

+ +
 id = w1sjRD7NSec
+
+ +

Assuming that each character in the id can be upper (26) or lower (26) case letter or number (10)

+ +

the number of combinations should be

+ +
 = (26 + 26 +10 ) ^ 11
+
+ = 62^11
+
+ = 5.2 x 10 ^19
+
+ +

So if I write a script to try random urls of 100/1000/whatever videos, and Z% are successful, then will

+ +
 Z% x 5.2 x 10 ^19
+
+ +

give me the number of videos that exist (or at least are downloadable), albeit with a very low confidence?

+",2013-11-06 09:53:06.613 +58945,23414.0,2,,32038.0,,,,CC BY-SA 3.0,"

I would not use a random effects model with only 6 levels. Models using a 6-level random effect can sometime be run using many statistical programs and sometimes give unbiased estimates, but:

+ +
    +
  1. I think there is an arbitrary consensus in the statistical community that 10-20 is the minimum number. If you want to have your research published, you'll be advised to look for a journal without statistical review (or be able to justify your decision using fairly sophisticated language).
  2. +
  3. With so few clusters, the between cluster variance is likely to be poorly estimated. Poor estimation of between cluster variance usually translates into poor estimation of the standard error of the coefficients of interest. (random effects models rely on the number of clusters theoretically going to infinity).
  4. +
  5. Often the models simply don't converge. Have you tried running your model? I would surprised with only 12-16 measures per subject that the models converge. When I've managed to get this sort of model to converge I've had hundreds of measurements per cluster.
  6. +
+ +

This issue is addressed in most standard textbooks in the field and you've sort of addressed them in your question. I don't think I'm giving you any new information.

+",2013-11-06 02:18:19.680 +58946,20473.0,2,,58940.0,,,,CC BY-SA 3.0,"

The last function you mention is the coefficient of variation (standard deviation over mean) of a sample of just two values:

+ +

$$c_v = \frac {\sigma}{\mu}$$ +and when we have only two values, $\sigma = |x-y|/2$ while $\mu = (x+y)/2$.

+ +

As for your function, although by not using absolute value in the numerator you hint that direction may be important to you, I expect usually subtracting the smallest from the largest value. +Then, since our sample is only these two numbers essentially we have

+ +

$$\frac {\text {range}}{\max} = \frac {\max - \min}{\max} = 1- \frac {\min}{\max}$$

+ +

Now the $\frac {max}{min}$ ratio is encountered in various situations, check for example, ""dynamic range"" or ""contrast ratio"".

+ +

On a more mundane level, if $x$ is ""final price""$=p_f$ and $y$ is ""list price""$=p_l$, then

+ +

$$\frac {x-y} {\max(x,y)} = \frac {p_f -p_l}{p_l} $$

+ +

equals the ""percentage discount"" -with the negative sign to indicate the direction of revenues!

+",2013-11-06 02:23:14.020 +58947,9456.0,1,,,,Confusion related to correlation in topic models,,CC BY-SA 3.0,"

I was reading this paper related to correlated topic models. However, I didn't understand this correlation figure.

+

In the figure given below, I am confused about the level curves shown in the simplex. In the first case, it says that there is diagonal covariance with non zero mean. In the second case, it says that there is negative correlation between components 1 and 2. In the third case it says that there is positive correlation between 1 and 2.

+

I didn't understand how they intepreted it like this

+

+

In the figure given below, the topics are uncorrelated as it says

+

+

But I didn't get why it is so? Can anyone give me some insights?

+",2013-11-06 02:34:46.223 +58948,23414.0,2,,23019.0,,,,CC BY-SA 3.0,"

I don't think the issues here can be addressed in a simple answer posted online. I would add:

+ +
    +
  1. the inclusion of age and time is problematic and should be thought through. It is unclear to me what the benefit is of having both variables in the model. It can be done. But not by avoiding the issue by making one of the variables a random effect.
    +1.5. if you want to include age, from what I understand, include age as age at start of experiment. This should not be collinear with other data and should be informative.
  2. +
  3. I would be very reluctant to include Age and time as random effects in this model. An assumption of the random effects model is that clusters are exchangeable.
    +2.5. There is a tendency in the R code I've seen to include multiple random effects. I'm not sure why. Once you go beyond a single random effect, or simple single random effect clustered in another, the model complexity is significant and often not warranted.
  4. +
  5. I don't think the models as written make sense. The following makes sense to me and are defensible:

    + +

    lmer(FiringRate~ Time + (1|Subject))

    + +

    lmer(FiringRate~ Time + (Time|Subject))

    + +

    lmer(FiringRate~ Time + age_atstart + (Time|Subject))

  6. +
+",2013-11-06 02:49:31.580 +58949,22488.0,1,58990.0,,,How do I check for bias of an estimator?,,CC BY-SA 3.0,"

I need to check if an estimator $\hat\theta$ for the parameter $\theta$ is biased. Theory says I should compare the expected value of $\hat\theta$ versus the expected value of $\theta$.

+ +

I assume the expected value of an estimator is the ""weighted average"" of the estimator and its distribution: $E[\hat\theta] = \int_0^\inf \hat\theta f(\hat\theta) d\hat\theta$*. If I'm right, to compute $E[\hat\theta]$ I need to know how $\hat\theta$ is distributed.

+ +

For example:

+ +
+

$X$ is a random var with support $0 <= X <= \theta$ and pdf $f(x;\theta) = 3x^2 / \theta^3$. Check if $\delta(x)=(4\bar X)/3$ is biased.

+
+ +

*is it $E[\hat\theta] = \int_0^\inf \hat\theta f(\hat\theta) d\hat\theta$ or $E[\hat\theta] = \int_0^\inf \hat\theta f(x) dX$?

+",2013-11-06 03:08:06.123 +58950,10135.0,2,,58944.0,,,,CC BY-SA 3.0,"

Yes, that's the necessary and sufficient condition for independence not only for two random variables but also for a (finite) sequence of random variables. Check out for example P.2 on page 242 of Probability with Statistical Applications, By Rinaldo B. Schinazi. Or page 259 of +Econometric Analysis of Count Data which is based on probability generating function. Just note that ""the moment-generating function does not always exist"".

+",2013-11-06 03:14:02.577 +58951,22564.0,1,,,,When does it make sense to reject/accept an hypothesis?,,CC BY-SA 4.0,"

In To P or not to P: on the evidential nature of P-values +and their place in scientific inference, Michael Lew has shown that, at least for the t-test, the one-sided p-value and sample size can be interpreted as an "address" (my term) for a given likelihood function. I have repeated some of his figures below with slight modification. The left column shows the distribution of p-values expected due to theory for different effect sizes (difference between means/pooled sd) and sample sizes. The horizontal lines mark the "slices" from which we get the likelihood functions shown by the right panels for p=0.50 and p=0.025.

+

+

These results are consistent with monte carlo simulations. For this figure I compared two groups with n=10 via t-test at a number of different effect sizes and binned 10,000 p-values into .01 intervals for each effect size. Specifically there was one group with mean=0, sd=1 and a second with a mean that I varied from -4 to 4, also with sd=1.

+

+

(The above figures can be directly compared to figures 7/8 from the paper linked above and are very similar, I found the heatmaps more informative than the "clouds" used in that paper and also wished to independently replicate his result.)

+

If we examine the likelihood functions "indexed" by the p-values, the behaviour of rejecting/accepting hypotheses or ignoring results giving p-values greater than 0.05 based on a cut-off value (either the arbitrary 0.05 used everywhere or determined by cost-benefit) appears to be absurd. Why should I not conclude from the n=100, p=0.5 case that "the current evidence shows that any effect, if present, is small"? Current practice would be to either "accept" there is no effect (hypothesis testing) or say "more data needed" (significance testing). I fail to see why I should do either of those things.

+

Perhaps when a theory predicted a precise point value, then rejecting a hypothesis could make sense. But when the hypotheses are of the form either "mean1=mean2 or mean1!=mean2" I see no value. Under the conditions these tests are often being used randomization does not guarantee all confounds are balanced across groups and there should always be the worry of lurking variables, so rejecting the hypothesis that mean1 exactly equals mean2 has no scientific value as far as I can tell.

+

Are there cases beyond the t-test where this argument would not apply? Am I missing something of value that rejecting a hypothesis with low a priori probability provides to researchers? Ignoring results above an arbitrary cutoff seems to have lead to widespread publication bias. What useful role does ignoring results play for scientists?

+

Michael Lew's R code to calculate the p-value distributions

+ +
LikeFromStudentsTP <- function(n, x, Pobs, test.type, 
+      alt='one.sided'){
+# test.type can be 'one.sample', 'two.sample' or 'paired'
+# n is the sample size (per group for test.type = 'two.sample')
+# Pobs is the observed P-value
+# h is a small number used in the trivial differentiation
+h <- 10^-7
+PowerDn <- power.t.test('n'=n, 'delta'=x, 'sd'=1,
+'sig.level' = Pobs-h, 'type'= test.type, 'alternative'=alt)
+PowerUp <- power.t.test('n'=n, 'delta'=deltaOnSigma, 'sd'=1,
+'sig.level' = Pobs+h, 'type'= test.type, 'alternative'=alt)
+PowerSlope <- (PowerUp$power-PowerDn$power)/(h*2)
+L <- PowerSlope
+}
+
+

R code for figure 1

+
deltaOnSigma <- 0.01*c(-400:400)
+type <- 'two.sample'
+alt='one.sided'
+p.vals <- seq(0.001,.999,by=.001)
+
+#dev.new()
+par(mfrow=c(4, 2))
+for(n in c(3, 5, 10, 100)){
+  
+  m<-matrix(nrow=length(deltaOnSigma), ncol=length(p.vals))
+  cnt <- 1
+  for(P in p.vals){
+    m[, cnt] <- LikeFromStudentsTP(n, deltaOnSigma, P, type, alt)
+    cnt <- cnt+1
+  }
+  
+  #remove very small values
+  m[which(m/max(m, na.rm=TRUE) < 10^-5)] <- NA
+        
+  m2 <- log(m)
+  
+  par(mar=c(4.1, 5.1, 2.1, 2.1))
+  image.plot(m2, axes=FALSE,
+             breaks=seq(min(m2, na.rm=TRUE), max(m2,  na.rm=T), 
+             length=1000), col=rainbow(999),
+             xlab="Effect Size", ylab="P-value"
+  )
+  title(main=paste("n=",n))
+  axis(side=1, at=seq(0,1,by=.25), labels=seq(-4,4,by=2))
+  axis(side=2, at=seq(0,1,by=.05), labels=seq(0,1,by=.05))
+  axis(side=4, at =.5, labels="Log-Likelihood", pos=.95, tick=F)
+  abline(v=0.5, lwd=1)
+  abline(h=.5, lwd=3, lty=1)
+  abline(h=.025, lwd=3, lty=2)
+  par(mar=c(5.1,4.1,4.1,2.1))
+        
+  plot(deltaOnSigma, m[, which(p.vals==.025)], type="l", lwd=3, 
+         lty=2, xlab="Effect Size", ylab="Likelihood", 
+         xlim=c(-4, 4),
+         main=paste("Likelihood functions for","n=",n)
+  )
+  lines(deltaOnSigma,m[,which(p.vals==.5)], lwd=3, lty=1)
+  legend("topleft", legend=c("p=.5","p=.025"), lty=c(1,2),lwd=1, 
+           bty="n")
+}
+
+

R code for figure 2

+
p.vals <- seq(0, 1, by=.01)
+deltaOnSigma <- 0.01*c(-400:400)
+n <- 10
+n2 <- 10
+sd2 <- 1
+num.sims <- 10000
+sp <- sqrt((9*1^2 +(n2-1)*sd2^2)/(n+n2-2))
+
+p.out=matrix(nrow=num.sims*length(deltaOnSigma) ,ncol=2)
+m <- matrix(0, nrow=length(deltaOnSigma), ncol=length(p.vals))
+pb <- txtProgressBar(min = 0, max = length(deltaOnSigma) , 
+                     style = 3)
+cnt <- 1
+cnt2 <- 1
+for(i in deltaOnSigma ){
+  
+  for(j in 1:num.sims){
+    
+    m2 <- i
+    a <- rnorm(n, 0, 1)
+    b <- rnorm(n, m2, sd2)
+    p <- t.test(a, b, alternative="less")$p.value
+    
+    r <- end(which(deltaOnSigma<=m2/sp))[1]
+    
+    m[r, end(which(p.vals<p))[1]] <- 
+         m[r, end(which(p.vals<p))[1]] + 1
+    p.out[cnt,] <- cbind(m2/sp, p)
+    cnt <- cnt+1
+    
+  }
+  cnt2 <- cnt2+1
+  setTxtProgressBar(pb, cnt2)
+}
+close(pb)
+
+
+m[which(m==0)] <- NA
+
+m2 <- log(m)
+
+dev.new()
+par(mfrow=c(2,1))
+par(mar=c(4.1,5.1,2.1,2.1))
+image.plot(m2, axes=FALSE,
+           breaks=seq(min(m2, na.rm=TRUE), max(m2, na.rm=TRUE), 
+           length=1000), col=rainbow(999),
+           xlab="Effect Size", ylab="P-value"
+)
+title(main=paste("n=",n))
+axis(side=1, at=seq(0,1,by=.25), labels=seq(-4,4,by=2))
+axis(side=2, at=seq(0,1,by=.05), labels=seq(0,1,by=.05))
+axis(side=4, at =.5, labels="Log-Count", pos=.95, tick=F)
+abline(h=.5, lwd=3, lty=1)
+abline(h=.025, lwd=3, lty=2)
+abline(v=.5, lwd=2, lty=1)
+par(mar=c(5.1,4.1,4.1,2.1))
+
+
+hist(p.out[which(p.out[,2]>.024 & p.out[,2]<.026),1],
+     xlim=c(-4,4), xlab="Effect Size", col=rgb(1,0,0,.5), 
+     main=paste("Effect Sizes for","n=",n)
+)
+hist(p.out[which(p.out[,2]>(.499) & p.out[,2]<.501),1], add=T,
+     xlim=c(-4,4),col=rgb(0,0,1,.5)
+)
+
+legend("topleft", legend=c("0.499<p<0.501","0.024<p<0.026"), 
+       col=c("Blue","Red"), lwd=3, bty="n")
+
+",2013-11-06 03:23:21.103 +58952,13165.0,2,,58789.0,,,,CC BY-SA 3.0,"

First of all, observe that, all of the factors include all of the variables and their joint distributions. But they have more factors. In particular, each variable inside one separator $S$ is repeated $d(S)$ times inside the neighbouring factors. That's why the distribution is normalized by dividing by the separators to the power of $d(S)$.

+",2013-11-06 03:40:59.913 +58953,1145.0,2,,58951.0,,,,CC BY-SA 3.0,"

I really like your rainbow versions of my clouds, and may 'borrow' them for a future version of my paper. Thank you!

+ +

Your questions are not entirely clear to me, so I will paraphrase them. If they are not what you had in mind them my answers will be misdirected!

+ +
    +
  • Are there situations where rejection of the hypothesis like ""mean1 equals mean2"" is scientifically valuable?

  • +
+ +

Frequentists would contend that the advantage of having well-defined error rates outweighs the loss of assessment of evidence that comes with their methods, but I don't think that that is very often the case. (And I would suspect that few proponents of the methods really understand the complete loss of evidential consideration of the data that they entail.) Fisher was adamant that the Neyman-Pearson approach to testing had no place in a scientific program, but he did allow that they were appropriate in the situation of 'industrial acceptance testing'. Presumably such a setting is a situation where rejection of a point hypothesis can be useful.

+ +

Most of science is more accurately modelled as estimation than as an acceptance procedure. P-values and the likelihood functions that they index (or, to use your term, address) provide very useful information for estimation, and for inferences based on that estimation.

+ +

(A couple of old StackExchange questions and answerd are relevant: What is the difference between "testing of hypothesis" and "test of significance"? and Interpretation of p-value in hypothesis testing)

+ +
    +
  • Are you missing the point of rejection of a hypothesis (of low a priori probability)?

  • +
+ +

I don't know if you are missing much, but it is probably not a good idea to add prior probabilities into this mixture! Much of the argumentation around the ideas relating to hypothesis testing, significance testing and evidential evaluation come from entrenched positions. Such arguments are not very helpful. (You might have noticed how carefully I avoided bringing Bayesianism into my discussion in the paper, even though I wholeheartedly embrace it when there are reasonable prior probabilities to use. First we need to fix the P-value provide evidence, error rates do not issue.)

+ +
    +
  • Should scientists ignore results that fail to reach 'significance'?
  • +
+ +

No, of course not. Using an arbitrary cutoff to claim significance, or to assume significance, publishability, repeatability or reality of a result is a bad idea in most situations. The results of scientific experiments should be interpreted in light of prior understanding, prior probabilities where available, theory, the weight of contrary and complementary evidence, replications, loss functions where appropriate and a myriad of other intangibles. Scientists should not hand over to insentient algorithms the responsibility for inference. However, to make full use of the evidence within their experimental results scientists will need to much better understand what the statistical analyses can and do provide. That is the purpose of the paper that you have explored. It will also be necessary that scientists make a more complete account of their acquisition of evidence and the evolution of their understanding than what is usually presented in papers, and they should provide what Abelson called a principled argument to support their inferences. Relying on P<0.05 is the opposite of a principled argument.

+",2013-11-06 04:56:31.703 +58956,,2,,58916.0,user31668,,,CC BY-SA 3.0,"

First, a minor terminology correction: You can't really have independent and mutually exclusive events, as mutually exclusive implies that if one event happens, the others cannot happen, which makes them not independent. I think what you mean is that they are sequential, in that they happen one after another.

+ +

Since you have the pdf of each, what you want to calculate is the sum of the three random times: $T = T_A+T_B+T_C$. Depending on the exact form of your pdfs, you can try one of three approaches:

+ +
    +
  1. Recognize the sum as being equal to a particular distribution (e.g., sum of normals is normal)
  2. +
  3. Simulate the sum using Monte Carlo simulation
  4. +
  5. Analytically calculate the distribution of the sum by either multiplying thecharacteristic functions of each pdf and then back-transforming to a denstity, or by directly performing two convolutions: $f_C*(f_A*f_B)$
  6. +
+ +

I would recommend method 2 if you have access to a monte carlo simulator or know R or Matlab or any other numerical package. 1 is also OK...3 is a real pain unless you are lucky to be using simple distributions...in which case you will likely find a solution as per 1.

+",2013-11-06 05:22:28.300 +58957,23418.0,1,,,,Joint probability of two correlated RVs,,CC BY-SA 3.0,"

I am trying to get the joint PDF of two RVs $X$ and $Y$ where $aX<Y<bX$, so I am stuck in calculating the probability of

+ +

$\mathbb{P}(X<x,Y<y|aX<Y<bX)$

+ +

any idea?

+",2013-11-06 06:33:28.320 +58958,17459.0,1,,,,An unfair coin with different face value problem,,CC BY-SA 3.0,"

Could any one give me some hints for the following problem: +A coin has two face: 1 and -1/2, and the probability for 1 is $P(x=1)=1/3$ and for $-1/2$ is $P(x=-1/2)=2/3$. The expected value is therefore $E(x)=1\cdot(1/3)+(-1/2)\cdot(2/3) = 0$.
+Supposing for an infinite tossing, what's the chance that the average value would be bigger than $k$? (supposing the sample number is sufficiently large)

+ +

Since if the face value are 1 and 0, it is simply a binomial distribution and can be approximate by normal distribution, so I also want to know if this problem can be transformed into binomial distribution $P(x>k)$ ?

+ +

This is not any homework problem, but a problem I just think of and try to solve.

+",2013-11-06 06:58:05.353 +58959,594.0,2,,58957.0,,,,CC BY-SA 3.0,"

Your question is quite general (specific cases may offer shortcuts), so I'll limit myself to suggesting strategies. Typically such a question involves constructing appropriate limits on integrals and trying to evaluate them by some means. Usually there will be a bivariate integral where the limits in the inner integral involve the variable in the outer integral. Sometimes the hardest part is simply writing the correct limits down; the general case will involve ""min"" and ""max"" functions on a problem like this.

+ +

To make progress I strongly suggest you get into the habit of making diagrams of the region you're trying to integrate.

+ +

A couple of suggested strategies for approaching such a problem, by making slightly simpler problems you might see how to write integrals for.

+ +

One approach: Let $Z = Y-aX$ and work out the joint probability in terms of $X$ and $Z$.

+ +

Another approach: First, replace your $x$ and $y$ with $x_0$ and $y_0$ so I can use $x$ and $y$ as dummy variables in the integration. If $a\leq y_0/x_0\leq b$ then you have a region like this:

+ +

+ +

While you can actually write the integral by splitting it up into pieces (move a vertical line along the $x$-axis and split the integral where your line hits any 'corners' on the dark green region), you might otherwise evaluate it by working out the probability of being in the rectangle $0<X<x_0; 0<Y<y_0$ and then subtract the two triangles.

+ +

The other cases (the other arrangements of $x_0$ and $y_0$ relative to $a$ and $b$) might be worked out by drawing the relevant diagram in order to obtain the right limits on the integrals; in each case you'll do something similar, but sometimes you might not hit a corner.

+ +

With more details, more specific responses might be possible.

+",2013-11-06 07:12:01.233 +58960,20470.0,2,,58941.0,,,,CC BY-SA 3.0,"

Directly taken from a document on bivariate statistics with SPSS that lives here:

+ +
+

Chi-square is a useful technique because you can use it to see if + there’s a relationship between two ordinal variables, two nominal + variables, or between an ordinal and a nominal variable. You look at + the assymp. Sig column and if it is less than .05, the relationship + between the two variables is statistically significant.

+
+",2013-11-06 08:58:02.350 +58961,23422.0,2,,53391.0,,,,CC BY-SA 3.0,"

The formula that you refer to can be used when the distribution of classes is the same in the training and test set (which is commonly assumed with machine learning).

+ +

Take 7 classes: A, B, C, D, E, F, G. There will be #A instances with label A in your data set. And of course, #A + #B + #C + #D + #E + #F + #G = X

+ +

The chance of encountering an instance with label A, i.e. the probability of class A, pA, equals #A/X.

+ +

Now, if you consider a random baseline system, this system will assign labels to instances according to these probabilities. Because labels are assigned according to probabilities, each time you let the system label the instances a different result will be produced. A majority system or your SVM-based system will produce the same result, no matter how often they are applied. With an infinite number of runs of the random baseline system, on average, the following will happen:

+ +

Given an instance with gold label A, this instance will be labelled pA times as A, pB times as B, etc. This means that we have a (fractional) true positive count equal to the probability pA. There are #A instances with gold label A, the total true positive count for label A becomes #A*pA. This can be done for each label. The total number of true positives, TP, becomes:

+ +

TP = #A*pA + #B*pB + #C*pC + #D*pD + #E*pE + #F*pF + #G*pG

+ +

And the average accuracy of this baseline system becomes acc = TP/X

+ +

acc = 1/X * (#A*pA + #B*pB + #C*pC + #D*pD + #E*pE + #F*pF + #G*pG)

+ +

If the X is distributed over the different terms, and using the definition of the probabilities, this becomes:

+ +

acc = pA*pA + pB*pB + pC*pC + pD*pD + pE*pE + pF*pF + pG*pG

+ +

which is the formula that you refer to.

+ +
+ +

As noted before, for an SVM-based system or a majority system, the average accuracy is equal to the accuracy of a single run. Meaning that the accuracies of a single run can be compared with the outcome of the random baseline formula.

+ +

If your machine learner produces slightly different results with each run (because it contains an element of randomness), you should compute the average accuracy for an infinite number of runs. But this is the ideal situation, and it may be impossible to compute. In practice, differences will be probably very small and most people stick to comparing using the outcome of a single run.

+",2013-11-06 09:14:07.293 +58962,22507.0,2,,58958.0,,,,CC BY-SA 3.0,"

Because of the central limit theorem, for large number of tosses $n$, the total value is distributed normally as $N(0, \sigma)$, where $\sigma^2 = n \left( {1\over 3} 1^2 + {2 \over 3} (1/2)^2 \right) = {n \over 2}$.

+",2013-11-06 09:23:20.700 +58988,19752.0,2,,58984.0,,,,CC BY-SA 3.0,"

One possibility is to use a Jeffreys Interval, which has a Bayesian derivation. One nice feature is that you're guaranteed to get values between 0 and 1. When using a Wald interval on similar problems I got negative lower bounds, which made no sense.

+ +

The confidence intervals are found by computing the $\alpha$ quantiles of a $\text{Beta}(x + 1/2, n – x + 1/2)$ distribution. Where $x$ is the number of successes and $n$ is the number of trials. Computing these quantiles must be done numerically, but it's no problem in something like R.

+",2013-11-06 17:17:27.510 +58964,23404.0,1,58981.0,,,Interpretation of the regression coefficient of a proportion II,,CC BY-SA 3.0,"

Following question: +Interpretation of the regression coefficient of a proportion type independent variable

+ +

In my model I have a log dependet variable. +As indenpendet variables I have one proportion $X_1$ that lies within [0.001, 0.30] +and a second proportion $X_2$ that lies within [0.12, 0.99].

+ +

$E(\log(y))$ = $\alpha + \beta_1X_1$ + $\beta_2X_2$

+ +

How do I interpret the coefficients? Let´s say, the estimated $\beta_1$ is 1.5 , the estimated $\beta_1$ is 0.8.

+ +

Or do I have to adjust the range before running the regression?

+",2013-11-06 10:54:36.910 +58965,23427.0,1,,,,Correct use of AIC,,CC BY-SA 3.0,"

It is well known that the AIC can be used to compare nested models.

+ +

Additionally, I believe I am correct in saying that you can also use the AIC to compare non-nested models on the same dataset (please correct me if I am in fact wrong). However, it is not correct to use the AIC to compare between different data sets.

+ +

In my scenario, I have 5 measurements on individuals over time plus an outcome variable. If I was to regress the outcome variable linearly on all the measurements over time, I would be able to obtain an AIC for this model which uses the entire dataset.

+ +

Now I want to consider only 2 of the measurements of all individuals plus the outcome variable. Technically, I am now using a subset of the original dataset as I have lost information on the other three measurements. However, isn't this the same as fitting a nested model since I kept all individuals but lost 3 explanatory variables? So is it justified to compare the AIC I get from this model with that obtained from the full model?

+",2013-11-06 11:01:34.377 +58966,23428.0,1,,,,"Given $n$ ratings from 0 to 1, how to calculate a weighted ""average"" estimating the ""true"" rating?",,CC BY-SA 3.0,"

I've read How Not to Sort By Average Rating regarding how to average binary positive/negative ratings in a way that takes the number of ratings into account. The author uses the ""lower bound of Wilson score confidence interval for a Bernoulli parameter"". However, the items I'm dealing with have continuous ratings from 0 to 1. Is there an analogous averaging technique for this case?

+ +

My ratings collection is long-tailed: the median item has only 2 ratings, but the average one has 80, and the most-rated item has 36,000 ratings. Intuitively ten ratings of 0.8 should ""average"" higher than one of 0.9, but I'd like a precise formulation of this intuition.

+ +

(I'm using this to design a recommender system, which has to deal with 50,000 users and 10,000 items. I'm evaluating various known recommenders, like GroupLens and LSI, and have to design one that doesn't perform too much worse than those (and hopefully better). I was reminded of this blog post on averages when using users' average ratings for a baseline RMSE calculation.)

+",2013-11-06 11:05:00.940 +58967,449.0,2,,58965.0,,,,CC BY-SA 3.0,"

To you, all of the predictors in the model are data so, from that perspective, data is changing. But the data that needs to be consistent across models for AIC is the outcome or response variable. As long as that is the same data in each comparison you can use AIC. Search AIC model selection on this site. This has been discussed several times.

+",2013-11-06 11:34:48.943 +58968,22653.0,1,,,,What is the 95% confidence interval / ellipse in a PCA plot telling me?,,CC BY-SA 3.0,"

I am new to statistics and within the last two days I tried to get my head around PCA plots. Now, I kind of understand what they are showing but I am still not sure about the 95% confidence ellipse that is very often shown in such plots. The 2 dimensional PCA plot displays the two biggest variances (whatever these are) in the data but I don't know what the ellipse is trying to tell me and what it means if a sample/dot (whatever is displayed) is lying outside that ellipse.

+ +

Help is very much appreciated.

+",2013-11-06 11:57:06.773 +58969,503.0,2,,58965.0,,,,CC BY-SA 3.0,"

@John is correct but I think not quite explicit enough, perhaps leading to confusion (maybe not now, but if someone else reads this thread later).

+ +

AIC needs to be on the same data set: That is, it has to have the same subjects. And it needs to have the same dependent variable (John's point). To be nested, one set of independent variables must be a subset of the other, but AIC can work on non-nested subsets (at least, according to some people see this thread).

+",2013-11-06 12:06:35.297 +58970,503.0,2,,58968.0,,,,CC BY-SA 3.0,"

The ellipse around a scatter plot of ""component 1"" vs. ""component 2"" has a similar meaning to the ellipse around any other scatter plot. Unfortunately, there are two common uses of such ellipses: Prediction ellipses and confidence ellipses. SAS documentation explains the difference (as do other sites)

+",2013-11-06 12:15:49.530 +58971,5480.0,1,,,,Ranking topics in K-Means,,CC BY-SA 3.0,"

I understand that clustering is meant to group items together, is there any ways that we can quantify saying Cluster A is more important than Cluster B? Other than counting the number of items in a cluster?

+",2013-11-06 12:38:16.353 +58972,22972.0,2,,58963.0,,,,CC BY-SA 3.0,"

I think the answer to your question is yes. What you have is two columns, one is a long list of combinations and one with a 1 or a 0. 1 if there is a video at that combination and 0 if there is not a video. Were you to sum this second column, then this will give you the number of videos. For the whole population this works.

+ +

The only problem I see is in drawing a random sample as I have no idea how youtube generates its urls. You could of course send them an email and ask. I would just give it a shot with the script, try different ways to generate random urls take a large sample (n>1,000) and see whether the estimates are close enough.

+",2013-11-06 12:40:09.360 +58973,23171.0,1,58975.0,,,Show that $R_{n}^{2}-n$ and $(-1)^{n} \cos(\pi R_{n}) $ are $\mathcal F_{n}$-martingales,,CC BY-SA 3.0,"

Let $X_{i}, i\ge 1$, be i.i.d. random variables defined on a probability space $(\Omega, \mathcal F,P)$ such that $P(X_{i}=1)=P(X_{i}=-1)=\frac{1}{2}$. Consider the filtration $\mathcal F_{n}=\sigma(X_{1},\dots,X_{n})$ on this space and the random walk $R_{i}=\sum_{i=1}^{n} X_{i}$.

+ +

Show that $R_{n}^{2}-n$ and $(-1)^{n} \cos(\pi R_{n}) $ are $\mathcal F_{n}$-martingales.

+",2013-11-06 13:02:15.287 +58974,22972.0,2,,58809.0,,,,CC BY-SA 3.0,"

First, the p-value will not tell you whether a home court advantage exists or not. It will only tell you the probability of the data given a hypothesis. Second, the data you have will not tell you anything about a home court advantage, because you have nothing to compare it to. What is the proportion of games won away or what is the proportion of games won overall? The only thing you try to find out is whether the proportion of games won is statistically significant different from .5.

+ +

Now if you have data on games played away, then you can set up your null hypothesis as:

+ +

$H_{0}: \; p_{home} = p_{away}$

+ +

where, $p_{home}=\frac{won_{home}}{total_{home}}$ and $p_{away}=\frac{won_{away}}{total_{away}}$ - if you want you can write a tie as a half win or exclude the ties all together - then what you have is categorical variable $home$ which can take a value of $1$ or $0$: $1$ for home and $0$ for away and two dependent proportions. For this problem you can then use the McNemar test or the paired t-test if your sample is large enough (see here: http://www.ats.ucla.edu/stat/stata/whatstat).

+ +

Hope this helps.

+",2013-11-06 13:02:36.247 +58975,1406.0,2,,58973.0,,,,CC BY-SA 3.0,"

Simply write $R_n=R_{n-1}+X_n$ and take the conditional expectations with respect to $\mathcal{F}_n$. Then exploit the fact that

+ +

$$E[g(X_n)f(R_{n-1})|\mathcal{F}_n]=f(R_{n-1})Eg(X_n),$$ +for any measurable functions $f$ and $g$ (assuming that the expectations exist). Also recall the formula

+ +

$$\cos(\alpha+\beta)=\cos\alpha\cos\beta-\sin\alpha\sin\beta$$.

+",2013-11-06 13:19:46.403 +58976,23431.0,1,,,,Data split into training and test,,CC BY-SA 3.0,"

I am implementing an EEG classifier with 15 subjects (patients), specifically a support vector machine classifier.

+ +

I randomly choose the training and testing sets, but I was faced by a question ""how did you choose subjects in each set?"". I looked for the response but I couldn't find a good one (cross validation wouldn't be the best solution in my case).

+ +

Could you please help me with this problem?

+",2013-11-06 13:44:37.607 +58977,16110.0,1,,,,Using nls() function in R for exponential function,,CC BY-SA 3.0,"

I know that this issue was already discussed here but I faced with the problem I can't solve. I have list of persons, each represented with some time series consisting from 4-8 points. I want to approximate them all with the function $y=a\cdot x^2\cdot exp(-bx)+c$. +Thus for each person I am going to find his own ""a"", ""b"" and ""c"". +For most of them next code works very good:

+ +

res=nls(p2[,2] ~ c+a*I(p2[,1]^2)*exp(b*p2[,1]),start=list(a=0.005,b=-0.005,c=5))

+ +

However for some persons these starting values don't work, R returned ""Missing value or an infinity produced when evaluating the model"" or ""singular gradient matrix at initial parameter estimates"". For some of these people these starting values worked:

+ +

res=nls(p2[,2] ~ c+a*I(p2[,1]^2)*exp(b*p2[,1]),start=list(a=0.1,b=-0.02,c=5))

+ +

Could anybody give any clear suggestion how to choose starting points for all the people I consider? +I tried to use tryCatch to try different staring values and find those which work but another problem appeared: +this code +nls(p2[,2] ~ c+a*I(p2[,1]^2)*exp(b*p2[,1]),start=list(a=5,b=0,c=5)) +led to:

+ +
        a         b         c 
+ -0.00166  -0.00269 140.87366 
+
+ +

while +nls(p2[,2] ~ c+a*I(p2[,1]^2)*exp(b*p2[,1]),start=list(a=0.1,b=-0.02,c=5)) +led to

+ +
      a       b       c 
+ 0.2024 -0.0251 47.7811 
+
+ +

So by choosing different starting values we have different answers. How can this happen? I thought that since NLS function is quadratic, it can't have more than 1 extremum... +Do you have any suggestions about how should I proceed in this situation?

+",2013-11-06 14:19:30.577 +58978,4320.0,2,,58934.0,,,,CC BY-SA 3.0,"

The problem is small enough you can work it out by hand. For your example you have +$$ +\begin{align*} +P(outlook = sunny| play=yes) &= \frac{2}{9}\\ +P(temp = cool| play=yes) &= \frac{3}{9}\\ +P(humidity=high| play=yes) &= \frac{3}{9}\\ +P(windy=true| play=yes) &= \frac{3}{9}\\ +P(play=yes) &= \frac{9}{14}.\\ +\end{align*} +$$ +Putting it all together you have +$$ +\begin{align*} +P(play=yes|sunny, cool, high, true) &\varpropto \frac{2}{9} \left(\frac{3}{9}\right)^3 \frac{9}{4}\\ +&\approx 0.0053, +\end{align*} +$$ +which agrees with Mitchell. I don't use R, so I can't speak as to why the output is different. Obviously the package you're using is normalizing, but this shouldn't change the classification. If I had to guess I'd say it is the cross validation.

+",2013-11-06 14:20:07.447 +58979,1575.0,1,,,,Flaw in a conditional probability argument,,CC BY-SA 3.0,"

Imagine an experiment where you roll two fair, six-sided dice. Someone peeks at the dice, and (truthfully) tells you that ""at least one of the dice is a 4"". What is the probability that the total of the dice is 7?

+ +

It seems straightforward to calculate that the probability the total is 7 is 2/11.

+ +

However, the person who peeked at the dice could equally well have said ""at least one of the dice is a 1"" and you would come to the same conclusion - 2/11. Or they could have said ""at least one of the dice is a 2"" or ""at least one of the dice is a 3"", or indeed any number from 1 to 6, and you would still conclude that the probability that the total is 7 is 2/11.

+ +

Since you will always conclude that the probability that the total is 7 is 2/11, you could block your ears as they speak, and you'd still come up with 2/11. From there it's a short hop to conclude that even if they don't say anything the probability that the total is 7 is 2/11.

+ +

However, clearly if they don't say anything, the probability that the total is 7 is not 2/11, but rather 1/6.

+ +

Where is the flaw in the argument?

+",2013-11-06 15:07:02.440 +58989,23438.0,1,,,,Multinomial confidence interval,,CC BY-SA 3.0,"

I am trying to help out a quality auditing department and figure out sample sizes and confidence intervals and such. The sample size and confidence level (95%) is pretty well documented on the web and I am muddling through it. I even figured out how to do the exact confidence level of a binomial sample which I would use for a pass/fail condition but I wanted to figure out how to do a multinomial of that same sample so that I could generate some type of information about a situation with three possibilities: pass / fail(critical) / fail(non-critical).

+ +

Can someone point me in the right direction here? Ultimately I am going to want to program this into an application so a formula would get me going, a link to a way to program it would be a home run.

+",2013-11-06 17:21:38.667 +59003,23414.0,2,,58977.0,,,,CC BY-SA 3.0,"

Bill has given a great answer. But maybe some practical things are worth adding.

+ +
    +
  1. nls is sensitive to starting values. Depending on your staring values you're going to get different answers or your model might not converge. That's life. No reason to be surprised. An important part of nls is working out a methodology of deriving your starting values.
  2. +
  3. nls is particularly sensitive. If you're having troubles you should always minpack.lm before reassessing your approach. Using an optimizing function that is more robust to starting values.
  4. +
  5. Try what Bill recommends.
  6. +
+",2013-11-06 20:47:27.883 +58980,23302.0,1,,,,Goodness of fit shows less than 0.01 p-values at all 16 distributions for my wind data,,CC BY-SA 3.0,"

I have got two wind datasets (one for 4 months and the other one for 12 years). Typically wind speed distributions follow Weibull distributions. In order to demonstrate my data distribution in a statistically sound manner, I ran goodness of fit test in Minitab. However, all p-values of 16 distributions including Weibull distribution showed extremely low (<0.01). I have no idea what to do next as all distributions do not fit with my data.. +Weibull distributions showed lowest AD values though. Can I use it to support that my data has Weibull distributions? Below is the description of my test result.

+ +

Descriptive Statistics

+ +
   N  N*     Mean    StDev  Median    Minimum  Maximum  Skewness   Kurtosis
+2894   0  15.8579  8.69187   14.56  0.0533333    44.42  0.521124  -0.317979
+
+
+Box-Cox transformation: Lambda = 0.573785
+
+Johnson transformation function:
+0.910617 + 1.25421 * Ln( ( X + 2.42013 ) / ( 50.6894 - X ) )
+
+
+Goodness of Fit Test
+
+Distribution                  AD       P  LRT P
+Normal                    21.568  <0.005
+Box-Cox Transformation     2.035  <0.005
+Lognormal                 52.753  <0.005
+3-Parameter Lognormal      4.050       *  0.000
+Exponential              236.204  <0.003
+2-Parameter Exponential  233.814  <0.010  0.000
+Weibull                    1.779  <0.010
+3-Parameter Weibull        1.514  <0.005  0.000
+Smallest Extreme Value    77.762  <0.010
+Largest Extreme Value      4.727  <0.010
+Gamma                     10.989  <0.005
+3-Parameter Gamma          2.917       *  0.000
+Logistic                  20.538  <0.005
+Loglogistic               23.023  <0.005
+3-Parameter Loglogistic    8.249       *  0.000
+Johnson Transformation     0.622   0.106
+
+
+ML Estimates of Distribution Parameters
+
+Distribution             Location    Shape     Scale  Threshold
+Normal*                  15.85790            8.69187
+Box-Cox Transformation*   4.68271            1.59250
+Lognormal*                2.56056            0.74667
+3-Parameter Lognormal     3.37633            0.28606  -14.61547
+Exponential                                 15.85790
+2-Parameter Exponential                     15.81003    0.04787
+Weibull                            1.86662  17.81083
+3-Parameter Weibull                1.97275  18.54109   -0.58137
+Smallest Extreme Value   20.39570            9.29332
+Largest Extreme Value    11.74217            7.24790
+Gamma                              2.61645   6.06084
+3-Parameter Gamma                  5.10250   3.94816   -4.28763
+Logistic                 15.31511            5.03370
+Loglogistic               2.63670            0.38443
+3-Parameter Loglogistic   3.22306            0.19638  -10.52426
+Johnson Transformation*  -0.01235            1.01633
+
+* Scale: Adjusted ML estimate
+
+",2013-11-06 15:13:06.043 +58981,10060.0,2,,58964.0,,,,CC BY-SA 3.0,"
+

The maximum by which X1 can increase is 0.30-0.001. Following your + explanation, if I want to compare the max. value with the lowest + value, does this mean that the average log(y) increases by 1.5/100 * + 0.3-0.001, i.e. around 45%?

+
+ +

No, because the simple way of multiplying the coefficient by 100 and then interpreting as a factorial change only works when both the coefficient and the change in the independent variable are relatively small.

+ +

Consider the equation:

+ +

$\ln(y) = \alpha + 1.5X_1 + 0.8 X_2$

+ +

With one percent point increase:

+ +

$\ln(y') = \alpha + 1.5(X_1+0.01) + 0.8 X_2$

+ +

Subtracting for the difference:

+ +

$\ln(y') - \ln(y) = 1.5 \times 0.01$

+ +

$\ln(y'/y) = 0.015$

+ +

The factorial change is then:

+ +

$y'/y = 1.015113$,

+ +

which is about 1.5%. So, that approximation shortcut works.

+ +
+ +

When the change is large, like your 0.299, the approximation can deviate:

+ +

$\ln(y'/y) = 1.5 \times 0.299$

+ +

The factorial change is then:

+ +

$y'/y = 1.565961$,

+ +

about 56.6% larger instead of the approximated 45%.

+",2013-11-06 15:13:40.960 +58982,21485.0,1,58985.0,,,Caret package Varimp - feature selection question,,CC BY-SA 3.0,"

I decided to use RFE using the caret package for feature selection for a logistic regression model.

+ +

The documentation says the Varimp for linear model uses

+ +

the absolute value of the t-statistic for each model parameter is used.

+ +

Logistic regression is not a linear model or has any of the linear model assumptions. Does it make sense to use linear model assumptions to reduce variables for a logistic regression problem using the caret package?

+",2013-11-06 15:41:05.977 +58983,7016.0,2,,58921.0,,,,CC BY-SA 3.0,"

From Wikipedia,

+ +
+

RMSE is a frequently used measure of the differences between values + predicted by a model or an estimator and the values actually observed.

+
+ +

Further,

+ +
+

RMSE is a good measure of accuracy, but only to compare forecasting + errors of different models for a particular variable and not between + variables, as it is scale-dependent.

+
+ +

Clearly, you can use RMSE for comparing the forecast error between different models (linear or nonlinear), as long as you are comparing models for a particular variable. I don't see any assumptions to consider.

+",2013-11-06 15:45:15.427 +58984,3693.0,1,58991.0,,,Bernoulli Confidence Intervals for p very close to 0,,CC BY-SA 3.0,"

Let's say I have the following observations from many Bernoulli distributions with different p (p1, p2, ..):

+ +

Observations from Distribution 1: 10 successes, 100,000 trials, p_hat = 0.0001 +Observations from Distribution 2: 0 successes, 100 trials, p_hat = 0 +Observations from Distribution 3: 4 successes, 60,000 trials, p_hat = 0.00007

+ +

I want to order these distributions by their true probabilities of success and get rid of the ones that have low probability of success. However, because of the inherent nature of these distributions, the probability of success is so low, that if I use a standard Wald and Wilson confidence interval for Bernoulli distributions, the results don't make too much sense.

+ +

Is there a standard statistical way to deal with these types of problems? Or do I have to resort to some self defined heuristics to remove distributions with low probability of success?

+",2013-11-06 15:49:47.827 +58985,2198.0,2,,58982.0,,,,CC BY-SA 4.0,"

We wouldn't be making those assumptions. The logistic regression model falls into a wider class called generalized linear models (as does linear regression).

+ +

The t-test discussed here is the generalized linear model t-statistic to test that the parameter is equal to zero.

+",2013-11-06 17:09:52.273 +58986,5906.0,1,59038.0,,,Forecast accuracy measures for different forecast horizon h in R,,CC BY-SA 3.0,"

I have a yearly time series data, from 1980 to 2005. The data is splitted into an training sample and a out of sample; the out-of sample consists of the 6 most recent observations and the rest is considered for training sample. I need to fit a ETS model and compare different accuracy measures for different forecast step aheads h=1,2,3,4,5 and 6.

+ +

Something like this:

+ +
    h=1  h=2  h=3  h=4  h=5   h=6
+
+ +

MSE .. .. .. .. .. .... .. .. .. .. .... .. .. .. .. ..

+ +

MASE .. .. .. .. .. ...... .. .. .. .. .... .. .. .. .. ..

+ +

The following code gives me the accuracy measures for h=6:

+ +
trainx<- window(x,end=1999.99)
+testx<- window(x,start=2000)
+fit<- ets(trainx)
+accuracy(forecast(fit,h=6),testx)
+
+ +

The questions are:

+ +
    +
  1. How can I calculate the accuracy measures for h=1,2,3,4,5 ? For instance, when h=2, I fit a model to training data and I produce the forecast that correspond to 2000 and 2001.

  2. +
  3. Now, how should I produce the forecast for 2002 and 2003, etc?
    +Should I suppose that the observations for the year 2000 an 2001 are known and then fit a new model (this time I need to add the observations of 2000 and 2001 to the training set), then, to produce the forecast for 2002 and 2003?

  4. +
+",2013-11-06 17:13:17.030 +58987,2958.0,2,,58921.0,,,,CC BY-SA 3.0,"
    +
  • RMSE is certainly appropriate also for nonlinear models
  • +
  • However, the RMSE expressions I know actually calculate the mean, so no -2 (looks like d.f. for linear model? - d.f. for nonlinear models would be different!)
  • +
  • In general, I'd not use the residuals for calculating RMSE but rather use independent test cases to avoid an optimistic bias.
  • +
+",2013-11-06 17:14:02.080 +58990,1636.0,2,,58949.0,,,,CC BY-SA 3.0,"

You seem to have some conceptual issues.

+ +

In the classical non-bayesian context (the fact that your are learning about bias, and your working example, suggest that this is your context) the parameter $\theta$ is ... a parameter, a number; which is perhaps unknown to us but which takes nonetheless some determined fixed value. In short: $\theta$ is not a random variable.

+ +

The estimator, instead, is (in general) a random variable. Because $\hat{\theta}=g(\{X\})$ where $g(\cdot)$ is some function and $\{X\}$ is a list of realizations ($X_1,X_2 \cdots.. X_n$) of a random variable. (Think for example, of the sample average $(X_1+X_2+\cdots + X_n)/n$) This is to say: in different ""experiments"" (trials) we'll get different values of the estimator $\hat{\theta}$ . But in all experiments the parameter $\theta$ will be the same.

+ +

That's why it makes sense to ask if $E(\hat{\theta})=\theta$ (because the left side is the expectation of a random variable, the right side is a constant). And, if the equation is valid (it might or not be, according to the estimator) the estimator is unbiased.

+ +

In your example, you're using $\hat{\theta} = \frac{X_1+X_2+ \cdots + X_n}{n}\frac{4}{3}$. The expectation of this is $E(\hat{\theta} )= \frac{n E(X)}{n} \frac{4}{3}$ +Now, we need to compute $E(X)$ (it will be a function of $\theta$) and check if that gives $\theta$.

+",2013-11-06 17:34:45.527 +58991,15827.0,2,,58984.0,,,,CC BY-SA 4.0,"

Confidence intervals are great, but

+
    +
  • With probabilities near zero, different takes on how to do it have to be considered. The question and discussion so far mention various possibilities. The paper by Brown and friends in Statistical Science 2001 +remains the best guide I know for 21st century statisticians and data analysts.

    +
  • +
  • With sample sizes this different, overlap of intervals is inevitable and a clear ordering a little elusive.

    +
  • +
  • The leading evidence arguably remains the point estimates.

    +
  • +
+

Putting your cases in different order, of the point estimates 0.0001, 0.00007, 0, Stata gives for 95% confidence intervals:

+
 
+Exact          0.0000480         0.0001839
+Agresti        0.0000515         0.0001869
+Jeffreys       0.0000514         0.0001774
+Wald           0.0000380         0.0001620
+Wilson         0.0000543         0.0001841
+
+Exact          0.0000182         0.0001707
+Agresti        0.0000192         0.0001781
+Jeffreys       0.0000225         0.0001585
+Wald           0.0000013         0.0001320
+Wilson         0.0000259         0.0001714
+
+Exact          0.0000000         0.0362167
+Agresti        0.0000000         0.0444121
+Jeffreys       0.0000000         0.0247453
+Wald           0.0000000         0.0000000
+Wilson         0.0000000         0.0369935
+
+

Notes: "Exact" here means Clopper-Pearson. Stata is explicit that it clips at 0 (or 1).

+

Normally I would add a graph, but its main point would be that the intervals for the $n = 100$ sample are massively larger, and logarithmic scale is not appropriate here.

+

If the samples were from quite different populations, you would have to take all the samples seriously. Otherwise one possible conclusion is that the sample of $n = 100$ is far too small to take seriously compared with the other samples.

+",2013-11-06 18:34:54.673 +58992,13165.0,1,,,,Fixed point iterations for expectation propagation using energy minimization,,CC BY-SA 3.0,"

EP Primal

+ +

In 1, it is finding the EP iterations by solving a saddle-point problem on the energy function. First, the primal is claimed to be +$$ +\min_{\hat{p}_i} \max_{q} \left[ \sum_i \int_{\mathbf{y}} \hat{p}_i(\mathbf{y}) \log \frac{ \hat{p}_i(\mathbf{y}) }{ t_i(\mathbf{y}) p(\mathbf{y}) } d\mathbf{y}‎ - ‎(n-1) \int_\mathbf{y} q_\theta(\mathbf{y}) \log \frac{q_\theta(\mathbf{y})}{p(\mathbf{y})} d\mathbf{y} \right]‎ +$$ +‎with the local moment matching constraints +$$ +‎\mathbb{E}_{ q_\theta(\mathbf{y}) }\left[ \phi(\mathbf{y}) \right] = \mathbb{E}_{ \hat{p}_i(\mathbf{y} ) }\left[ \phi(\mathbf{y}) \right]‎, ‎\forall i \quad \quad ‎$$

+ +

EP Dual

+ +

The dual energy function is the following; +$$ +‎\min_{\nu} \max_{\lambda} \left[ (n-1) \log \int_\mathbf{y} p(\mathbf{y}) \exp \left( \nu^\top \phi(\mathbf{y}) \right) d\mathbf{y}‎ - ‎\sum_{i=1}^{n} \log \int_\mathbf{y} \hat{t}_i(\mathbf{y}) p(\mathbf{y}) \exp \left( {\lambda_i}^\top \phi (\mathbf{y}) \right) d\mathbf{y} \right]‎, +$$ +$$ +‎(n-1) \nu = \sum_i \lambda_i‎. +$$

+ +

EP fixed point iterations

+ +

And using the dual energy function, we should be able to find the fixed point iterations:

+ +

Message elimination: Choose a $\tilde{t}_i$ to do approximation with‎. ‎

+ +

‎Remove the factor $\tilde{t}_i$ from approximation‎, ‎$\; q_\theta^{-i} = \displaystyle \frac{q_\theta}{ \tilde{t}_i }$ +‎ +Belief projection: Project the approximate posterior‎, ‎with $\tilde{t}_i$ replaced with $t_i$‎, ‎on the approximating family‎, +‎$$‎ +‎q^{new}_\theta(\mathbf{y}) = \text{proj}\left( \hat{p}_i(\mathbf{y}) \rightarrow q_\theta(\mathbf{y}) \right)‎, +‎$$‎ +‎where‎, +‎$$‎ +‎\hat{p}_i(\mathbf{y}) = \frac{1}{Z} q_\theta^{-i}(\mathbf{y}) t_i(\mathbf{y})‎, ‎\; \; Z = \int q_\theta^{-i}(\mathbf{y}) \times t_i(\mathbf{y}) d\mathbf{y}‎ +‎$$‎

+ +

$‎ +\tilde{t}_i = \arg \min_{\tilde{t}_i} \text{KL} \left( \displaystyle \frac{ t_i \prod_{j\neq i} \tilde{t}_j }{ \int t_i \prod_{j \neq i} \tilde{t}_j } \parallel q_\theta(\mathbf{y}) \right) +$. ‎

+ +

Message update: Compute the new approximating factor‎, +‎$$‎ +‎\tilde{t}_i = Z \frac{ q^{new}_\theta(\mathbf{y}) }{ q_\theta^{-i}(\mathbf{y})‎ } +‎$$‎

+ +

Here are the questions:

+ +
    +
  1. I know how how to derive dual from primal, but it it not clear to me where the primal is coming from.
  2. +
  3. I don't see how can I find the EP iterations from Dual. Any idea?
  4. +
+",2013-11-06 18:50:24.980 +58993,17573.0,2,,58977.0,,,,CC BY-SA 3.0,"

Non-linear least squares solves $min_\beta \sum (y_i-f(x_i;\beta))^2$. This is quadratic in $\beta$ if $f$ is linear in $\beta$. Your $f$ is not linear in $\beta$, so the NLS objective function is not quadratic in $\beta$. Of course, you don't need the function to be quadratic to guarantee convergence to a unique minimum, rather you need $min_\beta \sum (y_i-f(x_i;\beta))^2$ to be convex in $\beta$. Presumably, with your $f$, the NLS objective function is not convex. It doesn't look, to me, like the kind of $f$ which generates a convex objective function. That's pretty much the explanation. You can have lots of minima or one minimum.

+ +

If I were fitting the function that you are, I would use an entirely different approach. I would not just blindly use NLS. If you look carefully at your function, $f(x_i;\beta)=a*x_i^2exp(-bx_i)+c$ it is almost linear in the parameters. If you fixed $b$ at some value, say 0.1, then you could fit $a$ and $c$ by OLS: +\begin{align} +y_i &= a*x_i^2exp(-0.1x_i)+c \\ + &= a*z_i+c +\end{align} +The variable $z_i$ is defined $z_i=x_i^2exp(-0.1x_i)$. This means that, once you have picked $b$, the optimal value of $a=\widehat{Cov}(y,z)/\hat{V}(z)$ and the optimal value of $c=\overline{y}-a*\overline{z}$.

+ +

So what, right? At the very least, this is how you should pick starting values for $a$ and $c$. But, really, this reduces the search for optimal parameters to a one dimensional search over $b$. With a modern computer, one dimensional searches are fast and easy. If you have some idea of what reasonable values for $b$ are, then you can just define an interval $[b_{low},b_{high}]$ and grid search for the b which gives the lowest sum of squared errors. Then use that $b$ and its associated optimal $a$ and $c$ to start NLS from.

+ +

Or, you could do something more sophisticated. Suppose you are searching over $b$, using the optimal $a(b)$ and $c(b)$ from OLS. Then the NLS objective function is $\sum \left(y_i - f(x_i;a(b),b,c(b))\right)^2$. The envelope theorem makes the derivative of this very easy to calculate: +\begin{align} +\frac{d}{d b} \sum \left(y_i - f(x_i;\beta)\right)^2 &= \sum 2\left(y_i - f(x_i;\beta)\right)\frac{d}{d b}f(x_i;\beta)\\ +&= \sum 2\left(y_i - f(x_i;\beta)\right)(-abx_i^2exp(-bx_i)) +\end{align}

+ +

So, you can easily write a function to calculate the NLS objective function for any given $b$ and you can easily write a function to calculate the derivative of the NLS objective function for any $b$. These two ingredients are enough to get a optimizer going on your function. Then, after you find the optimal $b$, just run NLS with that $b$ and its associated optimal $a$ and $c$. It will converge in one iteration.

+",2013-11-06 18:55:39.330 +58994,14402.0,1,,,,Cohen d for 2x2 interaction,,CC BY-SA 3.0,"

There is a nice answer to this question, but it assumes that you have the ANOVA table available.

+ +

My problem is different. Say I'm reading a paper describing a male vs female, disease vs control experiment, and I know the n's, means, and standard deviations for all four groups (healthy females, ... , diseased males), but I don't have the original data.

+ +

I estimate Cohen d for the sex:health interaction. Numerator: (male_disease-female_disease)-(male_healthy-female_healthy). Denominator: I pool the standard deviations.

+ +

In simulations with perfect data (normally distributed, etc), my estimated Cohen d does not match the Cohen d calculated from eta-squared: $$2\sqrt\frac{\eta^2}{1-\eta^2}$$

+ +

I am sure there must be a closed-form estimator for Cohen's d for this case, but having looked all over google and online stats books I can't seem to find the answer. Apologies if I've missed something obvious.

+",2013-11-06 18:58:32.697 +59004,23091.0,1,,,,Total Sum of Squares (TSS) - Exponential Regression,,CC BY-SA 3.0,"

The total sum of squares is expressed as below [1][2],

+ +

\begin{align} +\rm{TSS} &= MSS + ESS +\end{align}

+ +

\begin{align} +\sum_{i=1}^n (y_{i}-\overline{y})^2 &= \sum_{i=1}^n (\hat{y}_{i}-\overline{y})^2 + \sum_{i=1}^n (y_{i}-\hat{y}_{i})^2 +\end{align}

+ +

I have a set of exponentially distributed data (x,y) as below,

+ +
x   y       
+332 7.283650
+342 7.231924
+356 6.949199
+369 7.360927
+369 7.154024
+315 7.379831
+334 7.278457
+339 7.217902
+321 7.238676
+300 7.282819
+330 7.255710
+329 7.126188
+374 7.042991
+353 7.292686
+335 7.405174
+360 7.196402
+351 7.130031
+357 7.218629
+348 6.991577
+327 7.326131
+347 7.453576
+335 7.292167
+391 7.199726
+351 7.310863
+307 7.206269
+349 7.125773
+340 7.129408
+341 7.093262
+358 7.375157
+306 7.516519
+301 7.304112
+342 7.133147
+350 7.345347
+333 7.546433
+318 7.192559
+321 7.142807
+347 7.167319
+327 7.197025
+352 7.122761
+343 7.194221
+314 10.388952
+332 7.425843
+344 8.922770
+367 7.209697
+361 7.040914
+309 7.236910
+322 7.050262
+321 7.974467
+291 12.036381
+264 10.103215
+322 7.208139
+313 8.114479
+374 7.072074
+343 7.376195
+333 7.081941
+356 7.061375
+351 7.176979
+349 7.145715
+348 6.948888
+299 13.043680
+337 7.679382
+331 7.025437
+387 7.182068
+349 7.174590
+293 7.387724
+335 7.240338
+330 7.540512
+321 9.081374
+350 8.787224
+270 12.606609
+283 8.027543
+336 7.113309
+346 7.149870
+317 10.554307
+308 8.318993
+317 7.115282
+347 7.038421
+309 7.059298
+344 7.025853
+339 7.194532
+262 12.690118
+291 9.464123
+317 12.085718
+335 7.581643
+348 7.195571
+231 8.631216
+274 7.765591
+298 12.136716
+221 8.449553
+190 11.299447
+285 7.610830
+250 9.075558
+372 6.836608
+312 9.058835
+296 6.974543
+332 7.381492
+298 7.987658
+312 7.734847
+316 6.975374
+227 13.573608
+305 9.040347
+280 8.783381
+362 7.320523
+311 6.930607
+222 9.994258
+299 10.843992
+256 9.309050
+294 9.304895
+303 11.077899
+181 14.128775
+206 11.550493
+293 8.223747
+315 8.525272
+289 10.906415
+279 8.296662
+259 9.490090
+305 8.418497
+267 9.286926
+326 8.573466
+320 7.391983
+20  46.517206
+18  46.796088
+50  33.885254
+10  53.022687
+50  33.730700
+46  33.914648
+30  39.763380
+50  34.076680
+40  40.189545
+66  27.821829
+39  40.119954
+39  39.077237
+57  27.396287
+29  46.103090
+30  39.334099
+29  40.155373
+30  40.455963
+47  33.197760
+60  39.389356
+29  40.129821
+40  40.059608
+38  39.861015
+30  46.634160
+19  45.749527
+47  33.575315
+68  33.705149
+69  33.255718
+69  33.722495
+10  52.855150
+18  46.515025
+28  40.819288
+28  39.971840
+20  47.167931
+28  41.367704
+50  34.006985
+49  33.528368
+50  33.917452
+39  39.867870
+50  33.280126
+40  39.621913
+20  46.517206
+18  46.796088
+50  33.885254
+10  53.022687
+50  33.730700
+46  33.914648
+30  39.763380
+50  34.076680
+40  40.189545
+66  27.821829
+39  40.119954
+39  39.077237
+57  27.396287
+29  46.103090
+30  39.334099
+29  40.155373
+30  40.455963
+47  33.197760
+60  39.389356
+29  40.129821
+40  40.059608
+38  39.861015
+30  46.634160
+19  45.749527
+47  33.575315
+68  33.705149
+69  33.255718
+69  33.722495
+10  52.855150
+18  46.515025
+28  40.819288
+28  39.971840
+20  47.167931
+28  41.367704
+50  34.006985
+49  33.528368
+50  33.917452
+39  39.867870
+50  33.280126
+40  39.621913
+20  46.517206
+18  46.539745
+48  33.885254
+10  53.022687
+50  33.730700
+46  33.592453
+30  39.763380
+50  34.076680
+40  40.189545
+64  27.515942
+36  40.119954
+38  39.089597
+56  27.260741
+28  46.103090
+30  39.334099
+28  40.155373
+30  40.455963
+46  33.043622
+60  39.389356
+28  40.129821
+40  40.059608
+36  39.724430
+30  46.645377
+18  45.749527
+46  33.307547
+66  33.456388
+68  33.031573
+68  33.714705
+10  52.855150
+18  46.253489
+28  40.595767
+28  39.724741
+20  47.167931
+28  41.117905
+50  34.021215
+48  33.528368
+50  33.917452
+38  39.867870
+50  33.280126
+40  39.621913
+
+ +

I am trying

+ +

a) to find the MSS, ESS and TSS values.
+ b) validate if the values I got are correct, satisfying TSS = MSS + ESS

+ +

Approach 1

+ +

[Best Fit approach [3], Without converting the exponential data to linear data]

+ +

\begin{align} +y_{i} = y, +x_{i} = x +\end{align} +\begin{align} +\overline{y}=23.62464471 +\end{align} +\begin{align} +\hat{y}_{i}=Ae^{Bx_{i}}= 47.3826e^{-0.0055x_{i}} +\end{align} +where A=exp(a) and B=b. a and b obtained from eq.(3) and eq.(4) from [3]. The equations are reproduced below,

+ +

\begin{align} +a &= (\sum_{i=1}^n ln\ y_{i} \sum_{i=1}^n x_{i}^2 - \sum_{i=1}^n x_{i} \sum_{i=1}^n x_{i}\ ln\ y_{i})\ /\ (n\ \sum_{i=1}^n x_{i}^2 - (\sum_{i=1}^n x_{i})^2) +\end{align}

+ +

\begin{align} +b &= (n\ \sum_{i=1}^n x_{i}\ ln\ y_{i} - \sum_{i=1}^n x_{i} \sum_{i=1}^n ln\ y_{i})\ /\ (n\ \sum_{i=1}^n x_{i}^2 - (\sum_{i=1}^n x_{i})^2) +\end{align}

+ +

Calculating the values,

+ +

\begin{align} +MSS = \sum_{i=1}^n (\hat{y}_{i}-\overline{y})^2 = 55961.58373 +\end{align} +\begin{align} +ESS = \sum_{i=1}^n (y_{i}-\hat{y}_{i})^2 = 1739.728157 +\end{align} +\begin{align} +TSS = \sum_{i=1}^n (y_{i}-\overline{y})^2 &= 62029.58951 +\end{align}

+ +

Unfortunately MSS + ESS = 57701.31189 != TSS

+ +

Approach 2

+ +

[Best Fit approach [3], Converting the exponential data to linear data]

+ +

\begin{align} +y_{i} = ln y, +x_{i} = x +\end{align} +\begin{align} +\overline{y}=2.868328 +\end{align} +\begin{align} +\hat{y}_{i}=a+{bx_{i}}= 3.858255231{-0.0055x_{i}} +\end{align} +where a and b obtained from eq.(3) and eq.(4) from [3]. The equations are reproduced below,

+ +

\begin{align} +a &= (\sum_{i=1}^n ln\ y_{i} \sum_{i=1}^n x_{i}^2 - \sum_{i=1}^n x_{i} \sum_{i=1}^n x_{i}\ ln\ y_{i})\ /\ (n\ \sum_{i=1}^n x_{i}^2 - (\sum_{i=1}^n x_{i})^2) +\end{align}

+ +

\begin{align} +b &= (n\ \sum_{i=1}^n x_{i}\ ln\ y_{i} - \sum_{i=1}^n x_{i} \sum_{i=1}^n ln\ y_{i})\ /\ (n\ \sum_{i=1}^n x_{i}^2 - (\sum_{i=1}^n x_{i})^2) +\end{align}

+ +

Calculating the values,

+ +

\begin{align} +MSS = \sum_{i=1}^n (\hat{y}_{i}-\overline{y})^2 = 150.6993684 +\end{align} +\begin{align} +ESS = \sum_{i=1}^n (y_{i}-\hat{y}_{i})^2 = 3.992417623 +\end{align} +\begin{align} +TSS = \sum_{i=1}^n (y_{i}-\overline{y})^2 &= 154.691786 +\end{align}

+ +

Here we able to validate, MSS + ESS = 154.691786 = TSS

+ +

Approach 3

+ +

[Least Square Fit approach [3], Without converting the exponential data to linear data]

+ +

\begin{align} +y_{i} = y, +x_{i} = x +\end{align} +\begin{align} +\overline{y}=23.62464471 +\end{align} +\begin{align} +\hat{y}_{i}=Ae^{Bx_{i}}= 48.6062e^{-0.0056x_{i}} +\end{align} +where A=exp(a) and B=b. a and b obtained from eq.(9) and eq.(10) from [3]. The equations are reproduced below,

+ +

\begin{align} +a &= (\sum_{i=1}^n (x_{i}^2 y_{i}) \sum_{i=1}^n (y_{i}\ ln\ y_{i}) - \sum_{i=1}^n (x_{i} y_{i}) \sum_{i=1}^n (x_{i}y_{i}\ ln\ y_{i}))\ /\ ((\sum_{i=1}^n y_{i}\sum_{i=1}^n (x_{i}^2 y_{i}))-(\sum_{i=1}^n (x_{i} y_{i})) +\end{align}

+ +

\begin{align} +b &= (\sum_{i=1}^n y_{i} \sum_{i=1}^n (x_{i} y_{i}\ ln\ y_{i}) - \sum_{i=1}^n (x_{i} y_{i}) \sum_{i=1}^n (y_{i}\ ln\ y_{i}))\ /\ ((\sum_{i=1}^n y_{i}\sum_{i=1}^n (x_{i}^2 y_{i}))-(\sum_{i=1}^n (x_{i} y_{i})) +\end{align}

+ +

Calculating the values,

+ +

\begin{align} +MSS = \sum_{i=1}^n (\hat{y}_{i}-\overline{y})^2 = 59282.06044 +\end{align} +\begin{align} +ESS = \sum_{i=1}^n (y_{i}-\hat{y}_{i})^2 = 1622.208368 +\end{align} +\begin{align} +TSS = \sum_{i=1}^n (y_{i}-\overline{y})^2 &= 62029.58951 +\end{align}

+ +

Unfortunately MSS + ESS = 60904.26881 != TSS

+ +

Approach 4

+ +

[Least Square Fit approach[3], Converting the exponential data to linear data]

+ +

\begin{align} +y_{i} = ln y, +x_{i} = x +\end{align} +\begin{align} +\overline{y}=2.868328 +\end{align} +\begin{align} +\hat{y}_{i}=a+{bx_{i}}= 3.858255231{-0.0055x_{i}} +\end{align} +where a and b obtained from eq.(9) and eq.(10) from [3]. The equations are reproduced below,

+ +

\begin{align} +a &= (\sum_{i=1}^n (x_{i}^2 y_{i}) \sum_{i=1}^n (y_{i}\ ln\ y_{i}) - \sum_{i=1}^n (x_{i} y_{i}) \sum_{i=1}^n (x_{i}y_{i}\ ln\ y_{i}))\ /\ ((\sum_{i=1}^n y_{i}\sum_{i=1}^n (x_{i}^2 y_{i}))-(\sum_{i=1}^n (x_{i} y_{i})) +\end{align}

+ +

\begin{align} +b &= (\sum_{i=1}^n y_{i} \sum_{i=1}^n (x_{i} y_{i}\ ln\ y_{i}) - \sum_{i=1}^n (x_{i} y_{i}) \sum_{i=1}^n (y_{i}\ ln\ y_{i}))\ /\ ((\sum_{i=1}^n y_{i}\sum_{i=1}^n (x_{i}^2 y_{i}))-(\sum_{i=1}^n (x_{i} y_{i})) +\end{align}

+ +

Calculating the values,

+ +

\begin{align} +MSS = \sum_{i=1}^n (\hat{y}_{i}-\overline{y})^2 = 155.7545382 +\end{align} +\begin{align} +ESS = \sum_{i=1}^n (y_{i}-\hat{y}_{i})^2 = 4.053637091 +\end{align} +\begin{align} +TSS = \sum_{i=1}^n (y_{i}-\overline{y})^2 &= 154.691786 +\end{align}

+ +

Unfortunately, MSS + ESS = 159.8081753 != TSS

+ +

Questions:

+ +
    +
  1. Is the above equation is limited to linear data only?

  2. +
  3. How to calculate TSS and ESS for exponentially data without converting it to linear first? The TSS equation seems to be generic that could fit any type of data.

  4. +
+ +

Hope you can guide me on the right way to calculate TSS and ESS for exponential data.

+ +
Reference:
+[1] http://en.wikipedia.org/wiki/Explained_sum_of_squares
+[2] http://www.originlab.com/forum/topic.asp?TOPIC_ID=4823 
+[3] http://mathworld.wolfram.com/LeastSquaresFittingExponential.html
+
+",2013-11-06 20:50:43.547 +58996,22483.0,1,,,,"Simpson's Paradox, Combining data across confounding variable when few values are missing",,CC BY-SA 3.0,"

The statistical analysis of experimental data that I have to perform could be described as follows. Three drug treatments $D_1$, $D_2$ and $D_3$ were tested across three groups $G_1$, $G_2$ and $G_3$. For each group and treatment combination $D_i, G_k$ the data are exposure (no of patients being treated with drug) and effect (ratio of no of patients recovered from treatment to treated).
+I have to find a ordered list of treatments according to their effectiveness.

+ +

It requires combining the data taken from different groups. I know about Simpson's paradox and from the Bayesian belief networks (establishing the causal relationships) I know that groups (confounding or lurking variable) affect the exposure of the treatment and the effectiveness of the treatment, so I should use the partitioned data across different groups to come to any inference. The one way that I came across to remove this paradox is to break one of the relationships between the confounding variable and exposure or confounding variable and effectiveness. The relationship between the confounding variable and exposure could be broken if the exposure is same across groups or is proportional and that could be achieved through scaling exposure and effectiveness (multiplying by a factor that balances out the exposure across different groups). This works fine if I have all the data for each treatment, group combination.

+ +

But the data I have has some missing values: for a few $D_i, G_k$ I don't have their exposure and effectiveness. Should I use expectation maximization techniques here to find out the missing values and then use scaling while combining data? If I could how should I proceed to avoid getting spurious results?

+ +

The one assumption from the domain knowledge could be made that I could make here is that the groups should affect the effectiveness of every treatment the same way and the effect should be multiplicative,

+ +

i.e. effect of drug for a group $D_i, G_k$ = effect of drug $D_i \times$ effect on group $G_k$

+ +

What could be other ways to find the ordered list?

+",2013-11-06 19:07:01.127 +58997,1895.0,2,,58979.0,,,,CC BY-SA 3.0,"

The flaw in the argument is that the conditioning random variable is not well-defined.

+ +

The ambiguity lies in how our friend peeking at the dice decides to report the information back to us. If we let $X_1$ and $X_2$ denote the random variables associated with the values of each of the dice, then it is certainly true that for each $k \in \{1,2,\ldots,6\}$, +$$ +\mathbb P(X_1 + X_2 = 7 \mid X_1 = k \cup X_2 = k) = \frac{2}{11} \>, +$$ +independently of $k$.

+ +

However, the events $\{X_1 = k \cup X_2 = k\}$ are clearly not mutually exclusive, and so clearly we cannot claim +$$ +\begin{align} +\mathbb P(X_1 + X_2 = 7) &\stackrel{?}{=} \sum_{k=1}^6 \mathbb P(X_1 + X_2 = 7 \mid X_1 = k \cup X_2 = k) \mathbb P( X_1 = k \cup X_2 = k ) \cr +&\stackrel{?}{=} \frac{2}{11} \sum_{k=1}^6 \mathbb P( X_1 = k \cup X_2 = k ) \cr +&\stackrel{?}{=} \frac{2}{11} +\end{align} +$$

+ +

Formally, we need to properly define a random variable, say $K$, that encodes the knowledge imparted by our peeking friend.

+ +

Our peeking friend could always report the value of the left-most die, or the right-most, or the larger of the two. She could flip a coin and then report based on the coin flip, or employ any number of more complicated machinations.

+ +

But, once this process is specified, the apparent paradox vanishes.

+ +

Indeed, suppose that $K = X_1$. Then, we have +$$ +\begin{align} +\mathbb P(X_1 + X_2 = 7) &= \sum_{k=1}^6 \mathbb P(X_1+X_2 = 7, K=k) \cr + &= \sum_{k=1}^6 \mathbb P(X_1+X_2 = 7 \mid K=k) \mathbb P(K=k) \cr + &= \sum_{k=1}^6 \frac{1}{36} = \frac{1}{6} \>. +\end{align} +$$

+ +

Similar arguments hold if we choose $K = X_2$ or $K = \max(X_1,X_2)$, etc.

+",2013-11-06 19:08:33.613 +58998,11250.0,1,,,,Goodness-of-fit for very large sample sizes,,CC BY-SA 3.0,"

I collect very large samples (>1,000,000) of categorical data each day and want to see the data looks ""significantly"" different between days to detect errors in data collection.

+ +

I thought using a good-of-fit test (in particular, a G-test) would be a good fit (pun intended) for this. The expected distribution is given by the distribution of the previous day.

+ +

But, because my sample sizes are so large, the test has very high power and gives off many false positives. That is to say, even a very minor daily fluctuation will give a near-zero p-value.

+ +

I ended up multiplying my test statistic by some constant (0.001), which has the nice interpretation of sampling the data at that rate. This article seems to agree with this approach. They say that:

+ +
+

Chi square is most reliable with samples of between roughly 100 to + 2500 people

+
+ +

I'm looking for some more authoritative comments about this. Or perhaps some alternative solutions to false positives when running statistical tests on large data sets.

+",2013-11-06 19:17:37.127 +58999,21057.0,1,,,,An inequality involving expectation,,CC BY-SA 3.0,"

Let $f,g$ be two pdfs, and suppose $X$ is a random variable that has pdf $f$. Is it necessarily true that $E[f(X)] \ge E[g(X)]$?

+ +

Although I doubt this will help, but I got this problem from studying the Kullback-Leibler divergence, which is defined as $D(f,g) = E\left[\ln (f(X)/g(X))\right]$ (with $X$ having pdf $f$ like above). It can be shown using Jensen's inequality that $D(f,g) \ge 0$, which is equivalent to saying that $E[\ln(f(X))] \ge E[\ln(g(X))]$. But my question removes the $\ln$ and I am wondering whether it will still be true.

+ +

In fact, I hypothesize that not only is it true for the $\ln$ function, but it will be true for any function that is increasing (which includes the identity function, which is my question here). So as a bonus, maybe we could ask if $E[h(f(X))] \ge E[h(g(X))]$ where $h$ is any strictly increasing function on positive reals. It seems like a continuous form of the rearrangement inequality to me.

+",2013-11-06 19:22:27.210 +59000,23443.0,1,59001.0,,,Linear regression from data that don't represent a function,,CC BY-SA 3.0,"

I have $(x,\ y)$ pairs with a strongly suspected linear correlation. So I want to fit the ""best"" linear function in order to make predictions for unknown $x$'s. These pairs don't represent a function, i.e. there are many different values of $y$ for some $x$. I want to know if there is an issue with that considering this as a machine learning problem. Should I use some unique ""representative"" (average, maximum, minimum, most frequent, etc.) and convert the input data in a function or it is OK to work with the data as it is?

+",2013-11-06 19:30:31.207 +59001,16464.0,2,,59000.0,,,,CC BY-SA 3.0,"

If you perform linear regression on your raw data using Ordinary Least Squares to estimate the model parameters you will get a model that estimates the conditional mean of $Y$ given the observed value of $X$. I.e. $\hat{f}(x)=E[Y|X=x].$ Since your model outputs the conditional expectation of the response it wouldn't be necessary to convert your $Y|X=x$ into a descriptive statistic prior to your analysis.

+",2013-11-06 19:53:44.297 +59002,21804.0,2,,8681.0,,,,CC BY-SA 3.0,"

the following repositories hold a variety of datasets:

+ +

Statlib

+ +

UCI machine Learning Repository

+ +

KDNuggets Link List

+ +

and, as said before, many more at: +this question.

+",2013-11-06 20:07:24.733 +59022,5045.0,2,,59010.0,,,,CC BY-SA 3.0,"

Rob Hyndman outlines one approach here and here with a survey reference. Both are towards the end of the links. It seems pretty straight forward to implement in Python.

+",2013-11-07 00:29:07.917 +59005,4656.0,2,,58979.0,,,,CC BY-SA 3.0,"

If $B$ is an event with the property that $P(B\mid D_i) = p$ for all events +$\{D_1, D_2, \ldots\}$ in a countable partition of the sample space $\Omega$, +(that is, $D_i \cap D_j = \emptyset$ for all $i \neq j$ and +$\bigcup_i D_i = \Omega$), then the +law of total probability tells us that +$$P(B) = \sum_i P(B\mid D_i)P(D_i) = p\sum_i P(D_i) = p.$$ However, +the law of total probability does not apply if the events $D_i$ are +not mutually exclusive (even though their union is still $\Omega$), and +we cannot conclude that $P(B)$ equals the common value of $P(B\mid D_i)$.

+ +

Let $A_i$ denote the event that at least one of the dice shows the number $i$ and $B$ the event that the sum of the two numbers on the die is $7$. We know that +$P(B) = \frac{1}{6}$ and that $P(A_i) = \frac{11}{36}$. Also, +$P(B\mid A_i) = \frac{2}{11}$. Now, +$A_1\cup A_2\cup A_3 \cup A_4\cup A_5\cup A_6$ +is the entire sample space $\Omega$ +but we cannot use the fact that $P(B\mid A_i)$ +is the same for all choices of $i$ to conclude that $P(B) = \frac{2}{11}$ +because the $A_i$ are not mutually exclusive events. +However, notice that regarded as a multiset, +$A_1\cup A_2\cup A_3 \cup A_4\cup A_5\cup A_6$ contains each outcome +$(i,j)$ exactly twice, once as a member of $A_i$ and again as a member of +$A_j$. Therefore, +$$\sum_{i=1}^6 P(B \mid A_i)P(A_i) += \sum_{i=1}^6 \frac{2}{11}\times\frac{11}{36} = \frac{1}{3} $$ +which is exactly twice the value of $P(B)$.

+",2013-11-06 20:56:39.623 +59006,23447.0,1,,,,Creating a Normalization Factor,,CC BY-SA 3.0,"

I have a relatively simple problem that I can't seem to find a satisfactory solution to. If I have three scales for three different sets of data. One varies from [-5,5] the other from [1,10], and the last from [-10,10] what would an appropriate way to make a single scale for data from all three sets of data?

+",2013-11-06 21:05:05.393 +59007,22686.0,2,,58998.0,,,,CC BY-SA 3.0,"

One approach would be to make the goodness-of fit tests more meaningful by performing them on smaller blocks of data.

+ +

You could split your data from a given day into e.g. 1000 blocks of 1000 samples each, and run an individual goodness-of-fit test for each block, with the expected distribution given by the full dataset from the previous day. Keep the significance level for each individual test at the level you were using (e.g. $\alpha = 0.05$). Then look for significant departures of the total number of positive tests from the expected number of false positives (under the null hypothesis that there is no difference in the distributions, the total number of positive tests is binomially distributed, with parameter $\alpha$).

+ +

You could find a good block size to use by taking datasets from two days where you could assume the distribution was the same, and seeing what block size gives a frequency of positive tests that is roughly equal to $\alpha$ (i.e., what block size stops your test from reporting spurious differences).

+",2013-11-06 21:11:38.023 +59008,22564.0,2,,58998.0,,,,CC BY-SA 3.0,"

The test is returning the correct result. The distributions are not the same from day to day. This is, of course, no use to you. The issue you are facing has been long known. See: Karl Pearson and R. A. Fisher on Statistical Tests: A 1935 Exchange from Nature

+ +

Instead you could look back at previous data (either yours or from somewhere else) and get the distribution of day to day changes for each category. Then you check if the current change is likely to have occurred given that distribution. It is difficult to answer more specifically without knowing about the data and types of errors, but this approach seems more suited to your problem.

+",2013-11-06 21:49:32.890 +59009,23450.0,2,,33598.0,,,,CC BY-SA 3.0,"

I think that Ho is proved and H1 disproved. In addition the GDP has no unit root.

+ +

is meaning the trend remains. There is no break point.

+",2013-11-06 21:54:18.113 +59010,23451.0,1,,,,How do you do time series cross-validation using python?,,CC BY-SA 3.0,"

Also, any tutorials/blogs available that you are aware of?

+",2013-11-06 22:21:51.163 +59011,23348.0,2,,59006.0,,,,CC BY-SA 3.0,"

Assuming you have enough data points, I don't see why you couldn't just normalize everything into units of standard deviations. For each group, calculate the group's average value, and the group's standard deviation. Then for each data point in each group, subtract the corresponding group average, and then divide that result by the group's standard deviation.

+ +

For example, let's say that your first group (with the scale from -5 to 5) has a mean of 1.7, and a standard deviation of 2.3. For each observation in the group, subtract 1.7 from the observation, and then divide the result by 2.3. The result is the number of standard deviations above (if the result is positive) or below (if the result is negative) the observation is from the mean of the group.

+ +

All of your observations from each of the three groups will now be on a common scale, whose mean is zero, standard deviation is one, and the units of this new scale is standard deviations.

+",2013-11-06 22:23:08.643 +59012,21652.0,1,,,,Comparing models with different number of predictors,,CC BY-SA 3.0,"

Given that the overall F-test of a multiple regression model has an F distribution, which depends on the number of predictors in the model, I understand why you cannot compare the F-statistics from models with different number of predictors.

+ +

However, the p-value of the F-statistics always has a uniform distribution between 0 and 1, and represents the probability that at least one βj ≠ 0 under the null hypothesis.

+ +

Can I compare the F-statistics p-value from models with different number of predictors? If not, what are good alternatives? (R-squared?; adjusted R-squared?)

+",2013-11-06 22:32:59.840 +59013,23452.0,1,,,,Calculating Standard deviation of percentages?,,CC BY-SA 3.0,"

I have the following data

+ +

X 1 2 3 4 5…

+ +

Y 10 12 13 14 15…

+ +

X/Y 10% 16% 23% etc.

+ +

How do I find the standard deviation of percentage (last line)? Can I treat the ratio as a normal distribution and apply regular SD formula?

+",2013-11-06 22:36:03.583 +59023,,1,,,user30490,Deriving the distribution of the sum of censored variables,,CC BY-SA 3.0,"

I want to be able to calculate the distribution of +$$Y = \sum_{i=1}^n\max\{0,X_i\}$$ +where the random variable $X_i\sim N(\mu_i,\sigma_i)$. Is the calculation of $f_Y(y)$ possible and if so what is it?

+",2013-11-07 00:42:24.927 +59024,12522.0,2,,59012.0,,,,CC BY-SA 3.0,"

I would NOT recommend the $R^2$ as this measure increases as the number of variables increases. In other words, the $R^2$ does not account for overfitting.

+ +

Among the options you mentioned the adjusted $R^2$ would be the best. If you take a look at the formula:

+ +

$R^2_{adj} = 1 - \frac{(1-R^2)\cdot(n-1)}{n-p-1}$

+ +

Since the number of parameters $p$ is in the denominator of the formula, the addition of variables that do not increase significantly the $R^2$ will penalize the $R^2_{adj}$.

+ +

A better approach to compare your models would be to use the Akaike Information Criterion:

+ +

$AIC_i = -2\cdot log(\mathcal{L}_i) + 2\cdot p_i$

+ +

where $\mathcal{L}_i$ is the likelihood of model $i$

+ +

You could obtain this very easy in R by using the AIC function: + AIC(model1, model2)

+",2013-11-07 00:45:53.630 +59014,11490.0,1,,,,How to get asymptotic covariance matrix when observed information matrix is singular,,CC BY-SA 3.0,"

I'm fitting different models by Maximum Likelihood. To do this I'm using a stochastic version of Newton-Raphson algorithm, where both the gradient and the Hessian of the likelihood are estimated at each step.

+ +

In most cases I reach convergence, but then I often encounter the following problem: the estimate Hessian $\hat H$ at convergence is negative definite. This is a problem because I can't invert it to get an estimate of the observed Fisher information.

+ +

This happens because one or more parameters are weakly identified. On the other hand other parameters seem to be well identified and their corresponding entries in the estimated Hessian seem to make sense. Identifiability is difficult to assess beforehand for the dynamical models I'm trying to fit.

+ +

What I would like is an approach that (starting from the $\hat H$) points out what set of parameters are not identifiable and that gives variances for the remaining parameters.

+ +

I tried to do this by tilting the smallest eigenvalue of $\hat H$ in order to get a better conditioning number. The results seem arbitrary: depending on the conditioning number I want to achieve I get different variances for the parameters.

+ +

EDIT: +This is a typical example of an Hessian I can end up with:

+ +
H <- matrix(c( 67.23586, 10.477815, 138.696877,
+               10.47782, -3.238982,   2.592774,
+              138.69688,  2.592774, 473.161347 ), 3, 3, byrow = TRUE)
+
+ +

You see that I have a negative entry in the main diagonal: the second parameter is weakly identifiable, while the other well identified. The other situation I often encounter is that of sub-group of parameters that are redundant - highly correlated.

+",2013-11-06 22:42:31.000 +59015,20953.0,2,,59013.0,,,,CC BY-SA 3.0,"

The classical definition of the standard deviation estimate is independent from the theoretical distribution of the data, so you can perfectly apply it to a set of percentages $$ s = \sqrt{\frac{1}{n-1}\sum_{i=1}^n \left( z_i - \overline z\right)^2 } .$$

+ +

Depending on the distribution, you can have other estimates, though, with different properties.

+",2013-11-06 22:45:05.033 +59016,20953.0,2,,59014.0,,,,CC BY-SA 3.0,"

It is less general than your problem, but I think this article may help: Ridge Estimators in Logistic Regression. Basically, instead of maximizing the (log-)likelihood $L$, you maximize $$ L_\lambda(\beta) = L(\beta) - \lambda \|\beta\|_2^2 .$$ +This regularization increases all the eigenvalues of the Hessian matrix, so it is not so far away from what you are doing right now.

+",2013-11-06 22:59:23.020 +59017,594.0,2,,59013.0,,,,CC BY-SA 3.0,"

You should clarify your question to be clear whether you want the sample standard deviation of the collection of percentages or the estimated standard deviation of each percentage.

+ +

I'll discuss the second case. You should also clarify the question enough that I can remove some of the 'if's below.

+ +

If the Y's are total counts of objects of which the X's are counts of some particular subset (such as X='number of people with red hair in a classroom', and Y='number of people in the classroom'), and if you can assume independence of occurrence of the characteristic being counted in X and if you can assume constant probability of occurrence of that characteristic, ... +then conditional on Y, you're in a binomial sampling situation and the estimated s.d. of the fraction X/Y is $\sqrt{\frac{1}{Y} \frac{X}{Y}(1-\frac{X}{Y})}$, which you can convert to percentage terms by multiplying by 100%.

+",2013-11-06 23:03:48.483 +59018,23453.0,1,,,,Panel data: predictor variables with different observation times/frequencies,,CC BY-SA 4.0,"

I would like to estimate a standard logit (panel data):

+ +

$\text{logit}(P(y_{i,t}=1))= \alpha + \beta_1 x^1_{i,t} + \beta_2 x^2_{i,t} +\epsilon_{i,t}$.

+ +

The problem I am facing, however, is that $x^2$ is yearly, whereas $y$ and $x^1$ are monthly. Aggregating $x^1$ to the yearly level is not an option. So I naively created 12 observations for each $x^2$ in order to make it monthly and be able to match it to every $y$ and $x^1$. Yet I realize that this might cause problems:

+ +
    +
  1. It may create serial correlation
  2. +
  3. It artificially inflates the number of observations and hence reduces the SE of $x^2$'s coefficient.
  4. +
  5. The SE of other coefs are not affected.
  6. +
  7. All coefficients remain unbiased.
  8. +
+ +

My first question is: is this diagnosis of the potential problems correct? Are there other problems I did not think about?

+ +

My second question is whether a solution to the problems outlined above is to use robust standard errors, clustering the observations by year.

+",2013-11-06 23:39:53.850 +59019,23455.0,1,,,,"Logistic regression - the model is significant in predicting the DV, yet the percent correct decreases",,CC BY-SA 3.0,"

How can this be? My thoughts are that while the percent correct decreased from 80.8% to 80% with the model, perhaps the model is regarded to be a significant predictor due to the specificity having increased. Any thoughts?

+",2013-11-06 23:43:23.503 +59020,15827.0,2,,42513.0,,,,CC BY-SA 3.0,"

Previous answers make excellent points, but here is one fundamental to be added.

+ +

The mean is the centre of gravity of a distribution and so the pivot point of a histogram. It is where the distribution would balance. So, there is a reciprocal relation: not only can the mean help you think about a histogram, so also can a histogram help you think about the mean. This is even perhaps more helpful when a distribution is skewed and the mean of the distribution is not necessarily in the middle.

+",2013-11-07 00:05:51.277 +59021,2217.0,1,,,,Determine separation between two modes from distribution,,CC BY-SA 3.0,"

I’ve got a sample of pairwise distances between points in a 2D picture. Some of these points lie within the same object. Their distance to each other is thus smaller than some well-defined threshold (the object’s diameter). Points that lie in different objects (predominantly) have a pairwise distance greater than said threshold. Points that lie within the same object are however rare (<10%).

+ +

I would like to determine this distance threshold empirically from my sample.

+ +

For “appropriate” parameters (well, herein lies the rub, doesn’t it?) the threshold is visible in the density plot:

+ +

+ +

The threshold is marked by the arrow. This is the objectively right cut-off for my application: it is the dip after the first tall plateau which corresponds to the distribution of the few points lying within the same object, and it corresponds to the object diameter that can be individually verified in the original picture, but not easily automatically deduced from my data.

+ +

Unfortunately, I have no idea how to determine it in an automated fashion. Even the adjust argument / bandwidth for the density function has been found by trial and error, and a different input data set I’ve tried requires a different bandwidth.

+ +

Is there any hope? Or should I just give up?

+",2013-11-07 00:15:21.987 +59025,23457.0,2,,40859.0,,,,CC BY-SA 3.0,"

The split sample validation you proposed above has become less popular in many fields because of the issue Harrell mentions (unreliable out of bag estimates). I know Harrell has mentioned this in his textbook, but other references would be Steyerberg ""Clinical Prediction Models"" p301, James et al ""An Introduction to Statistical Learning"" p175.

+ +

In the biomedical field boostrap resampling has thus become the standard. This is implemented in Harrell's rms package and so fairly easy to implement. But you could really use any of the other resampling methods, bootstap has just become popular because of a Steyerberg article suggesting it is the most efficient of the resampling methods (""Internal validation of predictive models: efficiency of some procedures for logistic regression analysis"").

+ +

It is worth mention that the benefit of the rms package is that it easily enables you to include some of the variable selection in the bootstap (built in stepwise selection option). This can be awkward to achieve with most commercial packages.

+ +

I own sense is that the differences have been overemphasized. I usually get pretty reliable/consistent results irrespective of the method used. With large sample sizes the differences are really non-existent.

+ +

Bootstrap validation - as well as the other resampling methods - can also easily be done wrong. Often only some of the model building stages are included in the bootstrap giving inaccurate estimates. On the other hand it is fairly hard to mess up split sample validation. Given the face validity of split sampling - I know you didn't muck it up - I prefer split sample unless it is a very small dataset. It many cases the model building process is also complicated enough that it can't really be included in a resampling method.

+ +

If you want to publish in a biomedical journal though, and you aren't using a medicare size database, you will want to use a resampling method - likely bootstrapping. If the dataset is large, you can likely still get published with k-fold and save yourself some processing time.

+",2013-11-07 01:05:08.400 +59026,13165.0,2,,22674.0,,,,CC BY-SA 3.0,"

You have many options!

+ +
    +
  1. You can form Junction Tree of your tree and do the conventional Belief Popagation on the resulting graph. (See chapter )
  2. +
  3. Choose one of the (loopy)BP family algorithms, based on your concern for time or complexity (This is a big family!).
  4. +
+ +

See [1] for a complete list of BP family inferences in MRFs.

+ +

[1] http://www.nowozin.net/sebastian/papers/kappes2013energyminimization.pdf

+",2013-11-07 01:22:49.347 +59027,13165.0,2,,6788.0,,,,CC BY-SA 3.0,"

Let's say we want to bound empirical risk of a model. Given an arbitrary $(\epsilon, \delta)$, the sample complexity is $n(\epsilon, \delta)$ such that for $n\geq n(\epsilon, \delta)$ +$$ +P(|\hat{L}(f) - L(f) | \geq \epsilon ) \leq \delta +$$ +The function $\delta(n,\epsilon)$ is a bound on the deviation from the main (unknown) risk (loss).

+ +

As a higher-level intuition: Sample complexity is the smallest number of samples for which we can make sure that we are close enough to the correct model.

+",2013-11-07 01:35:24.930 +59028,23461.0,1,,,,Auto-Regressional & Moving Average Model Formula Properties,,CC BY-SA 3.0,"

I seeking help in understanding specific values underlying the formula's for the MA(p) model & the AR(q) model. I am attempting to implement the models (building up to the combined ARIMA model) in the programming language Java.

+ +

I do not come from an overly mathematical (I'm fairly new to statistics at least) background so be gentle.

+ +

Here is the formula I am using for the AR(p) model:

+ +

$$X_t - μ = β_1(X_{t-1} - μ) + ... + β_p(X_{t-p} - μ) + Z_t$$

+ +

Where $X$ is the time series, $μ$ is the mean of the time series, $β$ is the auto-correlation coefficient at a specific lag, $p$ is the order of the model and $Z$ is white noise of mean $0$ and variance $σ^2$.

+ +

I'm fairly certain I have the above figured out, however the term ""$Z_t$"" confuses me. How would I implement this in code? I understand it is ""random"" however what are its ranges? Surely there must be a maximum and minimum of the term $Z_t$. Is it somehow based on the variance of the overall dataset? How is the ""$Z$"" value calculated on implementation exactly?

+ +

Here is the formula I am using for the MA(q) model:

+ +

$$X_t - μ = Z_t - θ_1(Z_{t-1}) - ... - θ_q(Z_{t-q})$$

+ +

Where $X$ is again the time series dataset, $μ$ is the mean of the dataset, $Z$ is white noise with mean 0 and variance $σ^2$, $θ$ is the correlation coefficient at a specific lag and $q$ is the order of the model.

+ +

Again I the same issue as the above model in regards to the ""$Z$"" term. Also $θ$ is also the correlation coefficient of the dataset at various different lags, correct?

+ +

Any help on this matter would be extremely welcomed and if you have any questions I would more than happy to answer them.

+ +

Any use of examples alongside a full dataset i.e., X = (1,2,3,4,5,6,7) would be also extremely welcomed as it helps me understand the concept much more easily. Also please try to keep the explaination as idiot proof and contained as possible.

+",2013-11-07 02:27:04.507 +59029,23460.0,1,59041.0,,,Significance test across multiple simulated experiments,,CC BY-SA 3.0,"

First time question on this site, so please bear with me, thank you:

+ +

I have 6 coin-flip-type experiments for which I can calculate 6 binomial p-values. I would now like to calculate the significance of observing at least 4 p-values < 0.05 in six experiments total.

+ +

In one approach, I used Fisher's method (http://en.wikipedia.org/wiki/Fishers_method) to compound the p-values, but I wanted to add an additional test based on simulating the original coin-flip data.

+ +

To this end, I performed random coin-flips (P=0.5) for each of the 6 experiments; the total number of flips differs between these 6 experiments but is irrelevant. I then count how many times (out of 100 simulations) the binomial p-value < 0.05. Simulating the original data 100 times, I get the following number of significant binomial p-values (""false positive hits"") from these 6 experiments:

+ +

12, 13, 9, 10, 7, 11

+ +

Or divided by 100 (= frequency of false positive hits in simulated experiments):

+ +

0.12, 0.13, 0.09, 0.1, 0.07, 0.11

+ +

How do I calculate the probability that 4 or more out of these 6 would be positive given these frequencies? I realize that for calculating the probability that 6/6 are positive, I would simply multiply 0.12 x 0.13 x 0.09 x 0.1 x 0.07 x 0.11. But for 1-5/6 it's more complicated. I'm leaning towards a hypergeometric test, since I have to draw 6 times and I think there's no replacement, but I want to double-check with you experts. +Thank you!

+",2013-11-07 02:40:18.163 +59030,13165.0,2,,37748.0,,,,CC BY-SA 3.0,"

I liked your question!

+ +

The complexity of EM depends on the structure it is being applied on, and could be quite different from one problem to another one. But, just like any computational problem, it might be worthy of rigorous analysis to find and compare the complexity in different cases.

+ +

For some of the problems it is analyzed. For example in [1] see 3.1 . Some of these problems are proved to be NP-hard, whether with EM or not. See [3,4].

+ + +",2013-11-07 03:05:16.890 +59031,6608.0,1,,,,Good libraries for working with probabilistic graphical models?,,CC BY-SA 3.0,"

Could someone recommend some well-maintained and up-to-date libraries for working with probabilistic graphical models?

+ +

I noticed that there are some libraries for R listed here and one for C++, but are there some other good libraries in C++ or Python?

+",2013-11-07 03:21:23.427 +59032,16588.0,2,,59019.0,,,,CC BY-SA 3.0,"

Say we have 2 observations, an event ($X_1=1$) and a non-event ($X_2=0$). Say our first model ""predicts"" probabilities $P(X_1)=.55$ and $P(X_2)=.45$. If our decision rule for prediction of an event is based on a $.5$ probability threshold, then this first model is perfect (given our decision rule and the estimated probabilities). But notice the probabilities are close to $.5$ (close to that of getting tails on a coin flip!).

+ +

Now we change the model and the new probabilities are $P(X_1) =.95$ (much closer to observed $1$) and $P(X_2)=.51$ (not very different from the previous estimate, but importantly, crossing the threshold!). Considering the same decision rule for prediction, our predictions are not perfect anymore. However, considering the change in estimated probabilities, now we have a much better fitting model overall.

+ +

This brief discussion is based on the decision rule for prediction which I have assumed you are using. If your view of your model's performance is based on correct guesses, then the performance of your model depends on how you make guesses. Chances are, you could merely change the decision rule and see a completely different evaluation of your model's performance.

+ +

There are many ways to measure model performance. Some more imperfect than others.

+ +

It's probably worth mentioning that such a decision rule is not an inherent element of logistic regression. Logistic regression models the probability of events, not dichotomous guesses about whether they occurred.

+",2013-11-07 03:28:18.260 +59033,23414.0,2,,55436.0,,,,CC BY-SA 3.0,"

If by technique you mean classification method (logistic regression, classification tree, ...), you can use any of these methods to obtain the result you want. Each method usually has a build in cost-function that you can adjust to obtain your desired results. All of these methods end up as being equivalent to building an ROC curve and choosing which point on that curve you want.

+ +

Usually this is done automatically for you so you might not be aware that there is a tuning parameter that should be changed if you have an explicit cost function.

+ +

Thus logistic regression usually uses the classification split at 0.5 probability, but based on your cost function you can change this to obtain the desired sensitivity/specificity. Most standard statistical packages will contain a post-estimation command that you can use after you build your regression model to provide sen/spec/ppv/npv for all the possible cut-point from 0 to 1.

+ +

Perhaps it is worth noting that the cost function is rarely expressed at ""goal of 100%PPV"" but often as a ratio: the cost of false negatives/cost of false positives. In your case this ratio is low. The cost of a false positive >> cost false negative. But estimating this ratio you can give a more precise measure of your cost function.

+ +

Edit: what i have called the cost function above is usually called the ""utility function"" in texts.

+",2013-11-07 04:09:59.533 +59034,23414.0,2,,51496.0,,,,CC BY-SA 3.0,"

The terminal nodes are mutually exclusive in that observations cannot be classified into more than one node.

+ +

They are not mutually exclusive in the sense that they can use the same characteristics/variables to classify observations.

+ +

The issue here may be that often in statistical packages like R the node number has little meaning (at least to me). So that terminal node 23 and 24 are not necessarily two branches from a single node - in fact they are likely to be branches of different nodes and they thus can use the same characteristics.

+ +

Hope that makes sense

+",2013-11-07 04:17:56.270 +59035,594.0,2,,59023.0,,,,CC BY-SA 3.0,"

The distribution is of mixed type (it's neither discrete nor continuous) and has neither a density nor probability mass function.

+ +

However, it is a mixture of a (degenerate) discrete and a continuous distribution.

+ +

It should be possible to compute the CDF for small $n$; its just a sum over cases where 0,1,2,...,$n$ of the $X$ values are <0; the number of terms to account for grows very rapidly with $n$, though.

+ +

$Y$ takes the value $0$ with probability $\prod_i F_{X_i}(0)$ and otherwise it's from a truncated-distribution based on the distribution of the $X's$. But it's going to be complicated to do exactly.

+ +

With middling to large $n$ I'd actually be inclined to use simulation on the continuous part (perhaps with smoothing by logspline density estimation).

+ +

With very large $n$ it may even be possible to come up with some kind of approximation.

+ +

What do you need it for?

+",2013-11-07 04:28:40.553 +59036,7016.0,2,,23087.0,,,,CC BY-SA 3.0,"

MA Model Estimation:

+ +

Let us assume a series with 100 time points, and say this is characterized by MA(1) model with no intercept. Then the model is given by

+ +

$$y_t=\varepsilon_t-\theta\varepsilon_{t-1},\quad t=1,2,\cdots,100\quad (1)$$

+ +

The error term here is not observed. So to obtain this, Box et al. Time Series Analysis: Forecasting and Control (3rd Edition), page 228, suggest that the error term is computed recursively by,

+ +

$$\varepsilon_t=y_t+\theta\varepsilon_{t-1}$$

+ +

So the error term for $t=1$ is, +$$\varepsilon_{1}=y_{1}+\theta\varepsilon_{0}$$ +Now we cannot compute this without knowing the value of $\theta$. So to obtain this, we need to compute the Initial or Preliminary estimate of the model, refer to Box et al. of the said book, Section 6.3.2 page 202 state that,

+ +
+

It has been shown that the first $q$ autocorrelations of MA($q$) process + are nonzero and can be written in terms of the parameters of the model + as + $$\rho_k=\displaystyle\frac{-\theta_{k}+\theta_1\theta_{k+1}+\theta_2\theta_{k+2}+\cdots+\theta_{q-k}\theta_q}{1+\theta_1^2+\theta_2^2+\cdots+\theta_q^2}\quad k=1,2,\cdots, q$$ The expression above for$\rho_1,\rho_2\cdots,\rho_q$ + in terms $\theta_1,\theta_2,\cdots,\theta_q$, supplies $q$ equations + in $q$ unknowns. Preliminary estimates of the $\theta$s can be + obtained by substituting estimates $r_k$ for $\rho_k$ in above + equation

+
+ +

Note that $r_k$ is the estimated autocorrelation. There are more discussion in Section 6.3 - Initial Estimates for the Parameters, please read on that. Now, assuming we obtain the initial estimate $\theta=0.5$. Then, +$$\varepsilon_{1}=y_{1}+0.5\varepsilon_{0}$$ +Now, another problem is we don't have value for $\varepsilon_0$ because $t$ starts at 1, and so we cannot compute $\varepsilon_1$. Luckily, there are two methods two obtain this,

+ +
    +
  1. Conditional Likelihood
  2. +
  3. Unconditional Likelihood
  4. +
+ +

According to Box et al. Section 7.1.3 page 227, the values of $\varepsilon_0$ can be substituted to zero as an approximation if $n$ is moderate or large, this method is Conditional Likelihood. Otherwise, Unconditional Likelihood is used, wherein the value of $\varepsilon_0$ is obtain by back-forecasting, Box et al. recommend this method. Read more about back-forecasting at Section 7.1.4 page 231.

+ +

After obtaining the initial estimates and value of $\varepsilon_0$, then finally we can proceed with the recursive calculation of the error term. Then the final stage is to estimate the parameter of the model $(1)$, remember this is not the preliminary estimate anymore.

+ +

In estimating the parameter $\theta$, I use Nonlinear Estimation procedure, particularly the Levenberg-Marquardt algorithm, since MA models are nonlinear on its parameter.

+ +

Overall, I would highly recommend you to read Box et al. Time Series Analysis: Forecasting and Control (3rd Edition).

+",2013-11-07 04:31:29.557 +59037,7016.0,2,,59028.0,,,,CC BY-SA 3.0,"

The $z_t$ is the error term, and is obtain by

+ +

$$z_t = x_t-\hat{x}_t$$

+ +

Or the difference between the observed series ($x_t$) and the predicted ($\hat{x}_t$). To code this, you need to obtain the $\hat{x}_t$, which is just the expected value of $x_t$ or $E[x_t]$. So for example, AR($1$) +$$x_t=\beta x_{t-1}+z_t$$ +where $\beta$ is the parameter, then +$$E[x_t]=\hat{x_t}=\beta x_{t-1},\quad \mathrm{since}\; E[z_t]=0$$ +Thus, +$$z_t=x_t-\beta x_{t-1}.$$ +For MA case, however, is quite complicated. Assuming we have MA(1), +$$x_t=z_t-\theta z_{t-1}$$ +where $\theta$ is the parameter, the error term ($z_t$) is not observed, so to compute this we need to recursively calculate this using the formula,

+ +

$$z_t=x_t-\theta z_{t-1}$$

+ +

Here is my answer on the steps of calculating the error term.

+",2013-11-07 05:13:27.493 +59038,132.0,2,,58986.0,,,,CC BY-SA 3.0,"

It sounds like you need a rolling forecast origin (aka time series cross-validation). Here is an example.

+ +
library(forecast)
+x <- ts(cumsum(rnorm(26)), start=1980)
+
+k <- 10 # minimum data length for fitting a model
+n <- length(x)
+mae <- matrix(NA,n-k-1,6)
+st <- tsp(x)[1] + k - 1
+
+for(i in 1:(n-k-1))
+{
+  trainx <- window(x, end=st+i-1)
+  testx <- window(x, start=st+i, end=st+i+5)
+  fit <- ets(trainx)
+  fcast <- forecast(fit, h=6)
+  mae[i,1:length(testx)] <- abs(fcast[['mean']]-testx)
+}
+mase <- mae / mean(abs(diff(x)))
+tab <- rbind(colMeans(mae,na.rm=TRUE),colMeans(mase,na.rm=TRUE))
+rownames(tab) <- c(""MAE"",""MASE"")
+colnames(tab) <- paste(""h="",1:6,sep="""")
+tab
+
+",2013-11-07 05:33:34.667 +59039,12495.0,1,,,,Granger Causality Testing With Panel Data,,CC BY-SA 3.0,"

I'm trying to apply a Granger Causality test to panel data. I've found enough literature to understand that topic. However, I've been unable to find and R package to carry out that analysis. I'm wondering if anybody know whether there is around any package to deal with that. Thanks!

+ +

I'm adding a potential solution, but new questions arose.

+ +

The solution that I found is apply a Granger Non- Causality test and using Generalized Method of Moments (GMM). In the Erdil & Yetkiner’s (2004) paper you can find a description of Granger non-causality test with panel data. To perform a GMM I used the plm package for R. If you have a look at its tutorial (Croissant & Millo, 2008), you’ll see that the built-in function pgmm (page 17) removes the individual effect by the first difference and time dummies are included. The function’s summary also provides some tests to assess the model. For instance, to check serial autocorrelation in the residuals, Wald tests for coefficients and for time dummies and the Sargan test to evaluate if there is correlation between the instrumental variable and the residuals. Then I performed a Wald test (the first one in Erdil & Yetkiner, 2004) with the sum of squared residuals of an unrestricted model (SSRu) and of a restricted model (SSRr). +Now, my questions for the audience are:

+ +

1) Do the time dummies remove the time effect? I think so.

+ +

1.1) What if the time dummies aren't significant?

+ +

2) Therefore, if I got rid of the individuals and time effects, is the Wald test (SSRr-SSRu) be as a Wald test applied to an OLS model? I think so. +If so, I’m not sure about the freedom degrees. Let’s see first the test suggested by Erdil & Yetkiner (2004):

+ +

$$W=\frac{(SSRr-SSRu)/Np}{SSRu/[NT-N(1+p)-p]}$$

+ +

where N= number of individuals, T=years and p=number of lags. Note that they didn’t get rid of individuals and time effects (at least that's what I understood). +Now, if I got rid of individuals and time effects the Wald test as applied to OLS models would be: + $$W=\frac{(SSRr-SSRu)/m}{SSRu/ (n-k)}$$ where m= number of restrictions (number of coefficients that were removed from the unrestricted model to turn it restricted), k= total number of coefficients in the unrestricted model and n= number of observation.

+ +

More questions:

+ +

3) What is number of observation?

+ +

3.1) Is it the number of year or number of years*number of individuals? If it is number of years it seems reasonable, but if it is the product between years and individual it doesn’t. For example, in my case I have 328 individual and 13 years, so it is 4264; therefore, the numerator in the Wald test will be very, very small and I’ll be rejecting everything.

+ +

Finally,

+ +

4) Am I right doing as I did?

+ +

Again, any help will be much appreciated

+",2013-11-07 06:18:12.660 +59040,21119.0,1,59042.0,,,$E(x^k)$ under a Gaussian,,CC BY-SA 3.0,"

What would be the expectation of $|x|^k$, where $x\sim\mathcal{N}(0,1)$, $k>0$ and $k$ is not an integer?

+",2013-11-07 06:26:28.130 +59041,594.0,2,,59029.0,,,,CC BY-SA 3.0,"

I have a number of additional comments to make regarding issues I have with what you're describing (which I will come back to), but first let's just deal with the simple question:

+ +
+

I would now like to calculate the significance of observing at least 4 p-values < 0.05 in six experiments total.

+
+ +

By 'significance' I assume you mean 'probability of ... under the null'.

+ +

Simple version:

+ +

Imagine that the sample sizes were such that we could treat the distribution of p-values under the null as continuous. In that case they will be uniform under the null.

+ +

Then under $H_0$ each experiment has a 5% chance of giving a p-value below 0.05

+ +

The distribution of the number of experiments yielding p-values below 0.05 under the null is $\text{binomial}(6,0.05)$

+ +

Let $X\sim \text{binomial}(6,0.05)$. Then $P(X\geq 4) = 8.64\times 10^{-5}$

+ +

Less simple version:

+ +

The distribution of p-values is discrete, and a significance level of exactly 0.05 won't typically be attainable. A more accurate answer in this case involves finding the largest possible p-value less than 0.05 (which depends on the exact sample size for each experiment), and then doing a similar calculation for that case. It will give a smaller probability than the one I just calculated. This involves some slightly more complicated calculation, but it's perfectly possible to do it exactly, without simulation.

+ +

(I don't think your simulation approach looks right, by the way, but since it's possible to do this question without worrying about that, I won't labor the point.)

+",2013-11-07 06:51:46.573 +59042,594.0,2,,59040.0,,,,CC BY-SA 3.0,"

$E(x^k)$ can be worked out directly from the law of the unconscious statistician

+ +

$E(x^k) = \int_{-\infty}^\infty x^k \phi(x) dx$ where $\phi$ is the standard normal pdf

+ +

You may be able to make progress with a simple substitution.

+ +

See also:

+ +

http://mathworld.wolfram.com/NormalDistribution.html (this gives the numeric answers)

+ +

and

+ +

http://mathworld.wolfram.com/GaussianIntegral.html

+ +
+ +

Responding to the updated question: we just follow my suggestion above (simple substitution).

+ +

\begin{eqnarray} +E(|x|^k) &=& \int_{-\infty}^\infty |x|^k \frac{1}{\sqrt{2\pi}} e^{-x^2/2}dx\\ + &=& 2\frac{1}{\sqrt{2\pi}} \int_{0}^\infty x^k e^{-x^2/2}dx +\end{eqnarray}

+ +

Let $u = x^2/2;\,$ so $ du = x\,dx;\,x=(2u)^{1/2}$

+ +

\begin{eqnarray} + &=& 2\frac{1}{\sqrt{2\pi}} \int_{0}^\infty (2u)^{\frac{k-1}{2}} e^{-u}du\\ + &=& \frac{2^{k/2}}{\sqrt{\pi}} \int_{0}^\infty u^{\frac{k-1}{2}} e^{-u}du\\ + &=& \frac{2^{k/2}}{\sqrt{\pi}} \Gamma\left(\frac{k+1}{2}\right) +\end{eqnarray}

+ +

In fact the second link above shows you how to it for the non-integer case - see eqns (9)-(12); there's nothing there requiring the power to be an integer.

+",2013-11-07 07:10:50.683 +59043,23466.0,1,,,,Poisson process - calls arriving,,CC BY-SA 3.0,"

Already posted on MSE. Had no answer, so will post here.

+ +

Assume the number of calls per hour arriving at an answering service follows a Poisson process with $\lambda = 4$.

+ +

Question: If it is know that $8$ calls came in the first two hours. What is the probability that exactly five arrived in the first hour?

+ +

Attempt: Isn't this just a combinatorial question? So the answer is ${8 \choose 5}/2^8$

+",2013-11-07 08:19:01.863 +59044,16665.0,1,59068.0,,,Analysis of interaction tables,,CC BY-SA 3.0,"

I have two tables of interactions. One represents the number of times that a clownfish of a given species is found in an anemone of a given species. One represents the number of times that a clownfish of a given species is found in the same anemonae than a clownfish of another species.

+ +
Table_anemone_fish
+
+      Fish_a    Fish_b   Fish_c
+An_A    23        56       12
+An_B    12        5        5
+An_C    23        10       68
+
+ +

Note: Summing this table gives the total number of anemones in my sample.

+ +
Table_fish_fish
+
+        Fish_a    Fish_b   Fish_c
+Fish_a   NA         7        3
+Fish_b    7        NA        1
+Fish_c    3         1        NA
+
+ +

Note: Most of the times the fishes live with no other species and they are therefore not represented in this table. The frequency of each fish species can be obtain with Table_anemone_fish

+ +

The questions I want to answer are:

+ +
    +
  • Is there differential preference of fish species in choosing their anemones?
  • +
  • Is there different preference of fish species in choosing with which other species to live with?
  • +
+ +

To answer this post one can simply gives the kind of test to use and their philosophy or even better, one might give an example of how to analyze these data with R.

+",2013-11-07 08:43:39.197 +59045,22049.0,1,,,,What is the impact of windowing function on time series,,CC BY-SA 3.0,"

Greeting

+ +

I would like to know what is the impact of windowing functions like Hanning,... on a time series.

+ +

Is it possible to finde anomalies using windowing functions?

+ +

EDIT

+ +

I have a time series, which contains integer values (470,471,472,472,473,471,...) I try to find anomalies in this vector of value. As I have understood, with Wavelet transform , FFT, and moving average .

+ +

As I know any discontinuity between the 1st and last time sample (i.e., a jump in the value ) will contribute to the frequency spectrum in FFT, but we should use windowing to detect these anomalies. It means, I should calculate FFT for my time series and then I should run a window function like hanning or etc. on the result to find anomalies, Is it right?

+",2013-11-07 09:01:06.470 +59046,23454.0,1,59077.0,,,Joint cdf of extreme values,,CC BY-SA 3.0,"

A die is rolled twice,

+ +
    +
  • $X_1$ : the minimum value to appear in the two rolls

  • +
  • $X_2$ : the maximum

  • +
+ +

I would like to derive $\ F_{X_1,X_2}(x_1,x_2)$.

+ +

I know that that the CDF of $\ X_1 $ = $\ 1- [1-{F(x)]} ^ n $, CDF of $\ X_2 $ $\ = $ $ \ [{F(x)]} ^ n $ + and $\ F_{X_1,X_2}(x_1,x_2) = P(X_1<x_1|X_2<x_2)F_{X_2}(x_2) $

+ +

The solution seems to be :

+ +

$\ F_{X_1,X_2}(x_1,x_2) = 2F(\min[{x_1,x_2}])F(x_2) - F(\min[{x_1,x_2}])^2 $

+ +

I want to understand how such solution can be obtained.

+",2013-11-07 09:09:49.097 +59047,11831.0,1,,,,Markov Decision Process and its generality,,CC BY-SA 3.0,"

My major is CS and I have a question about Markov decision process.

+ +

I have been reading a book, planning with markov decision process an AI perspective. +While reading it, I have a question regarding the definition of MDP and its generality.

+ +

The 2nd order Markov chain can be transformed into 1st order Markov chain. +So any stochastic process that is depend on limited length history can be eventually +1st order Markov chain.

+ +

I think Markov Decision Process is the most general one when we consider discrete and finite state space. I don't know if there is non-Markovian decision process.

+ +

If we want to find optimal policy of an MDP with respect to maximum total expected utility (M.E.U) using value iteration, dynamic programming; if connectivity graph of MDP is acyclic, value iteration is same as Bellman-Ford shortest path finding algorithm.

+ +

I am curious about if there is something different decision process that can't be solved by dynamic programming when we are still finding an optimal policy w.r.t. M.E.U?

+ +

The question itself is confusing, but simply my question is MDP is the most general one if the state space is finite?

+ +

The additional question is when we are optimizing w.r.t. other metrics than M.E.U, the MDP can be solved by dynamic programming?

+ +

I hope someone suggest directions to explore this field.

+ +

Thanks in advance.

+",2013-11-07 09:26:30.730 +59048,23227.0,1,59093.0,,,Binning By Equal-Width,,CC BY-SA 4.0,"

I have a dataset:

+
5, 10, 11, 13, 15, 35, 50 ,55, 72, 92, 204, 215
+
+

The formula for binning into equal-widths is this (as far as I know) +$$width = (max - min) / N$$

+

I think N is a number that divides the length of the list nicely. So in this case it is 3. +Therefore:

+

width = 70

+

How do I use that 70 to make the bins?

+",2013-11-07 09:31:01.880 +59049,14873.0,1,,,,Fitting distribution to a given data,,CC BY-SA 3.0,"

I have a loss data arising out of Operation risk for some particular bank. The standard procedure for arriving at the capital charge w.r.t. Operational risk needs I fit some continuous distribution to this loss data.

+ +

Normally, I am able to fit some standard distributions. Once the distribution is identified, the same distribution is used to simulate future loss amounts.

+ +

My question is assuming the data is such that I am just not able to fit any statistical distribution, how do I deal with data to simulate the loss amounts. Is there any non-parametric method available to deal with such situations.

+ +

Kindly give me some advice.

+",2013-11-07 10:06:02.833 +59050,8361.0,2,,58976.0,,,,CC BY-SA 3.0,"

I am assuming you are seeking to classify the EEG data into one or more disease states e.g. seizure/non-seizure, pathological/non-pathological etc.

+ +

The best way to validate a classifier model for an application like this is to implement Leave One Out cross validation.

+ +

What I mean by this is to start with all data for patient 1 as the test set and all data for patients 2-15 as the training set and store the results. Next, set the data for patient 2 as the test set and the remainder as the training set. Do this for each patient's data in turn so that you have 15 classification results, one for each patient. The take the mean of these 15 values and you have an estimate for the classification performance of your classifier model on unseen data.

+",2013-11-07 10:12:41.690 +59051,22968.0,1,,,,R Multiple Linear Regression; plotting results,,CC BY-SA 3.0,"

I'm trying to do some exploratory analysis on some weather. I'd like to do a multiple linear regression on my data and then plot the predicted value against the actual value. Here's where I've got so far:

+ +
data<-read.csv(""Amsterdam.csv"", header=TRUE)
+data2<-data[SolarAltitude>0,]
+data2.lm<-lm(DirectRadiation ~ DryBulbTemperature + RelHum
+   +AtmosphericPressure, data=data2)
+data.data.frame(data2,fitted.value=fitted(data2.lm),residual=resid(data2.lm)) 
+
+ +

If you could help, I would be very grateful,

+",2013-11-07 10:15:24.467 +59052,594.0,2,,59049.0,,,,CC BY-SA 3.0,"

You could resample the observed data (bootstrapping).

+ +

The problem with that is the real risk is in the extreme tail... and the sample doesn't really give you any information there (e.g. if you're interested in say a tail value at risk, for $\alpha = 0.005$ but you only have +a couple of hundred observations, then you have no information +about the behavior of the tail out that far.

+ +

On the other hand if you had many thousands of observations +it may not be a big problem.

+ +

You may get better benefit from investigating extreme value +distributions (which do deal with the extreme tail), but that's not quite nonparametric.

+",2013-11-07 10:39:55.037 +59053,10579.0,2,,59051.0,,,,CC BY-SA 3.0,"

The function fitted returns the fitted (predicted) values. To plot the fitted values against the actual values, you can use:

+ +
plot(data2$DirectRadiation, fitted(data2.lm))
+
+ +

This will produce a plot with the actual values on the horizontal axis and the fitted values on the vertical axis.

+ +

If the above code doesn't work due to missing data, you can try one of the following approaches:

+ +
 plot(fitted(data2.lm) + residuals(data2.lm), fitted(data2.lm))
+
+ plot(data2.lm$model[[1]], fitted(data2.lm))
+
+",2013-11-07 10:51:32.230 +59054,23468.0,1,,,,Different results by using chi square test and logistic regression,,CC BY-SA 3.0,"

I am working on a problem to identify the risk factors to infection after operation. So there are risk factors such as age, pre existing condition, cause of infection etc. Since the dependent variable and most of variables are categorical data, I used logistic regression first, in which the factor A is not significant.

+ +

Just want to double confirm, I used chi square test just between the dependent variable and A, this time the P value is 0.03 which means they are correlated. Can anyone give me a hint? Does this imply the correlation between the factors?

+",2013-11-07 11:02:08.353 +59070,23476.0,1,,,,Predicting absolute risk using cox regression,,CC BY-SA 3.0,"

I am trying to use R to predict the absolute risk of developing adverse events in a cohort, and to compare that with the observed outcome. Should I use survreg or coxph to do this? Anyone kind enough to explain how to do this with R code?

+ +

The mean follow up period of my cohort is only up to 6 years, so am I able to predict the absolute risk for each individual at the end of the follow up period (including both censored and non censored data)?

+",2013-11-07 14:15:45.333 +59055,12522.0,2,,59043.0,,,,CC BY-SA 3.0,"

Thinking this through, I believe this should be calculated with a binomial distribution with $n = 8$ and $p = 0.5$ as follows:

+ +

$P = \binom{8}{5} \cdot 0.5^{5} \cdot (1-0.5)^{3} $

+ +

Let me try to proof this:

+ +

Let

+ +

$X_1$ = number of calls that arrive in the first hour

+ +

$X_2$ = number of calls that arrive in the second hour

+ +

$X_3$ = number of calls that arrive in the two hours

+ +

What you want to calculate is the conditional probability of 5 calls arriving in the first hour given that 8 calls arrived in two hours:

+ +

$P(X_1 = 5 | X_3 = 8) = \frac {P[(X_1 = 5) \cap (X_3 = 8)]} {P(X_3 = 8)}$

+ +

This would be equivalent to : $\frac {P[(X_1 = 5) \cap (X_2 = 3)]} {P(X_3 = 8)}$, however now the events occur over non overlapping time frames which allow us to use the independent increment property of the poisson processes.

+ +

$\frac {P[(X_1 = 5) \cap (X_2 = 3)]} {P(X_3 = 8)} = \frac {P(X_1 = 5) \cdot P(X_2 = 3)]} {P(X_3 = 8)}$

+ +

$ =\frac {\left[\frac {e^{-4} \cdot 4^5} {5!} \right] \cdot \left[\frac {e^{-4} \cdot 4^3} {3!} \right]} {\frac {e^{-(4 \cdot 2)} \cdot {(4 \cdot 2)}^8} {8!}} $

+ +

$=\frac{8!} {5! \cdot 3!} \frac {(4^5) \cdot (4^3)} {8^8} $ +$=\frac{8!} {5! \cdot 3!} \frac {(4^5) \cdot (4^3)} {(8^5) \cdot (8^3)} $ +$=\frac{8!} {5! \cdot 3!} \cdot \left(\frac {4} {8}\right)^5 \cdot \left(\frac {4} {8}\right)^3$ +$= \binom{8}{5} \cdot 0.5^{5} \cdot (0.5)^{3} $

+",2013-11-07 11:11:09.630 +59056,21638.0,2,,59043.0,,,,CC BY-SA 3.0,"

Judging from the comments there appears to be a lot of confusion and lack of intuition over this question. A trivial Monte Carlo simulation will give (roughly) the correct answer that can be used to gauge the validity of the solutions. Here it is in R:

+ +
> firstHour <- rpois(n=10000000,lambda=4) ; secondHour <- rpois(n=10000000,lambda=4)
+> mean(firstHour[firstHour + secondHour == 8]==5)
+[1] 0.2181712
+
+ +

Compare this to the OP's attempt:

+ +
> choose(8,5)/2^8
+[1] 0.21875
+
+ +

Personally the combinatorial approach is not obvious to me. I would have followed @Orlando Mezquita's solution. As you can see, they arrive at the same answer.

+",2013-11-07 11:18:54.330 +59057,503.0,2,,59054.0,,,,CC BY-SA 3.0,"

A few points:

+ +

1) The fact that the independent variables are categorical is irrelevant for the choice of logistic regression.

+ +

2) A significant chi-square value means the two variables are associated, but if both are categorical, it's not really correlation. If both variables have only 2 levels, there are analogues of correlation.

+ +

3) (your main question) It appears you did a logistic regression with multiple independent variables and compared it to a chi-square test between only two variables (infection and A). These ask two different questions, so they get different answers. The first asks whether A affects the odds of infection after controlling for other variables. The second does not control for any other variables. If all the independent variables were completely unrelated, then I believe the effect sizes (odds ratios) would stay the same. However, this hardly ever happens in real life (except in some very controlled experiments).

+ +

4) Just as an aside, it is better to look at effect sizes, not just p values.

+",2013-11-07 11:31:18.837 +59058,23470.0,1,,,,"Log-transformed variable is not significant, while variable itself is",,CC BY-SA 3.0,"

I'm doing a Logistic Regression on company level using the total assets to control for company size. Due to the skewedness of the data, I do a log-transformation of the asset data. While I get no significance for the log-transformed variable, the not transformed variable is highly significant (p < 0.01). What could be a possible explanation of such a result? Does it mean that the relationship is not diminishing for extremely high and low values or does it show an outlier problem and should not be used for interpretation of the data.

+",2013-11-07 11:32:54.817 +59059,13537.0,2,,58694.0,,,,CC BY-SA 3.0,"

Although the other answers have already addressed the question, I would like to add another powerful option that would solve most of the problems related to the distribution-assumptions: quantile regression.

+ +

Depending on the research interests, this method can be extremely powerful.

+ +

As someone has already said before, if you are merely interested in estimating the marginal mean (or any quantile) of your outcome then you don't need to worry about any assumption at all, as both quantile and ordinary regression methods perfectly estimate it.

+ +

If you are interested in inference, ordinary regression has a couple of problems with the distribution assumptions, whereas the quantile doesn't because it is distribution free. +It's true that you can try using mean regression and robust estimators, but personally I prefer quantile regression, which is by the way even more informative (because you can estimate the whole conditional distribution of the outcome instead of just one of its summary indicators, the mean).

+ +

If you are interested in both prediction and inference, then the quantile's property of invariance is quite handy. +For example, suppose you are working with probabilities, or rates (or any other ""bounded"" outcome). +With quantile regression you can transform the outcome Y so that it's transformation is not bounded (for example, using a logit or probit function), model logit(Y) and use the same model for predictions and inference.

+ +

With ordinary methods it's not so easy, because of Jensen's inequality: E(g(Y)) is never equal to g(E(Y)).

+ +

Therefore, you either use two models (one for the prediction, one for the association) or you must use other methods (beta regression, logit normal regression) that, however, have problems related to respectively parameter interpretation and distribution assumptions.

+ +

Finally, there can always be problems related to the linearity assumption or independent data. In the former, we can solve the problem by adding splines (which, though, complicate the interpretation of parameters).

+ +

For the latter, instead, mixed effect regression models could help us (if we have hierarchical or longitudinal data).

+",2013-11-07 11:40:08.050 +59060,23473.0,2,,16998.0,,,,CC BY-SA 3.0,"

http://endb-consolidated.aihit.com/datasets.htm contains 10K companies with textual descriptions

+",2013-11-07 12:45:43.713 +59061,20249.0,1,59067.0,,,Meaning of standard deviation of the mean difference,,CC BY-SA 3.0,"

I'm trying to understand the meaning of stating the standard deviation (SD) of the mean difference (MD) [or otherwise called the absolute mean difference]. This is for a paper I'm writing up where other examples also quote the SD of the MD as part of the analysis summary.

+ +

While I understand what the mean difference represents, i.e., being a ""measure of statistical dispersion"", I'm not sure of the utility of reporting the standard deviation of this value. I can kind of conceptualise what it means, but again the issue is the utility.

+ +

For reference, the analysis is of repeated measures data for equivalence. I will be reporting the correlation coefficient and also the statistical probability for equivalence (with equivalence range) using a Two One Sided T-test (TOST). Thus my question regarding the utility of the standard deviation of the mean difference.

+ +

Maybe I'm lacking in statistical knowledge and please feel free to point out a suitable reference for me to consult.

+ +

Thanks in advance.

+ +

Update - example:
+Machine A is the reference measurement system. Machine B is the new measurement system. A certain number of real life measurements are made with each machine (N) on a common object to understand how equivalent the machines are. Ideally they're identical - ""of course"" say the designers, and can even be expected - but this testing will be used to support the equivalence limit of +/- x units. (Given in reality it's not possible to say A is exactly equivalent to B.) So a paired measurement comparison.

+ +

When reporting these results it has been common to report the mean difference and the standard deviation of the mean difference, plus the correlation coefficient (Pearson's) with its p value.

+ +

Update - comments:
+1. This question does indeed refer to the standard deviation of the mean difference, not simply the difference.
+2. I well understand what the standard deviation means in terms of distributions. But my question is really ""what is the meaning of the standard deviation of the mean difference in relation to the quantity being measured?""
+3. If it's as simple as a guide to the variability of the difference, then so be it.
+4. If the result was reported as A = B +/- x units with a p < 0.05 then I understand that this is different to quoting a standard deviation type number, but I still get it intuitively. I don't get the SD of a MD in regard to the original measurement entity nearly as well.

+",2013-11-07 13:03:59.207 +59096,594.0,2,,59088.0,,,,CC BY-SA 3.0,"

Let's see what Dan Ma actually says in his blog. To quote:

+ +
+

There is uncertainty in the parameter $\theta$, reflecting the risk characteristic of the insured. Some insureds are poor risks (with large $\theta$) and some are good risks (with small $\theta$). Thus the parameter $\theta$ should be regarded as a random variable $\Theta$. The following is the conditional distribution of $N$ (conditional on $\Theta=\theta$):

+ +

$$\displaystyle (15) \ \ \ \ \ P(N=n \lvert \Theta=\theta)=\frac{e^{-\theta} \ \theta^n}{n!} \ \ \ \ \ \ \ \ \ \ n=0,1,2,\cdots$$

+
+ +

Aside from some small oddness in the wording, the gist of that is fine. The parameter of the Poisson ($\theta$ in the quoted discussion) represents the underlying rate of claims per unit time; that individuals are homogeneous, and have different 'riskiness' (different claim-rates) isn't controversial.

+ +

So why does he think that the distribution of the claim-rate is distributed as gamma?

+ +

Well, actually he doesn't say that he thinks that at all.

+ +

What he says is:

+ +
+

Suppose that $\Theta$ has a Gamma distribution with scale parameter $\alpha$ and shape parameter $\beta$.

+
+ +

He's positing a circumstance -- discussing an assumption if you wish -- for which he then discusses the consequences.

+ +

He doesn't even assert anything about the plausibility of the assumption.

+ +
+ +

Here's some things that might be reasonable to assert/suppose about the claim-rate distribution:

+ +

1) It's necessarily non-negative and may be taken to be continuous

+ +

2) we could expect that it would tend to be right-skew

+ +

3) We might not-too-unreasonably expect there to be a typical level (a mode), around which the bulk of the distribution lies, and that it tails off as we move further away (i.e. it might be reasonable to expect that it would be unimodal, at least to a first approximation)

+ +

That's about all we could say without collecting data.

+ +

The gamma at least doesn't break any of those suppositions/expectations, and so is likely to result in a more useful distribution than assuming homogeneity of claim-rate, but any number of other distributions satisfy those conditions.

+ +

So why gamma rather than lognormal say? Likely, a matter of convenience; the gamma works nicely with the Poisson - which even conditional on the individual underlying claim-frequency is itself another assumption that isn't actually true (though we can make some argument that the assumptions of claims having a Poisson process may not be too badly wrong, it's clear that they can't be exactly true).

+ +

There's no good reason to think it is gamma-distributed.

+ +

Indeed, I'll assert here and now that there's no real-world case where the claim rate is actually gamma distributed, in practice there will always be differences between the actual distribution of interest and some simple model for it; but that's true of essentially all our probability models.

+ +

They're convenient fictions, which may sometimes be not so badly inaccurate as to have some value.

+ +
+

Is there a way I can determine if my density is gamma distributed?

+
+ +

Nothing will tell you it is; in fact you can be quite sure - even when it looks like an excellent description of the distribution - that the gamma is at best merely an approximation. You can use diagnostic displays (perhaps something like a Q-Q plot) to help check that it's not too far from gamma.

+",2013-11-07 18:52:34.853 +59062,16174.0,2,,2156.0,,,,CC BY-SA 3.0,"

Finding Groups in Data. An Introduction to Cluster Analysis from professors Leonard Kaufman and Peter J. Rousseeuw.

+ +

I am reading the book and finding it very useful because:

+ +
    +
  • As stated by the authors in the preface:
  • +
+ +
+

Our purpose was to write an applied book for the general user. We wanted to make cluster analysis available to people who do not necessarily have a strong mathematical or statistical background.

+
+ +
    +
  • It provides theoretical content to understand the functions available in the R package Cluster.

  • +
  • Chapters can be read individually according to the cluster method of interest.
    + exception is chapter 3, which is built on chapter 2

  • +
+ +

The book's chapters are:

+ +
    +
  1. Introduction
  2. +
  3. Partitioning Around Medoids (Program PAM).
  4. +
  5. Clustering Large Applications (Program CLARA).
  6. +
  7. Fuzzy Analysis (Program FUNNY).
  8. +
  9. Agglomerative Nesting (Program AGNES).
  10. +
  11. Divisive Analysis (Program DIANA).
  12. +
  13. Monothetic Analysis (Program MONA).
  14. +
+ +
+ +

References:

+ +

Kaufman, L., & Rousseeuw, P. J. (2005). Finding Groups in Data. An Introduction to Cluster Analysis (p. 342). John Wiley & Sons Inc.

+ +

Maechler, M. (2013). Cluster Analysis Extended Rousseeuw et al. CRAN.

+",2013-11-07 13:09:48.110 +59063,14900.0,1,60467.0,,,"Difference between ""in-sample"" and ""pseudo out-of-sample"" forecasts",,CC BY-SA 4.0,"

Is there an explicit difference between in-sample forecasts and pseudo out-of-sample forecasts. Both is meant in the context of evaluating and comparing forecasting models.

+",2013-11-07 13:11:37.360 +59064,2161.0,1,59802.0,,,How can I look for a correlation between dependent variables in a repeated-measures/within-subjects design?,,CC BY-SA 3.0,"

I have a 2x3 within-subjects design, with two different dependent variables (DVs). I would like to know if the two DVs are correlated or not.

+ +

Here is an example of what the data look like, e.g. a data frame in R:

+ +
# Make some data:
+set.seed(1154)
+
+data <- data.frame(id=gl(10, 6),
+                   factor1=gl(2, 3, labels=c(""A"", ""B"")),
+                   factor2=gl(3, 1),
+                   DV1=rnorm(60),
+                   DV2=rnorm(60))
+
+head(data)
+
+# Output:
+#   id factor1 factor2          DV1         DV2
+# 1  1       A       1  0.255579320  1.72318604
+# 2  1       A       2  0.133878731 -0.32694875
+# 3  1       A       3  0.890576655  0.14834580
+# 4  1       B       1 -0.007879094 -0.07145311
+# 5  1       B       2  0.976311664 -0.40686813
+# 6  1       B       3  0.701357069 -0.50813556
+
+ +

In R, I could do something like:

+ +
cor.test(data$DV1, data$DV2) # p = 0.048, significant
+
+ +

but there seem to be two problems with that.

+ +

First problem: the data are not independent (first 6 items from each DV come from the same participant in the experiment).

+ +

Second problem: we want to generalize from a sample to the population, so each id in the sample should just be included only once, e.g.:

+ +
# We want:
+#  id  factor1  factor2  DV1  DV2
+#  1      X        X     ...  ...
+#  2      X        X     ...
+#  3   ...
+
+# So:
+library(plyr)
+data2 <- ddply(data, .(id), summarize, mean.DV1=mean(DV1), mean.DV2=mean(DV2))
+head(data2)
+
+# Output:
+#   id    mean.DV1    mean.DV2
+# 1  1  0.49163739  0.09302105
+# 2  2  0.66030997 -0.09344809
+# 3  3  0.38277688  0.20274906
+# 4  4 -0.35217913  0.57308528
+# 5  5 -0.13470820  0.26663012
+# 6  6 -0.04756911  0.60406950
+
+ +

Now I can look for a correlation and the responses are independent, but I have lost the individual factor levels.

+ +
cor.test(data2$mean.DV1, data2$mean.DV2) # p = .15, not significant
+
+ +

What is the correct way to check for a correlation between the two dependent variables (using R)?

+",2013-11-07 13:20:25.830 +59065,2666.0,2,,59058.0,,,,CC BY-SA 3.0,"

I don't prefer to think of this type of a problem as a ""choose between two transformations"" problem but rather I like to estimate the transformation as part of the modeling process. In doing so we take care of multiplicities (possible inflated type I error) by having a parameter in the model for everything we think might be needed. Consider expanding the predictor using a regression spline such as a restricted cubic spline (natural spline). Test for association by doing a ""chunk"" test of all the parameters jointly that involve that predictor. With a restricted cubic spline this test will have $k-1$ degrees of freedom where $k$ is the number of knots (join points), and using defaults for knots based on the marginal distribution of the predictor will work fine (this is how the R rms package's rcs function does it).

+ +

Once you fit the spline model you can plot the predicted value vs. the predictor to learn about the estimated shape in the logistic model.

+ +

Concerning $Y$ make sure that it is truly all-or-nothing and does not represent a dichotomization.

+",2013-11-07 13:21:27.320 +59066,20470.0,2,,1248.0,,,,CC BY-SA 3.0,"

I found this list of quotes from Gelman's famous Bayesian Data Analysis book on this link. They are more like witty, stand-up one-liners but I enjoyed them a lot. Just a few below to whet your appetite:

+ +
+

1 ""As you know from teaching introductory statistics, 30 is infinity.""

+ +

2 ""Suppose there's someone you want to get to know better, but you have + to talk to all her friends too. They're like the nuisance parameters.""

+ +

3 People don't go around introducing you to their ex-wives."" (on why model improvement doesn't make it into papers)

+
+",2013-11-07 13:24:37.083 +59067,11489.0,2,,59061.0,,,,CC BY-SA 3.0,"

Think of the difference like any other statistic that you are collecting. These differences are just some values that you have recorded. You calculate their mean and standard deviation to understand how they are spread (for example, in relation to 0) in a unit-independent fashion.

+ +

The usefulness of the SD is in its popularity -- if you tell me your mean and SD, I have a better understanding of the data than if you tell me the results of a TOST that I would have to look up first.

+ +

Also, I'm not sure how the difference and its SD relate to a correlation coefficient (I assume that you refer to the correlation between two variables for which you also calculate the pairwise differences). These are two very different things. You can have no correlation but a significant MD, or vice versa, or both, or none.

+ +

By the way, do you mean the standard deviation of the mean difference or standard deviation of the difference?

+ +

Update

+ +

OK, so what is the difference between SD of the difference and SD of the mean?

+ +

The former tells you something about how the measurements are spread; it is an estimator of the SD in the population. That is, when you do a single measurement in A and in B, how much will the difference A-B vary around its mean?

+ +

The latter tells us something about how well you were able to estimate the mean difference between the machines. This is why ""standard difference of the mean"" is sometimes referred to as ""standard error of the mean"". It depends on how many measurements you have performed: Since you divide by $\sqrt{n}$, the more measurements you have, the smaller the value of the SD of the mean difference will be.

+ +

SD of the difference will answer the question ""how much does the discrepancy between A and B vary (in reality) between measurements""?

+ +

SD of the mean difference will answer the question ""how confident are you about the mean difference you have measured""? (Then again, I think confidence intervals would be more appropriate.)

+ +

So depending on the context of your work, the latter might be more relevant for the reader. ""Oh"" - so the reviewer thinks - ""they found that the difference between A and B is x. Are they sure about that? What is the SD of the mean difference?""

+ +

There is also a second reason to include this value. You see, if reporting a certain statistic in a certain field is common, it is a dumb thing to not report it, because not reporting it raises questions in the reviewer's mind whether you are not hiding something. But you are free to comment on the usefulness of this value.

+",2013-11-07 13:28:05.523 +59068,20613.0,2,,59044.0,,,,CC BY-SA 3.0,"

You could use a chi-square test for both ... although with the low sample sizes in the second example, you may want to use Fisher's exact test instead.

+ +

The Quick-R website gives some examples of how to code these analyses in R, http://www.statmethods.net/stats/frequencies.html

+",2013-11-07 13:39:15.223 +59069,2916.0,2,,47497.0,,,,CC BY-SA 3.0,"

From a recent update (2013, August 04th), caret R package (see page 97) also supports Y-J power transformation.

+",2013-11-07 14:14:44.740 +59071,23477.0,1,59072.0,,,Student's t-distribution,,CC BY-SA 3.0,"

I have two functions that provide an implementation of the t-distribution.

+ +

A webpage with a Javascript algorithm. The algorithm takes two input x-value and degrees_of_freedom.

+ +

Excel's TInv function. The function also takes two inputs: probability and degrees_of_freedom.

+ +

Can you give me the relation between probability and x-value? I have doubts that the two methods calculates the same thing.

+",2013-11-07 14:39:23.997 +59072,16043.0,2,,59071.0,,,,CC BY-SA 3.0,"

The first takes a value on the x-value and degrees of freedom and reports the amount of probability to the left of the x-value. If you're familiar with calculus, this is the equivalent of taking the integral of the Student's t probability density function over the interval $(-\infty,x]$. If you're familiar with statistics, this is the Student's t cumulative density function evaluated at $x$ for some degrees of freedom.

+ +

The second does the inverse, taking some probability and returning the corresponding x-value. In statistics, this is called the quantile function.

+ +

So the mathematical relationship between the two is the same as for any function and its inverse, with the substantive knowledge that they also have probability-based interpretations and statistical applications.

+",2013-11-07 14:43:37.290 +59073,11353.0,2,,7965.0,,,,CC BY-SA 3.0,"

As CHL has already explained the use of center and scale to obtain standardized variables, I'll address collinearity:

+ +

There is good reason to reduce collinear variables when clustering.

+ +

Curse of Dimensionality

+ +

The more dimensions you use, the more likely you are to fall victim of Bellman's 'curse of dimensionality'. In brief, the greater the number of dimensions, the greater the total volume, and the greater the sparsity of your data within it. (See the link for more detail.)

+ +

Dimension Reduction --- manually by inspecting of pairwise collinearity...

+ +

You mention that you have already reduced variables from some larger number down to 5 using pairwise collinearity measures.

+ +

While this will work, it is quite tedious, since in general you will have $n\choose 2$ number of pairs to check. (So for example with 10 variables, you would have ${10 \choose 2} = 45$ different pairs to examine -- a few too many to do manually in my opinion!

+ +

Dimension Reduction --- automatically using Principal Components Analysis (PCA)...

+ +

One way to handle this automatically is to use the PCA (principle components analysis) algorithm. The concept is more or less what you're doing manually -- ranking the variables by how much unique information each variable is contributing.

+ +

So you provide PCA your $n$-variable dataset as input, and PCA will rank order your variables according to the greatest variance each explains in the data -- essentially picking out the non-collinear variables.

+ +

Depending on whether you want 2-D or 3-D clusters, you would use the top 2 or 3 variables from PCA.

+ +

Principal Components in R

+ +

The PCA algorithm is available (built-in) from R.

+ +

Actually there are several functions in R that do principal components.

+ +

I've had success with prcomp().

+ +

Standard Reference available free online

+ +

One of the best references available is the classic:

+ +

Elements of Statistical Learning, by Trevor Hastie, Robert Tibshirani, and Jermoe Friedman

+ +

The authors have graciously made the entire book available (for free) as a PDF download from their Stanford website.

+ +

There are excellent chapters on Clustering, Principal Comoonents, and a great section on the curse of dimensionality.

+",2013-11-07 14:55:57.120 +59074,23478.0,1,,,,How to analyze intra-rater reliability with 31 raters who all rated 4 subjects twice,,CC BY-SA 3.0,"

I'm trying to analyze the intra-rater reliability of an occupational therapy assessment.

+ +

The assessment consists of 57 items, where some items are ratio data and some items are ordinal data. +In my study 31 raters rated 4 subjects (on video) twice (time 1 and time 2) with a six week interval. +So I have scores on each of the 57 items, for each subject, for each time and each rater.

+ +
                Rater1Item 1, Rater1Item2 .... Rater1Item 57.... Rater31Item1....Rater31Item57
+
+Subject1 Time1
+Subject2 Time1
+....
+Subject3 Time2
+Subject4 Time2
+
+ +

My first question is about the set-up of the database, I'm using SPSS. As showed above, is this the right set-up or do I need to change the set up and put the raters in the rows and the subjects in the colomns?

+ +

My second question, I found out I can use ICC and Bland & Altman to analyse the intra-rater reliability. Are these two suitable or are there other statistical methods? +The kappa statistic seems not very suitable since I have 31 raters, each rating 4 subjects. Or is there a way to deal with the kappa statistic and such a big database?

+ +

My third question is, which ICC can I use for the ratio data to calculate the intra-rater reliabiliy? I read different suggestions in the literature (ICC 1.1 or ICC 3.1)

+ +

My fourth question is, if I use Bland & Altman the first thing I need to do is calculating the mean of time 1 and time 2. Do I need to do this for each item, each subject and each rater independently or can I calculate the mean for each item on each subject for all raters together? +Or wouldn't it make sense to calculate this for the raters together, because the mean on both times can be the same, without have agreement (for example rater 1 scores less on time 2 and rater 2 scores higher on time 2, keeping the mean the same on both time 1 and time 2)

+ +

My last question, when I draw the Bland & Altman plot can I do this for each item with all 4 subjects and all 31 raters, having 124 dots in my plot?

+",2013-11-07 14:57:29.900 +59075,18198.0,1,59098.0,,,Numerical example for MLE for linear regression model,,CC BY-SA 3.0,"

I need to calculate the log-likelihood for a linear regression model in MATLAB.

+ +

Although the theoretical result is well know and given in several sources, I want to find a numerical example so that I can check my code is correct.

+ +

Can anyone point me to one?

+ +

I realize that the parameters are the same as OLS (at least asymptotically) but its the actual log-likelihood I need.

+",2013-11-07 15:23:21.270 +59076,20470.0,2,,56928.0,,,,CC BY-SA 3.0,"

Here is a really good 15-page article by Kemp at al. on Hierarchical Bayesian Modelling. It is more conceptual than mathematical though so I don't know whether it is good for your taste. Having said that, it definitely is less of a commitment than reading an entire book.

+",2013-11-07 15:59:52.550 +59077,22686.0,2,,59046.0,,,,CC BY-SA 3.0,"

Let's work out the general answer: we have independent identically distributed random variables $X_i$, and a sample of $n$ instantiations of these RVs. I'll denote the sample maximum by $X_{max}$ and the sample minimum by $X_{min}$. Let the $X_i$ have the cdf $F$. Then we have the following:

+ +

$$ P(X_{max} \leq x \cap X_{min}> y) = (F(x) - F(y))^n \textbf{1}_{\{x \geq y\}} $$

+ +

where the $\textbf{1}$ is the indicator function. This holds because for $x\geq y$, the probability that the sample maximum and minimum are in the interval $(y, x]$ is equal to the probability that each of the $n$ random variables is in this interval. +Then to get the joint density, use:

+ +

$$ P(X_{max} \leq x \cap X_{min}\leq y) = P(X_{max} \leq x) - P(X_{max} \leq x \cap X_{min}> y) $$

+ +

$$P(X_{max} \leq x \cap X_{min}\leq y) = F(x)^n- (F(x) - F(y))^n \textbf{1}_{\{x \geq y\}}$$

+ +

If you work out what this is for $n=2$, and look at the cases $x \geq y$ and $x < y$, this is equivalent to the expression you gave.

+",2013-11-07 16:12:12.497 +59121,23138.0,1,,,,Are growth mixture models just Gaussian mixtures applied to coefficients of polynomials fitted to time-series data?,,CC BY-SA 3.0,"

Am I understanding correctly that growth mixture model is just Gaussian mixtures applied to coefficients of polynomials fitted to the time-series data?

+ +

For example, we have 1000 cases, with 3 measurements each. We fit, say, a quadratic equation to each which gives us 3 extra values per case (quadratic, linear, constant). Then we fit Gaussian mixture model to those 3 coefficients, which gives us clustering of the trajectories.

+ +

Is that it, or are growth mixtures something more involved?

+",2013-11-08 02:49:21.433 +59078,371.0,1,,,,Estimating non-stationary Markov chain transition probabilities from data,,CC BY-SA 3.0,"

I have some call centre outcome data: each person is called several times, and the result of the call is recorded as one of a few discrete outcomes (""no answer"", ""wrong number"", ""third party"", ""correct party"", etc.). The goal of the analysis is to try and work out how many times to call before giving up.

+ +

My current plan is to consider the outcomes as a Markov chain. If I assume that the data represents a stationary state, then it is easy to get the transition probabilities. The problem is, I don't believe that they are stationary: having ""no answer"" 20 times is a different situation to be in than having ""no answer"" once.

+ +

How do you calculate the transition probabilities if your data is non-stationary?

+ +

For bonus points, is there a function in R to do this?

+",2013-11-07 16:19:17.773 +59079,23480.0,1,,,,Interpreting logit regression with both continuous and categorical variables,,CC BY-SA 3.0,"

I have a logistic regression like this:

+ +
Y = a1 + b1*(number positive scores) + b2*(number negative scores) + b3*Z 
+       + b4*Z*(number positive scores) + b5*Z*(number negative scores) 
+       + additional non-interaction terms
+
+ +

Y is the probability of an outcome that takes on binary values (0,1). +Z is a continuous variable; I am trying to determine if it is significant +Number of positive and negative scores take on integer values between (0,5)

+ +

When I run the regression using glm in matlab, I find that all coefficients are significant except for b3; b1 and b4 are positive whereas b2 and b5 are negative. I would like to draw conclusions about whether Z is in fact a significant factor in the outcome variable via interactions b4 and b5, but I understand that in a logistic regression all coefficients and particularly interactions need to be evaluated in the context of specific values of the independent variable x. So this is what I have done so far:

+ +

Let's say bhat is the estimated vector of coefficients b=(a1,b1,b2,b3,b4,b5...) and xhat is the sample mean of x=(num pos scores, num neg scores,Z,Z*num pos scores, Z*num neg scores). Also say that bi_sigma is the estimated standard error on the ith element of bhat and xi_sigma is the sample standard deviation of the ith element of x. Let's say L(bx) is the logistic cdf evaluated at bx.

+ +

Am I right to evaluate the odds ratios exp(bhat*xhat-bi_sigma*xi_hat) and exp(bhat*xhat+bi_sigma*xi_hat), then determine whether each element bi_hat is significant based on whether the range of these odds ratios is strictly greater than or less than one? In other words, my thinking is that if the odds ratios don't include one then they significantly improve or decrease the odds. For instance an odds ratio range of (1.3,2) reflects a bi that improves the odds for mean levels of x. Yes?

+ +

Secondly, am I correct to evaluate the first differences, L(bhat*xhat)-L(bhat*xhat-bi*xi_sigma) and L(bhat*xhat+bi*xi_sigma)-L(b*xhat) as two measures of the size of the impact of each xi. Can I do this for the interaction also?

+ +

Thanks very much for any advice or help. If there are references that you would suggest for this, I would appreciate that too. All the examples I've found online involve binary or categorical variables only, and no continuous variables.

+",2013-11-07 16:23:32.583 +59080,4656.0,2,,59046.0,,,,CC BY-SA 3.0,"

This is a problem in which working from first principles is better than specializing from poorly-understood general formulas. If $X$ and $Y$ are +the outcome on the two dice, then their joint mass function $p_{X,Y}(i,j)$ has value +$\frac{1}{36}$ for all $i,j \in \{1, 2, \dots, 6\}$ and so $p_{X_1,X_2}(i,j)$ has value $\frac{2}{36}$ if $1 \leq i < j \leq 6$, and value $\frac{1}{36}$ if $1 \leq i=j \leq 6$. The CDF can be worked out from this, but writing it out explicitly +gives a long multi-case expression that I will leave to the OP to figure out.

+",2013-11-07 16:28:36.937 +59081,22976.0,1,,,,"A problem of probabilities, calculation of 2 events",,CC BY-SA 3.0,"

I have this problem: A student must choose exactly two out of three electives: art, French, and mathematics. He chooses art with probability 5/8, French with probability 5/8, and art and French together with probability 1/4. What is the probability that he chooses mathematics? What is the probability that he chooses either art or French?

+ +

Now my rationale was this:

+ +

p(A and F)+p(F and M)+p(A and M)=1 something has to happen. +1/4+p(F and M)+p(A and M)=1

+ +

p(F and M)+p(A and M)=3/4

+ +

p(M)=p(F and M)+p(A and M)

+ +

p(M)=3/4

+ +

But how do I get p(A) or p(F).

+ +

Thanks.

+",2013-11-07 16:30:35.943 +59082,14965.0,1,59085.0,,,Is $f(Y | X)$ in the same family as $f(Y)$?,,CC BY-SA 3.0,"

Is it the case that for random variables $Y$ and $X$, $f(Y | X)$ in the same family as $f(Y)$?

+ +

If so how can I prove it? If not are there any situations (some families of distributions) where they can be the same? Or is it related to the link function(s)?

+ +

Thanks

+",2013-11-07 16:45:23.733 +59083,10546.0,1,,,,Cross validation for variable selection and coefficient shrinkage?,,CC BY-SA 3.0,"

Is cross validation an appropriate technique for variable selection and regression coefficient shrinkage?

+ +

A former colleague of mine used 10-fold CV to compare the regression coefficients from the 10 training models. Variables were kicked out if the coefficients changed sign. The average coefficient values for the remaining variables was then used to estimate the coefficients of the final model.

+ +

Does this homebrewed methodology sound like a valid technique?

+",2013-11-07 17:03:24.260 +59084,12544.0,1,,,,Probability of moving house (non-independence problem),,CC BY-SA 3.0,"

We are analyzing a dataset where we are looking at the probability of individuals moving house, and the factors that predict moving.

+ +

The problem is that a lot of people are married, and (usually) if one moves house, one's spouse also moves house (although not everyone is married). We have two options, neither of which seems satisfactory:

+ +
    +
  1. Model at the household level: but then including predictors at the +individual level is hard (not everyone is married).
  2. +
  3. Model at the individual level, and use something which corrects for clustering. Doesn't seem to account for the nearly complete clustering, and I don't feel like the stimates will be appropriate.
  4. +
+ +

In addition, the probability of moving is curious. If a couple moves, and a single person doesn't, was that a 0.5 probability of moving (because 1/2 households moved - as in (1) above, or 0.66, because 2/3 households moved, as in (2) above.

+",2013-11-07 17:15:26.773 +59085,2873.0,2,,59082.0,,,,CC BY-SA 3.0,"

There are some specific cases where it is true, such as a bivariate normal, then $f(Y)$ and $f(Y|X)$ are both normal.

+ +

But consider the case where $f(Y|X)$ is normal (with mean depending on $X$) and $X$ follows a uniform distribution. Then $f(Y)$ is not normally distributed.

+ +

There are also distributions where the marginals ($f(X)$ and $f(Y)$) are both $\text{uniform}(0,1)$, but there is a hole in the square where the probability is $0$, so the conditional $f(Y|X)$ would not be uniform and for some values of $X$ would be disjoint.

+",2013-11-07 17:15:51.580 +59122,5480.0,1,,,,How to model the probability that a tweet is spam using normal distribution?,,CC BY-SA 3.0,"

I am trying to process a set of tweets, by using its length to determine the probability of spam.

+ +

The data looks like this, given 2000 tweets, n is the tweet length (number of words)

+ +
when n = 1, relevant tweets = 0
+when n = 2, relevant tweets = 10
+when n = 3, relevant tweets = 20
+when n = 4, relevant tweets = 100
+when n = 5, relevant tweets = 200,
+...
+...
+when n = 9, relevant tweets = 10
+when n = 10, relevant tweets = 0.
+
+ +

My question is, how do I build a normal distribution based on this set of data, and calculate the probability of a tweet being spam, given n?

+ +

Thanks very much. This is of my own research work and it is not homework. Sorry if the questions are not asked in proper statistical way as I have limited knowledge on probability and normal distribution.

+",2013-11-08 03:02:58.640 +59086,23414.0,2,,59079.0,,,,CC BY-SA 3.0,"

I'm reluctant to post because I don't quite understand what you're looking for here. Maybe this will jumpstart someone else to respond that has a better grasp of what you want.

+ +

""I would like to draw conclusions about whether Z is in fact a significant factor in the outcome variable via interactions b4 and b5, but I understand that in a logistic regression all coefficients and particularly interactions need to be evaluated in the context of specific values of the independent variable x.""

+ +

The statement above isn't quite correct. Z is significant, the p-value for the interaction term is significant so Z is significant. You don't need to go much further (unless you tested an unreasonable number of interactions/data size small). It does not make a difference that Z is not significant if the higher order interactions are significant.

+ +

To estimate the effect of Z on the outcome, this needs to be done in the context of specific values of the independent variable x. This is often best done graphically.

+",2013-11-07 17:26:20.267 +59087,,2,,59081.0,user31668,,,CC BY-SA 3.0,"

You can directly solve for the smaller problem: $P(A \cup F) = P(A) + P(F) - P(A\cap F) = \frac{5}{8} + \frac{5}{8} - \frac{1}{4} = \frac{8}{8} = 1$! Which also makes logical sense, since he has to pick two of three electives.

+",2013-11-07 17:32:53.813 +59088,658.0,1,59096.0,,,Intuition for Gamma-Poisson / Negative Binomial,,CC BY-SA 3.0,"

I have a data set which intuitively seems Poisson-like, but it's overdispered. So I'm investigating negative binomial.

+ +

From this question and this page I understand one way of viewing this is to state that we are uncertain about the density parameter $\lambda$ in Poisson, and assume $\lambda \sim \text{Gamma}(\alpha, \beta)$, in which case we get a negative binomial.

+ +

I get the math here, but I don't understand why we would think that $\lambda$ is gamma-distributed. For example, the wordpress link says that when $\lambda$ represents ""likelihood to default on a loan"", then it would be gamma-distributed. But I don't really understand why.

+ +

Is there a way I can determine if my density is gamma distributed?

+",2013-11-07 17:36:38.483 +59089,5199.0,1,,,,How to obtain the model behind a simulator?,,CC BY-SA 3.0,"

I am looking for an useful statistical approach or analysis tool in order to understand the data obtained from an aeroelastic simulator of wind turbine dynamics.

+ +

In this case, the simulation provides data about the forces that a structure has to resist against the wind. Imagine a large structure such as a wind turbine which its base is loaded (with forces and moments) at each time step. This structural loads depend on the loads of previous instances of time (autocorrelated) and more variables such as wind intensity, angle of the attack, lift coefficients, relative speed, blade deflection of different sections of different blades, and other variables. The dataset is a time series of 12000 observations with 195 variables with many dependencies among them but some can be pointed out by an expert aeronautical engineer. She believes that some dependencies are non linear.

+ +

The main goal is to quantify the grade of dependency, in other words, what variables influence more the response variable (loads at the tower base). Later on I would also like to estimate the model using a data set from a different structure and see whether more influence variable are the same. I expect to see the same group of variables because the simulator’s algorithm is always the same.

+ +

I thought about some approaches but I want to know your experiences in similar problems. +- Generalized Linear Model or design of experiments, I have experience using them but I don't know how to capture the autocorrelation and the non-linearity. +- Multivariate Time series models, I do not have too much experience, but I believe they can capture the dependencies. +- What about more flexible and opaque models such as neural networks.

+ +

I am a recent graduate statistician and we hardly seen such complex problems during the degree. However, I am very confident with R and I'd love to know a good approach in order to start my research at CRAN and Wikipedia. My starting point is the Time Series View Task.

+",2013-11-07 17:56:00.773 +59090,23096.0,1,,,,Group effects where there is a single group,,CC BY-SA 3.0,"

Suppose I have a table of subjects and measurements as follows

+ +
+subject measurement
+s1        1
+s1        2
+s1        3
+s1        4
+s1        5
+s1        6
+s1        7
+s1        8
+s1        9
+s1       10
+s2       10
+
+ +

Now it is unlikely that the true value of s2 is 10 as observed. It is much more likely to be closer to the mean of the s1 measurements. How do I discover the true value?

+ +

I've discovered that if I construct the following table:

+ +
+group subject measurement
+g1  s1  1
+g1  s1  2
+g1  s1  3
+g1  s1  4
+g1  s1  5
+g1  s1  6
+g1  s1  7
+g1  s1  8
+g1  s1  9
+g1  s1  10
+g1  s2  10
+g2  s1  1
+g2  s1  2
+g2  s1  3
+g2  s1  4
+g2  s1  5
+g2  s1  6
+g2  s1  7
+g2  s1  8
+g2  s1  9
+g2  s1  10
+g2  s2  10
+
+ +

and run the following R command

+ +
 fit = lmer(measurement ~ 1 + (subject|group)) 
+ +

I get a plausible answer. Is it correct? Is there a better way? In R? I had to double the table up to coerce R to solve the problem as it doesn't seem to like a single group.

+",2013-11-07 17:59:00.433 +59091,23414.0,2,,59058.0,,,,CC BY-SA 3.0,"

A. +There are a number of textbook and statisticians that use this method of plugging in different transformations of a variable and using the p-values to provide evidence to support a conclusion about the nature of the relationship between the variable and the outcome. It seems appealing.

+ +

The Vittinghoff book ""Regression Methods in Biostatistics"" has a rather long section where they plug in a predictor as a categorical or continuous variable or both and discuss the p-values.

+ +

In practice I haven't found this approach meaningful. If you're doing exploratory analysis graphical methods including residual plots are usually more helpful. In this case I would be reluctant to make any assumption based on the p-value but would consider different graphical approaches.

+ +

B. +There are a number of common reasons why you might log transform a variable: (1) you think you should but not sure why (2) you want to linearize the relationship between x & y (3) you’re trying to address that your data is clumped/with outliers.

+ +

You’ll get better answers to your question if you’re explicit about your reasons for transformation. Most I think will assume you are motivates by (2) since this is perhaps the most valid reason, but from the question is seems that you might be motivated by (3) which is likely the most common reason.

+",2013-11-07 17:59:08.067 +59092,22564.0,1,,,,How do I combine multiple prior components and a likelihood?,,CC BY-SA 3.0,"

Lets imagine I am comparing two groups of animals (treatment/control). There is previous data from cell cultures indicating the treatment should have a positive effect. This gives me ""prior component 1"". There are also two previous studies very similar to my own. One of them had an effect of 5 +/- 1 (prior component 2), the other of 1 +/- 2 (prior component 3). I feel the cell culture data is highly convincing, and that the prior component 3 is not such a reliable study. So I choose some weights of 3,1, and .5 for each and multiply.

+ +

1) To calculate the ""overall prior"" do I simply add these together as shown in the lower right panel?

+ +

2) Am I supposed to normalize these components before adding them?

+ +

+ +

I then calculate a likelihood function for my current data as shown in the upper panel.

+ +

+ +

3) How do I combine this information with the prior information shown in the first figure? For the lower panel I simply multiplied overall prior*likelihood.

+ +

4) I then want to make a decision based on this outcome. If I believe the effect is between -1 and 1 then I will stop studying the drug. If the effect is < -1 then I would perform new study A, if the effect is > 1 I will perform new study B.

+ +

5) Obviously there are a number of ways of choosing a decision (% density between -1 and 1, etc) Is there a best choice?

+ +

6) I feel I am doing something incorrectly, but maybe not. Is there a name for what I am trying to accomplish?

+ +

Edit:

+ +

If it helps I am trying to use the framework proposed by Richard Royall:

+ +

1) The likelihood function tells me ""how to interpret this body of observations as evidence""

+ +

2) The likelihood function + priors tells be ""what I should believe""

+ +

3) The likelihood function + priors +cost/benefit determines """"what I should do"".

+ +

Royall R (1997) Statistical evidence: a likelihood paradigm (Chapman & Hall/CRC)

+ +

While the priors used here are subjective/nebulous they are built out of simple building blocks (uniform and normal distributions) that mathematically unsophisticated researchers can understand quickly. I think they convey my thought processes as a researcher well. Others may of course know of different background information. They should be able to build their own ""compound prior"" which may lead to a different decision than mine, but we should always agree on the likelihood function.

+ +

This approach (if implemented correctly, which I am not sure I am doing here), appears to me to model the actual thought processes of researchers and thus be suitable for scientific inference. The steps map to the common sections found in scientific papers. The priors are the introduction, the likelihood is the results, and the posterior probability is the discussion.

+ +

R code:

+ +
#Generate Priors
+x<-seq(-10,10,by=.1)
+y1<-dunif(seq(0,10,by=.1), min=-10, max=10)
+y1<-c(rep(0,length(x)-length(y1)),y1)
+y2<-dnorm(x, mean=5, sd=1)
+y3<-dnorm(x, mean=1, sd=2)
+
+#Weights for Priors
+wt1<-3
+wt2<-1
+wt3<-.5
+
+#Final Priors
+y1<-y1*wt1
+y2<-y2*wt2
+y3<-y3*wt3
+
+#Sum to get overall Prior
+y<-y1+y2+y3
+
+#Likelihood function for ""current data""
+lik<-10*dnorm(x, mean=1, sd=1)
+
+#Updated Posterior Probability?
+prob<-lik*y
+
+
+par(mfrow=c(2,2))
+plot(x,y1, ylim=c(0,1), type=""l"", lwd=4, 
+     ylab=""Density"", xlab=""Effect"", main=""Prior Component 1"")
+plot(x,y2, ylim=c(0,1), type=""l"", lwd=4, 
+     ylab=""Density"", xlab=""Effect"", main=""Prior Component 2"")
+plot(x,y3, ylim=c(0,1), type=""l"", lwd=4, 
+     ylab=""Density"", xlab=""Effect"", main=""Prior Component 3"")
+plot(x,y, ylim=c(0,1), type=""l"", lwd=4, 
+     ylab=""Density"", xlab=""Effect"", main=""Overall Prior"")
+
+
+
+
+dev.new()
+par(mfrow=c(2,1))
+plot(x,lik, type=""l"", lwd=4, col=""Red"",
+     ylab=""Likelihood"", xlab=""Effect"", main=""Likelihood"")
+plot(x,prob, type=""l"", lwd=4, col=""Blue"",
+     ylab=""Probability"", xlab=""Effect"", main=""Posterior Probability?"")
+abline(v=c(-1,1), lty=2, lwd=3)
+
+",2013-11-07 18:03:18.783 +59093,23227.0,2,,59048.0,,,,CC BY-SA 3.0,"

I found the answer. I was somewhat close with the question. The trick is that width is not just width, it is width of each interval.

+ +

Therefore

+ +
bin1: 5,10,11,13,15,35,50,55,72 I.e. all values between 5 and 75
+bin2: 92 I.e. all values between 75 and 145    
+bin3: 204,215 I.e. all values between 145 and 215
+
+ +

And for equal width binning you are given number of required bins and in this case it is 3.

+",2013-11-07 18:19:37.763 +59094,15827.0,2,,58995.0,,,,CC BY-SA 4.0,"

The Iris dataset is deservedly widely used throughout statistical science, especially for illustrating various problems in statistical graphics, multivariate statistics and machine learning.

+
    +
  • Containing 150 observations, it is small but not trivial.

    +
  • +
  • The task it poses of discriminating between three species of Iris from measurements of their petals and sepals is simple but challenging.

    +
  • +
  • The data are real data, but apparently of good quality. In principle and in practice, test datasets could be synthetic and that might be necessary or useful to make a point. Nevertheless, few people object to real data.

    +
  • +
  • The data were used by the celebrated British statistician Ronald Fisher in 1936. (Later he was knighted and became Sir Ronald.) At least some teachers like the idea of a dataset with a link to someone so well known within the field. The data were originally published by the statistically-minded botanist Edgar Anderson, but that earlier origin does not diminish the association.

    +
  • +
  • Using a few famous datasets is one of the traditions we hand down, such as telling each new generation that Student worked for Guinness or that many famous statisticians fell out with each other. That may sound like inertia, but in comparing methods old and new, and in evaluating any method, it is often considered helpful to try them out on known datasets, thus maintaining some continuity in how we assess methods.

    +
  • +
  • Last, but not least, the Iris dataset can be enjoyably coupled with pictures of the flowers concerned, as from e.g. the useful Wikipedia entry on the dataset.

    +
  • +
+

Note. Do your bit for biological correctness in citing the plants concerned carefully. Iris setosa, Iris versicolor and Iris virginica are three species (not varieties, as in some statistical accounts); their binominals should be presented in italic, as here; and Iris as genus name and the other names indicating particular species should begin with upper and lower case respectively.

+

(EDIT 4 May 2022 In a generally excellent book to hand on machine learning, the Iris data are described in terms of classes, types, kinds and subspecies, but never once correctly from a biological viewpoint. Naturally that sloppiness makes not a jot of difference to the machine learning exposition.)

+

Stebbins (1978) gave an appreciation of Anderson, a distinguished and +idiosyncratic botanist, and comments on the scientific background to +distinguishing three species of the genus Iris. Kleinman (2002) +surveys Anderson's graphical contributions with statistical flavor. See also Unwin and Kleinman (2021).

+

Kleinman, K. 2002. +How graphical innovations assisted Edgar Anderson's discoveries in +evolutionary biology. +Chance 15(3): 17-21.

+

Stebbins, G. L. 1978. Edgar Anderson 1897--1969.
+Biographical Memoir. Washington, DC: National Academy of Sciences. +accessible here

+

Unwin, A. and Kleinman, K. 2021. The iris data set: In search of the source of virginica. Significance 18: 26-29. https://doi.org/10.1111/1740-9713.01589

+",2013-11-07 18:35:10.543 +59095,23487.0,1,59111.0,,,Iris dataset how to solve,,CC BY-SA 3.0,"

I am new to Machine Learning. For Iris data set problem we can solve the problem with Multinomial logistic regression and as well as neural network. Which would give better performance regarding with cost and error?

+",2013-11-07 18:44:09.757 +59097,3999.0,2,,59070.0,,,,CC BY-SA 3.0,"

So by ""absolute risk"" I'm going to assume you mean either the cumulative probability of an event at time t, or the hazard at time t.

+ +

In short, no, a Cox proportional hazards model doesn't really give you back that information - the model itself doesn't calculate the underlying hazard, just the relative difference in the hazard between covariate values. This rather nicely frees you from having to specify the underlying hazard of your outcome, which in many cases is unknown, not particularly of interest in the first place, or difficult to specify using a parametric model.

+ +

If you do want to estimate the underlying hazard function, you need to use parametric survival models, such as those used in survreg. There are a large number of tutorials online, including code, for survreg and parametric models.

+ +

This is one of my favorites, as it includes some theoretical treatment and a good bit of code. The 'Survival' package documentation is also a good place to start.

+",2013-11-07 18:53:56.177 +59098,9049.0,2,,59075.0,,,,CC BY-SA 3.0,"

The generative model under OLS is that of: $y \sim N(X\beta, \sigma^2 I)$. Fitting the model is finding the $\beta$ and $\sigma$ that maximize your log-likelihood.

+ +

Assuming your general covariance is $K$ (here $K = \sigma^2 I$) the loglikehood $L$ is equal to : +$L = -\frac{N}{2}\log(2\pi) - \frac{1}{2} \log(|K|) - \frac{1}{2} (y-\hat{y})^T K^{-1}(y-\hat{y})$, where as I said above $N$ is your number of readings and now $\hat{y}$ are the model fitted values; $(y-\hat{y})$ are your model's residuals and $|K|$ denotes the determinant of the covariance matrix $K$.

+ +

Luckily for all of us the OLS log likelihood can also be expressed as :
+$L = -\frac{N}{2}\log(2\pi) - N\log(\sigma) - \frac{1}{2\sigma^2} \sum (y-\hat{y})^2$. This and the above expression are equivalent. A bit of linear algebra can convince you for that.

+ +

OK, enough talk. Here is your numerical example in MATLAB:

+ +
clc; clear;                %just clear stuff
+
+X = (1:.02:20)'; 
+Y = cos(X);                %dependant variable
+N = length(Y);             %number of readings
+X_matrix = [ ones(N,1) X]; %make design matrix
+
+[b] =  (X_matrix\ Y);      %solve the system
+%[b2] = lscov(X_matrix,Y); %equivalent statement
+Fitted = X_matrix * b;     %find fitted values
+Residuals = Y - Fitted;    %find residuals
+sigma = std(Residuals);    %find std.dev. of residuals
+format long                %set it long for visual inspec. 
+                           %Calculate L using simple expression
+L_simple = -N*.5*log(2*pi) - N*log(sigma) - (1/(2*sigma.^2))*sum( Residuals.^2);
+                           %set K covariance matrix
+K_matrix =  eye(N) * sigma^2;
+                           %Calculate L using generic expression
+L_generic =-N*.5*log(2*pi) - sum(log(diag(chol(K_matrix)))) ...
+           - .5*Residuals' / (K_matrix)* Residuals;
+% ( sum(log(diag(chol(K_matrix)))) equals -.5*log(det(K)) 
+% but it is far more stable.
+L_generic 
+% ans = -9.929305263221722e+02
+L_generic - L_simple 
+% ans =  3.410605131648481e-12 %cool they are practically the same
+
+ +

But hey, can you check this in R that we are quite sure it is works?

+ +
X = seq(1,20, by=.02)
+Y = cos(X)
+lm_test = lm(Y ~ X)
+sigma = sd( residuals( lm_test))
+logLik(lm_test)
+#'log Lik.' -992.9303 (df=3)
+#Check difference with MATLAB answer:
+-9.929305263221722e+02 -  as.numeric(logLik(lm_test))
+#[1] -0.0002630656 #very small difference, mostly due to numerics(*).
+#Use the simple formula:
+-N*.5*log(2*pi) - N*log(sigma) - (1/(2*sigma^2))*sum(  residuals( lm_test)^2)
+#[1] -992.9305  #Cool it works as expected.
+
+ +

A standard reference free reference for all this would be the book Elements of Statistical Learning: Data Mining, Inference, and Prediction by Hastie, Tibshirani and Friedman. (Chapter 2.6)

+ +

(*) There are some small issues also about how you calculate the degrees of freedom for the variance; one uses (N-m) instead of just (N-1), m being the number of columns of your X matrix but you don't need to worry much about it at this point.

+",2013-11-07 19:27:08.170 +59099,23194.0,1,,,,Is there a textbook / handbook with full derivations for statistical / machine learning concepts?,,CC BY-SA 3.0,"

In particular, I am looking for a textbook which will go over the details of derivations (including all calculus and linear algebra) for learning models and concepts such as logistic regression, Gaussian Discriminant Analysis, with full proofs for variants like Gaussian Naive Bayes.

+ +

Books such as ""Elements of Statistical Learning"" tend to gloss over certain details. For example, when discussing L1 Regularized Logistic Regression (Section 4.4.4), it says that ""the score equations ... have the form"", and then presents the form without giving the derivation.

+",2013-11-07 19:29:11.147 +59100,23490.0,1,59106.0,,,t-test or Wilcoxon test in R,,CC BY-SA 3.0,"

I have a very small data sets of web traffic to compare the effect of performing advertising over five days or seven days. Yes, running a test over 7 days would definitely give me more traffic but I would like to know if the traffic is significantly higher and worth my consideration.

+ +

I run each test for 2 weeks.

+ +

This is my dataset:

+ +
5day advertising web traffic    7day advertising web traffic
+week1   week2                     week1    week2
+184418  179650                    301978    308019
+
+ +

I ran the t-test in R and got this value

+ +
> a<-c(184418,179650)
+> b<-c(301978,308019)
+> t.test(a,b)
+
+t = -31.9557, df = 1.898, p-value = 0.001307
+alternative hypothesis: true difference in means is not equal to 0
+95 percent confidence interval:
+ -140407.5 -105521.5
+sample estimates:
+mean of x mean of y 
+ 182034.0  304998.5 
+
+ +

Is my dataset too small and does this mean that there is a significance difference?

+ +

When I run the Wilcoxon test (via wilcox.test), I get a different output:

+ +
data:  a and b
+W = 0, p-value = 1
+alternative hypothesis: true location shift is greater than 0
+
+ +

Thanks

+",2013-11-07 20:10:36.020 +59101,13037.0,2,,59100.0,,,,CC BY-SA 3.0,"

Here is what I got when I used your data. I would go with the results of the wilcoxon test since your sample size is so small.

+ +
x1<- c(184418, 179650)
+x2<- c(202316, 196395)
+
+t.test(x1,x2)
+
+    Welch Two Sample t-test
+
+data:  x1 and x2
+t = -4.557, df = 1.913, p-value = 0.0488
+alternative hypothesis: true difference in means is not equal to 0
+95 percent confidence interval:
+ -34409.9541   -233.0459
+sample estimates:
+mean of x mean of y 
+ 182034.0  199355.5 
+
+wilcox.test(x1,x2)
+
+data:  x1 and x2
+W = 0, p-value = 0.3333
+alternative hypothesis: true location shift is not equal to 0
+
+",2013-11-07 20:29:36.977 +59102,23492.0,1,,,,Computational statistics book,,CC BY-SA 3.0,"

Can anybody recommend me a good book on Computational Statistics? I am new to this subject so I am not sure how to be more specific.

+",2013-11-07 20:46:18.193 +59103,6204.0,2,,59102.0,,,,CC BY-SA 3.0,"

This question is probably going to get closed for being off topic, but here's one:

+ +
+

Givens, GH and Hoeting, JA. Computational Statistics, 2nd ed. Wiley (2012)

+
+ +

You might also like The Elements of +Statistical Learning (Hastie, Tibshirani, and Friedman; available online as PDF).

+",2013-11-07 20:53:46.740 +59104,20538.0,2,,59102.0,,,,CC BY-SA 3.0,"

I've found Monahan's Numerical Methods of Statistics very valuable. I also have Computational Statistics by Givens and Hoeting, and Numerical Methods of Statistics by Lange, both of which are good. Statistical Computing with R by Rizzo is nice to have if you're wanting to do a lot of this stuff in R. I find myself jumping around a lot, and still doing a lot of reading outside for things like quadrature, matrix computations, and everything in Press et al.

+",2013-11-07 21:07:50.560 +59105,21161.0,1,59123.0,,,Modeling time: Probability distribution over time?,,CC BY-SA 3.0,"

I'm trying to model users' posting behavior during a day. Say we have a bunch of users, with the time they post tweets. Now, for each user, I would like to estimate the likelihood of he post a new tweet at 9:00am according to his historical posting behaviors.

+ +

I'm curious what distribution I could pose here. In the literature I saw people using Gaussian, but I'm not sure if that's suitable since it's single peak. (Mixture model would be too complex for this task)

+ +

Thus I'm wondering is there any distribution over time that I could use?

+ +

The data I'm having is 2 month worth of tweets. Each tweet contains a timestamp, and the author's id. What I'm trying to model is the user's daily activity in terms of posting a tweet. E.G. the data for a single user would looks like [9:00, 9:12, 17:00, 17:01, 22:22, 22:37, 22:45, 22:47, 22:48...]. So this user post more tweets during night (around 10:30 pm) but very rare during work hours. I wish to model the probability P(user post a tweet|time).

+ +

Would really appreciate the answers!

+",2013-11-07 21:20:06.500 +59106,503.0,2,,59100.0,,,,CC BY-SA 3.0,"

Given what you are trying to do, I am not sure a t-test is what you want.

+ +

I am guessing that advertising for 7 days costs more than advertising for 5. So, let's look at cost per day:

+ +
Week 1: 184,418/5 = 36,883 per day
+        202,316/7 = 28,902 per day
+
+Week 2: 179,650/5 = 35,930
+        196,395/7 = 28,056
+
+ +

Then what impresses me is that the difference is nearly the same for the 2 weeks. The question of how to test this is, I think, not so important.

+ +

In fact, if 5 days is M-F and 7 is every day, you might consider trying only the weekend!

+ +

Of course, cost may not be the same per day, in which case the above would have to be modified.

+",2013-11-07 21:29:38.710 +59107,19463.0,1,59108.0,,,shuffle my data to investigate differences between 3 groups,,CC BY-SA 3.0,"

I am sure this already exists but I just don't know the terminology to look for.

+ +

I have three sets of 10 measurements. Each set corresponds to a different geographic region. +So in total I have 30 measurements of my variable, and I have the factor ""region"" with 3 levels (west region, middle region, east region).

+ +

Let's say I do a simple ANOVA and I get differences between the 3 regions. But I want to play a little with the possibility of this differences being ""by chance"". Or, in another scenario, let's say I can't use ANOVA for some reason (eg. strongly inhomogeneous variances) and I use a non-parametric test and I don't find differences

+ +

I want to know if it's possible to do the following (or if the idea is appropriate):

+ +

If there is really no difference between the 3 regions, then I can assume that any test (eg ANOVA or a non-parametric equivalent) will find approximate the same results even if I randomly mix all data once and again. So I thought I could simulate this, using my own data but just in different grouping. for example: +1- take all the 30 values from my own measurements +2- shuffle them into 3 groups, ie. randomly choose 10 values and assign them to a randomly chosen group; repeat with the next 10 data and then you have again 3 groups of 10 measurements. +3- run the test (eg. ANOVA)

+ +

Now I go back to 1, and repeat this eg 1000 times, and see if there is a convergence towards a ""stable"" pattern. If there is, then there are actually no differences. +If the convergence deviates a lot from the results I found with my ""real"" dataset, then I may think there are actually differences between the 3 regions.

+ +

Is my reasoning correct/sound? I know there is something like this, I just don't remember the name.. I thought it was related to permutations but I'm not sure...

+",2013-11-07 21:31:58.190 +59108,594.0,2,,59107.0,,,,CC BY-SA 4.0,"
+

If there is really no difference between the 3 regions, then I can assume that any test (eg ANOVA or a non-parametric equivalent) will find approximate the same results even if I randomly mix all data once and again.

+
+

This is the central insight that underlies resampling methods, such as permutation tests / randomization tests.

+

e.g. see Wikipedia, for example here

+

The basic idea of a permutation test (let's take a one way ANOVA-like situation) is that if the null is true, the group labels are arbitrary - you don't change the distributions by shuffling them.

+

So if you look at all possible arrangements of the group labels and compute some test statistic of interest, you obtain what's called the permutation distribution of the test statistic. You can then see if your particular sample (which will be one of the possible permutations - or more accurately, possible combinations) is unusually far 'in the tails' of that null distribution (giving a p-values).

+

Many of the common nonparametric rank-based tests are actually permutation-tests carried out on the ranks (which is a practical way of doing permutation tests without computers, which are otherwise very tedious unless you have very small sample sizes).

+

When the sample sizes are large, an option is to sample (with replacement) from the permutation distribution, typically because there are too many combinations to evaluate them all. Generally this is achieved by randomly permuting the labels rather than systematically re-arranging them to cover every possibility. The test statistic is then computed for each such arrangement. The sample value of the statistic is then compared with the distribution (it is normally included as part of the distribution for computing the p-value, and counts in the values 'at least as extreme' as itself). Some authors call this sampled permutation test a randomization test (though other authors reserve that term for a somewhat different notion also connected to permutation tests).

+

What you described was pretty close to this randomly sampled permutation test (randomization test).

+

I advise trying such a randomization test, not least for its ability to expand your horizons in terms of the standard tools you have available for tackling problems. The procedure is distribution-free (conditional on the sample) - it requires fewer assumptions while still allowing you to use either familiar statistics or ones custom-designed to your circumstances (e.g. you could slot in a more robust measure of location).

+

In practice I'd advise more than 1000 resamples for a randomization test. Consider a test with a p-value near 5%. The standard error of an estimated p-value for a sample size of 1000 will be nearly 0.007; when the true p-value is just on one side of 5%, nearly 15% of the time you'll see a value more than 1% on the wrong side (more than 6% or less than 4% when it should be the other side). I usually regard 10000 as toward the low end of what I do unless I just want a rough idea of the ballpark of the p-value. If I was doing a formal test, I'd want to pin it down a bit better. I often do 100,000 and sometimes a million or more - at least for the simpler tests.

+

If you search here on permutation tests or randomization tests you should find a number of relevant questions and answers and even some examples.

+",2013-11-07 21:35:01.070 +59109,23493.0,2,,45543.0,,,,CC BY-SA 3.0,"

No, the square root of the symmetrised KL divergence is not a metric. A counterexample is as follows:

+ +
    +
  • Let $P$ be a coin that produces a head 10% of the time.
  • +
  • Let $Q$ be a coin that produces a head 20% of the time.
  • +
  • Let $R$ be a coin that produces a head 30% of the time.
  • +
  • Then $d(P, Q) + d(Q, R) = 0.284... + 0.232... < 0.519... = d(P, R)$.
  • +
+ +

However, for $P$ and $Q$ very close together, $D(P, Q)$ and $J(P, Q)$ and $S(P, Q)$ are essentially the same (they are proportional to one another $+ O((P-Q)^3)$) and their square root is a metric (to the same order). We can take this local metric and integrate it up over the whole space of probability distributions to obtain a global metric. The result is:

+ +

$$A(P, Q) = \cos^{-1}\left(\sum_x \sqrt{P(x)Q(x)} \right)$$

+ +

I worked this out myself, so I'm afraid I do not know what it is called. I will use A for Alistair until I find out. ;-)

+ +

By construction, the triangle inequality in this metric is tight. You can actually find a unique shortest path through the space of probability distributions from $P$ to $Q$ that has the right length. In that respect it is preferable to the otherwise similar Hellinger distance:

+ +

$$H(P, Q) = 1 - \sqrt{\sum_x \sqrt{P(x)*Q(x)} }$$

+ +

Update 2013-12-05: Apparently this is called the Battacharrya arc-cos distance.

+",2013-11-07 21:42:38.513 +59110,23496.0,1,59239.0,,,How to analyze this incomplete block design in R?,,CC BY-SA 3.0,"

I was just wondering if you could point me in the right direction. +I have a dataset with 5 tree clones planted at 10 different sites, i.e. same clones are replicated twice at different sites.

+ +
  Site Clone  
+    1     A    
+    2     A  
+    3     B  
+    4     B  
+    5     C  
+    6     C  
+    7     D  
+    8     D  
+    9     E  
+   10     E
+
+ +

At each site each clone is replicated multiple times. Ideally, I would want to know what is the effect of clone on my dependent variable y and whether a site effect is present. To me it looks like an incomplete block design with clone as a fixed effect and site as a random effect (and block). Using lmer from the lme4 package in R, I would specify the model as follows:

+ +
lmer(y~clone + (1|site), data=mydata)    
+
+ +

Is this a correct way of analyzing this dataset? I could also average by clone over sites and eliminate sites. But this way I will lose potential important information as to whether a site effect is present.

+ +

Any pointers are appreciated!

+",2013-11-07 21:50:35.707 +59111,7741.0,2,,59095.0,,,,CC BY-SA 3.0,"

As far as I know, the Iris data set should be (almost) linearly separable.

+ +

Multinomial logistic regression (MLR) is a linear classifier. Neural networks (NN) are nonlinear classifiers.

+ +

The problem with NNs is that they could overfit your training data and might not generalize as good as MLR. You can avoid that by adding a regularization term to the cost function (error function) of the NN, so that your error function will consist of a term that penalizes errors on the training set (e.g. cross entropy, sum of squared errors $\sum_n ||y^{(n)}-t^{(n)}||^2_2$) and a term that penalizes the model complexity (e.g. norm of the weight vector $\gamma||w||^2_2$, ...).

+ +

Adding a penalty for large weights to your error function is like using a prior $p(w)$ for your hypothesis. Which means some $w$ become more likely than others.

+",2013-11-07 22:05:38.010 +59112,23335.0,1,,,,Repeated measurement analysis advice in R lme and / or aov and a couple of more questions,,CC BY-SA 3.0,"

I want to analyse the visual performance, operationalized via contrast threshold, depending on adaptation luminance and spectrum, gathered for 29 subjects. I'm currently kind of confused of how to do that. Some of my data:

+ +
    ID C_measured Subject  LB Spectrum SpectrumLB
+  1  1 0.1339795   AHI11 0.1       HS     HS.0.1
+  2  2 0.1040440   AIC19 0.1       HS     HS.0.1
+  3  3 0.1363313   AUO13 0.1       HS     HS.0.1
+  4  4 0.1134103   BAR01 0.1       HS     HS.0.1
+  5  5 0.1117670   BAR02 0.1       HS     HS.0.1
+  6  6 0.1166350   BCL10 0.1       HS     HS.0.1
+
+ +

LB can be 0.1, 0.21, 0.3 and 1.0, Spectrum HS and LED

+ +

I know that I want to do 8 planned comparisons of Spectrum at each LB (4x) and the effect of reducing LB from 0.3 to 0.21 for both spectra with and without exchanging the spectrum (4x).

+ +

Some said I should do a two-way ANOVA, e.g. like

+ +
   aov(C_measured ~ LB * Spectrum + Error(Subject / (LB * Spectrum)), data = anovaFrameWithGlareFoveal)
+
+ +

and then a post-hoc test on the interaction variable SpectrumLB, e.g. like

+ +
anovaFrameWithGlareFoveal.lme <- lme(C_measured ~ LB, random = ~1 | Subject / LB, data = anovaFrameWithGlareFoveal)
+anovaFrameWithGlareFoveal.glht <- glht(am2.subject, linfct = mcp(LB = ""Tukey""))
+summary(anovaFrameWithGlareFoveal.glht, test = adjusted(type = ""none""))
+
+ +

then manually bonferroni-adjusting the p-value to the number of my planned comparisons.

+ +

I do that for a couple of other parameters (with glare, without glare, old reference group, young group), which I don't want to include in the statistical analysis, because it is common knowledge that this influences visual performance, I just want to analyse whether the planned comparisons differ for those parameters.

+ +

Till here the question: is this how to do it? DO I need the two-way ANOVA at all?

+ +

Then I observed some things: +for some of the parameters I observed significant values in the two-way ANOVA for the main effect of Spectrum p<.05, but none of the uncorrected multiple comparisons between the two spectra at all four LBs was significant (ok one was <.1, but I'm testing against .05), which some people commented with ""impossible"". +Is this possible?

+ +

Then people said: ""ok paired t.tests should come up with the same results"" so I did this:

+ +
df <- anovaFrameWithGlareFoveal
+df.led <- subset(df, Spectrum == ""LED"")
+df.hs <- subset(df, Spectrum == ""HS"")
+t.test(df.led$C_measured[df.led$LB==.1], df.hs$C_measured[df.hs$LB==.1], paired=T)
+t.test(df.led$C_measured[df.led$LB==.21], df.hs$C_measured[df.hs$LB==.21], paired=T)
+t.test(df.led$C_measured[df.led$LB==.3], df.hs$C_measured[df.hs$LB==.3], paired=T)
+t.test(df.led$C_measured[df.led$LB==1], df.hs$C_measured[df.hs$LB==1], paired=T)
+
+ +

then all of the t.tests were significant (<.05) but only one of the uncorrected multiple comparison was significant (<.05).

+ +

I'm not really that much into statistics that I can definitively argue for or against one method or combined methods or whether the lme + glht alone is sufficient. Had a tough time on that the last week and am really curiously looking forward to your comments!

+",2013-11-07 22:17:16.140 +59113,23499.0,1,59116.0,,,Interpreting coefficients of first differences of logarithms,,CC BY-SA 3.0,"

My problem is interpreting coefficients of such time series model:

+ +

\begin{equation} +\ln Y_t - \ln Y_{t-1} =b_1 \cdot \left(X_{t}-X_{t-1}\right)+b_2 \cdot Z_t.\end{equation}

+ +

I don't know how to interpret coefficients $b_1$ and $b_2$.

+ +

Hope that someone can help.

+",2013-11-07 22:17:18.293 +59114,23138.0,1,,,,Best approach to classifying 3-point trajectories?,,CC BY-SA 3.0,"

I have a sample of about 300 subjects who have been measured at 3 different times (morning, afternoon, evening). The variable of interest can be assumed to be approximately normal. It appears that most subjects have an increase between between morning and afternoon, followed by a decrease from afternoon to evening. Some however show an opposite pattern (devrease->increase), while yet others remain approximately the same.

+ +

What I am interested in is clustering, or classifying the subjects according to their trajectories. After a bit of googling, I have discovered GMM. I don't really understand what goes on behind the scenes, but it looks like the classification is done based on straight line fits when we only have 3 points. This seems highly inappropriate to me since straight line does not capture the increase followed by a decrease type of behaviour. Another thing is, people seem to be using Mplus package, which I am not familiar with and would rather avoid buying and learning (I am very comfortable with R and Matlab).

+",2013-11-07 22:22:10.430 +59115,23500.0,1,,,,Selection of sites for wind power generation using time series from 600 candidate sites,,CC BY-SA 4.0,"

I have 3 hourly power generation data for around 600 locations for a year. (i.e. 8 data per day for 365 days for each location.)

+ +

I want to find out a way where out of this 600 locations, I can say suppose choose suppose 10 locations which are producing power at different time periods.

+ +

suppose:

+ +
Location:   Hour 0, Hour 3, Hour 6, Hour 9, Hour 12,  Hour 15
+
+   A          30       00      50      70       00      20    
+   B          50       20      70     100       00      40
+   C          00      100      20      00       40      30
+   D          20       15      10      00       40      30
+
+ +

Here, A and B are highly correlated where A and C are inversely correlated. So, is there a way where I can distinguish the locations which are producing powers at different time periods so that, by accommodating these few location, the overall power generation can be sort of stable?

+ +

By sort of stable, I mean something like this which I mentioned in the comment later on to clarify my question.

+ +

Since A and C are inversely co-related, so if I choose power from A and C out of four locations, I would get some power all the time (always above 30 in this scenario). But if I take A and B, I would get very high amount of power when I have power (like 70+100=170 in hour 9), and I wont get any power at Hour 12 (since A and B produces 00 power in Hour 12). I want to avoid choosing A and B, and I want to choose A and C. My objective is to choose 10 locations out of 600 where by adding powers of each hour of 10 locations, total power for each hour should be above a certain threshold, like 30.

+",2013-11-07 22:49:32.913 +59116,5045.0,2,,59113.0,,,,CC BY-SA 4.0,"

For $small$ changes, you can interpret logged differences as percentage changes after multiplying by 100.

+ +

For example, $y_t=9$ and $y_{t-1}=8$. Then $\ln 9 - \ln 8=.118$ or 11.8%, which is the logarithmic approximation to the actual 12.5% increase. Note that I had to multiply by 100 here. For $y_t=9$ and $y_{t-1}=8.5$ the approximation will be much better ($5.9\% \approx 5.7\%$).

+ +

Usually, a coefficient tells you the effect on $y$ of a one unit change in that explanatory variable, holding other variables constant. A one unit change in $\Delta \ln x$ corresponds to a 100% change (using the approximation above, which is terrible since this is not a small change). This means that $b_1$ tells you the percentage change in $y$ associated with a 1% increase in x.

+ +

But your $x$ is not logged, so the coefficient needs to be interpreted differently. When $x$ grows by one unit, you get $100 \cdot b_1\%$ more $y$.

+ +

Moreover, $100 \cdot b_2$ tells you the percentage change in $y$ associated with a 1 unit increase in $z$.

+",2013-11-07 22:57:43.387 +59117,16325.0,1,,,,Most Powerful Test; Two-Parameter Normal Distribution,,CC BY-SA 3.0,"

Is it possible to show that the two-parameter Normal distribution has monotone likelihood ratio?

+ +

EDIT:

+ +

This is actually part of a larger problem. We have a random sample from $\mathcal N(\mu, \sigma^2)$ and are testing: +\begin{align} +Ho\!:\quad&\sigma <\sigma_o , &\mu &\in \mathbb{R} \\ +Ha\!:\quad&\sigma = \sigma_1 , &\mu &= \mu_1 +\end{align} +Construct a most powerful level-$α$ test. My initial thought was to show $\mathcal N(μ,σ^2)$ has the monotone likelihood ratio (MLR), and from there we can construct a uniformly most powerful (UMP) test via the Karlin-Rubin theorem. Any suggestions on general strategy would be appreciated!

+",2013-11-07 23:36:44.683 +59118,3728.0,2,,58995.0,,,,CC BY-SA 3.0,"

The dataset is big and interesting enough to be non-trivial, but small enough to ""fit in your pocket"", and not slow down experimentation with it.

+ +

I think a key aspect is that it also teaches about over-fitting. There are not enough columns to give a perfect score: we see this immediately when we look at the scatterplots, and they overlap and run into each other. So any machine-learning approach that gets a perfect score can be regarded as suspicious.

+",2013-11-08 00:18:50.447 +59119,21119.0,1,59120.0,,,"$E(x^k)$ under truncated $\mathcal{N}(\mu,1)$",,CC BY-SA 3.0,"

There is a similar question in $E(x^k)$ under a Gaussian. However, it doesn't seem to be trivial when $\mu\ne0$. As mentioned in the previous question $k$ is not an integer.

+ +

The integral that I need to evaluate is as follows: +$$\int_0^\infty x^k\exp\left(-\frac{(x-\mu)^2}{2}\right)dx$$

+ +

If it helps for the case that $\mu=0$ the answer is $\frac{2^{(k-2)/2}}{\sqrt{\pi}}\Gamma(\frac{k+1}{2})$

+",2013-11-08 00:29:41.700 +59120,20473.0,2,,59119.0,,,,CC BY-SA 3.0,"

This is a Mellin transform. In general notation we have ($a>0, s>0$)

+ +

$$\int_{0}^{\infty}x^{s-1}\exp\left\{-ax^2-bx\right\}dx = (2a)^{-1/2}\Gamma(s)\exp\left\{\frac {b^2}{8a}\right\}D_{-s}\left(b(2a)^{-1/2}\right)$$

+ +

where $D_{-s}()$ is (Whittaker's) parabolic cylinder function.

+ +

For your integral we have

+ +

$$I=\int_0^\infty x^k\exp\left\{-\frac{(x-\mu)^2}{2}\right\}dx = \exp\left\{\frac {-\mu^2}{2}\right\}\int_0^\infty x^k\exp\left(-\frac{x^2}{2} +\mu x\right)dx $$

+ +

Matching coefficients we get

+ +

$$s-1 = k \Rightarrow s=k+1,\;\; a=\frac 12,\;\; b=-\mu$$

+ +

Inserting into the general solution we have

+ +

$$I=\exp\left\{\frac {-\mu^2}{2}\right\}\left(2\frac 12\right)^{-1/2}\Gamma(k+1)\exp\left\{\frac {\mu^2}{8\frac 12}\right\}D_{-k-1}\left(-\mu\left(2\frac 12\right)^{-1/2}\right)$$

+ +

$$=\exp\left\{\frac {-\mu^2}{4}\right\}\Gamma(k+1)D_{-k-1}\left(-\mu\right)$$

+ +

Now set $k^* =k +\frac 12$. Then $D_{-k-1}\left(-\mu\right) = D_{-k^*-\frac 12}\left(-\mu\right)$ and for the second you can look up Abramowitz and Stegun p.687 and 686, starting with eq. $19.3.1$. You will indeed see that when $\mu\neq 0$ the situation is not trivial. The fact that $k$ is not an integer causes no special trouble.

+",2013-11-08 02:27:04.930 +59123,20473.0,2,,59105.0,,,,CC BY-SA 3.0,"

The time-stamp does not measure the magnitude of some variable, it marks points in time per se. And from what you write, you are interested in a binary variable: to tweet or not to tweet, call it $Y$.

+ +

One possible modelling approach could be the following (for each individual separately): First, you have to decide on how you will partition the day in time zones (half-hours? Hours? Morning-Noon, etc? Depends on the particulars of your case). Given this partition, your data will be grouped in each time zone, for each day, as a count ""XX tweets during time zone 2"" etc.

+ +

For each day separately, this will give you an empirical frequency distribution for the random variable $X=$""number of tweets per time zone"". If you divide these frequencies of each time zone by the total tweet count of the day, you will obtain an empirical relative frequency distribution, that can be considered an estimation of the random variable $Y$ (to tweet or not to tweet), for this particular day.

+ +

Denote $d_{it}$ the time zone $i$ of day $t$, $i=1,...,k$ and $p_{it}$ the corresponding empirically estimated probability that the person tweets during this time zone. +Now go across days and consider the $k$ probabilities series +$p_{it}=$ ""Tweet during time zone $i$ of day $t$"", $t=1,...,60$, since you say you have data for 60 days.

+ +

From here on you can do various things: check each time series for stability: do the pattern remains approximately the same? Here, how you have partitioned the day becomes crucial (the smaller the time interval represented in each time zone, the more instability is to be expected).

+ +

If you expect to be getting more data as days pass, you can adopt a Bayesian, updating approach, estimate a prior distribution with these first 60 days of data, and then gradually update the estimation of the distribution as new data come in: the new estimate will give you the probabilities of when the persons will tweet the next day, for each time zone.

+ +

But also you can view all the time zones together as a Vector Autoregression (VAR), of $k-1$ equations (since probabilities add up to unity), and do what one can do with VAR's, i.e. model the tomorrow probability of each time zone, as depending in a usually linear way on the corresponding probabilities of previous days (lag length to be determined during by the data and the model specification process).

+",2013-11-08 03:04:15.913 +59124,14298.0,1,,,,Implications of current debate on statistical significance,,CC BY-SA 3.0,"

In the past few years, various scholars have raised a detrimental problem of scientific hypothesis testing, dubbed ""researcher degree of freedom,"" meaning that scientists have numerous choices to make during their analysis that bias towards finding with p-value < 5%. These ambiguous choices are, for example, which case to be included, which case is categorized as outlier, running numerous model specification until something shows up, do not publish null results, etc. (The paper that sparked this debate in psychology is here, see a popular Slate article and follow-up debate by Andrew Gelman here, and the Time magazine also touches on this topic here.)

+ +

First, one clarification question:

+ +

The Time magazine wrote,

+ +
+

""A power of 0.8 means that of ten true hypotheses tested, only two + will be ruled out because their effects are not picked up in the + data;""

+
+ +

I am not sure how this fits into the definition of the power function I found in textbook, which is the probability of rejecting the null as a function of parameter $\theta$. With different $\theta$ we have different power, so I don't quite understand the above quote.

+ +

Second, some research implications:

+ +
    +
  1. In my field of political science / economics, scholars simply use up all the country-year data available. Thus, should we not be concerned with sample fiddling here?

  2. +
  3. Can the problem of running multiple tests but reporting only one model be fixed simply by the fact that someone else in the discipline will re-test your paper and strike you down immediately for not having robust results? Anticipating this, scholars in my field are more likely to include a robustness check section, where they show that multiple model specifications does not change the result. Is this sufficient?

  4. +
  5. Andrew Gelman and others raise the point that no matter the data, it would be always possible to find and publish some ""pattern"" that isn't really there. But this should not be a concern, given the fact that any empirical ""pattern"" must be supported by a theory, and rival theories within a discipline will just engage in an debate / race to find which camp is able to find more ""patterns"" in various places. If a pattern is truly spurious, then the theory behind will be quickly struck down when there is no similar pattern in other samples / settings. Isn't this how science progresses?

  6. +
  7. Assuming that the current trend of journals for null result will actually flourish, is there a way for us to aggregate all the null and positive results together and make an inference on the theory that they all try to test?

  8. +
+",2013-11-08 03:16:29.310 +59125,6204.0,2,,59122.0,,,,CC BY-SA 3.0,"

The probability of a tweet being spam given n is just the proportion of tweets for a given n that are spam. If for n=4 you had 10 spam tweets out of the 100 tweets observed for that level of n, then P(tweet is spam|n=4) = 10/100 = 0.1 = 10%.

+ +

It's not clear to me what you mean by ""build a normal distribution based on this data set.""

+",2013-11-08 03:51:53.727 +59126,5821.0,2,,59124.0,,,,CC BY-SA 3.0,"

The field of statistical science has addressed these issues since its outset. I keep saying the role of the statistician is to ensure that the type 1 error rate remains fixed. This implies that the risk of making false positive conclusions cannot be eliminated, but can be controlled. This should draw our attention to the extremely large volume of scientific research that's being conducted rather than toward the philosophy and ethics of general statistical practice. For every incredible (uncredible) result that surfaces in the media (or in government policy) at least 19 other uncredible results were shot down for their null findings.

+ +

Indeed, if you go to, say, clinicaltrials.gov, you will observe there are (for almost any disease indication) well over 1,000 clinical trials for pharmaceutical agents going on in the US at this very moment. That means, that with a false positive error rate of 0.001, on average at least 1 drug will be put on the shelves that has no effect. The validity of 0.05 as a validated threshold for statistical significance has been challenged again and again. Ironically, it's only the statisticians who feel uncomfortable with using a 1/20 false positive error rate whereas financial stakeholders (be they PIs, or Merck) will pursue beliefs tenaciously regardless of in-vitro results, theoretical proofs, or strength of prior evidence. Honestly, that tenacity is a successful and laudable personal quality of many individuals who are successful in non-statistical roles. They are generally seated above statisticians, in their respective totems, who tend to leverage that tenacity.

+ +

I think the Time quote you put forward is completely wrong. Power is the probability of rejecting the null hypothesis given it's false. This more importantly depends on exactly how ""false"" the null hypothesis is (which in turn depends on a measurable effect size). I rarely talk of power out of the context of the effect which we would deem ""interesting"" to detect. (for instance, a 4 month survival following chemotherapeutic treatment of stage 4 pancreatic cancer is not interesting, hence there's no reason to recruit 5,000 individuals for a phase 3 trial).

+ +

To address the questions you asked

+ +
    +
  1. ???

  2. +
  3. Multiplicity is difficult because it does not lead to an obvious decision rule about how to handle the data. For instance, suppose we are interested in a simple test of mean difference. Despite the infinite protestations of my colleagues, it is easy to show a t-test is well calibrated to detect differences in mean regardless of the sampling distribution of the data. Suppose we alternately pursued their path. They would begin by testing for normality using some variant of a well known distributional test (say calibration of the qqplot). If the data appeared sufficiently non-normal, they would then ask whether the data follow any well known transformation, and then apply a Box Cox transformation to determine a power transformation (possibly logarithmic) which maximizes entropy. If an obvious numerical value pops out, they will use that transformation. If not, they will use the ""distribution free"" Wilcoxon test. For this ad-hoc sequence of events, I cannot begin to hope how to calculate the calibration and power for a simple test of mean differences when the simple, stupid t-test would have sufficed. I suspect stupid acts like this can be linked mathematically to Hodge's superefficient estimation: estimators which are high power under a specific hypothesis we want to be true. Nonetheless, this process is not statistical because the false positive error rate has not been controlled.

  4. +
  5. The concept that trends can be ""discovered"" erroneously in any random set of data probably traces back to the well written article by Martin called ""Munchaesen's Statistical Grid"". This is a very illuminating read and dates back to 1984 before the golden calf of machine learning was born unto us as we presently know it. Indeed, a correctly stated hypothesis is falsifiable, but type 1 errors have grown to be much more costly in our data driven society than they ever were before. Consider, for instance, the falsified evidence of the anti-vaccine research that has led to a massive sequence of pertussis deaths. The results which spurned the public defenestration of vaccines was linked a a single study (which, although wrong, was neither confirmed by external research). There is an ethical impetus to conduct results and report honest-to-goodness strength of evidence. How strong is evidence? It has little to do with the p-value you obtain, but the p-value you said you would call significant. And remember, fudging your data changes the value of p, even when the final confirmatory test reports something different (often much smaller).

  6. +
  7. YES! You can clearly see in meta-analyses published by journals such as the Cochrane report that the distribution of test results looks more bimodal than noraml, with only positive and negative results making it into journals. This evidence is absolutely bonkers and confusing for anyone in clinical practice. If, instead, we publish null results (that come from studies whose results we would have been interested in, regardless of what they come to be), then we can expect meta-analyses to actually represent evidence that is meaningful and representative.

  8. +
+",2013-11-08 03:57:53.353 +59127,5821.0,2,,9524.0,,,,CC BY-SA 3.0,"

There is a published a documentary about Srinivasa Ramanujan whose life, as we know, is tremendously interesting. However, the film is Indian and I haven't actually seen it. I recall an Indian math historian speaking about this film at our university colloquium several years ago. He boasted, ""Ben Kingsley was interested in depicting Ramanujan but was turned down for the role because he was only half Indian"". As a mixed race individual, I felt a mixture of anger and pity. The latter because they basically turned down the opportunity to make a movie that would attract anyone's attention.

+",2013-11-08 04:09:57.603 +59128,23511.0,1,,,,Mixed Model t Test (Toothaker’s t-Test),,CC BY-SA 3.0,"

I'm trying to do a follow up test for a mixed model ANOVA, and I figured that I need do a Mixed Model t test also known as Toothaker's test. I just don't know how do it. I was wondering if anyone has an Excel sheet that can calculate this for me, because I know SPSS can't.

+ +

I'm doing a mixed model ANOVA and I found significant interactions between my within subjects variables and my between subjects variables. Notes from a previous year's stats course told me to use a mixed model t-test to examine these interactions, and we were given an Excel sheet to help us calculate it. However I lost this Excel sheet.

+ +

To clarify more I have two between subject conditions (drug/placebo) and two within subject conditions (memory tested at 2 different times).

+ +

In my notes the test is called Toothaker's Mixed Model t-test and is described as such ""Used when 1 (or more) within-subjects factor and 1 (or more) between subjects factor: Toothaker’s mixed-model t test (pools MS error)""

+ +

Wikipedia seems to describe a similar thing ""when there is a significant interaction between a between-subject factor and a within-subject factor, statisticians often recommended pooling the between-subject and within-subject MSerror terms.[2][page needed][citation needed] This can be calculated in the following way: +MSWCELL = SSBSError + SSWSError / dfBSError + dfWSError""

+",2013-11-08 04:45:17.397 +59129,22082.0,1,,,,Comparison of average values of data sets,,CC BY-SA 3.0,"

I am working with two data sets of unequal sample sizes of 998 and 857. +Average of first (998 samples), come out to be higher than the other dataset. +To my surprise, when I split my complete data into unequal halves; first into 803 and 195 samples and other also into 819 and 38 samples. +then comparison of average values of 803 subset of first complete dataset with the average of 819 subset of second complete data set showed a reverse trend in their mean values. +Same reverse trend was observed with other subset of both the data set.

+ +

My question is that is it possible that if mean of total items in A>B, their two subsets showed reverse trend in their means i.e. mean of A1 + +

Is this because of unequal sample sizes???? or because of sample density distribution trend????? or both????

+ +

If this is possible also, then is there any way to explain this quantitatively??????

+ +

It would be really helpful, if anyone can help me on this..

+",2013-11-08 06:04:54.563 +59130,10772.0,2,,9524.0,,,,CC BY-SA 3.0,"

Good Will Hunting is also a classic. Discrete mathematics at MIT.

+",2013-11-08 06:21:30.950 +59131,21029.0,2,,59129.0,,,,CC BY-SA 3.0,"

If you split your data into smaller samples, you can definitely have your means change. If you split your data by removing all the low points, then you immediately increase your mean.

+ +

The only time this will not be true is if all points are equal. Then removing points won't change the average.

+ +

Note that this does not have any significance! The fact that you took a subset does not prove anything about the overall distribution or trend. It just happened. If anything, it could imply that you incorrectly split your data.

+",2013-11-08 06:22:02.890 +59132,10450.0,2,,59099.0,,,,CC BY-SA 3.0,"

Have a look at 'A First Course in Machine Learning,' Simon Rogers and Mark Girolami. +There are many easy to follow step by step derivations of concepts that include calculus and linear algebra. Also, you can look at google book preview to see if it fits your needs.

+",2013-11-08 07:19:52.420 +59163,22415.0,2,,59161.0,,,,CC BY-SA 3.0,"

Okay, so, my first idea was using Jaynes' $A_p$ distribution, in which case we can define $A$ = next draw will be red and $N_r$ = out of $N$ draws, $r$ were red. With an ignorant prior distribution $(A_p|X) = 1$, I get that

+ +

$$(A_p|N_rX) = (A_p|X)\frac{P(N_r|A_p)}{P(N_r|X)}$$

+ +

We know that

+ +

$$P(N_r|A_p) = \binom{N}{r}p^r(1-p)^{N-r}$$

+ +

And we can find

+ +

$$P(N_r|X) = \int^1_0(N_rA_p|X)dp = \int^1_0P(N_r|A_p)(A_p|X)dp = \int^1_0\binom{N}{r}p^r(1-p)^{N-r}dp$$

+ +

from which

+ +

$$P(N_r|X) = \frac 1 {N+1}, 0 \leq r \leq N$$

+ +

And then

+ +

$$(A_p|N_r) = (N+1)\binom{N}{r}p^r(1-p)^{N-r}$$

+ +

And that looks like my distribution. Is that correct?

+",2013-11-08 17:36:47.673 +59133,594.0,2,,356.0,,,,CC BY-SA 4.0,"

You can't really even compare the two since the Kolmogorov-Smirnov is for a completely specified distribution (so if you're testing normality, you must specify the mean and variance; they can't be estimated from the data*), while the Shapiro-Wilk is for normality, with unspecified mean and variance.

+ +

* you also can't standardize by using estimated parameters and test for standard normal; that's actually the same thing.

+ +

One way to compare would be to supplement the Shapiro-Wilk with a test for specified mean and variance in a normal (combining the tests in some manner), or by having the KS tables adjusted for the parameter estimation (but then it's no longer distribution-free).

+ +

There is such a test (equivalent to the Kolmogorov-Smirnov with estimated parameters) - the Lilliefors test; the normality-test version could be validly compared to the Shapiro-Wilk (and will generally have lower power). More competitive is the Anderson-Darling test (which must also be adjusted for parameter estimation for a comparison to be valid).

+ +
+ +

As for what they test - the KS test (and the Lilliefors) looks at the largest difference between the empirical CDF and the specified distribution, while the Shapiro Wilk effectively compares two estimates of variance; the closely related Shapiro-Francia can be regarded as a monotonic function of the squared correlation in a Q-Q plot; if I recall correctly, the Shapiro-Wilk also takes into account covariances between the order statistics.

+ +

Edited to add: While the Shapiro-Wilk nearly always beats the Lilliefors test on alternatives of interest, an example where it doesn't is the $t_{30}$ in medium-large samples ($n>60$-ish). There the Lilliefors has higher power.

+ +

[It should be kept in mind that there are many more tests for normality that are available than these.]

+",2013-11-08 08:10:48.110 +59134,21029.0,1,61392.0,,,Multiple resampling test/train dataset when choosing new models?,,CC BY-SA 3.0,"

I have been reading several posts on testing multiple models on the same dataset, which can lead to problems controling type-1 errors. Mostly these posts have to do with data-mining on big datasets: How to draw valid conclusions from big-data and ensuring testing data doesn't influence training

+ +

However, I know that this is done frequently. In fact, in my old work it was my job to find the best (logit) model given a dataset. In order to decide if a model is predictive or not, you have to validate against the test data. If the performance is poor, then you start from zero and create a new model. By the end you may have checked the test dataset dozens of times.

+ +

I was recently asked if multiple re-sampling of the dataset would be a possible solution. I wanted to say 'no,' but I don't actually know why this is bad. To give a specific example:

+ +

Suppose I am looking for the best linear regression given a dataset of 1,000 observations. I split the data into training and testing sets. I formulate a model which is not satisfactory on the testing sample. So, I redistribute the 1,000 observations into new training/test samples and attempt to find a new model. Each model will be trained and tested on their own specific training/test sample, which all come from the same 1,000 original observations.

+ +

My question: Why is this incorrect? What are the problems that are created with this methodology?

+",2013-11-08 08:23:59.613 +59135,1959.0,1,,,,Does full subset selection suffer from the same handicaps as stepwise regression?,,CC BY-SA 3.0,"

Let's assume $p$ potential predictor variables $X_1,...,X_p$ and a single dependent variable $Y$.

+ +

Now I evaluate the performance of all possible linear models considering all possible combinations of predictor variables ($2^p-1$). The performance measure(s) could be pretty much any statistic but first comes to mind $R^2$, $F$-statistic and MSE. Based on them I select the ""best"" model or the top selection which I can check out more closely.

+ +

Intuitively I would assume this is a great idea—but I read around a bit and came across the infamous concept of ""stepwise regression"" and how it is considered useless by a lot of ( though apparently not all) statistically trained people. The reason seems to be that the assumed distribution underlying the involved statistics does not hold for the scenario.

+ +

But stepwise regression is usually described as a slightly different algorithm where you start with a model, adding and removing variables from the model based on a criterion for a statistic.

+ +

So my question is whether the approach I describe would also be a type of stepwise regression and hence be handicapped by design. On the latter part (if it is SwReg) I would be interested in clarification on where the handicap comes into play and whether it is possible to amend it.

+",2013-11-08 09:09:18.163 +59136,9554.0,2,,59114.0,,,,CC BY-SA 3.0,"

In order to use a mixture of Gaussians for your problem, you have to assume that your three measurements are multivariate normal. In that setting, you have many measurements of a mixture of 3 dimensional normals, generated by k different underlying densities.

+ +

You could start by setting k=3, one for ""/ shape"", one for ""/\ shape"", and one for ""-- shape"" you mentioned having observed. This will allow you to model the underlying mixture distribution, estimate the covariance of each cluster and of course, classify all measurements. Before comparing them, I would also think about subtracting the mean (and potentially also scaling), since you probably don't care about the general level of an individual, but about the change.

+ +

Please provide some details where you found the information about the ""quadratic curves"" you are mentioning, or why you think the Gaussian mixture would classify your measurements using a linear boundary, if you wish to follow up on that. Based on my understanding, the results are labeled based on the likelihood of being generated by a particular density (one of the k you started with). I am not aware of any condition that would restrict the boundary to be a polynomial in the general approach, or even suggest it could be so for that matter.

+ +

EDIT (in response to your comment):

+ +

The order in your measurements will be captured, since you will encode the morning,afternoon, evening measurements for each individual as a 3dimensional, multivariate normal, with unknown mean and unknown covariance. Imagine there are 3 such Gaussians - one for each characteristic shape of the measurements.

+ +

A gaussian mixture model is called a ""mixture"" model since it presumes that the measurements come from a probability, which is some weighted combination of the three underlying Gaussians. Each measurement can have a different weighting.

+ +

When you fit the model, you need to infer the parameters of all three Gaussians and simultaneously, infer the responsibility (weighting) of each Gaussian for having generated a particular measurement.

+ +

If you find this concept too hard to understand, or if my explanation is completely incomprehensible I would suggest you try a much simpler and much more straightforward approach: k-means.

+ +

You simply take the $3xN$ table of data, stick it in a k-means function and set the number of clusters to be 3. +I have written a snippet in R to illustrate:

+ +

First I generate some data along the lines of your description.

+ +
library(MASS)
+library(clue)
+library(mclust)    
+
+# Generate training data
+Sigma <- diag(3)
+mu1 <- c(3,0,3)
+mu2 <- c(0,0,0)
+mu3 <- c(0,3,0)    
+
+group1 <- mvrnorm(n = 5, mu1, Sigma)
+group2 <- mvrnorm(n = 5, mu2, Sigma)
+group3 <- mvrnorm(n = 5, mu3, Sigma)    
+
+# Generate test data
+new_measurements <- rbind(mvrnorm(n = 2, mu1, Sigma), mvrnorm(n = 2, mu2, Sigma),mvrnorm(n = 2, mu3, Sigma))    
+
+ +

Here is what it looks like: (I labeled the measurements according to the Gaussian which generated them purely for visual convenience. In your case, your data is not labeled.)

+ +
# Plot training data and cluster
+matplot(t(group1),type=""b"",col=2,lty=1,pch=1)
+matplot(t(group2),type=""b"",col=3,lty=1,pch=1,add=T)
+matplot(t(group3),type=""b"",col=4,lty=1,pch=1,add=T)
+
+ +

+ +

Now we can cluster the data using k-means, by setting the number of cluster to 3, since that is your intuition. (We could also try to investigate what is the right number of clusters using the data.)

+ +
# Cluster
+data <- rbind(group1,group2,group3)
+fit <- kmeans(scale(data), 3)
+# get cluster means 
+cluster_means <- aggregate(data,by=list(fit$cluster),FUN=mean)    
+
+ +

And we get the mean values for each cluster, which we can then use to make predictions for new measurements.

+ +
# Predict cluster
+fit$centers
+predicted_clusters <- cl_predict(fit,scale(new_measurements))
+predicted_clusters
+
+ +

We get the means of the clusters,

+ +
        [,1]       [,2]       [,3]
+1 -0.8618375 -0.6464979 -0.5279966
+2 -0.3605748  0.9793615 -0.5128992
+3  1.1221597 -0.6580355  1.0378763
+
+ +

as well as the predicted cluster for the our new measurements.

+ +
Class ids:
+[1] 3 3 1 1 2 2
+
+ +

As you can see, k-means correctly predicted to which cluster they belong to, though the k-means estimations of the means are relatively far off. The third cluster corresponds to our first simulated Gaussian with means (3,0,3).

+ +

Lastly, we can plot the new data and the estimated means.

+ +
matplot(t(cluster_means[,-1]),type=""b"",col=3,lty=1,pch=20,lwd=2,ylim=c(-1,6))
+matplot(t(scale(new_measurements)),type=""l"",col=1,lty=1,add=T)
+legend(""topright"",legend=c(""New data"",""k-means Cluster Means""),lty=1,col=c(1,3,4))
+
+ +

+",2013-11-08 10:33:53.120 +59137,23519.0,1,,,,Missing data SPSS paired samples t-test,,CC BY-SA 3.0,"

I have approximately 20% data missing in my sample (n=3215). I aim to assess the pre-post differences on a psychometric scale. Especially post measures are missing because of follow-up issues. What to do? Should I exclude cases listwise/pairwise or replace missings with series mean or linear interpolation?

+ +

I don't have the SPSS Multiple Imputation/Missing values module installed on my SPSS.

+",2013-11-08 11:17:59.117 +59138,23520.0,2,,16366.0,,,,CC BY-SA 3.0,"

We know that parameters of LDA are estimated through Variational Inference. So

+ +

$\log p(w|\alpha, \beta) = E[\log p(\theta,z,w|\alpha,\beta)]-E[\log q(\theta,z)] + D(q(\theta,z)||p(\theta,z))$.

+ +

If your variational distribution is enough equal to the original distribution, then $D(q(\theta,z)||p(\theta,z)) = 0$. So, $\log p(w|\alpha, \beta) = E[\log p(\theta,z,w|\alpha,\beta)]-E[\log q(\theta,z)]$, which is the likelihood.

+ +

$\log p(w|\alpha, \beta)$ approximates to the likelihood you got from the Variational Inference.

+",2013-11-08 11:23:00.013 +59151,20795.0,1,,,,What is the minimum historical data/sample data required for a time series forecasting analysis?,,CC BY-SA 3.0,"

Are there any statistical power analysis/sample size deteminations methods for time series data analysis/forecasting?

+ +

For example if I have time series of 30 data points, how can I with confidence use a particular statistical methods like exponential smoothing or arima for predict the future ?

+ +

I have seen in some textbooks that have a cursary mention on historical data points required for ARIMA would be 50 or 60. But I have not encountered a formal approach on how much history is required for a a particular time series forecasting method.

+ +

I did a thorough search on major time series textbooks and the internet, I'm unable to find any literature on this topic. Any guidance would be helpful.

+",2013-11-08 15:12:04.177 +59139,503.0,2,,59124.0,,,,CC BY-SA 3.0,"

Instead of using p-values to assess claims we should follow Robert Abelson's advice and use the MAGIC criteria:

+ +
Magnitude
+Articulation
+Generality
+Interestingness
+Credibility
+
+ +

For more on Abelson see my review of his book

+ +

And we should be concentrating on effect sizes, not p-values in statistical output (with the possible exception of some sorts of data mining, on which I am not expert at all). And effect sizes are to be judged in context:

+ +
1 in 1000 pairs of pants gets the wrong size label - not a big deal
+1 in 1000 airplanes are defective in a way that leads to crashes - a big deal
+1 in 1000 nuclear reactors is defective in a way that leads to meltdown - uh oh
+
+ +

A statistician/data analyst should not be some odd person, used like a black box into which data is put and out from which p values are gotten; he/she should be a collaborator in research designed to make a reasonable argument about the meaning of some set of data in the context of some field, given the current theories (or their lack) and current evidence (or lack of same).

+ +

Unfortunately, this approach requires thought on the part of the substantive researchers, the data analyst and whoever reviews the results (be it a pointy haired boss, a dissertation committee, a journal editor or whoever). Oddly, even academics seem averse to this sort of thought.

+ +

For more on my views, here is an article I wrote that got published in Sciences360.

+",2013-11-08 11:42:26.687 +59140,23476.0,1,,,,Calibration of Cox regression survival analysis,,CC BY-SA 3.0,"
    +
  1. To perform calibration of a Cox regression model (i.e. assessing for the agreement between the predicted and the observed outcome), what is the best method to present the accuracy of the model in predicting the actual event?

  2. +
  3. As far as I understand, we can calculate the actual outcome probability by observing the number of events that occurred in a number of subjects with similar/same predicted probability from the Cox model. To perform the above calculation, do we stratify the predicted risk into several groups (<15%, 15-30%, 30-45% etc.), and within each risk group we use the number of subjects as the denominator for the calculation of actual outcome?

  4. +
  5. What method do we use to compare the predicted outcome with the actual outcome? Is it good enough if we simply present the predicted and actual risk% in each risk group in table format? Can rms package in R do all calibrations for you?

  6. +
  7. Can we use pec::predictSurvProb() to give the absolute risk of event for each individual? Can we specify the time point for the risk/hazard function for each individual to be at the ENDPOINT of follow up?

  8. +
  9. When interpreting the results, do we use the mean follow up period (in years) as the time point on which the predicted risk and actual risk are based? (E.g. Individual A has 30% risk of event at 6.5 years (mean follow up period))

  10. +
  11. Is the goodness-of-fit test for Cox regression (Gronnesby and Borgan test) simply a means for calibration for cox regression? Or does it mean something else?

  12. +
  13. To compare models with net reclassification, how many subjects and outcomes do we need for such method to become valid?

  14. +
+",2013-11-08 12:36:14.390 +59141,12787.0,1,,,,Confidence intervals and central estimates for a functional of an estimated function with uncertain parameters,,CC BY-SA 3.0,"

I've got a problem that is leading me to dip my toes into Bayesian stats, and I've got a question about confidence (or, I suppose, credible) intervals:

+ +

Say you want to know how $X$ maps to $y$. You fit a model $y=f(X)+\epsilon$.

+ +

Then, you want to optimize $X$ to get the best $y$: +$$y_{max} = argmax_X(\hat{f}(X),s.t. \text{whatever constraints}) $$

+ +

This gives you the model's best estimate of the optimal $X$ for getting the biggest $y$.

+ +

But obviously $\hat{f}(X)$ is uncertain. If you take a Bayesian standpoint that $\beta$ is distributed multivariate normal, you can take samples from it, which gives new coefficients (see, for example, this). Taking many samples, using them to pick new optimal values of $X$, one gets a distribution of $y_{max}$ that reflects uncertainty in $\hat{f}(X)$.

+ +

Here is the problem: the central estimate of $y_{max}$ (i.e.: optimizing based on the parameter estimates of the fitted model) is not necessarily the mean or the median of the $y_{max}$ distribution that one gets when optimizing the functions based on the posterior draws.

+ +

So what should I do with the ""central estimate""? Which estimate should I consider to be my ""best guess"" of the value of $y_{max}$? Should it be $y_{max}$ at the (ML) parameter estimates? Should it be the mean or median of the posterior simulations of $y_{max}$?

+ +

I don't know whether there is a right answer here: maybe this is a somewhat of a philosophical question? (Or am I making some relatively fundamental mistake, which makes my whole question moot? If so, I'd be grateful for replies that point it out.)

+",2013-11-08 13:14:33.773 +59142,449.0,2,,59137.0,,,,CC BY-SA 3.0,"

There is nothing you can do without more data than just a list of pairs of numbers with some missing. It might be useful to really consider what getting all of these missing values might mean though. I'm doubtful you will gain much at all in going through the process of trying to make the imputation.

+ +

Consider the mathematical impact of this on your standard error, which is controlled by the square root of N. If you had all 3215 observations then the sqrt of N is 57. If you lose 20% it's 51. That's only a difference of about 11% in terms of what your standard error will be and consequently, smaller impact on your t than you might think adding in ~643 subjects would yield. Your effect, correlation, and variance estimates should all be pretty stable by the time the N is that high so those won't change much at all, and not in any predictable direction.

+ +

In other words, getting all of this data could change your t from 2 to 2.2. Given that any imputation is fraught with complication in explanation and limitations in your conclusions, is that worth it to you?

+ +

A further thing to consider is that if you don't have a significant effect already, with the N you have, but you believe the effect really exists, then you have a really small effect. Let's say you currently have a non-significant t. If that's true then Cohen's D is under 0.04. Is that a value that's meaningful in light of your theory?

+ +

Of course, this assumes there's no bias from the missing data. That could change things. In that case then perhaps you should strive to impute but you'd need some argument to support the idea. That would require knowledge about those subjects or hypotheses that are supported externally that strongly support an expectation of bias. If you do have a strong reason to expect that your missing subjects are part of a group that will bias the data then perhaps that should be a variable in the analysis.

+",2013-11-08 13:33:20.420 +59143,23524.0,1,,,,Comparing two ordinal regression models,,CC BY-SA 3.0,"

I am trying to find out if I can somehow assess if one model fits my data significantly better than another model? They are both ordinal regressions and we have introduced an additional interaction term in the second model. Now we want to know if the model has improved. Do I only look at the model fit indices? How do I know if the model has significantly improved?

+",2013-11-08 14:00:25.287 +59152,9408.0,2,,59147.0,,,,CC BY-SA 3.0,"

The major difference between time series data and cross-section data is that the former focuses on results gained over an extended period of time, often within a small area, whilst the latter focuses on the information received from surveys and opinions at a particular time, in various locations, depending on the information sought. Moreover, gdp in one time lag is likely to be dependent on the next time lag and so on. In a cross sectional point of view, you ignore this correlation. For your problem, I guess you will be trying to see how gdp is being affected by employment over time so that you can also estimate the future scenario.

+",2013-11-08 15:17:06.703 +59144,2161.0,1,59148.0,,,Can and should you use data from repeated responses in a linear mixed-effects model?,,CC BY-SA 3.0,"

When you collect data from participants in an experiment, sometimes you can collect repeated responses for the same condition, e.g., in R:

+ +
set.seed(2012) # keep the example the same each time.
+
+data.full <- data.frame(id=gl(10, 4),
+                        condition=gl(2, 40),
+                        response=c(rnorm(40), rnorm(40, 1)))
+head(data.full)
+
+# Output:
+#   id condition    response
+# 1  1         1 -0.77791825
+# 2  1         1 -0.57787590
+# 3  1         1  0.66325605
+# 4  1         1  0.08802235
+# 5  2         1  1.25707865
+# 6  2         1 -0.62977450
+
+ +

To analyse this (i.e. does condition predict response) I would normally take the mean response for each participant, for each condition. I would do this on the basis that we are supposed to be generalizing from a sample to a population, i.e. there should be one 'estimate' response from each participant for each condition, and the collection of these single responses (for each condition) is our sample, then we do an analysis which generalizes to the population.

+ +

I would transform the data e.g. like this:

+ +
library(plyr)
+data.means <- ddply(data.full, .(id, condition),
+                    summarize,
+                    mean.response=mean(response))
+head(data.means)
+
+# Output:
+#   id condition mean.response
+# 1  1         1    -0.1511289
+# 2  1         2     0.8658770
+# 3  2         1     0.1510842
+# 4  2         2     0.0129323
+# 5  3         1     0.1857577
+# 6  3         2     0.9859697
+
+ +

And then proceed with the within-subjects analysis (note the same process would apply if there were more conditions or a 2x2 design etc.), e.g.:

+ +
aov1 <- aov(mean.response ~ condition + Error(id/condition), data=data.means)
+summary(aov1) # F = 4.2, p = .07, not significant
+
+ +

However, I've been told that with linear mixed-effects models, you can include all the underlying data on the basis that the lme models can include correlated data. My understanding was that they could include correlated data meant they could include responses from the same participants (within-subjects effects modelled as random effects), not that you could include the underlying data that gives the participant response estimate.

+ +

My question is, can you include the underlying data collected from the multiple responses of each participant in the same condition, i.e. can you do this:

+ +
library(nlme)
+lme1 <- lme(response ~ 1, random= ~ 1|id/condition, data=data.full, method=""ML"")
+lme2 <- update(lme1, .~. + condition)
+anova(lme1, lme2)
+
+# X(1) = 3.19, p = .07, not significant
+
+ +

Or should you do this:

+ +
lme1 <- lme(mean.response ~ 1, random= ~ 1|id/condition, data=data.means, method=""ML"")
+lme2 <- update(lme1, .~. + condition)
+anova(lme1, lme2)
+
+# X(1) = 5.25, p = .02, significant
+
+ +

Which is the correct approach?

+",2013-11-08 14:01:43.553 +59145,21762.0,2,,59129.0,,,,CC BY-SA 3.0,"

The answer is ""Yes"". This is Simpson's paradox applied to mean differences instead of odds ratios. You can read Wiki's article (http://en.wikipedia.org/wiki/Simpson%27s_paradox) to understand the mechanisms behind it. It's a projection problem: If you only see a two dimensional projection of a three dimensional object, you can get quite a wrong impression about the whole picture. In balanced settings (equal group sizes), this is not possible.

+ +

Consider, for instance, the following simple setting:

+ +
    +
  • $A_1$ consists of 99 times the value 1
  • +
  • $A_2$ consists of the value 100
  • +
  • $B_1$ consists of the value -9
  • +
  • $B_2$ consists of the value 99
  • +
+ +

The average of $A = A_1 \cup A_2$ is about 2 and thus much smaller than the average 45 of $B = B_1 \cup B_2$. On the other hand, the average 1 of $A_1$ is larger than the average -9 of $B_1$. Similarly, the average 100 of $A_2$ is larger than the average 99 of $B_2$.

+",2013-11-08 14:02:03.623 +59146,16665.0,1,,,,Understanding formula for the standardized selection ratio (SSR),,CC BY-SA 3.0,"

Manyl et al defined an index called Standardized Selection Ratio (SSR). Here is the source. I don't quite understand how this index is calculated and how the p.values are calculated. Can you help me. The interesting part starts at page 40 (and ends few pages after) of the book.

+ +

Below is an example of an article where this index was used. They described the SSR index but it doesn't make much sense to me either.

+ +
+ +

On this article they used the Standardized Selection Ratio (SSR) in order to know what is the preferred host (anemone) of anemonefish.

+ +

Here is a quotation coming from their methods:

+ +
+

The “preferred host” of anemonefish was assessed calculating the “Standardized + Selection Ratio (SSR)” (values between 0 and 1) (Manly et al. 1993). Manly’s standardized + selection ratio represents the probability that an individual will use a particular habitat + type, taking into account the different resource availability. For each anemonefish species ($i$) inhabiting an anemone species ($j$), SSR was calculated as:

+
+ +

$$SSR = \frac{w_i}{\sum{w_j}} $$

+ +

where $w_i = \frac{o_i}{p_j}$

+ +
+

$o_i$ is the relative frequency of the anemonefish species $i$ and $p_j$ the relative frequency + of the anemone species $j$. Higher values of SSR indicate a strong preference for the + selected resource. The Log- Likelihood statistic ($\chi^2$L) (Manly et al. 1993) was used to + check the significance of the observed distribution under a null hypothesis of a random + host choice.

+
+",2013-11-08 14:09:38.360 +59147,23525.0,1,,,,Time series as cross-sectional data,,CC BY-SA 3.0,"

I have time series, for example, gdp and unemployment(unemp), freq= 4.

+ +

What if I interpret it as cross-sectional data and do cross-sectional analysis instead of time series?

+ +

My task is to test how unemployment affects gdp.

+ +

Is it allowed to do that kind of analysis?

+ +

Do the coefficients in the model lm(gdp~unemp) have an economic explanation?

+",2013-11-08 14:46:04.277 +59148,449.0,2,,59144.0,,,,CC BY-SA 3.0,"

Not only can you use the repeated measure, you should. You'll note that the mixed model doesn't dramatically reduce the standard error when you include lots of repeated responses. That's a hint that it's at least not doing the traditionally wrong thing. You don't have to identify these multiple responses any special way in the formula.

+",2013-11-08 14:55:33.263 +59149,23523.0,1,,,,Detecting outlier cash movements,,CC BY-SA 3.0,"

If I'm watching a series of accounts for transactions going in and transactions going out, I want to notice unusually large or transactions for any particular account on any particular day.

+ +

So if account A typically moves a few hundred dollars and one day moves five thousand dollars, that's a clear outlier. +If account B typically moves a few million dollars in or out and one day moves 20 million dollars, that's a clear outlier.

+ +

What I'd like to do is present a measure that should highlight outliers - I was thinking number of standard deviations versus a population of the rolling last 60 days, but I'm wondering if that's correct. I'm checking to see if it's a gaussian distribution, but are there better ways to hit what I'm looking for?

+ +

I think this poses a different set of questions than Robust outlier detection in financial timeseries.

+",2013-11-08 15:01:45.843 +59150,16474.0,2,,59143.0,,,,CC BY-SA 3.0,"

The equivalent null hypothesis is that the coefficient for the interaction term is 0. If that hypothesis is true, then your model with the interaction effect is exactly the same as your model without the interaction effect, and adding the interaction effect has thus added nothing to the model fit.

+ +

In your output there will be next to the interaction term a test for whether or not that coefficient is 0. So that alone will be enough to answer your question. This is a Wald test. If you insist on comparing models you can do a likelihood ratio test. The Wald test and likelihood ratio test will give the same answer in large samples.

+",2013-11-08 15:04:33.947 +59162,23529.0,1,,,,Do interactions in mixed designs inflate the main effect of fixed factor?,,CC BY-SA 3.0,"

Explain in clear way why in a mixed design the presence of an interaction between a random and a fixed factor inflates the estimate of the main effect of the fixed factor.

+",2013-11-08 17:22:49.013 +59153,2149.0,2,,59149.0,,,,CC BY-SA 3.0,"

What you have to do is to develop a reasonable model that may incorporate parameters reflecting day-of-the-week , changes in day-of-the-week parameters, week-of-the-year, month-of-the-year, week-of-the-month, day-of-the-month and activity around known events like holidays. The model should detect and incorporate level shifts and local time trends while being robust to pulses i.e. unaffected. The model should detect both parameter changes and changes in the error variance and incorporate remedies. We have been doing this for banking clients (atm machines and elsewhere) since 2002 using AUTOBOX (http://www.autobox.com) a piece of software that I have helped develop. If you wish to post your data ( or a coded version of your data ) please do so and I will submit to AUTOBOX in order to analyze it and then I will post the results. If you don't wish to post your data then contact me at my email address. At a minimum you might want to look at http://www.autobox.com/cms/index.php/afs-university/intro-to-forecasting/doc_download/53-capabilities-presentation as Slide 44-55 speaks directly to your problem.

+",2013-11-08 15:26:51.573 +59154,23195.0,1,,,,Suggestion for doing research in data mining and machine learning,,CC BY-SA 3.0,"

I am a fresh graduate student. I want to do research in machine learning and data mining. There is no professor in our department doing this!

+ +

I want to try to do this by myself, at least for a while. But I don't where should I start. What books or review papers should I read at the beginning?

+",2013-11-08 15:28:34.533 +59155,306.0,2,,59154.0,,,,CC BY-SA 3.0,"

First of all, learn a language like R or python where you can implement stuff once you read them. then finish the book called Elements of Statistical Learning. Then pickup the chapter you liked the most and start googling. you will find what you are looking for.

+",2013-11-08 15:37:27.850 +59156,23404.0,1,,,,Interpretation of log-level difference-in-differences specification,,CC BY-SA 3.0,"

When I run a standard difference in differences specification with a log-transformed dependent variable like:

+ +

$$\log(Outcome_{it}) = \beta_1 + \beta_2Treat_i +\beta_3Post_t +\beta_4(Treat\times Post)_{it} +\varepsilon_{it}$$

+ +

How do I interpret the coefficient $\beta4$?

+ +

Normally in log-level models, I would use the approximation $\%\Delta y= 100\beta_4\Delta x$, but this approximation is only valid for small changes in $x$ (and small $\beta$). In my case $x$ is a dummy variable and as such either $0$ or $1$ ($Treat\times Post$). Is this change considered to be 'small' or do I have too use $\%\Delta y=e^{\beta-1}$ +for interpreting the coefficient?

+",2013-11-08 16:03:25.230 +59157,2352.0,2,,9524.0,,,,CC BY-SA 3.0,"

The documentary about Andrew Wiles proof of Fermat's Last Theorem is fantastic: +http://www.pbs.org/wgbh/nova/proof/

+ +

Available on youtube: +http://www.youtube.com/watch?v=7FnXgprKgSE

+",2013-11-08 16:12:46.300 +59158,22644.0,2,,59154.0,,,,CC BY-SA 3.0,"

I'd recommend taking a relevant course at Coursera. There you can explore a myriad of topics with a good dose of flexibility. Just do a search there for relevant terms: machine learning, statistics, data science, data mining, etc.

+ +

You have an instructor and peers you can interact with, you get assigned homework (that will likely be graded) and of course, you get relevant reading material and recommendations. These are well prepared courses that are available for free so take advantage of them.

+",2013-11-08 16:12:49.973 +59159,23526.0,1,,,,How to determine a good simulation time?,,CC BY-SA 4.0,"

So I'm doing network simulations, and I'd like to know how long does each simulation run needs to be. +My network is quite simple: its composed of multiple M/M/1/H queues (Markovian processes + finite waiting queues):

+ +
    +
  • clients arrive at some node(s), are put into waiting queues
  • +
  • each node process one client at a time
  • +
  • a processed client is sent to another node, or goes out of the system
  • +
  • all random processes follow a Poisson distribution
  • +
+ +

The finite waiting queues prevents it of being a Poisson process: the output of a node is no longer Poisson, so neither will be the inputs of the nodes after it.

+ +

I thought I could try to plot the variance and mean of the waiting time (inside the queues) and see when it becomes ""stable""? +Would that be a good solution? How else could I do that?

+",2013-11-08 16:15:51.660 +59160,5821.0,2,,59140.0,,,,CC BY-SA 3.0,"
    +
  1. Cox models do not predict outcomes! ""Best"" methods depend on whether you obtain a risk score (as with Framingham) or absolute risk (as with Gail Breast Cancer Risk). You need to tell us exactly what you're fitting

  2. +
  3. With absolute risk prediction, you can split groups according to their risk deciles and calculate proportions of observed vs. expected outcome frequencies. This is basically the Hosmer Lemeshow test. But, in order to use this test, you need to have an absolute risk prediction! You cannot, say, split the groups by risk score deciles and use the empirical risk as the risk prediction, this strips off too much information and leads to some counter intuitive results.

  4. +
  5. The bioconductor package has a suite of tools related to ROC analyses, predictiveness curves, etc.

  6. +
  7. Nowhere in Ulla's package is mention made of estimating smoothed baseline hazard estimates. This is necessary to obtain risk prediction from survival models... because of censoring! Here's an example of that method being applied. I would accept no less from the package.

  8. +
  9. No, don't use mean follow up. You should report total person years follow-up, along with censoring rate, and event rate. The Kaplan Meier curve kinda shows you all of that.

  10. +
  11. I'm sure Sir David Cox is not fond of G&B's test. The power of the Cox model is that it can give consistent inference without necessarily having predictive accuracy: a tough concept for many to grasp. Tsiatis' book ""semiparametric inference"" has a lot to say about this. However, if you aim to take the Cox model one step further and create predictions from it, then I think the G&B test is very good for that purpose.

  12. +
  13. Reclassification indices are proportions of individuals being shuffled into different (more discriminating) risk categories comparing two competing risk prediction models (see Pencina). It's important to realize (Kerr 2011) that you can calculate confidence intervals for this value... not using the bootstrap (or any limit theory treating the model as fixed) but using the double bootstrap (bootstrap sample, refit model, bootstrap sample again, calibrate models).

  14. +
+",2013-11-08 16:47:04.433 +59161,22415.0,1,,,,Urn probability function,,CC BY-SA 3.0,"

Suppose I have an urn with an infinite number of balls which can be either red or white. I do not know what the proportion of each colour is, but I do know it's a fixed proportion. After drawing $N$ balls, I have observed $r$ red ones and $w$ white ones.

+ +

I believe the probability that I will observe a red ball on the next draw is given by Laplace's Law of Succession, $\frac{r+1}{N+2}$. However, how sure should I be of that? That is, before I drew any balls, I believed any proportion other than $0$ or $1$ was the true one. After I drew those $N$ balls, what should be my estimate pdf over the possible values for the proportion of red balls in the urn?

+",2013-11-08 17:11:12.443 +59164,23414.0,2,,59134.0,,,,CC BY-SA 3.0,"

The concept that an out-of-bag estimate may be useful in choosing a model is standard practice and sounds reasonable.

+ +

I think as fair number of people will get hung up on the practical issues with your method: (1) What is the motivation? Or is this just theoretical? Seems like this would be far more complicated that just using standard model building approaches. +(2) The model building process you describe isn't how I many might use validation. Usually use it either to assess if one has overfit the data or to compare completely different modelling approaches. I'm not convinced that the use of validation as part of an iterative model building process makes sense for a logistic model, though this practice may be common in other models (or even built into the model like MARS/EARTH) +(3) You're using split sample validation in a fairly small sample so the estimates are likely to be unreliable. You may want to increase n to 10,000 or 20,000 to get better answers to the question. +(4) As you start making adjustments to your method as a result of (3) above you'll find you're describing LOOCV or K-fold validation.

+",2013-11-08 17:43:15.263 +59165,13303.0,1,,,,Robustly standardize residuals in MM regression,,CC BY-SA 3.0,"

Does anyone know how we can robustly standardize the residuals in MM regression? +First we perform MM regression and then obtain the residuals: how can we robustly standardize the residuals obtained from MM regression? I have found the method for least median squares (LMS) and least trimmed mean squares (LTS) in which the scale of the errors is estimated using a formula and then the residuals will be divided by that estimated scale. But for MM regression I could not find a formula for estimating the scale of the errors in order to standardize the residuals.

+",2013-11-08 18:03:57.370 +59166,3993.0,2,,57508.0,,,,CC BY-SA 3.0,"

I have discovered that the regularity I described in my question has in fact been written about by several authors in the literature on Design of Experiments (DoE). It has been called the ""hierarchical ordering principle"" and also sometimes the ""sparsity-of-effects principle.""

+ +

In the chapter on fractional factorial designs in Montgomery (2013, p. 290), he writes:

+ +
+

The successful use of fractional factorial designs is based on three key ideas:

+ +
    +
  1. The sparsity of effects principle. When there are several variables, the system or process is likely to be driven primarily by some of the main effects and low-order interactions.
  2. +
+ +

...

+
+ +

Wu & Hamada (2000, p. 143) instead call this the ""hierarchical ordering principle"", and use the phrase ""sparsity of effects"" to refer to a related but distinct observation:

+ +
+

Three fundamental principles for factorial effects:

+ +

Hierarchical ordering principle: (i) Lower order effects are more likely to be important than higher order effects, (ii) effects of the same order are likely to be equally important .

+ +

Effect sparsity principle: The number of relatively important effects in a factorial experiment is small.

+ +

...

+
+ +

Li, Sudarsanam, & Frey (2006, p. 34) give two possible explanations for why hierarchical ordering should tend to occur. First they suggest that it is ""partly due to the range over which experimenters typically explore factors"":

+ +
+

In the limit that experimenters explore small changes in factors and to the degree that systems exhibit continuity of responses and their derivatives, linear effects of factors tend to dominate. Therefore, to the extent that hierarchical ordering is common in experimentation, it is due to the fact that many experiments are conducted for the purpose of minor refinement rather than broad-scale exploration

+
+ +

They next suggest that it is ""partly determined by the ability of experimenters to transform the inputs and outputs of the system to obtain a parsimonious description of system behavior"":

+ +
+

For example, it is well known to aeronautical engineers that the lift and drag of wings is more simply described as a function of wing area and aspect ratio than by wing span and chord. Therefore, when conducting experiments to guide wing design, engineers are likely to use the product of span and chord (wing area) and the ratio of span and chord (the aspect ratio) as the independent variables

+
+ +

References

+ +
    +
  • Li, X., Sudarsanam, N., & Frey, D. D. (2006). Regularities in data from factorial experiments. Complexity, 11(5), 32-45.
  • +
  • Montgomery, D. C. (2013). Design and analysis of experiments (Vol. 8). New York: Wiley.
  • +
  • Wu, C. J., & Hamada, M. S. (2000). Experiments: planning, analysis, and optimization (Vol. 552). John Wiley & Sons.
  • +
+",2013-11-08 18:06:59.943 +59167,1298.0,1,,,,What is the dimension (or units) of a CDF and PDF?,,CC BY-SA 3.0,"

Given a continuous random variable $X$, what are the units of the PDF and CDF of $X$?

+",2013-11-08 18:10:39.253 +59168,23492.0,1,,,,What are the most commonly used predictive models when dealing with binary data?,,CC BY-SA 3.0,"

I know everybody uses logistic regression as the starting point, but I'm curious to know: What are the other commonly used predictive models when data is primarily binary?

+",2013-11-08 18:24:49.283 +59169,450.0,2,,59165.0,,,,CC BY-SA 3.0,"

The robust scale is normally output by the routine you used to estimate the MM. +For example, in R:

+ +
library(robustbase)
+data(coleman)
+set.seed(0)
+RlmST <- lmrob(log.light ~ log.Te, data = starsCYG)
+RlmST$scale
+
+ +

There is no explicit formula to compute it: it's the result +of an iterative scheme.

+",2013-11-08 18:46:51.427 +59170,22564.0,2,,59124.0,,,,CC BY-SA 3.0,"

First, I am not a statistician, just a researcher who has looked into it alot the last few years to figure out why the methods I observe being used around me are so lacking and why there is so much confusion about basic concepts like the ""what is a p-value?"" I will give my perspective.

+ +
+

First, one clarification question:

+ +

The Time magazine wrote,

+ +
""A power of 0.8 means that of ten true hypotheses tested, only two will be ruled out > because their effects are not picked up in the
+
+ +

data;""

+ +

I am not sure how this fits into the definition of the power function + I found in textbook, which is the probability of rejecting the null as + a function of parameter θ. With different θ we have different power, + so I don't quite understand the above quote.

+
+ +

Power is a function of θ, variance, and sample size. I am not sure what the confusion is. Also for many cases in which significance testing is used null hypothesis of mean1=mean2 is always false. In these cases significance is only a function of sample size. Please read Paul Meehl's ""Theory-Testing in Psychology and Physics: A Methodological Paradox"" it clarified many things for me and I have never seen an adequate response. Paul Meehl has a few other papers on this you can find by searching his name.

+ +
+

In my field of political science / economics, scholars simply use up + all the country-year data available. Thus, should we not be concerned + with sample fiddling here?

+
+ +

If you read the Simmons 2011 paper this is only one of the ""p-hacking"" techniques mentioned. If it is true that there is only one data set and no one picks out selective samples from it then I guess there is no room for increasing sample size.

+ +
+

Can the problem of running multiple tests but reporting only one model + be fixed simply by the fact that someone else in the discipline will + re-test your paper and strike you down immediately for not having + robust results? Anticipating this, scholars in my field are more + likely to include a robustness check section, where they show that + multiple model specifications does not change the result. Is this + sufficient?

+
+ +

If replication was occurring without publication bias there would be no need for ""journals of the null result"". I would say the robustness check section is good to have but is not sufficient in the presence of researchers failing to publish what they consider null results. Also I would not consider a result robust just because multiple analysis techniques on the same data come to the same conclusion. A robust result is one that makes a correct prediction of effect/correlation/etc on new data.

+ +

A replication is not getting p<0.05 both times. The theory should be considered more robust if it predicted a different effect/correlation/etc than used in the first study. I do not refer to the presence of an effect or correlation, but the precise value or a small range of values compared to possible range of values. The presence of increased/decreased effect or positive/negative correlation are 100% likely to be true in the case of the null hypothesis being false. Read Meehl.

+ +
+

Andrew Gelman and others raise the point that no matter the data, it + would be always possible to find and publish some ""pattern"" that isn't + really there. But this should not be a concern, given the fact that + any empirical ""pattern"" must be supported by a theory, and rival + theories within a discipline will just engage in an debate / race to + find which camp is able to find more ""patterns"" in various places. If + a pattern is truly spurious, then the theory behind will be quickly + struck down when there is no similar pattern in other samples / + settings. Isn't this how science progresses?

+
+ +

Science cannot function properly if researchers are failing to publish null results. Also just because the pattern was not discovered in the second sample/setting does not mean it does not exist under the conditions of the initial study.

+ +
+

Assuming that the current trend of journals for null result will + actually flourish, is there a way for us to aggregate all the null and + positive results together and make an inference on the theory that + they all try to test?

+
+ +

This would be meta-analysis. There is nothing special about null results in this case other than that researchers do not publish them because the p-values were above the arbitrary threshold. In the presence of publication bias meta-analysis is unreliable as is the entire literature suffering from publication bias. While it can be useful, meta analysis is far inferior for assessing a theory than having that theory make a precise prediction that is then tested. Publication bias does not matter nearly as much as long as new predictions pan out and are replicated by independent groups.

+",2013-11-08 18:55:14.857 +59171,17740.0,2,,59168.0,,,,CC BY-SA 3.0,"

When the data is entirely binary I'd say association rule learning (aka affinity analysis or market basket analysis) and then learning a decision tree based on the result (a whole bunch of association rules).

+ +

Association rule learning attempts to find associations between predictors. The result of such an analysis is a set of rules (e.g. A ^ B) with an associated support (number of occurrences) and confidence. The amount of possible rules is exponential in terms of the amount of predictors and maximum rule length.

+ +

Subsequently it's common to learn models like decision trees from this (giant) set of rules.

+",2013-11-08 19:15:57.177 +59172,23534.0,1,,,,"non-normal data for two-way ANOVA, which transformation to choose?",,CC BY-SA 3.0,"

I need to perform a two-way ANOVA on my data. My data is from a non-normal population. Apparently there is no two or three factor test for non-normal populations. I realized I need to transform my data, but I'm unsure about which transformation to perform on my data, I don't know which is the most appropriate. I don't know what is the criteria to choose one from the transformation list of possibilities?

+",2013-11-08 19:34:53.347 +59173,20870.0,2,,57508.0,,,,CC BY-SA 3.0,"
+

Blockquote

+
+ +

In general, I agree with the original hypotheses that higher-order terms are often associated with smaller variances. But, this also depends on the type of data.

+ +

In plant breeding, a rule of thumb (Gauch, 1996, page 90) for multi-environment trials is that the variation in the data is: +70% location, +20% location-by-variety, +10% variety

+ +

Very approximate, but it is fairly consistent that the higher-order term ""location-by-variety"" variance is larger than the main-effect ""variety"" variance.

+ +

Ref: H G Gauch and R W Zobel, 1996. Book: Genotype by Environment Interaction. Chapter: AMMI analysis of yield trials. CRC Press.

+",2013-11-08 19:52:22.647 +59174,5045.0,2,,59156.0,,,,CC BY-SA 4.0,"

You can should treat the interaction variable as a dummy and follow this advice from David Giles:

+

If $Treat\cdot Post$ switches from 0 to 1, the % impact on $Y$ is $100 \cdot (\exp(\beta_4 - \frac{1}{2} \hat \sigma_{\beta_4}^2)-1).$

+",2013-11-08 19:54:42.737 +59175,17573.0,2,,59156.0,,,,CC BY-SA 3.0,"

What's required is that $\beta \cdot \Delta x$ be small. If you know that $\Delta x$ is 1, then that means that $\beta$ has to be small. How small? The true proportionate change in $Outcome$ when the dummy rises by $1$ is $\exp(\beta)-1$. The approximate change is $\beta$. The error from the approximation is:

+ +

\begin{equation} +\textrm{Error} = \exp(\beta)-1-\beta +\end{equation}

+ +

For small $|\beta|$, this is pretty small. For example, for $\beta=0.1$ (approximate 10% change), the true percent change in $Outcome$ when the dummy turns on is 10.5%. Given the usual standard errors in empirical work, I'm happy to ignore this. By the time you get to a $\beta$ of 0.2 (approximate 20%), the true percent change is 22%. Willing to ignore this much approximation error? Again, I am, but you may not be. This is now a 10% approximation error. By the time you get to $\beta=0.3$, the true percent change in outcome is 35% rather than 30%, and I am not happy to ignore this any more.

+ +

So, my rule of thumb is to ignore this approximation error for $|\beta|<0.2$ and worry about it for $\beta$ bigger than that.

+",2013-11-08 19:59:26.343 +59176,23171.0,1,59199.0,,,"Find the distribution of (X, X+Y) when X and Y have a given joint Normal distribution",,CC BY-SA 3.0,"

Let random variables $X$ and $Y$ be independent Normal with distributions $N(\mu_{1},\sigma_{1}^2)$ and $N(\mu_{2},\sigma_{2}^{2})$. Show that the distribution of $(X,X+Y)$ is bivariate Normal with mean vector $(\mu_{1},\mu_{1}+\mu_{2})$ and covariance matrix

+ +

$$ \left( \begin{array}{ccc} +\sigma_{1}^2 & \sigma_{1}^2 \\ +\sigma_{1}^2 &\sigma_{1}^2+\sigma_{2}^2 \\ + \end{array} \right).$$

+ +

Thanks .

+",2013-11-08 19:59:35.390 +59177,23536.0,1,,,,Is it ok to correlate before-and-after data?,,CC BY-SA 3.0,"

I am asked to draw a scatterplot and to compute a correlation coefficient for the following situation. A group of subjects are measured for a blood characteristic before and after surgery.

+ +

Is it OK to correlate before-and-after data?

+ +

I know that it is not OK to perform correlations on non independent data. I feel this is such a case--the two measurements are made on the same subjects--they should be correlated.

+ +

I know that correlating data to the change over time is not OK--but that is obvious and it is not the case here.

+ +

Also correlating two variables measured repeatedly on the same sample is a huge No. But again it is not my case.

+",2013-11-08 20:01:41.077 +59178,23348.0,2,,59177.0,,,,CC BY-SA 3.0,"

I think it depends on what you are trying to do with your data. Technically, it is okay to correlate repeated measures from the same subject in the sense that it is mathematically possible. But if you trying to draw some kind of inference (for example, causality) from your data, simply correlated two observations that are from the same subject is not going to tell you anything useful.

+ +

Here's a nice little thread talking about correlations of repeated measures within subjects.

+",2013-11-08 20:13:03.187 +59179,449.0,2,,59177.0,,,,CC BY-SA 3.0,"

None of those correlations you think aren't OK really aren't OK. The correlation is just a measure of linear relationship. Sometimes you need to know the extent of a relationship that you know exists, such as this one, or any of the others you listed. In this case they may want to know the amount of correlation for a variety of reasons ranging from needing it for a repeated measures t-test report to checking to see that the data are sound.

+ +

Perhaps what you mean by not OK is that it's not OK to examine such a correlation with a hypothesis test where the null is a 0 correlation. That wouldn't be OK because you know that there has to be some. But that's not what you're asked to do.

+",2013-11-08 20:28:48.323 +59180,23348.0,2,,59172.0,,,,CC BY-SA 3.0,"

Look at the distribution of your data via a histogram, and then see what type of distribution it resembles. For example, if your data is heavily skewed towards the low end of the scale, the data might benefit from a log(10) transformation:

+ +

+ +

A log transform of this particular data would make it at least close to normal. This particular example can be found here.

+ +

Examine the distribution of your outcome data, and then choose the appropriate transformation.

+",2013-11-08 20:30:32.803 +59181,21762.0,2,,59177.0,,,,CC BY-SA 3.0,"

This is perfectly fine. You are considering two different variables each measured once per subject. One contains the 'pre' values, the other the 'post' values. I think you are mixing up independence between observations (subjects) and independence of variables.

+ +

Please note that in your situation, you might want to analyze differences between pre and post, not just looking at correlations, depending on the scientific question.

+",2013-11-08 20:40:35.470 +59182,594.0,2,,59172.0,,,,CC BY-SA 3.0,"

Transformation that will change the shape leaves you no longer comparing means. If you really want to compare means you may want to avoid transform (there can be some particular exceptions where, at least with some accompanying assumptions, you can compute or approximate the means on the original scale as well).

+ +

If you don't need an estimate of the difference in means on the original scale (i.e. if effect sizes aren't critical to your analysis), then full-factorial models (i.e. with all interactions present) may work well enough with transformation.

+ +

If you are happy with more general location-comparisons than just means, there are other alternatives than transformation.

+ +

If you do want to compare means there are other alternatives than transformation. I'm not saying 'never use transformation'... but 'consider alternatives'.

+ +
+

Apparently there is no two or three factor test for non-normal populations.

+
+ +

This is untrue. This could be done with GLMs for example. Or via resampling.

+ +
+ +

Non-normality may not be the biggest issue you have (heteroskedasticity tends to have a bigger impact, one that doesn't diminish so nicely with sample size)

+ +

A nonlinear transformation will change many things. In your case, the important ones are distributional shape, variance of the transformed variables, and what means on the transformed scale correspond to on the original scale and vice versa. (In a regression situation there's also the impact on linearity of relationships)

+ +

You might choose a transformation that takes you to nearly constant variance. You might choose one that takes you to near symmetry. You might choose one that does either of those things less well, but is more interpretable.

+ +

If you're very lucky, you might be in a situation that gets you more than one of those at once.

+ +

But again, my advice is to first consider alternatives. As a first step, you might want to investigate what could be done with GLMs.

+ +

What are the characteristics of your data? What makes you say they're non-normal? Do you have counts? Are the data highly skew*?

+ +

* note that its not the unconditional distribution of the response that's crucial, but the conditional distribution.

+",2013-11-08 21:18:14.057 +1760,723.0,1,1787.0,,,Is my weatherman accurate?,,CC BY-SA 2.5,"

A question which bothered me for some time, which I don't know how to address:

+ +

Every day, my weatherman gives a percentage chance of rain (let's assume its calculated to 9000 digits and he has never repeated a number). Every subsequent day, it either rains or does not rain.

+ +

I have years of data - pct chance vs rain or not. Given this weatherman's history, if he says tonight that tomorrow's chance of rain is X, then what's my best guess as to what the chance of rain really is?

+",2010-08-19 05:56:06.483 +2156,114.0,1,2169.0,,,Recommended books or articles as introduction to Cluster Analysis?,,CC BY-SA 2.5,"

I'm working on a small (200M) corpus of text, which I want to explore with some cluster analysis. What books or articles on that subject would you recommend?

+",2010-09-01 23:57:06.760 +59183,22903.0,1,,,,Organizing data to feed random forests,,CC BY-SA 3.0,"

I'm willing to apply machine learning with R (I will start with random forests then maybe have a look at NNs) on some data, but I don't know where to start, probably because I don't know which words to put on my problem and what to google for.

+ +

My data consist in a set of events of type A, each of which contains both some specific variables and a (variable) number of elements of type B with their own variables.

+ +

A typical example of such data would be horse racing : each race has its own parameters along with a list of horses and their own parameters.

+ +

Now, of course the training has to be done on each element of type A independently, so tutorials using basic iris data won't work — or at least I don't understand how to apply them on events of type A instead of elements of type B.

+ +

How should I organize my data set or feed it to randomForest ? Or which keywords should I use to find relevant documentation on this kind of topic ? (I tried ""grouped data"" without much success…)

+ +

NB : For a start I can discard the common variables of each A event, if needed. But still every B element has to be considered equal of other B elements inside a single A event, and independently from other A events.

+ +

Update : I've found a workaround which may work in my particular situation (still to be tested, my DB needs reorganization). The workaround is to consider parameters of the A events as parameters of each B element, so the problem simply becomes a set of B elements. However I'm not satisfied with this solution and anyway I'm not sure it could be applicable to other similar problems, the question is still open.

+",2013-11-08 21:50:55.817 +28,4.0,1,,,,The Two Cultures: statistics vs. machine learning?,,CC BY-SA 3.0,"

Last year, I read a blog post from Brendan O'Connor entitled ""Statistics vs. Machine Learning, fight!"" that discussed some of the differences between the two fields. Andrew Gelman responded favorably to this:

+ +

Simon Blomberg:

+ +
+

From R's fortunes + package: To paraphrase provocatively, + 'machine learning is statistics minus + any checking of models and + assumptions'. + -- Brian D. Ripley (about the difference between machine learning + and statistics) useR! 2004, Vienna + (May 2004) :-) Season's Greetings!

+
+ +

Andrew Gelman:

+ +
+

In that case, maybe we should get rid + of checking of models and assumptions + more often. Then maybe we'd be able to + solve some of the problems that the + machine learning people can solve but + we can't!

+
+ +

There was also the ""Statistical Modeling: The Two Cultures"" paper by Leo Breiman in 2001 which argued that statisticians rely too heavily on data modeling, and that machine learning techniques are making progress by instead relying on the predictive accuracy of models.

+ +

Has the statistics field changed over the last decade in response to these critiques? Do the two cultures still exist or has statistics grown to embrace machine learning techniques such as neural networks and support vector machines?

+",2010-07-19 19:14:44.080 +143,114.0,1,3188.0,,,Algorithms to compute the running median?,,CC BY-SA 2.5,"

On smaller window sizes, n log n sorting might work. Are there any better algorithms to achieve this?

+",2010-07-19 21:32:38.523 +356,166.0,1,59133.0,,,What is the difference between the Shapiro–Wilk test of normality and the Kolmogorov–Smirnov test of normality?,,CC BY-SA 4.0,"

What is the difference between the Shapiro–Wilk test of normality and the Kolmogorov–Smirnov test of normality? When will results from these two methods differ?

+",2010-07-21 00:24:35.500 +412,186.0,1,,,,What book would you recommend for non-statistician scientists?,,CC BY-SA 3.0,"

What book would you recommend for scientists who are not statisticians?

+ +

Clear delivery is most appreciated. As well as the explanation of the appropriate techniques and methods for typical tasks: time series analysis, presentation and aggregation of large data sets.

+",2010-07-21 15:01:21.127 +414,4.0,1,,,,"What is your favorite ""data analysis"" cartoon?",,CC BY-SA 4.0,"

Data analysis cartoons can be useful for many reasons: they help communicate; they show that quantitative people have a sense of humor too; they can instigate good teaching moments; and they can help us remember important principles and lessons.

+

This is one of my favorites:

+

+

As a service to those who value this kind of resource, please share your favorite data analysis cartoon. They probably don't need any explanation (if they do, they're probably not good cartoons!) As always, one entry per answer. (This is in the vein of the Stack Overflow question What’s your favorite “programmer” cartoon?.)

+

P.S. Do not hotlink the cartoon without the site's permission please.

+",2010-07-21 15:13:21.493 +541,,1,543.0,,user28,Why is ANOVA taught / used as if it is a different research methodology compared to linear regression?,,CC BY-SA 4.0,"

ANOVA is equivalent to linear regression with the use of suitable dummy variables. The conclusions remain the same irrespective of whether you use ANOVA or linear regression.

+

In light of their equivalence, is there any reason why ANOVA is used instead of linear regression?

+

Note: I am particularly interested in hearing about technical reasons for the use of ANOVA instead of linear regression.

+

Edit

+

Here is one example using one-way ANOVA. Suppose, you want to know if the average height of male and females is the same. To test for your hypothesis you would collect data from a random sample of male and females (say 30 each) and perform the ANOVA analysis (i.e., sum of squares for sex and error) to decide whether an effect exists.

+

You could also use linear regression to test for this as follows:

+

Define: $\text{Sex} = 1$ if respondent is a male and $0$ otherwise. +$$ +\text{Height} = \text{Intercept} + \beta * \text{Sex} + \text{error} +$$ +where: $\text{error}\sim\mathcal N(0,\sigma^2)$

+

Then a test of whether $\beta = 0$ is a an equivalent test for your hypothesis.

+",2010-07-23 15:17:56.770 +1248,399.0,1,,,,Statistics Jokes,,CC BY-SA 3.0,"

Well, we've got favourite statistics quotes. What about statistics jokes?

+",2010-08-06 01:53:47.023 +16998,5479.0,1,17000.0,,,Where to find a large text corpus?,,CC BY-SA 4.0,"

I am looking for large (>1000) text corpus to download. Preferably with world news or some kind of reports. I have only found one with patents. Any suggestions?

+",2011-11-24 21:22:19.287 +2509,628.0,1,101645.0,,,"Making sense of principal component analysis, eigenvectors & eigenvalues",,CC BY-SA 4.0,"

In today's pattern recognition class my professor talked about PCA, eigenvectors and eigenvalues.

+ +

I understood the mathematics of it. If I'm asked to find eigenvalues etc. I'll do it correctly like a machine. But I didn't understand it. I didn't get the purpose of it. I didn't get the feel of it.

+ +

I strongly believe in the following quote:

+ +
+

You do not really understand something unless you can explain it to your grandmother. -- Albert Einstein

+
+ +

Well, I can't explain these concepts to a layman or grandma.

+ +
    +
  1. Why PCA, eigenvectors & eigenvalues? What was the need for these concepts?
  2. +
  3. How would you explain these to a layman?
  4. +
+",2010-09-15 20:05:55.993 +3646,211.0,1,3649.0,,,Kendall Tau or Spearman's rho?,,CC BY-SA 4.0,"

In which cases should one prefer the one over the other?

+ +

I found someone who claims an advantage for Kendall, for pedagogical reasons, are there other reasons?

+",2010-10-24 13:15:49.687 +4187,287.0,1,,,,What are common statistical sins?,,CC BY-SA 3.0,"

I'm a grad student in psychology, and as I pursue more and more independent studies in statistics, I am increasingly amazed by the inadequacy of my formal training. Both personal and second hand experience suggests that the paucity of statistical rigor in undergraduate and graduate training is rather ubiquitous within psychology. As such, I thought it would be useful for independent learners like myself to create a list of ""Statistical Sins"", tabulating statistical practices taught to grad students as standard practice that are in fact either superseded by superior (more powerful, or flexible, or robust, etc.) modern methods or shown to be frankly invalid. Anticipating that other fields might also experience a similar state of affairs, I propose a community wiki where we can collect a list of statistical sins across disciplines. Please, submit one ""sin"" per answer.

+",2010-11-15 18:46:37.113 +4705,1209.0,1,4714.0,,,Most famous statisticians,,CC BY-SA 3.0,"

What are the most important statisticians, and what is it that made them famous?
+(Reply just one scientist per answer please.)

+",2010-12-04 00:08:23.027 +5015,1542.0,1,5020.0,,,What if interaction wipes out my direct effects in regression?,,CC BY-SA 2.5,"

In a regression, the interaction term wipes out both related direct effects. Do I drop the interaction or report the outcome? The interaction was not part of the original hypothesis.

+",2010-12-13 23:43:17.117 +6788,1790.0,1,,,,Measuring and analyzing sample complexity,,CC BY-SA 3.0,"

I recently stumbled upon the concept of sample complexity, and was wondering if there are any texts, papers or tutorials that provide:

+ +
    +
  1. An introduction to the concept (rigorous or informal)
  2. +
  3. An analysis of the sample complexity of established and popular classification methods or kernel methods.
  4. +
  5. Advice or information on how to measure it in practice.
  6. +
+ +

Any help with the topic would be greatly appreciated.

+",2011-02-19 22:41:23.000 +7965,1691.0,1,,,,Colinearity and scaling when using k-means,,CC BY-SA 2.5,"

I'm trying to gain a better understanding of kmeans clustering and am still unclear about colinearity and scaling of data. To explore colinearity, I made a plot of all five variables that I am considering shown in the figure below, along with a correlation calculation. +

+ +

I started off with a larger number of parameters, and excluded any that had a correlation higher than 0.6 (an assumption I made). The five I choose to include are shown in this diagram.

+ +

Then, I scaled the date using the R function scale(x) before applying the kmeans() function. However, I'm not sure whether center = TRUE and scale = TRUE should also be included as I don't understand the differences that these arguments make. (The scale() description is given as scale(x, center = TRUE, scale = TRUE)).

+ +

Is the process that I describe an appropriate way of identifying clusters?

+",2011-03-24 14:51:27.800 +8529,793.0,1,,,,What are some interesting and well-written applied statistics papers?,,CC BY-SA 3.0,"

What are some good papers describing applications of statistics that would be fun and informative to read? Just to be clear, I'm not really looking for papers describing new statistical methods (e.g., a paper on least angle regression), but rather papers describing how to solve real-world problems.

+ +

For example, one paper that would fit what I'm looking is the climate paper from the second Cross-Validated Journal Club. I'm kind of looking for more statistics-ish papers, rather than machine learning papers, but I guess it's kind of a fuzzy distinction (I'd classify the Netflix Prize papers as a bit borderline, and a paper on sentiment analysis as something I'm not looking for).

+ +

I'm asking because most of the applications of statistics I've seen are either the little snippets you seen in textbooks, or things related to my own work, so I'd like to branch out a bit.

+",2011-04-08 19:01:11.850 +8681,1040.0,1,8699.0,,,Where can I find good publicly available data that I could use to teach z-scores to my college students?,,CC BY-SA 3.0,"

I am sick of using the examples in the book. Is there an easy place to find data for which z-score/percentile/normal distribution stuff would be easy to see?

+",2011-04-14 01:33:55.987 +9524,2872.0,1,9529.0,,,Are there any good movies involving mathematics or probability?,,CC BY-SA 3.0,"

Can you suggest some good movies which involve math, probabilities etc? One example is 21. I would also be interested in movies that involve algorithms (e.g. text decryption). In general ""geeky"" movies with famous scientific theories but no science fiction or documentaries. Thanks in advance!

+",2011-05-07 11:13:51.243 +10008,1506.0,1,10069.0,,,Including the interaction but not the main effects in a model,,CC BY-SA 3.0,"

Is it ever valid to include a two-way interaction in a model without including the main effects? What if your hypothesis is only about the interaction, do you still need to include the main effects?

+",2011-05-20 01:19:45.107 +10541,2690.0,1,,,,Gap statistics MATLAB implementation,,CC BY-SA 3.0,"

Does any know the reference/link where i can find the MATLAB implementation of gap statistics for clustering as mentioned in this paper?

+",2011-06-05 05:32:14.513 +10911,22.0,1,57347.0,,,How to calculate the confidence interval of the mean of means?,,CC BY-SA 3.0,"

Imagine that you repeat an experiment three times. In each experiment, you collect triplicate measurements. The triplicates tend to be fairly close together, compared to the differences among the three experimental means. Computing the grand mean is pretty easy. But how can one compute a confidence interval for the grand mean?

+ +

Sample data:

+ +

Experiment 1: 34, 41, 39

+ +

Experiment 2: 45, 51, 52

+ +

Experiment 3: 29, 31, 35

+ +

Assume that the replicate values within an experiment follow a Gaussian distribution, as does the mean values of each experiment. The SD of variation within an experiment is smaller than the SD among experimental means. Assume also that there is no ordering of the three values in each experiment. The left-to-right order of the three values in each row is entirely arbitrary.

+ +

The simple approach is to first compute the mean of each experiment: 38.0, 49.3, and 31.7, and then compute the mean, and its 95% confidence interval, of those three values. Using this method, the grand mean is 39.7 with the 95% confidence interval ranging from 17.4 to 61.9.

+ +

The problem with that approach is that it totally ignores the variation among triplicates. I wonder if there isn't a good way to account for that variation.

+",2011-06-16 16:58:13.537 +13058,1124.0,1,13060.0,,,Software needed to scrape data from graph,,CC BY-SA 3.0,"

Anybody have any experience with software (preferably free, preferably open source) that will take an image of data plotted on cartesian coordinates (a standard, everyday plot) and extract the coordinates of the points plotted on the graph?

+ +

Essentially, this is a data-mining problem and a reverse data-visualization problem.

+",2011-08-18 04:14:22.583 +13631,4221.0,1,,,,Forecasting binary time series,,CC BY-SA 3.0,"

I have a binary time series with 1 when the car is not moving, and 0 when the car is moving. I want to make a forecast for a time horizon up to 36 hours ahead and for each hour.

+ +

My first approach was to use a Naive Bayes using the following inputs: t-24 (daily seasonal), t-48 (weekly seasonal), hour of the day. However, the results are not very good.

+ +

Which articles or software do you recommend for this problem?

+",2011-09-01 14:56:28.933 +14729,5898.0,1,14790.0,,,Testing for linear dependence among the columns of a matrix,,CC BY-SA 3.0,"

I have a correlation matrix of security returns whose determinant is zero. (This is a bit surprising since the sample correlation matrix and the corresponding covariance matrix should theoretically be positive definite.)

+ +

My hypothesis is that at least one security is linearly dependent on other securities. Is there a function in R that sequentially tests each column a matrix for linear dependence?

+ +

For example, one approach would be to build up a correlation matrix one security at a time and calculate the determinant at each step. When the determinant = 0 then stop as you have identified the security who is a linear combination of other securities.

+ +

Any other techniques to identify linear dependence in such a matrix are appreciated.

+",2011-10-01 17:46:37.323 +15281,3641.0,1,57490.0,,,How to detect structural change in a timeseries,,CC BY-SA 4.0,"

Is there a specific method to detect change points(structural breaks) in a timeseries? (stocks prices).

+",2011-10-13 11:46:03.773 +15542,4911.0,1,16537.0,,,"What are the ""hot algorithms"" for machine learning?",,CC BY-SA 3.0,"

This is a naive question from someone starting to learn machine learning. I'm reading these days the book ""Machine Learning: An algorithmic perspective"" from Marsland. I find it useful as an introductory book, but now I would like to go into advanced algorithms, those that are currently giving the best results. I'm mostly interested in bioinformatics: clustering of biological networks and finding patterns in biological sequences, particularly applied to single nucleotide polymorphism (SNP) analysis. Could you recommend me some reviews or books to read?

+",2011-10-18 21:24:39.543 +16209,5196.0,1,16212.0,,,How to convert a vector of enumerable strings into a vector of numbers?,,CC BY-SA 3.0,"

How to convert the x below to into a vector like y?

+ +
x <- [""a"", ""b"", ""b"", ""c"", ...]
+
+y <- [1, 2, 2, 3, ...]
+
+ +

UPDATE:

+ +

I end up with:

+ +
levels(x) <- 1:length(levels(x))
+
+",2011-11-06 09:47:15.640 +16313,5234.0,1,16337.0,,,How do the Goodman-Kruskal gamma and the Kendall tau or Spearman rho correlations compare?,,CC BY-SA 3.0,"

In my work, we are comparing predicted rankings versus true rankings for some sets of data. Up until recently, we've been using Kendall-Tau alone. A group working on a similar project suggested we try to use the Goodman-Kruskal Gamma instead, and that they preferred it. I was wondering what the differences between the different rank correlation algorithms were.

+ +

The best I've found was this answer, which claims Spearman is used in place of usual linear correlations, and that Kendall-Tau is less direct and more closely resembles Goodman-Kruskal Gamma. The data I'm working with doesn't seem to have any obvious linear correlations, and the data is heavily skewed and non-normal.

+ +

Also, Spearman generally reports higher correlation than Kendall-Tau for our data, and I was wondering what that says about the data specifically. I'm not a statistician, so some of the papers I'm reading on these things just seem like jargon to me, sorry.

+",2011-11-09 02:39:58.810 +16366,5249.0,1,30434.0,,,How to calculate perplexity of a holdout with Latent Dirichlet Allocation?,,CC BY-SA 3.0,"

I'm confused about how to calculate the perplexity of a holdout sample when doing Latent Dirichlet Allocation (LDA). The papers on the topic breeze over it, making me think I'm missing something obvious...

+ +

Perplexity is seen as a good measure of performance for LDA. The idea is that you keep a holdout sample, train your LDA on the rest of the data, then calculate the perplexity of the holdout.

+ +

The perplexity could be given by the formula:

+ +

$per(D_{test})=exp\{-\frac{\sum_{d=1}^{M}\log p(\mathbb{w}_d)}{\sum_{d=1}^{M}N_d}\} $

+ +

(Taken from Image retrieval on large-scale image databases, Horster et al.)

+ +

Here $M$ is the number of documents (in the test sample, presumably), $\mathbb{w}_d$ represents the words in document $d$, $N_d$ the number of words in document $d$.

+ +

It is not clear to me how to sensibly calcluate $p(\mathbb{w}_d)$, since we don't have topic mixtures for the held out documents. Ideally, we would integrate over the Dirichlet prior for all possible topic mixtures and use the topic multinomials we learned. Calculating this integral doesn't seem an easy task however.

+ +

Alternatively, we could attempt to learn an optimal topic mixture for each held out document (given our learned topics) and use this to calculate the perplexity. This would be doable, however it's not as trivial as papers such as Horter et al and Blei et al seem to suggest, and it's not immediately clear to me that the result will be equivalent to the ideal case above.

+",2011-11-10 03:08:12.977 +20234,12900.0,1,20240.0,,vzn,"Are machine learning techniques ""approximation algorithms""?",,CC BY-SA 3.0,"

Recently there was a ML-like question over on cstheory stackexchange, and I posted an answer recommending Powell's method, gradient descent, genetic algorithms, or other ""approximation algorithms"". In a comment someone told me these methods were ""heuristics"" and not ""approximation algorithms"" and frequently did not come close to the theoretical optimum (because they ""frequently get stuck in local minima"").

+ +

Do others agree with that? Also, it seems to me there is a sense of which heuristic algorithms can be guaranteed to come close to theoretical optimums if they are set up to explore a large part of the search space (eg setting parameters/step sizes small), although I haven't seen that in a paper. Does anyone know if this has been shown or proven in a paper? (if not for a large class of algorithms maybe for a small class say NNs etc.)

+",2012-02-10 19:03:03.517 +20561,786.0,1,,,,How to deal with gaps/NaNs in time series data when using Matlab for autocorrelation and neural networks?,,CC BY-SA 3.0,"

I have a time series of measurements (heights-one dimensional series). In the observation period, the measurement process went down for some time points. So the resulting data is a vector with NaNs where there were gaps in the data. Using MATLAB, this is causing me a problem when computing the autocorrelation (autocorr) and applying neural networks (nnstart).

+ +

How should these Gaps/NaNs be dealt with? Should I just remove these from the vector? Or replace their entry with an interpolated value? (if so how in MATLAB)

+",2012-02-15 19:25:44.330 +20667,5911.0,1,,,user995434,Looking for 2D artificial data to demonstrate properties of clustering algorithms,,CC BY-SA 3.0,"

I am looking for datasets of 2 dimensional datapoints (each datapoint is a vector of two values (x,y)) following different distributions and forms. Code to generate such data would also be helpful. I want to use them to plot / visualise how some clustering algorithms perform. Here are some examples:

+ + +",2012-02-16 21:14:21.930 +22674,7714.0,1,,,,Belief propagation on MRF with complex cliques,,CC BY-SA 3.0,"

Is there a belief propagation algorithm for exact inference on a MRF with complex clique structures (i.e. ones involving more than 2 neighbours)?

+ +

For MRF's with cliques that only involve pairwise interaction, you could just search out far enough and cluster to form an acyclic graph and run the usual BP. With more complex cliques, this seems impossible to me as clustering might involve cutting through a clique with multiple members on either side. Is there a workaround for this? Perhaps some clever conditioning arguments?

+",2012-03-30 03:51:41.627 +22797,7769.0,1,,,,How does regression with and without intercept followed by test of stationarity affect cointegration test?,,CC BY-SA 3.0,"

For a simple 2 variables (say X and Y) cointegration test, how does it affect our analysis, if we perform regression on X and Y with and without the intercept, and then test the spread for stationarity.

+ +

I am doing this analysis for stocks.

+",2012-04-02 07:39:49.013 +23019,7739.0,1,58948.0,,,"R: How to ""control"" for another variable in Linear Mixed Effects Regression model?",,CC BY-SA 3.0,"

Essentially, I have two collinear variables which could be seen as either random or as fixed effects, a dependent variable I'm fitting the model to, and a variable that's assuredly a random effect.

+ +

Dependent var: Number of neuron spikes (FiringRate) in a specific region of mousebrain

+ +

Fixed effects:

+ +

1) Time at which data sample was taken (on a linear scale in days -- so day two would be 2, day 5 would be 5, and so on)

+ +

2) The Age of the mouse in days (so there's definitely collinearity between this and the Time variable, but there are enough mice of different ages to make this worthwhile as a separate variable)

+ +

Random effect: Subject -- ""Name"" (ID number) of the mouse

+ +

Essentially, I'm wondering if it would be appropriate to run two LMEs. In the first, I'd treat Age and Subject as random variables in order to control for the effects of Age (and thus the collinearity between Age and Time) and see if Time is a significant predictor of the # of spikes (dependent variable). In the second, I'd enter Time and Subject as random variables to see if Age was a significant predictor.

+ +
library(lme4)
+a = lmer(FiringRate ~ Time + (1|Age) + (1|Subject))
+b = lmer(FiringRate ~ Age + (1|Time) + (1|Subject))
+
+",2012-04-05 23:08:58.800 +23087,5643.0,1,59036.0,,,Moving-average model error terms,,CC BY-SA 3.0,"

This is a basic question on Box-Jenkins MA models. As I understand, an MA model is basically a linear regression of time-series values $Y$ against previous error terms $e_t,..., e_{t-n}$. That is, the observation $Y$ is first regressed against its previous values $Y_{t-1}, ..., Y_{t-n}$ and then one or more $Y - \hat{Y}$ values are used as the error terms for the MA model.

+ +

But how are the error terms calculated in an ARIMA(0, 0, 2) model? If the MA model is used without an autoregressive part and thus no estimated value, how can I possibly have an error term?

+",2012-04-07 12:48:41.467 +24506,7341.0,1,24602.0,,,Confidence interval for values for a fitted line,,CC BY-SA 3.0,"

I'm using JMP to analyze some sample data to make predictions about the population. My sample is from a destructive QC test, so I obviously want to minimize my sample. I have a response (my Y) and a known factor (a very strong and consistent correlation that is measurable by non-destructive means) but the exact relationship between them varies from lot to lot (the slope and y offset vary).

+ +

So, in JMP, I am fitting a line and then showing the ""confidence limits for an individual predicted value"" which I believe gives me an indicator of how the population is likely to behave. So I'm using that plot to make disposition decisions. I want to automate this process, perhaps using R, but I'm a total novice at R. I could do the math if I was just dealing with a mean and standard deviation, but I don't know how to do it with a fit line and a known factor. Can someone please give me either the general information on how to get the confidence limits around the line, or else tell me how to do the whole thing in R?

+ +

Thankss much.

+",2012-05-04 17:02:04.137 +30862,1805.0,1,30864.0,,,Why bother with low-rank approximations?,,CC BY-SA 4.0,"

If you have a matrix with $n$ rows and $m$ columns, you can use SVD or other methods to calculate a low-rank approximation of the given matrix. However, the low-rank approximation will still have $n$ rows and $m$ columns. How can low-rank-approximations be useful for machine learning and natural language processing, given that you are left with the same number of features?

+",2012-08-28 00:12:57.667 +30957,9446.0,1,30960.0,,,Initialize ARIMA simulations with different time-series,,CC BY-SA 3.0,"

I have a fairly long time-series of annual abundances ($N_t$) of a wildlife species (73 years of abundances). To forecast the population’s trajectory, I have used ARIMA modeling. Examination of the ACF and PACF of the first-order differenced time-series suggested a 10-year cycle exists. So I used a span 10 seasonal difference to account for this periodic pattern. Therefore, the response variable was: +$$ +Y_t=(\sqrt{N_t}-\sqrt{N_{t-1}})-(\sqrt{N_{t-10}}-\sqrt{N_{t-11}}) +$$ +Typically, I would have used a logarithmic transformation but it resulted in heteroscedastic residuals. Examination of the ACF and PACF of $Y_t$ indicated a multiplicative seasonal structure so I fit the model: +$$ +ARIMA(0,1,1)(0,1,1)_{10} +$$ +using the Forecast Package in R....library(forecast).

+ +

Example code for fitting the model:

+ +
m1=Arima(y,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=10),include.mean=FALSE)
+
+ +

The residuals of this model were normally distributed, not autocorrelated, and homoscedastic.

+ +

I have been using the fitted model from above for some additional simulation work using the simulate.Arima function. However, I would like to initialize the simulation with a different time-series. The arima.sim function allows this but the arima.sim function doesn't seem to handle seasonal ARIMA models. With the simulate.Arima function one can use the future=TRUE option to simulate values that are ""future to and conditional on the data"" in the model m1. Can the data in the model object m1 simply be replaced to create a simulation that is conditional on different data?

+ +

For example:

+ +
# Create a new model object for simulation.
+m.sim=m1
+# Replace the data in the model object with the new data.
+m.sim$x=new
+# Simulation conditional on the new data.
+sim.forecasts=replicate(1000,simulate.Arima(m.sim,future=TRUE,bootstrap=TRUE))
+
+",2012-08-29 14:56:24.737 +31575,6404.0,1,31587.0,,,Estimating Markov transition probabilities from sequence data,,CC BY-SA 3.0,"

I have a full set of sequences (432 observations to be precise) of 4 states $A-D$: eg

+ +

$$Y=\left(\begin{array}{c c c c c c c} +A& C& D&D & B & A &C\\ +B& A& A&C & A&- &-\\ +\vdots&\vdots&\vdots&\vdots&\vdots&\vdots&\vdots\\ +B& C& A&D & A & B & A\\ + \end{array}\right)$$

+ +

EDIT: The observation sequences are of unequal lengths! Does this change anything?

+ +

Is there a way of calculating the transition matrix $$P_{ij}(Y_{t}=j|Y_{t-1}=i)$$ in Matlab or R or similar? I think the HMM package might help. Any thoughts?

+ +

eg: Estimating Markov chain probabilities

+",2012-09-11 15:29:12.027 +32038,11013.0,1,32053.0,,,What is the minimum recommended number of groups for a random effects factor?,,CC BY-SA 3.0,"

I'm using a mixed model in R (lme4) to analyze some repeated measures data. I have a response variable (fiber content of feces) and 3 fixed effects (body mass, etc.). My study only has 6 participants, with 16 repeated measures for each one (though two only have 12 repeats). The subjects are lizards that were given different combinations of food in different 'treatments'.

+ +

My question is: can I use subject ID as a random effect?

+ +

I know this is the usual course of action in longitudinal mixed effects models, to take account of the randomly sampled nature of the subjects and the fact that observations within subjects will be more closely correlated than those between subjects. But, treating subject ID as a random effect involves estimating a mean and variance for this variable.

+ +
    +
  • Since I have only 6 subjects (6 levels of this factor), is this enough to get an accurate characterization of the mean and variance?

  • +
  • Does the fact that I have quite a few repeated measurements for each subject help in this regard (I don't see how it matters)?

  • +
  • Finally, If I can't use subject ID as a random effect, will including it as a fixed effect allow me to control for the fact that I have repeated measures?

  • +
+ +

Edit: I'd just like to clarify that when I say ""can I"" use subject ID as a random effect, I mean ""is it a good idea to"". I know I can fit the model with a factor with just 2 levels, but surely this would be in-defensible? I'm asking at what point does it become sensible to think about treating subjects as random effects? It seems like the literature advises that 5-6 levels is a lower bound. It seems to me that the estimates of the mean and variance of the random effect would not be very precise until there were 15+ factor levels.

+",2012-09-20 01:56:50.007 +32317,8208.0,1,,,,Good clustering Java library,,CC BY-SA 3.0,"

I'm looking for a good Java library implementing several clustering algorithms.

+ +

I'll have to cluster some programs execution traces and I still don't know which algorithms I am going to need, so I'd like to use a library providing lot of them and that makes it easy to swap algorithms.

+ +

So far I had a look at Weka but I don't know whether there is a more complete library available I'm missing.

+",2012-09-25 20:11:11.667 +32388,2105.0,1,,,,Odd results from Bayesian network in R,,CC BY-SA 3.0,"

Related to question here.

+ +

I've been trying to teach myself about Network Analysis, and developing DAG charts in R. Let's say that I have the following data.

+ +
dat=data.frame(sold=c(0,0,0,1,0,1), won=c(1,0,0,1,0,1), bid=c(5,3,2,5,3,4))
+dat
+
+ +

Given what I'm trying to analyze, I know that the DAG plot should be as follows:

+ +
bid => won => sold
+
+ +

However, when I utilize the bnlearn package to generate the plot, it comes out as follows. It just can't be correct, and should be in the opposite direction.

+ +
library(""bnlearn"")
+library(""Rgraphviz"")
+
+bn.hc <- hc(dat, score = ""bic"")
+graphviz.plot(bn.hc)
+
+ +

+ +

Now, I know that's just the data that I provided it to learn on, but I've messed around with the variable values, and it never turns our the way it should. Basically, a bid should determine whether you win, and whether you win should determine whether you can sell it. Just doesn't make sense.

+ +

Isn't there some way to specify what variable is the response variable? In my case, the response variable should be sold, and there should be no arcs from sold to another node.

+ +

Can anyone help with diagnosing the problem in R? Is there something I'm missing in the code? or of my understanding of BN's? is this an issue w/ what I pass as the algorithm to use in 'score'?

+",2012-09-26 20:56:22.507 +55043,18198.0,1,,,,"Testing if low-variance components in PCA contain any ""signal""",,CC BY-SA 3.0,"

My problem is similar to this one but I am looking for a different solution: (so if it should be merged just let me know).

+ +

Measuring what's 'lost' in PCA dimensionality reduction?

+ +

I my application we have a correlation matrix of dimension 30 upon which we conduct a PCA analysis and retain the first three eigenvectors on the basis that they typically contain 90+% of the variation.

+ +

However this has always struck me as a little arbitrary, I would like to test whether these smaller eigenvectors do actually contain a ""signal"" rather than white noise.

+ +

I suppose one very simple method would be to split the data up and see if these smaller eignevectors maintain a similar shape, but I would like to find a more scientifically robust way to test this hypothesis.

+",2013-09-10 13:00:25.933 +33598,11643.0,1,,,,How to identify structural change using a Chow test on Eviews?,,CC BY-SA 3.0,"

I have this little problem and I would appreciate some help.

+ +

As part of my master thesis, I have to identify a trend in a univariate (GDP) time series for different countries. I have to separate the trend and the stochastic element in it for each country.

+ +

I have managed to do so by doing:

+ +

variable c @trend // for each country.

+ +

And then running a AR(1) on the residuals // for each country.

+ +

However, now I need to identify structural breaks in one of these countries. I've been reading and searching all over the internet and books and I've found that the test most people use to identify these structural changes is the Chow Test.

+ +

I know how to run the test, but I have't been able to figure out how to interpret the results, and decide whether there is a structural break or not.

+ +

Here there is an example of the results:

+ +

+ +

What puzzles me the most is the fact that, regardless the point I choose to break the series, I always get

+ +

Prob. F(2,47) 0.0016 //or any very significant value, with the same degrees of freedom.

+ +

Can someone please help me understand how I should interpret these results in order to identify where the breaks lie?

+",2012-10-16 09:30:00.727 +34166,668.0,1,,,,The Sleeping Beauty Paradox,,CC BY-SA 3.0,"

The situation

+

Some researchers would like to put you to sleep. Depending on the secret toss of a fair coin, they will briefly awaken you either once (Heads) or twice (Tails). After each waking, they will put you back to sleep with a drug that makes you forget that awakening. When you are awakened, to what degree should you believe that the outcome of the coin toss was Heads?

+

(OK, maybe you don’t want to be the subject of this experiment! Suppose instead that Sleeping Beauty (SB) agrees to it (with the full approval of the Magic Kingdom’s Institutional Review Board, of course). She’s about to go to sleep for one hundred years, so what are one or two more days, anyway?)

+

+

[Detail of a Maxfield Parrish illustration.]

+

Are you a Halfer or a Thirder?

+

The Halfer position. Simple! The coin is fair--and SB knows it--so she should believe there's a one-half chance of heads.

+

The Thirder position. Were this experiment to be repeated many times, then the coin will be heads only one third of the time SB is awakened. Her probability for heads will be one third.

+

Thirders have a problem

+

Most, but not all, people who have written about this are thirders. But:

+
    +
  • On Sunday evening, just before SB falls asleep, she must believe the chance of heads is one-half: that’s what it means to be a fair coin.

    +
  • +
  • Whenever SB awakens, she has learned absolutely nothing she did not know Sunday night. What rational argument can she give, then, for stating that her belief in heads is now one-third and not one-half?

    +
  • +
+

Some attempted explanations

+
    +
  • SB would necessarily lose money if she were to bet on heads with any odds other than 1/3. (Vineberg, inter alios)

    +
  • +
  • One-half really is correct: just use the Everettian “many-worlds” interpretation of Quantum Mechanics! (Lewis).

    +
  • +
  • SB updates her belief based on self-perception of her “temporal location” in the world. (Elga, i.a.)

    +
  • +
  • SB is confused: “[It] seems more plausible to say that her epistemic state upon waking up should not include a definite degree of belief in heads. … The real issue is how one deals with known, unavoidable, cognitive malfunction.” [Arntzenius]

    +
  • +
+
+

The question

+

Accounting for what has already been written on this subject (see the references as well as a previous post), how can this paradox be resolved in a statistically rigorous way? Is this even possible?

+
+

References

+

Arntzenius, Frank (2002). Reflections on Sleeping Beauty Analysis 62.1 pp 53-62.

+

Bradley, DJ (2010). Confirmation in a Branching World: The Everett Interpretation and Sleeping Beauty. Brit. J. Phil. Sci. 0 (2010), 1–21.

+

Elga, Adam (2000). Self-locating belief and the Sleeping Beauty Problem. Analysis 60 pp 143-7.

+

Franceschi, Paul (2005). Sleeping Beauty and the Problem of World Reduction. Preprint.

+

Groisman, Berry (2007). The end of Sleeping Beauty’s nightmare. Preprint.

+

Lewis, D (2001). Sleeping Beauty: reply to Elga. Analysis 61.3 pp 171-6.

+

Papineau, David and Victor Dura-Vila (2008). A Thirder and an Everettian: a reply to Lewis’s ‘Quantum Sleeping Beauty’.

+

Pust, Joel (2008). Horgan on Sleeping Beauty. Synthese 160 pp 97-101.

+

Vineberg, Susan (undated, perhaps 2003). Beauty’s Cautionary Tale.

+",2012-10-25 20:10:18.553 +35097,9886.0,1,35160.0,,,What's wrong with XKCD's Frequentists vs. Bayesians comic?,,CC BY-SA 3.0,"

+ +

This xkcd comic (Frequentists vs. Bayesians) makes fun of a frequentist statistician who derives an obviously wrong result.

+ +

However it seems to me that his reasoning is actually correct in the sense that it follows the standard frequentist methodology.

+ +

So my question is ""does he correctly apply the frequentist methodology?""

+ +
    +
  • If no: what would be a correct frequentist inference in this scenario? How to integrate ""prior knowledge"" about the sun stability in the frequentist methodology?
  • +
  • If yes: wtf? ;-)
  • +
+",2012-11-11 15:56:03.667 +35249,11884.0,1,,,,Using PCA to reduce the number of variables split into groups,,CC BY-SA 3.0,"

First of all, sorry for the strange title, I had no idea how to describe my problem better. My issue is the following, I think it is pretty much limited to geosciences.

+ +

I have several properties for every sample, which are divided by depth.

+ +

For instance:

+ +

$ \qquad \displaystyle \small \begin{array} {r|rrr} \hline +ID & 1 & 2 &3 & ...\\ \hline +\text{var1}_{0-20cm} & 2.3 &2.0 &1.0& ...\\ +\text{var1}_{20-50cm} & 2.1 &1.1 &0.0& ...\\ +\text{var1}_{50-100cm}& 2.6 &1.1 &0.0& ...\\ \hline +\text{var2}_{0-20cm} & 10.5 &5.5 &3.5& ...\\ +\text{var2}_{20-50cm} & 10.9 &5.9 &1.9& ...\\ +\text{var2}_{50-100cm}& 15.0 &5.0 &1.0& ...\\ \hline + \vdots & \vdots & \vdots\\ \hline \end{array} +$

+ +

Basically these are geological layers going from surface down to 100 cm depth. +I am trying to decrease the number of variables, either with PCA or factor analysis. +The issue is, that I would like to handle properties together, no matter what the depth is.

+ +

(For instance I do not want to get rid of a layer in between the surface and the bottom layer.)

+ +

Is there any way to handle them together, or group them for PCA or whatever. I tried to find some relevant information, but I think the problem is limited to a small portion of the science (maybe I am wrong), so I could not find anything useful.

+",2012-11-13 22:29:58.533 +37182,11446.0,1,57320.0,,,How to specify in r spatial covariance structure similar to SAS sp(pow) in a marginal model?,,CC BY-SA 3.0,"

I'm currently translating existing code from SAS to R. I'm working on longitudinal data (CD4 count over time). I have the following SAS code :

+ +
Proc mixed data=df;
+class NUM_PAT;
+model CD4t=T /s ;
+repeated / sub=NUM_PAT type=sp(pow)(T);
+
+ +

The SAS spatial power covariance structure is useful for unequally spaced longitudinal measurements where the correlations decline as a function of time (as shown by the picture below). +

+ +

I think I have to use gls( ) from {nlme} since I don't have any random effects. As R 'only' provides ""spherical"", ""exponential"", ""gaussian"", ""linear"", and ""rational"" as correlation spatial structures, my guess is that I need to use corSpatial plus a weights argument.

+ +

I tried the following code, but it doesn't work :

+ +
gls(CD4t~T, data=df, na.action = (na.omit), method = ""ML"",
+corr=corCompSymm(form=~1|NUM_PAT), weighhts=varConstPower(form=~1|T))
+
+ +

What am I doing wrong ?

+ +

Thanks for any help.

+",2012-12-14 15:06:25.837 +37748,13370.0,1,,,,What is the computational complexity of the EM algorithm?,,CC BY-SA 3.0,"

In general, and more specifically for Bernoulli mixture model (aka Latent Class Analysis).

+",2012-12-27 07:48:43.813 +37819,12314.0,1,,,,Putting stationary variables through Johansen procedure,,CC BY-SA 3.0,"

Is it okay to feed $I(0)$ variables into the Johansen procedure? I've read three sources that seem to state that this is not what you're supposed to do. However, whenever I've done this, I notice that $\Pi$ is full rank and so it leads me to a VAR and therefore I don't see any problem with this.

+",2012-12-29 16:08:10.207 +37981,13403.0,1,,,,"""Peakedness"" of a skewed probability density function",,CC BY-SA 3.0,"

I would like to describe the ""peakedness"" and tail ""heaviness"" of several skewed probability density functions.

+ +

The features I want to describe, would they be called ""kurtosis""? I've only seen the word ""kurtosis"" used for symmetric distributions?

+",2013-01-03 16:00:17.050 +40030,1790.0,1,115327.0,,,Understanding stratified cross-validation,,CC BY-SA 4.0,"

I read in Wikipedia:

+ +
+

In stratified k-fold cross-validation, the folds are selected so that the mean response value is approximately equal in all the folds. In + the case of a dichotomous classification, this means that each fold + contains roughly the same proportions of the two types of class + labels.

+
+ +
    +
  1. Say we are using CV for estimating the performance of a predictor or estimator. What would mean response value (MRV) mean in this context? Just the average value of the predictor / estimator?
  2. +
  3. In what scenarios would ""achieving approximately the same MRV"" in all folds be actually important? In other words, what are the consequences of not doing so?
  4. +
+",2013-02-07 20:58:31.927 +40104,14684.0,1,,,,The weighted sum of two independent Poisson random variables,,CC BY-SA 3.0,"

Using wikipedia I found a way to calculate the probability mass function resulting from the sum of two Poisson random variables. However, I think that the approach I have is wrong.

+ +

Let $X_1, X_2$ be two independent Poisson random variables with mean $\lambda_1, \lambda_2$, and $S_2 = a_1 X_1+a_2 X_2$, where the $a_1$ and $a_2$ are constants, then the probability-generating function of $S_2$ is given by +$$ +G_{S_2}(z) = \operatorname{E}(z^{S_2})= \operatorname{E}(z^{a_1 X_1+a_2 X_2}) G_{X_1}(z^{a_1})G_{X_2}(z^{a_2}). +$$ +Now, using the fact that the probability-generating function for a Poisson random variable is $G_{X_i}(z) = \textrm{e}^{\lambda_i(z - 1)}$, we can write the probability-generating function of the sum of the two independent Poisson random variables as +$$ +\begin{aligned} +G_{S_2}(z) &= \textrm{e}^{\lambda_1(z^{a_1} - 1)}\textrm{e}^{\lambda_2(z^{a_2} - 1)} \\ +&= \textrm{e}^{\lambda_1(z^{a_1} - 1)+\lambda_2(z^{a_2} - 1)}. +\end{aligned} +$$ +It seems that the probability mass function of $S_2$ is recovered by taking derivatives of $G_{S_2}(z)$ $\operatorname{Pr}(S_2 = k) = \frac{G_{S_2}^{(k)}(0)}{k!}$, where $G_{S_2}^{(k)} = \frac{d^k G_{S_2}(z)}{ d z^k}$.

+ +

Is this is correct? I have the feeling I cannot just take the derivative to obtain the probability mass function, because of the constants $a_1$ and $a_2$. Is this right? Is there an alternative approach?

+ +

If this is correct can I now obtain an approximation of the cumulative distribution by truncating the infinite sum over all k?

+",2013-02-09 19:31:13.290 +40121,14728.0,1,,,,Comparing many means in JMP,,CC BY-SA 3.0,"

I'm trying to compare several sets of experiment data, by comparing means. I read there are several different tests such as Each Pair, Student’s t and All Pairs, Tukey HSD, which give different circles of different radius, an example shown below

+ +

+ +

How are the circles defined? How do I calculate the radius? And is there a rule what test one should use for what kind of data?

+",2013-02-10 03:03:22.580 +40859,15044.0,1,,,,Validation: Data splitting into training vs. test datasets,,CC BY-SA 3.0,"

I was naively validating my binomial logit models by testing on a test dataset. I had randomly divided the available data (~2000 rows) into training (~1500) and validation (~500) datasets.

+ +

I now read a post in another thread ( Frank Harrell) that causes me to question my approach:

+ +
+

Data splitting is not very reliable unless you have more than 15,000 + observations. In other words, if you split the data again, accuracy + indexes will vary too much from what you obtained with the first + split.

+
+ +

How serious is this worry and what are ways around it? The OP speaks of ""resampling"" but not sure how that works here for validation.

+ +

Edit: Adding context as per @Bernhard's comment below:

+ +

Comparing logistic regression models

+",2013-02-22 08:40:44.930 +40870,1923.0,1,,,,False discovery rate calculation in target-decoy matching context,,CC BY-SA 3.0,"

A common strategy in mass spectrometry of biological molecules is to upload observed spectra to a server so that they can be matched to a LARGE database of theoretical spectra of known molecules (a.k.a. target database). In order to control for false positives, a decoy database consisting of incorrect/irrelevant spectra is used.

+ +

I have been reading more into this subject and have come up some questions regarding the calculation of the FDR measure from this target-decoy strategy. The basic idea of the FDR value is very intuitive:

+ +

$FDR = \frac{FP}{FP + TP}$

+ +

where FP and TP stands for false and true positives respectively. This makes perfect sense to me; if I'm trying to guess some peoples' names out of a phone book, and get 8 right and 2 wrong, I would have 2 false out of 10 total guesses, and thus my false discovery rate would be 20%.

+ +

However reading this tutorial on how this is done in large scale on the servers, I got introduced to two different calculations, depending on whether or not the target and decoy databases are concatenated (page 2).

+ +

I don't think that this is a typo as I found other occurrences * of the mysterious factor 2 in front of FP in scientific literature. However the motivation behind this is never explained (at least I couldn't find it).

+ +

I would appreciate some insight on where this doubling comes from. Likewise I wonder whether or not FDR calculation this way assumes that the error rate for each spectra match is the same for the target database and decoy database (i.e. assuming that getting 25 decoy hits implies 25 target hits are also false positives). It's not really clear for me why the error rate has to be the same for the two databases. Any comments on this subject is also appreciated.

+ +

* one such reference is Elias et al Nature Methods - 2, 667 - 675 (2005)

+",2013-02-22 12:56:28.503 +41244,15330.0,1,,,,Probability of heads in a biased coin,,CC BY-SA 3.0,"

Given $N$ flips of the same coin resulting in $k$ occurrences of 'heads', what is the probability density function of heads-probability of the coin?

+",2013-03-01 05:09:26.123 +41914,13918.0,1,,,,Lewandowski algorithm demand forecasting,,CC BY-SA 3.0,"

I came across the Lewandowski method of demand forecasting in JDA Demand. Please help me understand at a high level the methodology it uses. I found a paper by Robert Hyndman titled +""A state space framework for automatic forecasting using exponential smoothing methods"" and it uses this method as one of methods they compare their algorithm to in the paper. Currently for us this is a black box, we want to get some high level understanding so that we can better fine tune the parameters they have provided as part of the software. It would be great if you can share some thoughts about the Lewandowski algorithm and point to some references that I could use for further research.

+",2013-03-12 11:17:01.283 +42513,15991.0,1,42517.0,,,Is it appropriate to plot the mean in a histogram?,,CC BY-SA 3.0,"

Is it ""okay"" to add a vertical line to a histogram to visualize the mean value?

+ +

It seems okay to me, but I've never seen this in textbooks and the likes, so I'm wondering if there's some sort of convention not to do that?

+ +

The graph is for a term paper, I just want to make sure I don't accidentally break some super important unspoken stats rule. :)

+",2013-03-19 21:23:49.697 +42885,2615.0,1,58417.0,,,2SLS with two instruments for one endogenous variable in MATLAB,<2sls>,CC BY-SA 3.0,"

I have one endogenous variable and two instruments for it, and I want to calculate my beta with the direct (one step) matrix formula

+ +

$\beta_2sls = X' Z(Z'X)^{-1}Z'X^{-1}X'Z(Z'Z)^{-1}Z'Y$

+ +

But if I have two instruments for one endogenous variable X and Z are not the same length.

+ +

Any ideas? +Thanks!

+",2013-03-25 15:11:36.577 +43458,16452.0,1,,,,How to check if removing a sample makes a difference in mean and stdev values?,,CC BY-SA 3.0,"

I'd like to ask if someone could help me with the following problem:

+ +

we have measured the same sample 5 times and we would like to check if there are significant differences in mean and stdev values if we use:

+ +
    +
  • All 5 datapoints
  • +
  • Only the last 4 datapoints
  • +
  • Only the last 3 datapoints
  • +
+ +

We have performed ANOVA analysis but we are not sure about the results because we might not have homocedasticity.

+ +

Which tests would you do to investigate this issue?

+ +

Thanks in advance for your help.

+",2013-04-02 14:45:37.780 +44370,8063.0,1,,,,A way to test for enrichment of differentially expressed genes in a genomic location,,CC BY-SA 3.0,"

I have an experiment where I expect a certain genomic location to influence gene expression levels of nearby genes. I have data for expression levels (Agilent 4x44 microarrays, Drosophila) in two groups - one where I expect expression to be affected and the other wild-type and I would like to run a test for overrepresentation of differentially expressed genes in a genomic location.

+ +

My main problem is that I couldn't find a package (R/bioconductor) that would do it out of the box easily, so if you know about such a package, please let me know. In the meantime, this is what I figured out: I would run a sliding window over the whole genome and simply count number of differentially expressed genes in each window - this should tell me where I have the most differentially expressed genes in the genome. However, it will be dependent on gene density, so to obtain some sort of background distribution, I would run permutations of the samples (or p values), say, 1000 times, and check how often I am likely to find this number of windows with that number of differentially expressed genes compared to the observed numbers. Does this sound right?

+ +

I should add that while I know the location that would mess up things, I cannot exclude that any other genomic region would not be affected as well. So I have to test the whole genome.

+ +

Please advise on this approach and/or propose a better one...

+",2013-04-15 13:42:17.450 +44635,728.0,1,,,,Testing symmetry of a distribution around its mean,,CC BY-SA 4.0,"

We can test the symmetry of a distribution around $0$ by Wilcoxon sign rank test, based on its sample.

+

But if we want to test if a distribution is symmetric around its mean, based on its sample $X_1, \dots, X_n$, is it valid to first normalize $X_i$ by the sample mean as $Y_i := X_i - \bar{X}$, and then apply Wilcoxon sign rank test to $Y_i$'s?

+

If not, what are some ways?

+",2013-04-18 18:32:01.767 +44772,17076.0,1,,,,"Appropriate Analysis for ordinal variable, repeated 4 times under different conditions, by the same 2 raters",,CC BY-SA 3.0,"

I am doubting myself on which analysis to run for the following: +18 participants were evaluated at 4 time points with different conditions at each time. +They were given scores (on a discrete visual analog scale) by 2 raters.

+ +

The scores were calculated for a pair of participants: the pairs changed at each time point. +I do know which participant comprises each pair.

+ +

Is that a 2-way repeated measures ANOVA? Some variation of Friedman test?

+",2013-04-20 21:00:03.237 +45279,9095.0,1,,,,Visualizing results from multiple latent class models,,CC BY-SA 3.0,"

I am using latent class analysis to cluster a sample of observations based on a set of binary variables. I am using R and the package poLCA. In LCA, you must specify the number of clusters you want to find. In practice, people usually run several models, each specifying a different number of classes, and then use various criteria to determine which is the ""best"" explanation of the data.

+ +

I often find it very useful to look across the various models to try to understand how observations classified in model with class=(i) are distributed by the model with class = (i+1). At the very least you can sometimes find very robust clusters that exist regardless of the number of classes in the model.

+ +

I would like a way to graph these relationships, to more easily communicate these complex results in papers and to colleagues who aren't statistically oriented. I imagine this is very easy to do in R using some kind of simple network graphics package, but I simply don't know how.

+ +

Could anyone please point me in the right direction. Below is code to reproduce an example dataset. Each vector xi represents the classification of 100 observations, in a model with i possible classes. I want to graph how observations (rows) move from class to class across the columns.

+ +
x1 <- sample(1:1, 100, replace=T)
+x2 <- sample(1:2, 100, replace=T)
+x3 <- sample(1:3, 100, replace=T)
+x4 <- sample(1:4, 100, replace=T)
+x5 <- sample(1:5, 100, replace=T)
+
+results <- cbind (x1, x2, x3, x4, x5)
+
+ +

I imagine there is a way to produce a graph where the nodes are classifications and the edges reflect (by weights, or color maybe) the % of observations moving from classifications from one model to the next. E.g.

+ +

+ +

UPDATE: Having some progress with the igraph package. Starting from the code above...

+ +

poLCA results recycle the same numbers to describe class membership, so you need to do a bit of recoding.

+ +
N<-ncol(results) 
+n<-0
+for(i in 2:N) {
+results[,i]<- (results[,i])+((i-1)+n)
+n<-((i-1)+n)
+}
+
+ +

Then you need to get all the cross-tabulations and their frequencies, and rbind them into one matrix defining all the edges. There is probably a much more elegant way to do this.

+ +
results <-as.data.frame(results)
+
+g1           <- count(results,c(""x1"", ""x2""))
+
+g2           <- count(results,c(""x2"", ""x3""))
+colnames(g2) <- c(""x1"", ""x2"", ""freq"")
+
+g3           <- count(results,c(""x3"", ""x4""))
+colnames(g3) <- c(""x1"", ""x2"", ""freq"")
+
+g4           <- count(results,c(""x4"", ""x5""))
+colnames(g4) <- c(""x1"", ""x2"", ""freq"")
+
+results <- rbind(g1, g2, g3, g4)
+
+library(igraph)
+
+g1 <- graph.data.frame(results, directed=TRUE)
+
+plot.igraph(g1, layout=layout.reingold.tilford)
+
+ +

+ +

Time to play more with the igraph options I guess.

+",2013-04-26 17:31:28.260 +45280,17326.0,1,,,,Statistical test for measure of association not assuming monotonicity in small samples (n=6)?,,CC BY-SA 3.0,"

I have two continuous variables which I have data from a physics experiment.

+ +

I want to test for association between the two variables but without assuming a monotonic relationship. I also only have 6 data point each with a large error associated with it and want the test to take this into consideration.

+ +

Does anyone know of a statistical test of this type?

+",2013-04-26 17:33:27.823 +45457,17179.0,1,,,,Predicting high frequency finance time series with HMM,,CC BY-SA 3.0,"

I have a the following time series

+ +
  Price      BrokerID 632 Behaviour  BrokerID 680 Behaviour ...BrokerID XYZ Behaviour
+
+  5.6          IP                       SP                   
+  5.7          BP                       IP
+  5.8          SP                       BP
+  5.83         IP                       SP
+
+ +

where IP is idle position, BP is buying position, and SP is selling position. I want to use Broker behaviour as the known variable and price as the hidden variable and predict it using HMM. But my question is how to find the emission matrix between a character vector (broker behaviour) and price numeric vector?

+",2013-04-29 17:03:08.587 +45534,17447.0,1,45536.0,,,Question about Harrington paradox,,CC BY-SA 3.0,"
    +
  1. Model
    +The firm and enforcement agency interact in more than one domain. This may arise because a single agency is responsible for enforcing more than one regulation or because it enforces the same regulation at more than one constituent plant of a multi-plant firm. +For simplicity we will assume that the number of domains is two and that they are ex ante identical. In each domain the firm is required to comply with a regulation. If it complies it inflicts no environmental damage otherwise it inflicts damage d, which is commonly observed. The cost to the ith firm of compliance in domain j [ h1, 2j will be denoted cij where ci 1 and ci 2 are independent, privately observed draws from a distribution f(c) with associated cumulative F(c). F is common knowledge.
    +If the agency observes non-compliance by a firm in either domain it can take that firm to court (‘‘pursue’’ the firm), in which case the firm is subject to a penalty L which is exogenous. Penalties are assumed to be restricted in the sense that +F(L) < 1. This implies that a policy of full-pursuit, whereby the agency pursues all 3 +violations, will not generate full-compliance. +The firm and enforcement agency are both risk neutral and aim to maximise +expected profit and minimise expected environmental damage respectively.
  2. +
+ +

can someone explain to me what F(L) < 1 implies?

+ +

if you need the context behind this model, please tell me ill explain that as well

+",2013-04-30 15:18:37.443 +45543,17454.0,1,59109.0,,,Is the square root of the symmetric Kullback-Leibler divergence a metric?,,CC BY-SA 3.0,"

It is well known that the square root of the Jensen-Shannon divergence is a true metric, but how about the square root of symmetric KL: D(P||Q)+D(Q||P)? I have reasons to believe that it also is a true metric but cannot find any references on that other than anecdotal comments such as that it behaves more like a metric when used.

+ +

Update 1

+ +

Kullback-Leibler divergence: $D(P||Q) = \sum_i p_i\log(p_i/q_i)$

+ +

Jensen-Shannon divergence: $J(P,Q) = \big(D(P||(P+Q)/2)+D(Q||(P+Q)/2)\big)/2$

+ +

Symmetric KL divergence: $S(P,Q) = D(P||Q)+D(Q||P) = \sum_i (p_i-q_i)\log(p_i/q_i)$

+ +

Square root of symmetric KL: $d_{KL}(P,Q) = \sqrt{S(P,Q)}$

+ +

Is $d_{KL}$ a metric?

+ +

Update 2

+ +

I think the following upper and lower bounds hold:

+ +

$\sum_i (p_i-q_i)^2 \leq \sum_i (p_i-q_i)\log(p_i/q_i) \leq \sum_i \log(p_i/q_i)^2$

+ +

Both of the square root of the bounds are metrics, I suppose, since they are the square of the Euclidean distances in the probability space and the log-prob space respectively.

+",2013-04-30 17:27:21.667 +45804,17580.0,1,,,,Finding the similarity between two functions,,CC BY-SA 3.0,"

I am a first-year grad student in Computer Science, and I need some help with a problem that I think is statistically oriented. I have taken a statistics course, but it was abysmal and I haven't had time to rectify that. But anyway, my problem stems from a project I'm working on involving genetic programming, where I'm randomly generating functions. Please bear with my description, as it's been a while since I've had a formal theory course too.

+ +

I have two continuous (but not onto) functions F and G, both of which map N variables to a single output. The domain of the input variables is the integers between -100 and 100. The range of the output is the Real numbers. I want to find some statistical measure of how ""similar"" the two functions are; given the finite inputs (of which there will be 201^N possible), how much variance(?) there is between the two functions outputs. Two identical functions should return no variance, and two wildly different functions should return a high variance.

+ +

Since N will typically be greater than 6, I can't iterate through all the possible inputs and compare the outputs, so I figured I could take some sampling at regular intervals (e.g. every multiple of 10, so that it's only 10^N). But here's about where I realize I have no idea what I'm doing. How do I determine if two numbers are ""highly variant"" from each other? What sample size do I need to use to have confidence in my results?

+ +

My current approach is to compare the functions with a two-sided Kolmogorov-Smirnov Test. Since that test doesn't seem to scale well to multi-variate problems, I've taken advantage of my limited domains to just treat the problem as having a single variable by concatenating my variables. So the first value of the variable is (-100:100:100:100:100:100), the second is (-100:100:100:100:100:099), and the last is (100:100:100:100:100:100). Does that even make sense?

+",2013-05-03 21:05:29.093 +46070,17678.0,1,,,,Variance of a time series fitted to an ARIMA model,,CC BY-SA 3.0,"

I think this is a basic question, but maybe I am confusing the concepts.

+ +

Suppose I fit an ARIMA model to a time series using, for example, the function auto.arima() in the R forecast package. The model assumes constant variance. How do I obtain that variance? Is it the variance of the residuals?

+ +

If I use the model for forecasting, I know that it gives me the conditional mean. I'd like to know the (constant) variance as well.

+ +

Thank you.

+ +

Bruno

+ +
+ +

Update 1:

+ +

I added some code below. The variance given by sigma2 isn't close to the one calculated from the fitted values. I'm still wondering if sigma2 is the right option. See figure below for time series plot.

+ +
demand.train <- c(10.06286, 9.56286, 10.51914, 12.39571, 14.72857, 15.89429, 15.89429, 17.06143, 
+              17.72857, 16.56286, 14.23000, 15.39571, 13.06286, 15.39571, 15.39571, 16.56286,
+              16.21765, 15.93449, 14.74856, 14.46465, 15.38132)
+timePoints.train <- c(""Q12006"", ""Q22006"", ""Q32006"", ""Q12007"", ""Q22007"", ""Q32007"", ""Q12008"", ""Q22008"",
+                      ""Q32008"", ""Q12009"", ""Q22009"", ""Q32009"", ""Q12010"", ""Q22010"", ""Q32010"", ""Q12011"",
+                      ""Q22011"", ""Q32011"", ""Q12012"", ""Q22012"", ""Q32012"")
+
+plot(1:length(timePoints.train), demand.train, type=""o"", xaxt=""n"", ylim=c(0, max(demand.train) + 2), 
+     ylab=""Demand"", xlab=""Quadrimestre"")
+
+title(main=""Time Series Demand of Product C"", font.main=4)
+axis(1, at=1:length(timePoints.train), labels=timePoints.train)
+box()
+
+### ARIMA Fit
+library(forecast)
+
+# Time series
+demandts.freq <- 3
+demandts.train <- ts(demand.train, frequency=demandts.freq, start=c(2006, 1))
+
+# Model fitting
+demandts.train.arima <- auto.arima(demandts.train, max.p=10, max.q=10, max.P=10, max.Q=10, max.order=10)
+print(demandts.train.arima)
+summary(demandts.train.arima)
+demandts.train.arima.fit <- fitted(demandts.train.arima)
+
+# Forecast ARIMA (conditional means)
+demandts.arima.forecast <- forecast(demandts.train.arima, h = 3, level=95)
+print(demandts.arima.forecast)
+
+# Constant variance from ARIMA
+demandts.arima.var <- demandts.train.arima$sigma2
+print(demandts.arima.var)
+
+# Variance from fitted values
+print(var(demandts.train.arima.fit))
+
+ +

+",2013-05-07 19:48:58.193 +46384,15839.0,1,,,,gaussian mixture HMM,,CC BY-SA 3.0,"

What is the difference of gaussian HMM and gaussian mixture HMM (the emission is gaussian or gaussian mixture)? I want to know if it is the same thing. What is the point when estimating the parameters using Baum Welch algorithm.

+",2013-05-13 01:51:18.740 +46894,18085.0,1,58636.0,,,Imputation variance and explained variance (in vector autoregression),,CC BY-SA 3.0,"

I have a question concerning the coefficients of VAR models used on multiple imputed data (high missigness in some variables: up to 40%). +In particular I would like to know how the coefficients are related to the explained variance.

+ +

I have used vector autoregression on multiple imputed data (m=10) and have then combined the estimated coefficient with rubin's rule. +However, what confuses me is the fact that my imputation variance is quite small in relationship to the estimates and variance of coefficients, but the difference between the explained variance is huge (17% to 0.04%) between models.

+ +

My idea is that since the highest imputation variance across all systems is at the constant (around a third of the variance value but 3-4 times higher then in other coefficients) and that this critically affects the explained variance. +But thats just a guess.

+ +

I would be very happy if somebody could help me here.

+",2013-05-19 14:17:34.503 +47447,18356.0,1,,,,How to fit a simple count time series INAR(1) model,,CC BY-SA 3.0,"

I am trying to perform a simple time series analysis with count time series data. My data is a sequence of small integer values like 0,1,2 and 3. I learned from various sources that INAR model would be appropriate with such data.

+ +

My question is whether anyone knows R codes for fitting a simple INAR(1) model (regressing time series data on a binary dummy variable).

+ +

Appreciate any assistance.

+",2013-05-27 18:56:55.533 +47497,18382.0,1,,,,R Code for Yeo-Johnson transformation,,CC BY-SA 3.0,"

I have writen code for a Box-Cox transformation (see below). But now I want to do a Yeo-Johnson transformation because datc$plot contains zeros. I tried, but I didn't find a solution.

+ +
lambda.fm1 <- boxcox(datc$plot ~ datc$cond.evlot*datc$cond.dl*datc$version), 
+                     family=""yjPower"")
+lambda.max <- lambda.fm1$x[which.max(lambda.fm1$y)]
+require(car)
+datc$plott <- bcPower(datc$plot, lambda = lambda.max, jacobian.adjusted = FALSE)
+
+",2013-05-28 08:54:55.873 +47846,17994.0,1,,,,"Difference between binomial, negative binomial and Poisson regression",,CC BY-SA 3.0,"

I am looking for some information about the difference between binomial, negative binomial and Poisson regression and for which situations are these regression best fitted.

+ +

Are there any tests I can perform in SPSS that can tell me which of these regressions is the best for my situation?

+ +

Also, how do I run a Poisson or negative binomial in SPSS, since there are no options such as I can see in the regression part?

+ +

If you have any useful links I would appreciate it very much.

+",2013-06-02 09:36:07.877 +47981,16990.0,1,57334.0,,,Is a p-value of 0.04993 enough to reject null hypothesis?,,CC BY-SA 3.0,"

In a Wilcoxon signed-ranks statistical significance test, we came across some data that produces a $p$-value of $0.04993$. With a threshold of $p < 0.05$, is this result enough to reject the null hypothesis, or is it safer to say the test was inconclusive, since if we round the p-value to 3 decimal places it becomes $0.050$?

+",2013-06-04 09:21:32.970 +57128,22505.0,1,,,,Pooling regression results in SPSS,,CC BY-SA 3.0,"

I have to solve the following issue:

+ +
    +
  1. I run my linear regression model many times (let's say 1000 times) with two variables: y - continuous dependent variable, x - continuous independent variable (mean of several consequent measurements).
  2. +
  3. The independent variable in each model was randomly drawn using its mean and standard deviation
  4. +
  5. I have the regression coefficient and standard error for this independent variable in each of the models.
  6. +
+ +

Somehow I have to combine these results into one regression result. As far as I know the regression coefficients of 1000 models can be just averaged. However, this is not really clear to me how can I estimate the total variance of 1000 models.

+",2013-10-09 10:06:51.963 +48103,11200.0,1,48133.0,,,Fit a sinusoidal term to data,,CC BY-SA 4.0,"

Although I read this post, I still have no idea how to apply this to my own data and hope that someone can help me out.

+

I have the following data:

+
y <- c(11.622967, 12.006081, 11.760928, 12.246830, 12.052126, 
+       12.346154, 12.039262, 12.362163, 12.009269, 11.260743, 
+       10.950483, 10.522091,  9.346292,  7.014578,  6.981853,  
+       7.197708,  7.035624,  6.785289, 7.134426,  8.338514,  
+       8.723832, 10.276473, 10.602792, 11.031908, 11.364901, 
+       11.687638, 11.947783, 12.228909, 11.918379, 12.343574, 
+       12.046851, 12.316508, 12.147746, 12.136446, 11.744371,  
+       8.317413, 8.790837, 10.139807,  7.019035,  7.541484,  
+       7.199672,  9.090377,  7.532161,  8.156842,  9.329572, 
+       9.991522, 10.036448, 10.797905)
+t <- 18:65
+
+

And now I simply want to fit a sine wave

+

$$ +y(t)=A\cdot sin(\omega t+\phi) +C. +$$

+

with the four unknowns $A$, $\omega$, $\phi$ and $C$ to it.

+

The rest of my code looks is the following

+
res <- nls(y ~ A*sin(omega*t+phi)+C, data=data.frame(t,y), 
+           start=list(A=1,omega=1,phi=1,C=1))
+co <- coef(res)
+
+fit <- function(x, a, b, c, d) {a*sin(b*x+c)+d}
+
+# Plot result
+plot(x=t, y=y)
+curve(fit(x, a=co["A"], b=co["omega"], c=co["phi"], d=co["C"]), 
+        add=TRUE ,lwd=2, col="steelblue")
+
+

But the result is really poor.

+

+

I would very much appreciate any help.

+",2013-06-05 18:23:47.483 +48125,18416.0,1,,,,Loss for Kernel Ridge Regression,,CC BY-SA 3.0,"

Is $||Y-X\beta||_2^2 + \lambda\beta^T K\beta$ , the standard loss-function in kernel ridge regression, or is it different? Also, is the gaussian kernel a standard choice used for the kernel, in practice? If not, which kernels are used more often than not? Also, is $\lambda$ the only parameter to be tuned via cross-validation or is the kernel parameter like $\sigma$ in a gaussian kernel, also tuned via cross validation in practice? Please confirm and/or correct my understanding of Kernel ridge regression!

+",2013-06-06 01:38:15.400 +48597,18905.0,1,,,,"T-test shows no differences, but the experiment group shows tendency more benefit in all variables measured than control group",,CC BY-SA 3.0,"

I've just finished an animal experiment. I compared 1 control group and 1 experimental group, the only difference between the two is type of diet. For statistical analysis I used the independent groups t-test, and the result showed no significant differences between the two groups. However, the data shows the tendency that the experimental group has more benefit in all variables measured. So, what should I say about my data? All data are normally distributed.

+ +

My supervisor said that maybe because I used very small sample (each group n=8) that I could not find any significant differences. He suggested me to do some ""probability test"" or something to extrapolate my data (unfortunately, I don't have any clue what he was talking about).

+ +

So, is there any statistical analysis that I can use like what my supervisor told me to do?

+",2013-06-13 04:46:26.887 +48658,1926.0,1,,,,Bayesian network inference using pymc (Beginner's confusion),,CC BY-SA 3.0,"

I am currently taking the PGM course by Daphne Koller on Coursera. In that, we generally model a Bayesian Network as a cause and effect directed graph of the variables which are part of the observed data. But on PyMC tutorials and examples I generally see that it not quite modeled in the same way as the PGM or atleast I am confused. In PyMC the parents of any observed real world variable are often the parameters of the distribution that you use to model the variable.

+ +

Now my question really is a practical one. Suppose I have 3 variables for which data is observed (A, B, C) (lets assume they are all continuous variables just for the sake of it). From some domain knowledge, one can say that A and B cause C. So we have a BN here - A, B are the parents and C is the children. +now from the BN equation P(A, B, C) = P(C | A, B) * P(A) * P(B)

+ +

I can say A and B are some normal distributions with some mu and sigma, but how do I model P(C | A, B) ? +The general idea I want to learn, is how do I learn this BN using PyMC so that I can query the BN. Or do I have to augment the BN with parameters of the model in some fashion.

+ +

Is this problem solvable using pymc? or have I got some fundamentals wrong?

+ +

Any help would be appreciated!

+",2013-06-13 19:42:13.623 +49879,19492.0,1,53471.0,,,What is an adaptive copula?,,CC BY-SA 3.0,"

My basic question is: What is an adaptive copula?

+

I have slides from a presentation (unfortunately, I cannot ask the author of the slides) about adaptive copulae and I am not getting, what this means resp. what this is good for?

+

Here are the slides: + + +Then the slides continue with a change-point Test. I am wondering what this is about and why I need this in connection to copulae?

+

The slides end with an adaptively estimated parameter plot: + +

+

This seems to show, that my estimates are lagged behind. Any other interpretations, comments would be great!

+",2013-07-03 11:48:24.817 +51644,3733.0,1,,,,Adding random effect influences coefficient estimates,,CC BY-SA 3.0,"

I have always been taught that random effects only influence the variance (error), and that fixed effects only influence the mean. But I have found an example where random effects influence also the mean - the coefficient estimate:

+ +
require(nlme)
+set.seed(128)
+n <- 100
+k <- 5
+cat <- as.factor(rep(1:k, each = n))
+cat_i <- 1:k # intercept per kategorie
+x <- rep(1:n, k)
+sigma <- 0.2
+alpha <- 0.001
+y <- cat_i[cat] + alpha * x + rnorm(n*k, 0, sigma)
+plot(x, y)
+
+# simulate missing data
+y[c(1:(n/2), (n*k-n/2):(n*k))] <- NA
+
+m1 <- lm(y ~ x)
+summary(m1)
+
+m2 <- lm(y ~ cat + x)
+summary(m2)
+
+m3 <- lme(y ~ x, random = ~ 1|cat, na.action = na.omit)
+summary(m3)
+
+ +

You can see that the estimated coefficient for x from model m1 is -0.013780, while from model m3 it is 0.0011713 - both significantly different from zero.

+ +

Note that when I remove the line simulating missing data, the results are the same (it is full matrix).

+ +

Why is that?

+ +

PS: please note I am not a professional statistician, so if you are about to respond with a lot of math then please make also some simple summary for dummies :-)

+",2013-07-24 09:19:26.983 +57137,21896.0,1,57209.0,,,Negative Binomial Regression: is parameter theta (R) the reciprocal of parameter kappa (SAS)?,,CC BY-SA 3.0,"

After some frantic googling I do believe the answer is yes, but more so I am frustrated that the relation between the two parameters seems to be nowhere described explicitely so I do it here. (I hope this isn't against the rules of stackexchange.)

+ +

This very nice article states: we will denote the random variable Y having a negative binomial distribution as Y ~ NB($\mu, \kappa$) with a parameterization such that E(Y) = $\mu$, var(Y) = $\mu + \kappa \mu^2$.

+ +

I take this latter equation as the definition of $\kappa$.

+ +

Apparently this kappa is implemented in SAS.

+ +

Now turning to R, the function glm.nb in the MASS package contains a parameter $\mu$ which is obviously the same $\mu$ as above and a parameter $\theta$. The question is how $\theta$ and $\kappa$ are related. The documentation for glm.nb only refers to it as an ""additional parameter"". The answers to this and this stackexchange questions directly imply that $\theta = 1/\kappa$, but this question [EDIT: since removed] seems to suggest that $\theta = \kappa$.

+ +

The help page for negative binomial in R is nice and introduces a parameter called size that equals $1/\kappa$. Fitting glm.nb on random data generated by rnbinom for various choices of $\mu$ and size seems to support the thesis that $\theta = 1/\kappa$ (i.e. that $\theta$ = size) but also that for large values of size the estimation is poor.

+ +

Summarizing: I do believe that $\theta = 1/\kappa$ but it would be nice if there were an easily googlable place on the internet stating this explicitly. Maybe one of the answers to this questions can serve as such a place?

+",2013-10-09 12:00:22.943 +49906,5821.0,1,67660.0,,,Relationship between McNemar's test and conditional logistic regression,,CC BY-SA 3.0,"

I am interested in the modeling of binary response data in paired observations. We aim to make inference about the effectiveness of a pre-post intervention in a group, potentially adjusting for several covariates and determining whether there is effect modification by a group that received particularly different training as part of an intervention.

+ +

Given data of the following form:

+ +
id phase resp
+1  pre   1
+1  post  0
+2  pre   0
+2  post  0
+3  pre   1
+3  post  0
+
+ +

And a $2 \times 2$ contingency table of paired response information:

+ +

\begin{array}{cc|cc} +& & \mbox{Pre} & \\ +& & \mbox{Correct} & \mbox{Incorrect} \\ \hline +\mbox{Post} & \mbox{Correct} & a & b&\\ + & \mbox{Incorrect} & c& d&\\ +\end{array}

+ +

We're interested in the test of hypothesis: $\mathcal{H}_0: \theta_c = 1$.

+ +

McNemar's Test gives: $Q = \frac{(b-c)^2}{b+c} \sim \chi^2_1$ under $\mathcal{H}_0$ (asymptotically). This is intuitive because, under the null, we would expect an equal proportion of the discordant pairs ($b$ and $c$) to be favoring a positive effect ($b$) or a negative effect ($c$). With the probability of positive case definition defined $p =\frac{b}{b+c}$ and $n=b+c$. The odds of observing a positive discordant pair is $\frac{p}{1-p}=\frac{b}{c}$.

+ +

On the other hand, conditional logistic regression uses a different approach to test the same hypothesis, by maximizing the conditional likelihood:

+ +

$$\mathcal{L}(X ; \beta) = \prod_{j=1}^n \frac{\exp(\beta X_{j,2})}{\exp(\beta X_{j,1}) + \exp(\beta X_{j,2})}$$

+ +

where $\exp(\beta) = \theta_c$.

+ +

So, what's the relationship between these tests? How can one do a simple test of the contingency table presented earlier? Looking at calibration of p-values from clogit and McNemar's approaches under the null, you'd think they were completely unrelated!

+ +
library(survival)
+n <- 100
+do.one <- function(n) {
+  id <- rep(1:n, each=2)
+  ph <- rep(0:1, times=n)
+  rs <- rbinom(n*2, 1, 0.5)
+  c(
+    'pclogit' = coef(summary(clogit(rs ~ ph + strata(id))))[5],
+    'pmctest' = mcnemar.test(table(ph,rs))$p.value
+  )
+}
+
+out <- replicate(1000, do.one(n))
+plot(t(out), main='Calibration plot of pvalues for McNemar and Clogit tests', 
+  xlab='p-value McNemar', ylab='p-value conditional logistic regression')
+
+ +

+",2013-07-03 17:50:20.467 +50739,10492.0,1,,,,Linear Regression and ANOVA,,CC BY-SA 3.0,"

I found two very useful posts about the difference between linear regression analysis and ANOVA and how to visualise them:

+ +

Why is ANOVA taught / used as if it is a different research methodology compared to linear regression?

+ +

How to visualize what ANOVA does?

+ +

As stated in the first post, to test whether the average height of male and females is the same you can use a regression model ($y = \alpha + \beta x + \epsilon$, where $y$ denotes height and $x$ denotes gender) and test whether $\beta = 0$. If $\beta = 0$, then there is no difference in the height between males and females. However, I am not quite sure how this is tested when you have three groups. Imagine the following example:

+ +
height (y) -  group (x)
+5          -  A
+6          -  A
+7          -  A
+6          -  A
+30         -  B
+32         -  B
+34         -  B
+33         -  B 
+20         -  C
+19         -  C
+21         -  C
+22         -  C
+
+ +

The regression model would look like:

+ +

$$y = a+ b x + \epsilon$$

+ +

I quickly visualized the data (see image below)

+ +

They way I understood the regression model is that it would now test whether +any of the three slopes (AB, AC or BC) has a slope $b$ which is significantly different from 0. If that's the case one can conclude like in an ANOVA that there is at least one group in which height is significantly different from one or more groups. Afterwards, one could use a post-hoc test of course to test which of the groups really differ. Is my understanding of how the regression models tests this hypothesis correct?

+ +

+",2013-07-14 10:14:44.800 +50982,1790.0,1,57600.0,,,Comparing distributions of generalization performance,,CC BY-SA 3.0,"

Say that I have two learning methods for a classification problem, $A$ and $B$, and that I estimate their generalization performance with something like repeated cross validation or bootstrapping. From this process I get a distribution of scores $P_A$ and $P_B$ for each method across these repetitions (e.g. the distribution of ROC AUC values for each model).

+ +

Looking at these distributions, it could be that $\mu_A \ge \mu_B$ but that $\sigma_A \ge \sigma_B$ (i.e. the expected generalization performance of $A$ is higher than $B$, but that there is more uncertainty about this estimation).

+ +

I think this is called the bias-variance dilemma in regression.

+ +

What mathematical methods can I use to compare $P_A$ and $P_B$ and eventually make an informed decision about which model to use?

+ +

Note: For the sake of simplicity, I am referring to two methods $A$ and $B$ here, but I am interested in methods that can be used to compare the distribution of scores of ~1000 learning methods (e.g. from a grid search) and eventually make a final decision about which model to use.

+",2013-07-17 13:51:44.960 +51047,17056.0,1,57714.0,,,"Confused with MCMC Metropolis-Hastings variations: Random-Walk, Non-Random-Walk, Independent, Metropolis",,CC BY-SA 3.0,"

Over the past few weeks I have been trying to understand MCMC and the Metropolis-Hastings algorithm(s). Every time I think I understand it I realise that I am wrong. Most of the code examples I find on-line implement something that is not consistent with the description. i.e.: They say they implement Metropolis-Hastings but they actually implement random-walk metropolis. Others (almost always) silently skip the implementation of the Hastings correction ratio because they are using a symmetric proposal distribution. Actually, I haven't found a single simple example that calculates the ratio so far. That makes me even more confused. Can someone give me code examples (in any language) of the following:

+ +
    +
  • Vanilla Non-Random Walk Metropolis-Hastings Algorithm with Hastings correction ratio calculation (even if this will end up being 1 when using a symmetric proposal distribution).
  • +
  • Vanilla Random Walk Metropolis-Hastings algorithm.
  • +
  • Vanilla Independent Metropolis-Hastings algorithm.
  • +
+ +

No need to provide the Metropolis algorithms because if I am not mistaken the only difference between Metropolis and Metropolis-Hastings is that the first ones always sample from a symmetric distribution and thus they don't have the Hastings correction ratio. +No need to give detailed explanation of the algorithms. I do understand the basics but I am kinda confused with all the different names for the different variations of the Metropolis-Hastings algorithm but also with how you practically implement the Hastings correction ratio on the Vanilla non-random-walk MH. Please don't copy paste links that partially answer my questions because most likely I have already seen them. Those links led me to this confusion. Thank you.

+",2013-07-18 06:53:11.527 +51496,19870.0,1,,,,Should CRT decision tree node be mutually exclusive?,,CC BY-SA 3.0,"

I have been trying to understand the results of a CRT decision tree, my question is if the terminal nodes should be mutually exclusive? I am asking this because by reading the terminal nodes some variables seems to overlap each other.

+ +

For instance some terminal nodes ""share"" the same profession:

+ +

Node 23: carpenter, plumber, sole trader, truck driver

+ +

Node 24: plumber, truck driver, teacher, retired.

+ +

Probably I am reading the results incorrectly because it should not happen, at least in theory.

+",2013-07-23 03:50:02.683 +51577,20097.0,1,,,,LIBSVM parameter search in time series,,CC BY-SA 3.0,"

I try to predict values for regression in LIBSVM. My data is in time series. I use gridregression.m file in LIBSVM to find optimal parameters c, g and p. Gridregression.m file use cross validation to find optimal parameters, but is it ok to use cross validation in time series?

+ +

When I use parameters from gridregression.m, sometimes the MSE is not better then the default values. ( cmd= '-s 3 -t 2' is sometimes better )

+",2013-07-23 19:27:17.943 +51895,9384.0,1,,,,SMOTE throws error for multi class imbalance problem,,CC BY-SA 3.0,"

I am trying to use SMOTE to correct imbalance in my multi-class classification problem. +Although SMOTE works perfectly on the iris dataset as per the SMOTE help document, it does not work on a similar dataset. +Here is how my data looks. Note it has three classes with values 1, 2, 3.

+ +
> data
+   looking risk every status
+1        0    1     0      1
+2        0    0     0      1
+3        0    0     0      2
+4        0    0     0      1
+5        0    0     0      1
+6        3    0     0      1
+7        0    0     0      1
+8        0    0     0      1
+9        0    1     0      1
+10       0    0     0      1
+11       0    0     0      3
+12       0    0     0      1
+13       0    0     0      1
+14       0    0     0      1
+15       0    0     0      2
+
+ +

It is in the form of dataframe, same as iris:

+ +
> class(data)
+[1] ""data.frame""
+
+ +

Here is my code using SMOTE and the error that it throws:

+ +
> newData <- SMOTE(status ~ ., data, perc.over = 600,perc.under=100)
+Error in scale.default(T, T[i, ], ranges) : subscript out of bounds
+In addition: Warning messages:
+1: In FUN(newX[, i], ...) :
+  no non-missing arguments to max; returning -Inf
+2: In FUN(newX[, i], ...) :
+  no non-missing arguments to max; returning -Inf
+3: In FUN(newX[, i], ...) :
+  no non-missing arguments to max; returning -Inf
+4: In FUN(newX[, i], ...) : no non-missing arguments to min; returning Inf
+5: In FUN(newX[, i], ...) : no non-missing arguments to min; returning Inf
+6: In FUN(newX[, i], ...) : no non-missing arguments to min; returning Inf
+
+",2013-07-26 19:31:50.407 +52099,20363.0,1,,,,Multinomial likelihood for large number of groups,,CC BY-SA 3.0,"

I am trying to investigate the following problem using multinomial likelihoods and could really do with some advice regarding its appropriateness and implementation in R.

+ +

A sequence is generated by selecting with replacement from a bag of n differently coloured balls and consists of the number of occurrences of each colour in the selection (i.e. each sequence is a vector of length n with each element a count corresponding to the number of occurrences of a particular colour in the sequence). The process is then repeated a number of times to generate a group of unique sequences (duplicate sequences are rejected).

+ +

If a single sequence is selected at random as the test subject and a multinomial model is generated for each of the other sequences, using the colour count proportions as probabilities, can the likelihood be calculated for each multinomial model in the group using the test sequence as the data and would the greatest likelihood indicate the most alike sequence from the group?

+ +

I have tried implementing this in R but am struggling with a couple of points.

+ +
    +
  1. Calculating the likelihood fails if the number of colours is large since the factorial term falls out of bounds.
  2. +
  3. If the number of occurrences of each colour relative to the total number of colours is small then the probability is small and the product of the $p^x$ terms tends to zero.
  4. +
+ +

I hope this makes sense and somebody is able to offer some advice.

+",2013-07-29 19:04:43.263 +52126,20367.0,1,,,,How to evaluate Likert scale data changes over multiple surveys of the same group?,,CC BY-SA 3.0,"

I have five surveys of the same group of students over a semester. Each survey uses a 5-point Likert scale. The first and last survey contain some questions dealing with the beginning and end of the class (first impressions, final impressions), but most of the questions are identical for all four or five of the surveys.

+ +

I want to evaluate the statistical significance of changes to students' responses over time. Unfortunately statistics is not my strong suit. I know of the t-test, but that seems to only be applicable to two groups of data (please correct me if I'm wrong). How should I go about evaluating this data? Is a repeated measures one-way ANOVA appropriate?

+",2013-07-30 00:44:55.017 +52449,18845.0,1,57768.0,,,Definition of autocorrelation time (for effective sample size),,CC BY-SA 3.0,"

I've found two definitions in the literature for the autocorrelation time of a weakly stationary time series:

+ +

$$ +\tau_a = 1+2\sum_{k=1}^\infty \rho_k \quad \text{versus} \quad \tau_b = 1+2\sum_{k=1}^\infty \left|\rho_k\right| +$$

+ +

where $\rho_k = \frac{\text{Cov}[X_t,X_{t+h}]}{\text{Var}[X_t]}$ is the autocorrelation at lag $k$.

+ +

One application of the autocorrelation time is to find the ""effective sample size"": if you have $n$ observations of a time series, and you know its autocorrelation time $\tau$, then you can pretend that you have

+ +

$$ +n_\text{eff} = \frac{n}{\tau} +$$

+ +

independent samples instead of $n$ correlated ones for the purposes of finding the mean. Estimating $\tau$ from data is non-trivial, but there are a few ways of doing it (see Thompson 2010).

+ +

The definition without absolute values, $\tau_a$, seems more common in the literature; but it admits the possibility of $\tau_a<1$. Using R and the ""coda"" package:

+ +
require(coda)
+ts.uncorr <- arima.sim(model=list(),n=10000)         # white noise 
+ts.corr <- arima.sim(model=list(ar=-0.5),n=10000)    # AR(1)
+effectiveSize(ts.uncorr)                             # Sanity check
+    # result should be close to 10000
+effectiveSize(ts.corr)
+    # result is in the neighborhood of 30000... ???
+
+ +

The ""effectiveSize"" function in ""coda"" uses a definition of the autocorrelation time equivalent to $\tau_a$, above. There are some other R packages out there that compute effective sample size or autocorrelation time, and all the ones I've tried give results consistent with this: that an AR(1) process with a negative AR coefficient has more effective samples than the correlated time series. This seems strange.

+ +

Obviously, this can never happen in the $\tau_b$ definition of autocorrelation time.

+ +

What is the correct definition of autocorrelation time? Is there something wrong with my understanding of effective sample sizes? The $n_\text{eff} > n$ result shown above seems like it must be wrong... what's going on?

+",2013-08-02 14:46:27.663 +52567,728.0,1,,,,"Meaning of ""design"" in design matrix?",,CC BY-SA 3.0,"

In linear regression, $Y= X\beta$, why is $X$ called the design matrix? Can $X$ be designed or constructed arbitrarily to some degree as in art?

+",2013-08-04 18:26:28.673 +52871,18447.0,1,,,,Trap 66 in WinBUGS in a hierarchical Bayesian modeling,,CC BY-SA 3.0,"

I want to analyze a multilevel multidimensional model in WinBUGS. the model is as below (N=2362 students responding to K=45 items of a test, students are nested within J=116 schools):

+ +
model{
+#responses
+for(i in 1:N){
+    for(j in 1:K){
+        logit(p[i,j])<- a1[j]*th[i,1]+a2[j]*th[i,2]-b[j]
+        y[i,j]~dbern(p[i,j] )
+    }
+    th[i,1:2]~dmnorm(mu[sc[i],1:2],tau.p[1:2,1:2])
+}
+#school level
+for(j in 1:J){  
+    mu[j,1:2]~dmnorm(m[j,1:2],tau.s[1:2,1:2])
+}    
+
+#priors
+for(j in 1:J){
+    m[j,1:2]~dmnorm(m0[1:2],cov[1:2,1:2])
+}
+
+tau.p[1:2,1:2]~dwish(cov[1:2,1:2],2)
+tau.s[1:2,1:2]~dwish(cov[1:2,1:2],2)
+sigma.p[1:2,1:2]<-inverse(tau.p[,])
+sigma.s[1:2,1:2]<-inverse(tau.s[,])
+s2p<-sum(sigma.p[,])
+s2s<-sum(sigma.s[,])
+rho<-(s2s)/(s2s+s2p)
+
+a1[1]~dlnorm(0,4)
+a2[1]<-0
+b[1]~dnorm(0,1)
+for(s in 2:K) {
+    a1[s]~dlnorm(0,4)
+    a2[s]~dlnorm(0,4)
+    b[s]~dnorm(0,1)
+}    
+}
+
+ +

I've set these functions as initial values:

+ +
ini<-function(){
+list(tau.p=matrix(rgamma(4,100,100),2,2),
+tau.s=matrix(rgamma(4,100,100),2,2),
+th=rmvnorm(N,mean=c(0,0),sigma=diag(2)),
+m=rmvnorm(J,mean=c(0,0),sigma=diag(2)),
+mu=rmvnorm(J,mean=c(0,0),sigma=diag(2)),
+a1=rlnorm(K,0, 0.4),
+a2=c(NA,rlnorm(K-1,0, 0.4)),
+b=rnorm(45,0,0.5))
+}
+
+ +

I use rube package in R to check and run my analysis and everything looks fine. When I run the model I receive ""Trap 66 (postcondition violated)"" or ""undefined real result"". I think the problem is from the initials but I have no idea how to solve it.

+ +

Any idea?

+",2013-08-09 03:06:43.727 +52910,3731.0,1,57224.0,,,What are typically encountered condition numbers in social science?,,CC BY-SA 3.0,"

As part of my thesis, I'm proving (or attempting to prove...) a few asymptotic results. Because these results depend on the condition number, I'd like to have some idea about the typical sizes of a condition numbers that crop up in social science research. That way, I can give some guidance about how large the sample size has to be before we reach the happy land of asymptopia.

+ +

I'd be happy for any guidance.

+ +

My very specific setup is as follows. For the standard Generalized Least Squares (GLS) model

+ +

$$Y = X\beta + e \quad \quad \quad e \sim N(0, V\sigma^2) $$

+ +

where $V$ is assumed to be known and positive definite, we define

+ +

$$ X^- = (X^\top X)^{-1} X^\top \quad \quad \quad U = (I-XX^-)V$$ +and the condition number $\kappa$

+ +

$$ \kappa = \frac{ \lambda_{\text{max}} }{ \lambda_{\text{min}} } $$

+ +

where the $\lambda_\star$ values are the maximum and minimum eigenvalues of the matrix $U$.

+ +

Does anyone have pointers to references for the sizes of condition numbers in social science research? I don't even know where to look. Any pointers for either

+ +
    +
  1. OLS estimators (used incorrectly in a GLS context as posed above)
  2. +
  3. GLS estimators (correctly analyzed)
  4. +
  5. REML/ML estimators where $V$ is +estimated and then conditioned upon, or
  6. +
  7. OLS fixed effect only models +where $V$ is the identity matrix
  8. +
+ +

would be most welcome!

+",2013-08-09 13:48:43.087 +53261,449.0,1,53264.0,,,What is the $\mu^2$ squared effect size?,,CC BY-SA 3.0,"

I was recently looking at a paper in the journal Psychological Science and came across this:

+ +

F(1, 71) = 4.5, p = .037, $\mu^2$ = .06

+ +

F(1, 71) = 0.08, p = .78, $\mu^2$ = .001

+ +

I was wondering what the $\mu^2$ is in the above. Typically in APA the third thing should be either the MSE or it should be a standardized effect size (or you should have all 4). I'm guessing it's a standardized effect size of some sort but I'm not familiar with it and searching the net has turned up nothing. The actual effect, as near as I can tell from the graph, is about 12 for the first one.

+ +

Is this an effect size I haven't heard of yet or a typo in the article?

+ +

Farrelly, D., Slater, R., Elliott, H. R., Walden, H. R. and Wetherell, M. A. (2013) Competitors Who Choose to Be Red Have Higher Testosterone Levels. Psychological Science, DOI:10.1177/0956797613482945

+ +

Here's a screen shot of the text (p.2)

+ +

+",2013-08-14 03:06:30.697 +53384,20838.0,1,58388.0,,,Bootstrapping residuals: Am I doing it right?,,CC BY-SA 4.0,"

First of all: +From what I understood, bootstrapping residuals works as follows:

+ +
    +
  1. Fit model to data
  2. +
  3. Calculate the residuals
  4. +
  5. Resample the residuals and add them to 1.
  6. +
  7. Fit model to new dataset from 3.
  8. +
  9. Repeat n times, but always add the resampled residuals to the fit +from 1.
  10. +
+ +

Is that correct so far?

+ +
+ +

What I want to do is something slightly different:

+ +

I want to estimate parameter and prediction uncertainty for an algorithm that estimates some environmental variable.

+ +

What I have is a error-free time-series (from a simulation) of that variable, x_true, to which I add some noise, x_noise, in order to generate a synthetic dataset x. +I then try to find optimal parameters by fitting my algorithm with the sum of squares sum((x_estimate - x_true)^2) (! not x_estimate - x !) as an objective function. In order to see how my algorithm performs and to create samples of my parameters' distributions, I want to resample x_noise, add it to x_true, fit my model again, rinse and repeat. Is that a valid approach to assess parameter uncertainty? Can I interpret the fits to the bootstrapped datasets as prediction uncertainty, or do I have to follow the procedure I posted above?

+ +

/edit: I think I haven't really made clear what my model does. Think of it as essentially something like a de-noising method. It's not a predictive model, it's an algorithm that tries to extract the underlying signal of a noisy time-series of environmental data.

+ +

/edit^2: For the MATLAB-Users out there, I wrote down some quick & dirty linear regression example of what I mean.

+ +

This is what I believe ""ordinary"" bootstrapping of residuals is (please correct me if I'm wrong): http://pastebin.com/C0CJp3d1

+ +

This is what I want to do: http://pastebin.com/mbapsz4c

+",2013-08-15 21:35:20.763 +53391,20312.0,1,,,,How do I calculate random baseline?,,CC BY-SA 3.0,"

I am a bit confused as to how to calculate random baseline. If I understand correctly the random baseline is calculated by adding up the squared probabilities of all the classes. The random baseline classifier thus picks a class at random, instead of choosing the most frequent one.

+ +

I have 7 classes, each with # of items and a total of X. How do I find the probabilities?

+",2013-08-15 23:38:00.327 +53404,20820.0,1,,,,Why do we use a one-tailed test F-test in analysis of variance (ANOVA)?,,CC BY-SA 3.0,"

Can you give the reason for using a one tailed test in the analysis of variance test?

+ +

Why do we use a one-tail test - the F-test - in ANOVA?

+",2013-08-16 06:36:58.590 +53439,5208.0,1,,,,How would you frame this as a machine learning problem?,,CC BY-SA 3.0,"

I have a trading software that buys and sells loans. There's an auction site where borrowers ask for some money and lenders bid on them until the borrower is fully funded and the auction ends. There's lots of information on each loan request. My trading bot always bids at the highest possible interest rate, if it is outbid, then it just re-bids slightly lower. Once I win the loan parts, I can sell them at a markup. Right now, I sell at the minimum markup, so that with fees I barely make a profit.

+ +

What I'm not sure is what markup I should sell? The lower the markup the faster my loan parts sell, but I will get less profit too. On what loans should I bid? Should I bid on a loan auction with a higher interest rate, but which is not going to end for several days, thereby leaving my money stale, or should I bid on an auction with a lower interest rate, but which is going to end very soon. Sometimes in the former case, the borrower might decide to take the loan and not wait until the end of the auction, thereby I could secure a better interest rate than just bidding on the loan auction due to end soon.

+ +

I was thinking of framing this problem as reinforcement learning, but I'm not sure how to do it. My goal is to maximiz the profit I make from trading loans. Any ideas?

+",2013-08-16 15:35:05.747 +54234,21204.0,1,,,,Weighting time series coefficients using model's likelihood,,CC BY-SA 3.0,"

I have a question regarding to time series forecasting. In particular I've been working with a Bayesian approach, but I think the question is independent from that.

+ +

I have several time series which are very stable in time, except on specific dates that they have sudden changes. The problem is that if I use a forecasting technique that looks at the past to predict the future, such as ARIMA, the days after the sudden changes have high impact on the forecast.

+ +

Thus, to give a simple example, suppose I'm predicting $x_{t+1} = \sum \beta_j x_j, j<t+1$, I would like to add another weight witch accounts for the probability of $x_j$, something like $x_{t+1} = \sum f(x_j)\beta_j x_j, j<t+1$ where $f(x_j)$ is proportional to $P(x_j)$.

+ +

Thus, a sudden change has low probability and should not contribute to the prediction.

+ +

Does anyone know how to deal with these kind of problems? I'm trying to implement this in a Bayesian model, but I'm now sure how I should do it.

+",2013-08-28 15:56:16.150 +54506,21322.0,1,,,,How to analyse these data?,,CC BY-SA 3.0,"

I am conducting an experiment investigating lineup accuracy and witness confidence.

+ +

A long story short: we want to know what the pattern of false positives, hits and misses on a lineup task are under different lineup conditions and how confidence may vary with/independently of accuracy. Logically, witness confidence may also be affected by the different conditions, and we'd like to know this as well.

+ +

The between subjects variables are: Gender (male, female), ethnicity (Asian, Caucasian), and lineup type (sequential- where people see each lineup member one at a time and make a decision about each one, and simultaneous- where people see all the lineup members and make a decision about whether they see the perpetrator or not)

+ +

The within subjects variables are: Photo type (same vs different photo of the person), lineup ethnicity (Asian vs. Caucasian lineups), confidence (5 levels of a Likert scale from 1 ""not confidence at all"" to 5 ""extremely confident)

+ +

The dependent variable is accuracy in terms of hits, misses and false positives (these could be coded as 0 or 1?) and correct recognition (hits-false positives)

+ +

One of the problems is that we want to know the relationship between confidence and accuracy, which would necessitate that confidence is an independent variable, however we also want to know if the other variables might affect confidence (such as ethnicity or lineup type), so I'm having trouble figuring out the best way to analyse this data.

+ +

Does anyone have any answers for me? Someone suggested maybe logistic regression, but they weren't really sure. I'm really not used to dealing with categorical data, so am in need of help!

+",2013-09-02 04:48:52.793 +54574,11283.0,1,,,,Log-likelihood distance measure validity for clustering,,CC BY-SA 3.0,"

I have calculated log-likelihood distances between 50 sequences according to the Formula (1):

+ +

$$ +D(X_i,X_j)= 1/2(\log p(X_i|Mod_j)+\log p(X_j|Mod_i)), +$$ +where $ +p(X_i|Mod_j) +$ is the likelihood of sequence $X_i$ being produced by model $Mod_j$, where $Mod_j$ is a corresponding Markov model of the given $Seq_j$, defined by its Transition Probability Matrix and Start Probabilities Vector. The measure is symmetrical as seen from the definition. To make the measure more ""legible"" and similar to the traditional measures, I compute distance$=(1-D)$ from formula (1). Thus, $D(X_i,X_i) = 0$ and the distance increases if the likelihood decreases.

+ +

Now, I have a 50x50 Distance Matrix.I have run a ""meaningfullness"" check, and it seemed ok for me - i.e. more similar sequences had smaller distance and very different ones had very large distance. The distances seemed to satisfy the triangle inequality. However, I have noticed that:

+ +

1) the shorter sequences seem to be ""closer"" to all other sequences than longer ones. It seems that this distance measure is biased to favor short distances.

+ +

2) I have tried PAM-clustering with the distance matrix by converting my distance matrix to dist object in R by using as.dist(), and my results were very bad, even for 2 clusters or 49 ( max avg.silhouette width produced by R function pam was 0.28). With some numbers of clusters the avg.silhouette widths were even negative.

+ +

I am coming to conclusion that my way of computing medoids is invalid/conceptually wrong. What could be the problem? Can log-likelihood distance matrix be used with medoids clustering at all?

+ +

edit: I am including the heatmap of the distance matrix, where x- and y-axis represent sequences (1 through 50th). It looks strange to me but I cannot pinpoint what exactly doesn't feel right.

+ +

+",2013-09-03 10:48:31.337 +54622,12744.0,1,54624.0,,,"Do ""true"" multi-level models require Bayesian methods?",,CC BY-SA 3.0,"

I've been recently learning about mixed effects models (e.g. via Fitzmaurice, Laird, and Ware 's book Applied Longitudinal Analysis) as well as Bayesian hierarchical models (e.g. via Gelman and Hill's book Data Analysis Using Regression and Multilevel/Hierarchical Models)

+ +

One curious thing I've noticed: The Bayesian literature tends to emphasize that their models can handle covariates at multiple level of analysis. For example, if the clustering is by person, and each person is measured in multiple ""trials,"" then the Bayesian hierarchical models can investigate the main effects of covariates both at the subject and trial level, as well as interactions across ""levels.""

+ +

However, I have not seen these kinds of models in the textbooks introducing frequentist methods.

+ +

I'm not sure if this is a coincidence, or an example of where Bayesian methods can do ""more complicated things."" Is it possible to use mixed effects models (e.g. the lme4 or nlme packages in the R statistical software) to investigate interactions of covariates across ""levels"" of analysis?

+",2013-09-03 21:37:13.153 +54637,21382.0,1,58044.0,,,How to get pooled p-values on tests done in multiple imputed datasets?,,CC BY-SA 3.0,"

Using Amelia in R, I obtained multiple imputed datasets. After that, I performed a repeated measures test in SPSS. Now, I want to pool test results. I know that I can use Rubin's rules (implemented through any multiple imputation package in R) to pool means and standard errors, but how do I pool p-values? Is it possible? Is there a function in R to do so? +Thanks in advance.

+",2013-09-04 01:06:26.173 +54724,10957.0,1,60584.0,,,Flexible and inflexible models in machine learning,,CC BY-SA 3.0,"

I came across a simple question on comparing flexible models (i.e. splines) vs. inflexible models (e.g. linear regression) under different scenarios. The question is:

+ +

In general, do we expect the performance of a flexible statistical learning method to perform better or worse than an inflexible method when:

+ +
    +
  1. The number of predictors $p$ is extremely large, and the number of observations $n$ is small?
  2. +
  3. The variance of the error terms, i.e. $σ^2 = \text{Var}(e)$, is extremely high?
  4. +
+ +

I think for (1), when $n$ is small, inflexible models are better (not sure). For (2), I don't know which model is (relatively) better.

+",2013-09-04 20:24:41.247 +54836,20304.0,1,58209.0,,,Implementing Latent Dirichlet Allocation - notation confusion,,CC BY-SA 3.0,"

I am trying to implement LDA using the collapsed Gibbs sampler from +http://www.uoguelph.ca/~wdarling/research/papers/TM.pdf

+ +

the main algorithm is shown below

+ +

+ +

I'm a bit confused about the notation in the inner-most loop. n_dk refers to the count of the number of words assigned to topic k in document d, however I'm not sure which document d this is referring to. Is it the document that word (from the next outer loop) is in? Furthermore, the paper does not show how to get the hyperparameters alpha and beta. Should these be guessed and then tuned? Furthermore, I don't understand what the W refers to in the inner-most loop (or the beta without the subscript).

+ +

Could anyone enlighten me?

+",2013-09-06 15:56:05.860 +54915,21523.0,1,,,,Implementing Pettitt test in R,,CC BY-SA 3.0,"

I'm trying to implement Pettitt test in R following papers like this pdf (pp. 5 & 6), or this pdf. But, I'm misunderstanding something, because having tested it with some data, I think that output is not correct.

+ +

Here is the code:

+ +
pettitt <- function(x, alpha=0.99) {
+# Pettitt AN. 1979 A non-parametric approach to the change point detection.
+# x is a vector
+# alpha, integer, level of significance
+x <- na.omit(x)
+o <- rank(x)
+s <- c()
+L <- length(x)
+for (i in 1:(L-1)) {
+      s <- c(s, 2*(colSums(as.matrix(o[1:i]))) - (i*(L+1)) )
+}
+vc <- sqrt((-1) * log(alpha) * (L^3 + L^2)/6)
+output <- list(abs(s), vc)
+return(output)
+}
+
+ +

Testing with larain and tempdub dataset from TSA package:

+ +
library(TSA)
+data(larain)
+data(tempdub)
+pettitt(larain)
+[[1]]
+  [1]  78 118 180  76  30  30 144  90 124 148 224 334 314 298 362 444 356 334
+ [19] 300 302 194 121  83  55  45  57  25  95 175 195 193 287 181 231 175 213
+ [37] 301 331 421 345 392 322 282 354 372 274 194 130 188 248 175  97  85 153
+ [55] 105 171 181 189 245 297 401 375 449 557 467 551 594 576 602 490 406 354
+ [73] 262 266 362 248 244 214 208 200 247 147  89  13   9  15  97   5   9  83
+ [91]   3  95 123  63  31  12  44   6  48  34  72 108 208 164 170 282 214 148
+[109] 202 140 104   6 102  86
+
+[[2]]
+[1] 50.69224
+
+> max(pettitt(larain)[[1]])
+[1] 602
+
+pettitt(tempdub)
+[[1]]
+  [1]  83 161 226 235 164  60  80 169 220 219 188  74  57 177 266 281 228 147
+ [19]  19  82 125 140 102  41 100 197 235 254 233 141   1  97 144 153 112  26
+ [37]  73 206 255 258 235 137  28  49  98 101  46  29 149 252 281 274 247 160
+ [55]  43  70 115 126  79  22 157 248 317 328 287 224  96  27  86  79  27  82
+ [73] 225 348 407 406 351 256 125  10  58  77  32  61 200 314 381 386 353 216
+ [91] 124  40  35  70  35  36 173 302 365 386 321 242 131  10  51  38  19 146
+[109] 241 319 342 359 330 223  89  45 113 144 111   2 123 228 280 275 250 177
+[127]  34  50  89 102  59  22 131 248 334 359 302 198  73  46  83 100  73
+
+[[2]]
+[1] 70.96777
+
+> max(pettitt(tempdub)[[1]])
+[1] 407
+
+ +

I don't know if I lost something in pettitt test or there are error in my code.

+",2013-09-08 13:37:22.690 +45536,15663.0,2,,45534.0,,,,CC BY-SA 3.0,"

It means that the fine is lower than the compliance cost.

+ +

This is what Harrington Paradox (http://en.wikipedia.org/wiki/Harrington_paradox) show:

+ +

In the case of rational economics entities a firm will maximize its profit. This is not what is observed in reality. In theory, if the fine is lower than compliance cost a rationnal entity will not pay. In reality the fine is lower than compliance cost, but firms pay.

+ +

This suggest image concern ( or altruism....)

+",2013-04-30 15:36:32.853 +55150,21630.0,1,55182.0,,,Mathematical definition of causality,,CC BY-SA 3.0,"

Let $Y$ and $X$ be random variables. $E(Y|X)$ is the conditional mean of $Y$ given $X$. We say $Y$ is not causally related to $X$ if $E(Y|X)$ does not depend on $X$, which implies it is equal to $E(Y)$. Now, let's go along with this definition of causality for a second. By the law of iterated expectations, $E(XE(Y|X)) = E(E(XY|X)) = E(XY)$. This means that if $E(Y|X)$ does not depend on $X$, if it is equal to $E(Y)$, then $E(X)E(Y) = E(XY)$.

+ +

In other words:

+ +

If $X$ and $Y$ are not causally related, then $X$ and $Y$ are uncorrelated! - This makes no sense and I know this must be wrong. Have I defined causality incorrectly? What have I done wrong?

+ +

In econometrics we generally assume $E(Y|X) = b_0 + b_1X$. So $E(Y|X) = E(Y)$ is equivalent to $b_1 = 0$. The logic applies in this specific scenario too.

+",2013-09-12 01:13:21.980 +55209,20222.0,1,,,,Interpretation of Kolmogorov-Smirnov Critical Value Generated Distributions,,CC BY-SA 3.0,"

As a non-statistician, I need help in interpreting a customer specified two-part reliability requirement that I think involves KS.

+ +

Requirement Part 1

+ +

R[4 years] must be greater than or equal to 0.95 and

+ +

R[8 years] must be greater than or equal to 0.85

+ +

I have plotted the reliability (survival) function of a 2-parameter Weibull distribution that meets the above requirement in Plot A below. The shape parameter is 1.664 and the characteristic life is 23.844 for this distribution.

+ +

+ +

Requirement Part 2

+ +

The confidence level shall be 90% when demonstrating the Part 1 requirement via product life testing.

+ +

It’s the Part 2 that I’m a bit shaky on. On page 8-54 of MIL-HDBK-338B (http://www.sre.org/pubs/Mil-Hdbk-338B.pdf) there is a table showing KS critical “d” values as a function samples size, N and significance level, alpha (also note the plot on page 8-57). From this table I took a d value of 0.264 based on a signficance value of 0.10 and a sample size of 20. Plot B below shows my result. My interpretation of Plot B is that after running a life test on 20 samples that if the resulting reliability plot does not fall below the lower boundary shown in Plot B then we have met the requirements.

+ +

+ +

I have two questions:

+ +
    +
  1. Did I translate the Part 2 requirement properly when I used an alpha of 0.10 to obtain the KS critical value of 0.264 ? In other words, does a 90% confidence equal a 0.10 significance within the KS context ? If not, can someone provide guidance ?

  2. +
  3. How would you interpret Plot B ?

  4. +
+ +

Many thanks.

+ +
+ +

Response to owensmartin's answer

+ +

Thank you for your reply. I'll take your ""your reasoning is not incorrect"" statement as a big vote of confidence. I just have a few items below that I'd appreciate comments from anyone on.

+ +

a. You are right in that my reliability functions are also survival functions. I believe the Nominal curve is simply the compliment of the nominal CDF.

+ +

b. Although I haven't computed the statistical power, I'm not that surprised when you say it is ""very low at this sample size"". However our customer is comfortable with the sample size of 20 and by extension the resulting low power.

+ +

c. Our customer is also comfortable with using the referenced Military Handbook and the associated KS critical values shown therein even though they may be approximate. As time permits I'm hoping to learn how to compute these critical values ""exactly"" so as not to have to rely on the handbook values.

+ +

d. To answer your question as to whether I really need the Weibull fit ? The short answer is no as there is nothing particularly ""magical"" about the Weibull distribution. The slightly longer answer is that I'm not sure how else to produce a nominal survival curve that I can then apply the d value of 0.264 to. The nominal curve shown in Plot B was constructed by solving the two simultaneous equations for the shape parameter and characteristic life knowing that R[4] =0.95 and R[8]=0.85 meet the requirements. This is shown in Plot A. I suspect that other commonly used distribution types may not be able to meet both of these requirements simultaneously. But because of the inherent flexibility of the Weibull, it is able.

+ +

e. As for being sure about the 90% confidence, I think as long as the fitted distribution resulting from the 20 sample life test does not drop below the lower boundary shown in Plot B would you agree that we can say we are 90% confident we meet the stated requirements? The only exception I can see to this is if when fitting the test data to a distribution that if the degree of fit is so poor as to raise question to its validity. But in solving engineering problems, this is always present as a concern that needs to be dealt with.

+ +

f. Regarding failed vs didn't fail test results, we normally strive for having each of the 20 samples fail so that we can avoid dealing with the added uncertainty associated with censored or suspended data points. We achieve this via accelerated-life testing methods which essentially compresses time by increasing either the duty cycle and/or the stress value(s).

+ +

Thanks for any further insight into this.

+",2013-09-12 19:12:06.933 +55260,13459.0,1,,,,Which regression tree to use for large data?,,CC BY-SA 3.0,"

I have a dataframe with 2 million rows and approximately 200 columns / features. Approximately 30-40% of the entries are blank. I am trying to find important features for a binary response variable. The predictors may be categorical or continuous.

+ +

I started with applying logistic regression, but having so much missing entries I feel that this is not a good approach as glm discard all records which have any item blank. So I am now looking to apply tree based algorithms (rpart or gbm) which are capable to handle missing data in a better way.

+ +

Since my data is too big for rpart or gbm, I decided to randomly fetch 10,000 records from original data, apply rpart on that, and keep building a pool of important variables. However, even this 10,000 records seem to be too much for the rpart algorithm.

+ +

What can I do in this situation? Is there any switch that I can use to make it fast? Or it is impossible to apply rpart on my data.

+ +

I am using the following rpart command:

+ +
varimp = rpart(fmla,  dat=tmpData, method = ""class"")$variable.importance
+
+",2013-09-13 15:45:00.397 +55361,227.0,1,85707.0,,,Locomotive problem with various size companies,,CC BY-SA 3.0,"

I'm working through Think Bayes (free here: http://www.greenteapress.com/thinkbayes/) and I'm on exercise 3.1. Here's a summary of the problem:

+ +

""A railroad numbers its locomotives in order 1..N. One day you see a locomotive with the number 60. Estimate how many locomotives the railroad has.""

+ +

This solution is found with the likelihood function and exponential prior like so:

+ +
class Train(Suite):
+  def __init__(self, hypos, alpha=1.0):
+    # Create an exponential prior
+    Pmf.__init__(self)
+    for hypo in hypos:
+      self.Set(hypo, hypo**(-alpha))
+    self.Normalize()
+  def Likelihood(self, data, hypo):
+    if hypo < data:
+      return 0
+    else:
+      return (1.0/hypo)
+
+ +

Conceptually this is saying, if we see a train number larger than one of our hypotheses (1...1000) then every hypothesis that's smaller has a zero chance of being correct. The rest of the hypotheses have a 1/number_of_trains chance of showing us a train with this number.

+ +

In the exercise I'm working on the author then adds on a little extra. This assumes there's only one company. In real life however you'd have a mixture of big and small companies and bigger companies (both equally likely). However, this would mean that you're more likely to see a train from a bigger company since they'd have more trains.

+ +

Now the question is how to reflect this in the likelihood function?

+ +

This isn't Stack Overflow so I'm not really asking for coding help, but instead perhaps just help about how I might think about this problem in terms of a likelihood function.

+",2013-09-15 23:02:42.580 +55436,21778.0,1,,,,Creating a high predictive value classifier,,CC BY-SA 3.0,"

I have a two-class classification problem with n-dimensional data. I would like to train a classifier (preferably but not necessarily linear) with 100% positive predictive value. In other words, I want the model to completely avoid one of the classes. For this application a low-ish sensitivity is OK as long as PPV is ~100%.
+Do you have any suggestions of good techniques to use? +Thank you!

+",2013-09-16 22:19:43.457 +55576,21833.0,1,,,,Generating random numbers based on partial correlation data,,CC BY-SA 3.0,"

I need to generate random numbers based on already existing partial correlation data (not correlation or covariance data). Specifically, a 168*12 matrix based on a 12*12 partial correlation matrix. The idea is to simulate a data matrix that can be used for testing a few components of a project.

+ +

Any help in this regard would be appreciated. I have looked around but have not found any threads that talk about doing this with partial correlation data.

+ +

If someone has ideas about implementation in MATLAB, that would be a bonus!

+ +

Thanks a lot in advance!

+ +

Additions: +Apologies for any ambiguity.

+ +

-What I mean by partial correlation matrix is a matrix containing the partial correlations, calculated for any two pairs by partialling out effect of all other pairs.

+ +

-The goal is: given a matrix of partial correlation values, is there a way I can generate a data set (168*12) that would have these partial correlation values?

+ +

-If there is a method to convert partial correlation to correlation values, that would be appreciated as well.

+ +

Thanks again!

+",2013-09-18 12:09:15.653 +55609,21842.0,1,,,,how to forecast daily sale using Excel,,CC BY-SA 4.0,"

I am trying to find a method or a formula to forecast meals per day, which have ̀5 meals to upload on flights, sales, wastage and passengers are what I have to consider, the old template is not completed yet, and its not so good to forecast, and I can't think of other formulas or methods to forecast , I have the sales in the past few months.. anyone can suggest me which methods can solve this problem? well I am using MS.Excel to calculate or if there is a program to suggest me it would be great.

+",2013-09-18 18:04:46.113 +55617,21846.0,1,57825.0,,,Regression with rank order as dependent variable,,CC BY-SA 3.0,"

I have data on 44 firms that have all been ranked by an expert. The ""best"" firm has rank 1, the second best has rank 2, ..., the last one has rank 44. +I have a bunch of explanatory variables and would like to explain the rank of the firm on the basis of these variables. My inclination is to use a regression model, but am concerned about the fact that the dependent variable is limited, it can only be a positive discrete number.

+ +

I have thought about ordinal regression, but that seems impossible since I would have as many categories as I have observations.

+ +

What regression models would be possible? (preferably to be run in R)

+",2013-09-18 19:30:10.233 +55722,21885.0,1,56091.0,,,Looking for a good and complete probability and statistics book,,CC BY-SA 3.0,"

I never had the opportunity to visit a stats course from a math faculty. I am looking for a probability theory and statistics book that is complete and self-sufficient. By complete I mean that it contains all the proofs and not just states results. By self-sufficient I mean that I am not required to read another book to be able to understand the book. Of course it can require college level (math student) calculus and linear algebra.

+ +

I have looked at multiple books and I didn't like any of them.

+ + + +

""Weighing the Odds"" from David Williams is more formal than DeGroot and seems to be complete and self-sufficient. However, I find the style strange. He also invents new terms that only he seems to use. All the stuff that is explained in DeGroot too is explained better there.

+ +

If you know a great book in German that's also fine as I am German.

+",2013-09-19 22:14:08.257 +56273,22126.0,1,,,,Frequency Distribution,,CC BY-SA 3.0,"

I have a question that is very important to me related to the book Basic Statistics for Business and Economics for organizing data into a frequency distribution:

+ +
+

Step 1: Decide on the number of classes. The goal is to use just enough groupings or classes to reveal the shape of the distribution. Some judgment is needed here. A useful recipe to determine the number of classes ($k$) is the ""2 to the $k$ rule"". This guide suggests you select the smallest number ($k$) for the number of classes such that $2^k$ is greater than the number of observations ($n$): [$n \le 2^k$​]

+
+ +

I want to know, how can I prove this formula?

+",2013-09-27 11:50:54.613 +56372,21108.0,1,57285.0,,,Item correlation for recommender system,,CC BY-SA 3.0,"

I just made an implementation of P(A|B)/P(¬A|B) for a ""people who bought this also bought..."" algorithm.

+ +

I'm doing it by

+ +
P(A|B) = count_users(bought_A_and_B)/count_users(bought_A)
+P(¬A|B) = count_users(bought_B_but_not_A)/count_users(did_not_buy_A)
+
+ +

Then dividing the top one by the bottom one I get a score which makes absolute sense, but what kind of correlation am I calculating? What is this method called? Where can I read more about it?

+ +

[EDIT] This is not for using in a production environment, it is just some algorithm which appeared out of the blue in an online course I'm taking, I was just wondering where it could come from. Also, when the number of users who bought item B but not item A is zero I just skip the pair until I get more data. The same goes on when the number of users who bought A is zero.

+",2013-09-28 23:42:17.863 +56445,22189.0,1,,,,"Testing for significance between means, having one normal distributed sample and one non normal distributed",,CC BY-SA 3.0,"

I have following problem:

+ +

Within an independent groups 1-factor design I have two independent groups, with a sample size of 20 each. The data of the treatment group is not normally distributed, whereas the data for the control group is (checked with Shapiro-Wilk Normality Test). Now I want to check if the differences of the means of both groups are significant. +What is the appropriate test for this? I think it should be the Wilcoxon Rank Sum and Signed Rank Test, but I am not sure...

+ +

Could please anybody help me?

+",2013-09-30 08:12:44.097 +56580,20190.0,1,58642.0,,,Support Vector Machine(SVM) and log transformation,,CC BY-SA 3.0,"

Why may log(natural logarithm) transformation improve results of SVM prediction(regression, eps-svm)? Is SVM based on the assumption of normal distribution or something else?

+ +

update1. I use Radial basis function kernel.

+",2013-10-01 18:59:27.453 +56684,,1,,,Ben,Significance test for highly skewed Bernoulli distribution,,CC BY-SA 3.0,"

I am working with two highly skewed Bernoulli distributions where 96-99+% of the samples are in the ""false"" category, and the rest are in the ""true"" category (sort of speak). I am looking for a two-sided test of difference of proportions between the two samples. I can often achieve 500+ ""trues"" and tens or hundreds of thousands of ""falses"" in a reasonable time but I'm not sure if approximation to the normal distribution can withstand this extreme skewness.

+ +

I initially thought I might need something non-parametric, but here, I actually know the distribution.

+ +

I have been using a student's t-test, while paying attention to sample size estimation, but past experience has led me to be skeptical of its results. Thanks for your help.

+",2013-10-02 21:14:55.710 +56768,11490.0,1,57271.0,,,Estimating hidden transfers of market share,,CC BY-SA 4.0,"

Suppose we have yearly data representing the market share of three companies, +say A, B and C. In other words, we have observations:

+ +

$$ + A_t, \; B_t \;\; \text{and} \;\; C_t \;\; \text{where} \; \; A_t+B_t+C_t = 1 +$$ +for $t = 1, \dots,T$.

+ +

Suppose that in year $t$ the market share of company A has changed by $\Delta A_t = A_t - A_{t-1}$. Is there any way of estimating how that change can be sub-divided into market share lost to or acquired from companies B and C? My actual problem includes 5 companies, but I guess that the solution shouldn't change too much.

+",2013-10-03 21:13:50.837 +56780,15280.0,1,56783.0,,,Problem with proof of Conditional expectation as best predictor,,CC BY-SA 4.0,"

I have an issue with the proof of

+
+
+

$E(Y|X) \in \arg \min_{g(X)} E\Big[\big(Y - g(X)\big)^2\Big]$

+
+
+

which very likely reveal a deeper misunderstanding of expectations and conditional expectations.

+

The proof I know goes as follows ( another version of this proof can be found here)

+

\begin{align*} +&\arg \min_{g(X)} E\Big[\big(Y - g(x)\big)^2\Big]\\ + = &\arg \min_{g(X)} E \Big[ \big(Y - E(Y|X) + E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ \big(Y - E(Y|X)\big)^2 + 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +=&\arg \min_{g(x)} E \Big[ 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]\\ +\end{align*}

+

The proof then typically continues with an argument showing that $2 E\Big[ \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big)\Big] = 0$, and hence

+

\begin{align*} +\arg \min_{g(x)} E\Big[\big(Y - g(x)\big)^2\Big] = \arg \min_{g(x)} E \Big[\big(E(Y|X) - g(X)\big)^2\Big] +\end{align*}

+

which can be seen to be minimized when $g(X) = E(Y|X)$.

+

My puzzles about the proof are the following:

+
    +
  1. Consider
  2. +
+
+
+

$E \Big[ 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big]$.

+
+
+

It seems to me that, independently of any argument showing that the first term is always equal to zero, one can see that setting $g(X) = E(Y|X)$ minimizes the expression as it implies $\big(E(Y|X) - g(X)\big) =0$ and hence

+
+
+

$E \Big[ 2 \big(Y - E(Y|X)\big) \big(E(Y|X) - g(X)\big) + \big(E(Y|X) - g(X)\big)^2\Big] = E( 0 + 0)$ = 0.

+
+
+

But if this is true, then one might repeat the proof replacing $E(Y|X)$ by any other function of $X$, say $h(X)$, and get to the conclusion that it is $h(X)$ that minimizes the expression. So there must be something I misunderstand (right?).

+
    +
  1. I have some doubts about the meaning of $E[(Y−g(X))^2]$ in the statement of the problem. How should the notation be interpreted? Does it mean
  2. +
+
+
+

$E_X[(Y−g(X))^2]$, $E_Y[(Y−g(X))^2]$ or $E_{XY}[(Y−g(X))^2]$?

+
+
+",2013-10-04 00:24:13.043 +56784,594.0,1,57748.0,,,Impact of data-based bin boundaries on a chi-square goodness of fit test?,,CC BY-SA 3.0,"

Leaving aside the obvious issue of the low power of the chi-square in this sort of circumstance, imagine doing a chi-square goodness of test for some density with unspecified parameters, by binning the data.

+ +

For concreteness, let's say an exponential distribution with unknown mean and a sample size of say 100.

+ +

In order to get a reasonable number of expected observations per bin some account would need to be taken of the data (e.g. if we chose to put 6 bins below the mean and 4 above it, that would still be using data-based bin boundaries).

+ +

But this use of bins based on seeing the data would presumably affect the distribution of the test statistic under the null.

+ +

I have seen plenty of discussion about the fact that - if the parameters are estimated by maximum likelihood from the binned data - you lose 1 d.f per estimated parameter (an issue dating right back to Fisher vs Karl Pearson) - but I don't recall reading anything about finding the bin boundaries themselves based on the data. (If you estimate them from the unbinned data, then with $k$ bins the distribution of the test statistic lies somewhere between a $\chi^2_{k}$ and a $\chi^2_{k-p}$.)

+ +

Does this data-based choice of bins substantively impact significance level or power? Are there some approaches that matter more than others? If there is much of an effect, is it something that goes away in large samples?

+ +

If it does have a substantive impact, this would seem to make the use of a chi-squared test when parameters are unknown almost useless in many cases (in spite of still being advocated in quite a few texts), unless you had a good a-priori estimate of the parameter.

+ +

Discussion of the issues or pointers to references (preferably with a mention of their conclusions) would be useful.

+ +
+ +

Edit, pretty much an aside to the main question:

+ +

It occurs to me that there are potential solutions for the specific case of the exponential* (and the uniform come to think of it), but I am still interested in the more general issue of the impact choosing bin boundaries.

+ +

* For example, for the exponential, one might use the smallest observation (say it is equal to $m$) to get a very rough idea of where to place the bins (since the smallest observation is exponential with mean $\mu/n$), and then test the remaining $n-1$ differences ($x_i - m$) for exponentiality. Of course that might yield a very poor estimate of $\mu$, and hence poor bin choices, though I suppose one might use the argument recursively in order to take the lowest two or three observations from which to choose reasonable bins and then test the differences of the remaining observations above the largest of those smallest order statistics for exponentiality)

+",2013-10-04 01:48:35.417 +56859,6805.0,1,56860.0,,,What is the difference between descriptive and inferential statistics?,,CC BY-SA 3.0,"

My understanding was that descriptive statistics quantitatively described features of a data sample, while inferential statistics made inferences about the populations from which samples were drawn.

+ +

However, the wikipedia page for statistical inference states:

+ +
+

For the most part, statistical inference makes propositions about + populations, using data drawn from the population of interest via some + form of random sampling.

+
+ +

The ""for the most part"" has made me think I perhaps don't properly understand these concepts. Are there examples of inferential statistics that don't make propositions about populations?

+",2013-10-05 04:59:21.093 +56875,947.0,1,,,,Can you develop an econometrics model for stress test purpose only focusing on 2008-2009 data?,,CC BY-SA 3.0,"

I have become aware that a group at a large corporation is developing an econometrics model to forecast sales of their product. They are using this model solely to estimate sales in specified stress test economic scenarios where they are given what the economic environment will be like, including real GDP contraction, rising unemployment rate, etc... out to 2016. Because of the nature of those scenarios, they think the most proper way to construct this model is to focus solely on the 2008-2009 period capturing the main period of the recent financial crisis. They have monthly data, so that gives them 24 monthly data points. Given that GDP's frequency is really quarterly, on this one variable it gives them only 8 true datapoints. But, they extrapolate it into 24 month observations.

+ +

For the record, if they chose to, they have good internal data going back to 2001 and up to the current period. But, as mentioned they decided to focus instead solely on the 2008-2009 period.

+ +

I will also answer this question as I have built many such econometrics models. And, I invite others to debate and rebutt my answer... and to post your own better answer.

+",2013-10-05 14:58:52.307 +56911,1506.0,1,57357.0,,,Detecting patterns in residual plot,,CC BY-SA 3.0,"

I wish to automatically (not by visual inspection) detect where large deviations occur in a residual plot from a regression. For example, suppose I have the residual plot below:

+ +

+ +

I want to automatically detect the observations from about 30:35 deviate from a normal residual pattern. Some clues are that the magnitude is quite large and the residuals do not appear independent in this region. How can I go about this?

+",2013-10-06 06:53:48.957 +56928,16046.0,1,,,,Reference for hierarchical Bayesian modelling,,CC BY-SA 3.0,"

I am currently reading ""Bayesian Data Analysis"" by Gelman et al. and my main goal was to learn about Hierarchical modelling on chapter 5. I read until chapter 4 and the book is written terribly for a taste of a math student as it is pretty sketchy and engineering oriented.

+ +

I decided to not to continue anymore with this book and I would be very grateful if somebody could introduce a reference with a more rigorous approach to the topic.

+",2013-10-06 15:21:17.703 +56955,22423.0,1,57480.0,,,How to combine data from 5 surveys from the same population spanning 10 years,,CC BY-SA 3.0,"

I have results from 5 surveys each 2 years apart and let us assume that no subjects are selected in more than one survey.

+ +

The sampling method used in these surveys are biased and I have sampling weights calculated(with respect to the population) for each data point in each study.

+ +

The question is, how would I be able to combine the 5 datasets and have the weights recalculated so as to obtain one giant dataset for analysis on this population?

+ +

Also, what should I do if subjects appear in more than one survey?

+ +

Updates/Further Elaboration:

+ +

thank you @user30523, here are some more infomation that might be useful:

+ +

Suppose I wish to find out the estimated distribution of height across the population using these 5 datasets.

+ +

In some data, younger people are oversampled because of the location where the survey are conducted. Let's assume the weights are calculated with respect to their age.

+ +

Eg. assuming 2% of the population are 15 years old, and the location of the survey is at a mall where 15-year-olds made up 5% of all shoppers, then sampling weight for an subject aged 15 in that survey would be calculated as 0.02 / 0.05 = 0.4. For simplicity, each person in the mall has equal chance of being surveyed and all participants complied when asked.

+ +

Given that 5 surveys are conducted in 5 different malls and each has their set of weights calculated in the same way, how would I then be able to combine all 5 datasets and recalculate the sampling weights?

+ +

P.S: I'm new to the topic on sampling weights so do correct me if I have made errors in the way I have calculated the weights.

+",2013-10-07 02:44:46.310 +56970,22372.0,1,57858.0,,,How to test a logit model in R?,,CC BY-SA 3.0,"

I'm building a logit model using R and I'm getting a result of 88.9% of accuracy (verified using the ROC [in rattle, evaluation tab] using 30% of my 34k dataset).

+ +

What kind of tests would be interesting to do to certify myself that it's a good model?

+",2013-10-07 09:44:35.967 +57012,20773.0,1,,,,How to compare different sensitivity thresholds and detection limits?,,CC BY-SA 3.0,"

I have observations taken with different sensitivity thresholds and minimum detection levels, i.e. Lab A is less sensitive and has a minimum detection level of .2 and Lab B is more sensitive and has a minimum detection level of .02.

+ +

Edit 2: I have taken $N$ samples and have had them processed by two different labs (for stupid political reasons). Both labs send me the results and I discover that Lab A has a minimum detection level of .2 and Lab B has a minimum detection level of .02. See example:

+ +

Each row corresponds to a unique measurement taken by either lab:

+ +
Obs | Lab A | Lab B
+---------------------
+ 1  |  .6   |  NA
+ 2  |  0    |  NA
+ 3  |  NA   |  .53
+ 4  |  .2   |  NA
+ 5  |  NA   |  .07
+
+ +

Edit 2: I would like to be able to use and combine results from both labs, as if they were on the same scale. The problem is that the labs used to process the samples have very different thresholds for detection and have different sensitivity levels.

+ +

I think I would like something like:

+ +
Obs | LabA  | LabB  | NewLab
+----------------------------
+ 1  |  .6   |  NA   |  .64
+ 2  |  0    |  NA   |  .13
+ 3  |  NA   |  .53  |  .53
+ 4  |  .2   |  NA   |  .21
+ 5  |  NA   |  .07  |  .07
+
+ +

What techniques are available to standardize the values such that there is not a large loss of information?

+ +
    +
  1. Obviously, I could take the values from Lab B and replace anything less than .2 with 0 and then round them, but I want to avoid throwing away information if possible.
  2. +
  3. One person suggested to add random noise to the values of Lab A, but I'm not sure of the benefit of this vs. simply imputing the missing values from Lab B.
  4. +
+ +

Edit 1: +There are no observations for which both Lab A and Lab B values are present, one will always be missing.

+ +

Edit 2: +What can I do to get results from both labs on a similar scale?

+",2013-10-07 20:36:21.153 +57015,22454.0,1,,,,Testing statistical significance in two conditions,,CC BY-SA 3.0,"

I am measuring two unpaired variables $x$ and $y$ in two different conditions ($x$ and $y$ are magnitudes of some special magnetic signals). In the first condition, my hypothesis is that $\bar{x} > \bar{y}$ and in the second condition that $\bar{x} < \bar{y}$. Now that I have $N$ samples from both variables, how can I test whether my hypotheses are true? I am not sure if I can safely assume that $x$ and $y$ are independent from each other. Neither do I know from what kind of distributions they are sampled from. The sample size I have is small. I have read several introductions to statistics for the past few days, but never saw a worked out example for this kind of situations. All help appreciated.

+ +

Edit: Like Michael Mayer wrote, there is a binary grouping variable ""condition"". Sorry for a bit unclear question.

+",2013-10-07 20:50:53.017 +57026,22458.0,1,57333.0,,,Relationship between the kernel and the value of C in SVM's,,CC BY-SA 3.0,"

How exactly does the value of $C$ relate across different kernels that we can use for SVMs? As in, how does it vary when changing the polynomial degree of a kernel or while using a Gaussian kernel?

+",2013-10-08 01:16:28.460 +57053,22475.0,1,57055.0,,,Interpretation of descriptive statistics for dummy variable,,CC BY-SA 3.0,"

How can I describe descriptive statistics for a dummy variable (gender of worker in a shop)? Let's say this is the info that I have:

+ +
mean :         0.47
+median :       0
+max :          1
+min :          0
+std. dev :     0.4998
+skewness :     0.101
+kurtosis :     1.01
+jarque bera : 85.67
+probability :  0
+
+ +

I know that some of the information is useless since it's a dummy variable. So how do I interpret it in words?

+",2013-10-08 12:31:47.380 +57065,22477.0,1,,,,Bootstrapping - do I need to remove outliers first?,,CC BY-SA 3.0,"

We've run a split test of a new product feature and want to measure if the uplift on revenue is significant. Our observations are definitely not normally distributed (most of our users don't spend, and within those that do, it is heavily skewed towards lots of small spenders and a few very big spenders).

+ +

We've decided on using bootstrapping to compare the means, to get round the issue of the data not being normally distributed (side-question: is this a legitimate use of bootstrapping?)

+ +

My question is, do I need to trim outliers from the data set (e.g. the few very big spenders) before I run the bootstrapping, or does that not matter?

+",2013-10-08 14:28:42.660 +57086,22034.0,1,57184.0,,,Ideas for outputting a prediction equation for Random Forests,,CC BY-SA 3.0,"

I've read through the following posts that answered the question I was going to ask:

+ +

Use Random Forest model to make predictions from sensor data

+ +

Decision tree for output prediction

+ +

Here's what I've done so far: I compared Logistic Regression to Random Forests and RF outperformed Logistic. Now the medical researchers I work with want to turn my RF results into a medical diagnostic tool. For example:

+ +

If you are an Asian Male between 25 and 35, have Vitamin D below xx and Blood Pressure above xx, you have a 76% chance of developing disease xxx.

+ +

However, RF doesn't lend itself to simple mathematical equations (see above links). So here's my question: what ideas do you all have for using RF to develop a diagnostic tool (without having to export hundreds of trees).

+ +

Here's a few of my ideas:

+ +
    +
  1. Use RF for variable selection, then use Logistic (using all possible interactions) to make the diagnostic equation.
  2. +
  3. Somehow aggregate the RF forest into one ""mega-tree,"" that somehow averages the node splits across trees.
  4. +
  5. Similar to #2 and #1, use RF to select variables (say m variables total), then build hundreds of classification trees, all of which uses every m variable, then pick the best single tree.
  6. +
+ +

Any other ideas? Also, doing #1 is easy, but any ideas on how to implement #2 and #3?

+",2013-10-08 18:41:35.193 +57110,22494.0,1,,,,How to remove seasonality from daily electricity demand,,CC BY-SA 3.0,"

I want to remove seasonality from daily electricity demand (a time series). My understanding is there is weekly (high demand on Tue, Wed, and low demand on Sat, Sun) and annual seasonality (high demand on Winter and lower on Summer). I tried to build a model to forecast daily electricity demand in R, and plot my data as shown below: +

+ +

I tried to remove seasonality with the following:

+ +
demand.xts.diff<-diff(demand.xts,lag=1,difference=1)
+demand.xts.diff<-diff(demand.xts,lag=7,difference=1)
+
+ +

I also tried to use lag=365 and lag=366 (I am not sure what lag to use, due to the leap year issue), but none of them successfully removed seasonality. The ACF and PACF are shown below:

+ +

+

+ +

Any advice is appreciated.

+",2013-10-09 03:35:34.333 +57126,22503.0,1,,,,Is bootstrapping the right method for extreme distributions?,,CC BY-SA 3.0,"

Wondering if anyone has an opinion on whether bootstrapping the difference in means is the right method given I have a situation with extreme data points. I've decided to use this as I don't think a t test is appropriate

+ +

I have about 30k observations per group (3 groups)

+ +

My situation is about spend, and I have extreme outliers: +the outliers aren't quite like an ""income"" distribution. That is, most users (95%+) will spend zero, a subset of users will spend 5 - 10 dollars. some will spend about 20 or 50 dollars and then a select few will spend 500+, with a couple of users spending 5000 or 10000+

+ +

I am trying to test which group brought in the most revenue per user.

+ +

Can anyone offer any advice on which statistical test is best suited?

+",2013-10-09 09:52:16.150 +57156,19822.0,1,57269.0,,,How to perform unsupervised Random Forest classification using Breiman's code?,,CC BY-SA 3.0,"

I am working with Breiman's random forest code (http://stat-www.berkeley.edu/users/breiman/RandomForests/cc_manual.htm#c2) for classification of satellite data (supervised learning). I am using a training and test dataset having sample size of 2000 and variable size 10. The data is classified into two classes, A and B. In supervised learning mode, the algorithm is performing well with very low classification error (<2%). Now I want to try the unsupervised classification with no class labels in the test data set and see how the algorithm is able to predict the classes. Is there a way to implement unsupervised classification using Breiman's code? Will the error from this method will be higher than supervised classification? +The data and run parameter setting in the algorithm are given below

+ +

DESCRIBE DATA +1 mdim=10,ntrain=2000,nclass=2,maxcat=1, +1 ntest=2000,labelts=1,labeltr=1,

+ +

SET RUN PARAMETERS +2 mtry0=3,ndsize=1,jbt=500,look=100,lookcls=1, +2 jclasswt=0,mdim2nd=0,mselect=0,

+",2013-10-09 17:22:59.850 +57160,15764.0,1,,,,ARIMA with seasonality in Statsmodels,,CC BY-SA 3.0,"

I'd like to have a seasonal ARIMA model implemented with Statsmodels ARIMA. Specifically, I'd like to log before the weekly seasonality and then be able to make forecasts.

+ +

Perhaps an example with ARIMA's from_formula method could accomplish this. I'd also love to be able to do this with patsy.

+ +

Here's my sample code for logging before the weekly seasonality, and then transforming back to compare to the original time series (I've also skipped checking the validity of the model through testing stationarity and the residuals):

+ +
import pandas as pd
+import numpy as np
+from statsmodels.tsa.arima_model import ARIMA
+
+# ts is a time series
+logged_ts = np.log(ts)
+# Differencing by the week forces us to drop the first 7 values.
+diffed_logged_ts = (logged_ts - logged_ts.shift(7))[7:]
+
+p = 0
+d = 1
+q = 1
+
+arima = ARIMA(diffed_logged_ts.values, [p, d, q], exog=None, dates=diffed_logged_ts.index, freq='D', missing='none')
+diffed_logged_results = arima.fit(trend='c', disp=False)
+predicted_diffed_logged = diffed_logged_results.predict(exog=None, dynamic=False)
+predicted_diffed_logged_ts = pd.Series(predicted_diffed_logged, index=diffed_logged_ts.index[d:])
+predicted_diffed_logged_ts = np.exp(logged_ts.shift(7) + diffed_logged_ts.shift(d) + predicted_diffed_logged_ts)
+
+concatenated = pd.concat([ts, predicted_diffed_logged_ts], axis=1, keys=['original', 'predicted'])
+print concatenated[-7:]
+
+ +

What do you think of this approach? I hope there's a less error-prone way coming in a future version of Statsmodels. Could someone tag this question with ""statsmodels""? Thanks!

+",2013-10-09 19:09:36.590 +57161,20739.0,1,57197.0,,,Performing binary logistic regression with equal number of cases and non-cases,,CC BY-SA 3.0,"

The best way to ask my question is to present an example scenario:

+ +

Let's say that the outcome of interest is lung cancer (1 = lung cancer; 0 = no lung cancer) and a researcher has 200k records (where 20k patients have lung cancer (cases) and 180k patients do NOT have lung cancer (non-cases)). Since only 10% of patients (20/200k) in the sample data have lung cancer, a researcher uses a random sample of 20k from the patients that do NOT have lung cancer. By doing so, the researcher would have a sample of 20k patients with lung cancer and 20k patients without lung cancer in their sample (the sample is reduced from 200k to 40k records).

+ +

Are there any benefits to performing binary logistic regression with equal number of cases and non-cases when the actual distribution of the outcome is not equal? Or does this bias model estimates/predictive power?

+ +

Thanks in advance!

+",2013-10-09 19:25:15.053 +57164,19264.0,1,57177.0,,,Gamma vs. lognormal distributions,,CC BY-SA 3.0,"

I have an experimentally observed distribution that looks very similar to a gamma or lognormal distribution. I've read that the lognormal distribution is the maximum entropy probability distribution for a random variate $X$ for which the mean and variance of $\ln(X)$ are fixed. Does the gamma distribution have any similar properties?

+",2013-10-09 19:51:40.443 +57167,21952.0,1,,,,simultaneous equations,,CC BY-SA 3.0,"

I have the following relationships

+ +

logY ~ logX1 + logX2 + logX3 + logX4 + logX5

+ +

and

+ +

X1 ~ Z1 + Z2 + Z3 + Z4 + Z5

+ +

X2 ~ Z1 + Z2 + Z3 + Z4 + Z5

+ +

X3 ~ Z1 + Z2 + Z3 + Z4 + Z5

+ +

where Y and Z1, Z2, Z3, Z4, Z5 are endogenous (Say while the Z's play a role in determining Y, the values of Z's are fixed depending upont he values of Y - Kind of like advertising expense has an impact on sales revenue but at the same time managers determine the advertisement expense on the expected sales revenue). So all the variable are changing simultaneously. Can anyone help me on how I can estimate this relationship? I also have instruments for each of the Z's (lagged variables have been treated as instruments and I have the previous year data for the problem as well. Thank you for all your help and suggestions.

+",2013-10-09 20:44:49.813 +57175,20320.0,1,57193.0,,,On how to formulate and apply maximum likelihood,,CC BY-SA 3.0,"

I have just delved into the basics of maximum likelihood estimation and expectation maximization. The latter is really difficult to follow and I am having a tough time in figuring how I can apply the EM method for parameter estimation.

+",2013-10-09 22:19:42.207 +57183,22541.0,1,,,,Which test should I use to assess for the statistical signficance of changes in multiple binomial dependent variables from T1 to T2?,,CC BY-SA 3.0,"

Students have completed a test containing 20 questions at both T1 and T2, with an intervention in the interval. Scores for each question are either 0 (incorrect) or 1 (correct). I am interested in knowing whether the improvement in students' scores was significantly greater for some questions than for others. I am thinking that this may involve an extension of the McNemar test, but open to all suggestions. Thanks!

+",2013-10-09 23:52:28.193 +543,182.0,2,,541.0,,,,CC BY-SA 3.0,"

As an economist, the analysis of variance (ANOVA) is taught and usually understood in relation to linear regression (e.g. in Arthur Goldberger's A Course in Econometrics). Economists/Econometricians typically view ANOVA as uninteresting and prefer to move straight to regression models. From the perspective of linear (or even generalised linear) models, ANOVA assigns coefficients into batches, with each batch corresponding to a ""source of variation"" in ANOVA terminology.

+ +

Generally you can replicate the inferences you would obtain from ANOVA using regression but not always OLS regression. Multilevel models are needed for analysing hierarchical data structures such as ""split-plot designs,"" where between-group effects are compared to group-level errors, and within-group effects are compared to data-level errors. Gelman's paper [1] goes into great detail about this problem and effectively argues that ANOVA is an important statistical tool that should still be taught for it's own sake.

+ +

In particular Gelman argues that ANOVA is a way of understanding and structuring multilevel models. Therefore ANOVA is not an alternative to regression but as a tool for summarizing complex high-dimensional inferences and for exploratory data analysis.

+ +

Gelman is a well-respected statistician and some credence should be given to his view. However, almost all of the empirical work that I do would be equally well served by linear regression and so I firmly fall into the camp of viewing it as a little bit pointless. Some disciplines with complex study designs (e.g. psychology) may find ANOVA useful.

+ +

[1] Gelman, A. (2005). Analysis of variance: why it is more important than ever (with discussion). Annals of Statistics 33, 1–53. doi:10.1214/009053604000001048

+",2010-07-23 15:35:55.653 +1787,668.0,2,,1760.0,,,,CC BY-SA 2.5,"

In effect you are thinking of a model in which the true chance of rain, p, is a function of the predicted chance q: p = p(q). Each time a prediction is made, you observe one realization of a Bernoulli variate having probability p(q) of success. This is a classic logistic regression setup if you are willing to model the true chance as a linear combination of basis functions f1, f2, ..., fk; that is, the model says

+ +
+

Logit(p) = b0 + b1 f1(q) + b2 f2(q) + ... + bk fk(q) + e

+
+ +

with iid errors e. If you're agnostic about the form of the relationship (although if the weatherman is any good p(q) - q should be reasonably small), consider using a set of splines for the basis. The output, as usual, consists of estimates of the coefficients and an estimate of the variance of e. Given any future prediction q, just plug the value into the model with the estimated coefficients to obtain an answer to your question (and use the variance of e to construct a prediction interval around that answer if you like).

+ +

This framework is flexible enough to include other factors, such as the possibility of changes in the quality of predictions over time. It also lets you test hypotheses, such as whether p = q (which is what the weatherman implicitly claims).

+",2010-08-19 13:21:56.153 +2169,674.0,2,,2156.0,,,,CC BY-SA 2.5,"

It may be worth looking at M.W. Berry's books:

+ +
    +
  1. Survey of Text Mining I: Clustering, Classification, and Retrieval (2003)
  2. +
  3. Survey of Text Mining II: Clustering, Classification, and Retrieval (2008)
  4. +
+ +

They consist of series of applied and review papers. The latest seems to be available as PDF at the following address: http://bit.ly/deNeiy.

+ +

Here are few links related to CA as applied to text mining:

+ + + +

You can also look at Latent Semantic Analysis, but see my response there: Working through a clustering problem.

+",2010-09-02 10:25:32.673 +3188,450.0,2,,143.0,,,,CC BY-SA 4.0,"

#Edit: +As @Hunaphu's points out (and @whuber below in his answer) the original answer I gave to the OP (below) is wrong. It is indeed quicker to first sort the initial batch and then keep updating the median up or down (depending on whether a new data points falls to the left or to the right of the current median).

+
+

It's bad form to sort an array to compute a median. Medians (and other quantiles) are typically computed using the quickselect algorithm, with $O(n)$ complexity.

+

You may also want to look at my answer to a recent related question here.

+",2010-10-09 19:02:09.717 +3649,674.0,2,,3646.0,,,,CC BY-SA 2.5,"

I found that Spearman correlation is mostly used in place of usual linear correlation when working with integer valued scores on a measurement scale, when it has a moderate number of possible scores or when we don't want to make rely on assumptions about the bivariate relationships. As compared to Pearson coefficient, the interpretation of Kendall's tau seems to me less direct than that of Spearman's rho, in the sense that it quantifies the difference between the % of concordant and discordant pairs among all possible pairwise events. In my understanding, Kendall's tau more closely resembles Goodman-Kruskal Gamma.

+ +

I just browsed an article from Larry Winner in the J. Statistics Educ. (2006) which discusses the use of both measures, NASCAR Winston Cup Race Results for 1975-2003.

+ +

I also found @onestop answer about Pearson's or Spearman's correlation with non-normal data interesting in this respect.

+ +

Of note, Kendall's tau (the a version) has connection to Somers' D (and Harrell's C) used for predictive modelling (see e.g., Interpretation of Somers’ D under four simple models by RB Newson and reference 6 therein, and articles by Newson published in the Stata Journal 2006). An overview of rank-sum tests is provided in Efficient Calculation of Jackknife Confidence Intervals for Rank Statistics, that was published in the JSS (2006).

+",2010-10-24 14:26:35.747 +4714,60.0,2,,4705.0,,,,CC BY-SA 3.0,"

Reverend Thomas Bayes for discovering Bayes' theorem

+",2010-12-04 03:46:20.583 +5020,1411.0,2,,5015.0,,,,CC BY-SA 2.5,"

I think this one is tricky; as you hint, there's 'moral hazard' here: if you hadn't looked at the interaction at all, you'd be free and clear, but now that you have there is a suspicion of data-dredging if you drop it.

+ +

The key is probably a change in the meaning of your effects when you go from the main-effects-only to the interaction model. What you get for the 'main effects' depends very much on how your treatments and contrasts are coded. In R, the default is treatment contrasts with the first factor levels (the ones with the first names in alphabetical order unless you have gone out of your way to code them differently) as the baseline levels.

+ +

Say (for simplicity) that you have two levels, 'control' and 'trt', for each factor. Without the interaction, the meaning of the 'v1.trt' parameter (assuming treatment contrasts as is the default in R) is ""average difference between 'v1.control' and 'v1.trt' group""; the meaning of the 'v2.trt' parameter is ""average difference between 'v2.control' and 'v2.trt'"".

+ +

With the interaction, 'v1.trt' is the average difference between 'v1.control' and 'v1.trt' in the 'v2.control' group, and similarly 'v2.trt' is the average difference between v2 groups in the 'v1.control' group. Thus, if you have fairly small treatment effects in each of the control groups, but a large effect in the treatment groups, you could easily see what you're seeing.

+ +

The only way I can see this happening without a significant interaction term, however, is if all the effects are fairly weak (so that what you really mean by ""the effect disappeared"" is that you went from p=0.06 to p=0.04, across the magic significance line).

+ +

Another possibility is that you are 'using up too many degrees of freedom' -- that is, the parameter estimates don't actually change that much, but the residual error term is sufficiently inflated by having to estimate another 4 [ = (2-1)*(5-1)] parameters that your significant terms become non-significant. Again, I would only expect this with a small data set/relatively weak effects.

+ +

One possible solution is to move to sum contrasts, although this is also delicate -- you have to be convinced that 'average effect' is meaningful in your case. The very best thing is to plot your data and to look at the coefficients and understand what's happening in terms of the estimated parameters.

+ +

Hope that helps.

+",2010-12-14 02:25:20.237 +8699,155.0,2,,8681.0,,,,CC BY-SA 3.0,"

You may wish to read answers to this existing question on freely available datasets.

+ +

In general, I imagine that you'd want a dataset with some interesting metric variables. +In psychology research methods classes that I've taught, we've often looked at datasets with intelligence or personality test scores.

+ +

If you want a personality example, I have some personality data and metadata on github based on the IPIP, an public domain measure of the Big 5 factors of personality.

+ + +",2011-04-14 10:55:13.367 +9529,192.0,2,,9524.0,,,,CC BY-SA 3.0,"

Pi

+",2011-05-07 14:19:42.887 +10069,2666.0,2,,10008.0,,,,CC BY-SA 3.0,"

In my experience, not only is it necessary to have all lower order effects in the model when they are connected to higher order effects, but it is also important to properly model (e.g., allowing to be nonlinear) main effects that are seemingly unrelated to the factors in the interactions of interest. That's because interactions between $x_1$ and $x_2$ can be stand-ins for main effects of $x_3$ and $x_4$. Interactions sometimes seem to be needed because they are collinear with omitted variables or omitted nonlinear (e.g., spline) terms.

+",2011-05-21 12:31:20.447 +13060,1805.0,2,,13058.0,,,,CC BY-SA 3.0,"

Check out the digitize package for R. Its designed to solve exactly this sort of problem.

+",2011-08-18 05:14:07.900 +14790,2081.0,2,,14729.0,,,,CC BY-SA 3.0,"

You seem to ask a really provoking question: how to detect, given a singular correlation (or covariance, or sum-of-squares-and-cross-product) matrix, which column is linearly dependent on which. I tentatively suppose that sweep operation could help. Here is my probe in SPSS (not R) to illustrate.

+ +

Let's generate some data:

+ +
        v1        v2        v3         v4          v5
+    -1.64454    .35119   -.06384    -1.05188     .25192
+    -1.78520   -.21598   1.20315      .40267    1.14790
+     1.36357   -.96107   -.46651      .92889   -1.38072
+     -.31455   -.74937   1.17505     1.27623   -1.04640
+     -.31795    .85860    .10061      .00145     .39644
+     -.97010    .19129   2.43890     -.83642    -.13250
+     -.66439    .29267   1.20405      .90068   -1.78066
+      .87025   -.89018   -.99386    -1.80001     .42768
+    -1.96219   -.27535    .58754      .34556     .12587
+    -1.03638   -.24645   -.11083      .07013    -.84446
+
+ +

Let's create some linear dependancy between V2, V4 and V5:

+ +
compute V4 = .4*V2+1.2*V5.
+execute.
+
+ +

So, we modified our column V4.

+ +
matrix.
+get X. /*take the data*/
+compute M = sscp(X). /*SSCP matrix, X'X; it is singular*/
+print rank(M). /*with rank 5-1=4, because there's 1 group of interdependent columns*/
+loop i= 1 to 5. /*Start iterative sweep operation on M from column 1 to column 5*/
+-compute M = sweep(M,i).
+-print M. /*That's printout we want to trace*/
+end loop.
+end matrix.
+
+ +

The printouts of M in 5 iterations:

+ +
M
+     .06660028    -.12645565    -.54275426    -.19692972    -.12195621
+     .12645565    3.20350385    -.08946808    2.84946215    1.30671718
+     .54275426    -.08946808    7.38023317   -3.51467361   -2.89907198
+     .19692972    2.84946215   -3.51467361   13.88671851   10.62244471
+     .12195621    1.30671718   -2.89907198   10.62244471    8.41646486
+
+M
+     .07159201     .03947417    -.54628594    -.08444957    -.07037464
+     .03947417     .31215820    -.02792819     .88948298     .40790248
+     .54628594     .02792819    7.37773449   -3.43509328   -2.86257773
+     .08444957    -.88948298   -3.43509328   11.35217042    9.46014202
+     .07037464    -.40790248   -2.86257773    9.46014202    7.88345168
+
+M
+    .112041875    .041542117    .074045215   -.338801789   -.282334825
+    .041542117    .312263922    .003785470    .876479537    .397066281
+    .074045215    .003785470    .135542964   -.465602725   -.388002270
+    .338801789   -.876479537    .465602725   9.752781632   8.127318027
+    .282334825   -.397066281    .388002270   8.127318027   6.772765022
+
+M
+   .1238115070   .0110941027   .0902197842   .0347389906   .0000000000
+   .0110941027   .3910328733  -.0380581058  -.0898696977  -.3333333333
+   .0902197842  -.0380581058   .1577710733   .0477405054   .0000000000
+   .0347389906  -.0898696977   .0477405054   .1025348498   .8333333333
+   .0000000000   .3333333333   .0000000000  -.8333333333   .0000000000
+
+M
+   .1238115070   .0110941027   .0902197842   .0347389906   .0000000000
+   .0110941027   .3910328733  -.0380581058  -.0898696977   .0000000000
+   .0902197842  -.0380581058   .1577710733   .0477405054   .0000000000
+   .0347389906  -.0898696977   .0477405054   .1025348498   .0000000000
+   .0000000000   .0000000000   .0000000000   .0000000000   .0000000000
+
+ +

Notice that eventually column 5 got full of zeros. This means (as I understand it) that V5 is linearly tied with some of preceeding columns. Which columns? Look at iteration where column 5 is last not full of zeroes - iteration 4. We see there that V5 is tied with V2 and V4 with coefficients -.3333 and .8333: V5 = -.3333*V2+.8333*V4, which corresponds to what we've done with the data: V4 = .4*V2+1.2*V5.

+ +

That's how we knew which column is linearly tied with which other. I didn't check how helpful is the above approach in more general case with many groups of interdependancies in the data. In the above example it appeared helpful, though.

+",2011-10-03 09:34:57.197 +16212,1927.0,2,,16209.0,,,,CC BY-SA 3.0,"

Here is a possibility, very similar than that of @Roman Lustrik, but just a little bit more automatic.

+ +

Say that

+ +
+

x <- c(""a"", ""b"", ""b"", ""c"")

+
+ +

Then

+ +
   > x <- as.factor(x)
+   > levels(x) <- 1:length(levels(x))
+   > x <- as.numeric(x)
+
+ +

makes the job:

+ +
   > print(x)
+   [1] 1 2 2 3
+
+",2011-11-06 11:12:09.377 +16337,2081.0,2,,16313.0,,,,CC BY-SA 4.0,"

Spearman rho vs Kendall tau. These two are so much computationally different that you cannot directly compare their magnitudes. Spearman is usually higher by 1/4 to 1/3 and this makes one incorrectly conclude that Spearman is "better" for a particular dataset. The difference between rho and tau is in their ideology, proportion-of-variance for rho and probability for tau. Rho is a usual Pearson r applied for ranked data, and like r, is more sensitive to points with large moments (that is, deviations from cloud centre) than to points with small moments. Therefore rho is quite sensitive to the shape of the cloud after the ranking done: the coefficient for an oblong rhombic cloud will be higher than the coefficient for an oblong dumbbelled cloud (because sharp edges of the first are large moments). Tau is an extension of Gamma and is equally sensitive to all the data points, so it is less sensitive to peculiarities in shape of the ranked cloud. Tau is more "general" than rho, for rho is warranted only when you believe the underlying (model, or functional in population) relationship between the variables is strictly monotonic. While Tau allows for nonmonotonic underlying curve and measures which monotonic "trend", positive or negative, prevails there overall. Rho is comparable with r in magnitude; tau is not.

+

Kendall tau as Gamma. Tau is just a standardized form of Gamma. Several related measures all have numerator $P-Q$ but differ in normalizing denominator:

+
    +
  • Gamma: $P+Q$
  • +
  • Somers' D("x dependent"): $P+Q+T_x$
  • +
  • Somers' D("y dependent"): $P+Q+T_y$
  • +
  • Somers' D("symmetric"): arithmetic mean of the above two
  • +
  • Kendall's Tau-b corr. (most suitable for square tables): geometric mean of those two
  • +
  • Kendall's Tau-c corr$^1$. (most suitable for rectangular tables): $N^2(k-1)/(2k)$
  • +
  • Kendall's Tau-a corr$^2$. (makes nо adjustment for ties): $N(N-1)/2 = P+Q+T_x+T_y+T_{xy}$
  • +
+

where $P$ - number of pairs of observations with "concordance", $Q$ - with "inversion"; $T_x$ - number of ties by variable X, $T_y$ - by variable Y, $T_{xy}$ – by both variables; $N$ - number of observations, $k$ - number of distinct values in that variable where this number is less.

+

Thus, tau is directly comparable in theory and magnitude with Gamma. Rho is directly comparable in theory and magnitude with Pearson $r$. Nick Stauner's nice answer here tells how it is possible to compare rho and tau indirectly.

+

See also about tau and rho.

+
+

$^1$ Tau-c of a variable with itself can be below $1$: specifically, when the distribution of $k$ distinct values is unbalanced.

+

$^2$ Tau-a of a variable with itself can be below $1$: specifically, when there are ties.

+",2011-11-09 11:41:29.170 +48133,594.0,2,,48103.0,,,,CC BY-SA 3.0,"

If you just want a good estimate of $\omega$ and don't care much about +its standard error:

+ +
ssp <- spectrum(y)  
+per <- 1/ssp$freq[ssp$spec==max(ssp$spec)]
+reslm <- lm(y ~ sin(2*pi/per*t)+cos(2*pi/per*t))
+summary(reslm)
+
+rg <- diff(range(y))
+plot(y~t,ylim=c(min(y)-0.1*rg,max(y)+0.1*rg))
+lines(fitted(reslm)~t,col=4,lty=2)   # dashed blue line is sin fit
+
+# including 2nd harmonic really improves the fit
+reslm2 <- lm(y ~ sin(2*pi/per*t)+cos(2*pi/per*t)+sin(4*pi/per*t)+cos(4*pi/per*t))
+summary(reslm2)
+lines(fitted(reslm2)~t,col=3)    # solid green line is periodic with second harmonic
+
+ +

+ +

(A better fit still would perhaps account for the outliers in that series in some way, reducing their influence.)

+ +

---

+ +

If you want some idea of the uncertainty in $\omega$, you could use profile likelihood (pdf1, pdf2 - references on getting approximate CIs or SEs from profile likelihood or its variants aren't hard to locate)

+ +

(Alternatively, you could feed these estimates into nls ... and start it already converged.)

+",2013-06-06 08:03:29.833 +16537,1831.0,2,,15542.0,,,,CC BY-SA 3.0,"

Deep Learning got a lot of focus since 2006. It's basically an approach to train deep neural networks and is leading to really impressive results on very hard datasets (like document clustering or object recognition). Some people are talking about the second neural network renaissance (eg in this Google talk by Schmidhuber).

+ +

If you want to be impressed you should look at this Science paper Reducing the Dimensionality of Data with Neural Networks, Hinton & Salakhutdinov.

+ +

(There is so much work going on right now in that area, that there is only two upcoming books I know about that will treat it: Large scale machine learning, Langford et al and Machine Learning: a probabilistic perspective by Kevin Murphy.)

+ +

If you want to know more, check out what the main deep learning groups are doing: Stanford, Montreal and most importantly Toronto #1 and Toronto #2.

+",2011-11-14 20:26:28.373 +17000,65.0,2,,16998.0,,,,CC BY-SA 3.0,"

Do not the Wikileaks texts suit you?

+",2011-11-24 21:48:24.163 +18345,5448.0,2,,18335.0,,,,CC BY-SA 3.0,"

The LR (likelihood ratio) test actually is testing the hypothesis that a specified subset of the parameters equal some pre-specified values. In the case of model selection, generally (but not always) that means some of the parameters equal zero. If the models are nested, the parameters in the larger model that are not in the smaller model are the ones being tested, with values specified implicitly by their exclusion from the smaller model. If the models aren't nested, you aren't testing this any more, because BOTH models have parameters that aren't in the other model, so the LR test statistic doesn't have the asymptotic $\chi^2$ distribution that it (usually) does in the nested case.

+ +

AIC, on the other hand, is not used for formal testing. It is used for informal comparisons of models with differing numbers of parameters. The penalty term in the expression for AIC is what allows this comparison. But no assumptions are made about the functional form of the asymptotic distribution of the differences between the AIC of two non-nested models when doing the model comparison, and the difference between two AICs is not treated as a test statistic.

+ +

I'll add that there is some disagreement over the use of AIC with non-nested models, as the theory is worked out for nested models. Hence my emphasis on ""not...formal"" and ""not...test statistic."" I use it for non-nested models, but not in a hard-and-fast way, more as an important, but not the sole, input into the model building process.

+",2012-01-01 17:41:56.327 +20240,1073.0,2,,20234.0,carlosdc,,,CC BY-SA 3.0,"

I think you're mixing multiple important concepts. Let me try to clarify a couple of things:

+ +
    +
  • There are metaheuristic methods, which are methods that iteratively try to improve a candidate solution. Examples of this are tabu search, simulated annealing, genetic algorithms, etc. Observe that while there can be many cases where these methods work nicely, there isn't any deep understanding of when these methods work and when they don't. And more importantly when they don't get to the solution, we can be arbitrarily far from it. Problems solved by metaheuristic methods tend to be discrete in nature, because there are far better tools to handle continuous problems. But every now and then you see metaheuristics for continuous problems, too.

  • +
  • There are numerical optimization methods, people in this community carefully examine the nature of the function that is to be optimized and the restrictions of the solution (into groups like convex optimization, quadratic programming, linear programming, etc) and apply algorithms that have been shown to work for that type of function, and those type of restrictions. When people in this area say ""shown to work"" they mean a proof. The situation is that these types of methods work in continuous problems. But when your problem falls in this category, this is definitely the tool to use.

  • +
  • There are discrete optimization methods, which tend to be things that in nature are connected to algorithms to well studied discrete problems: such as shortest paths, max flow, etc. People in this area also care that their algorithms really work (proofs). There are a subset of people in this group that study really hard problems for which no fast algorithm is expected to exist. They then study approximation algorithms, which are fast algorithms for which they are able to show that their solution is within a constant factor of the true optimum. This is called ""approximation algorithms"". These people also show their results as proofs.

  • +
+ +

So... to answer your question, I do not think that metaheuristics are approximation algorithms. It doesn't seem to me as something connected to opinion, it is just fact.

+",2012-02-10 21:45:51.503 +24602,3662.0,2,,24506.0,,,,CC BY-SA 3.0,"

If your using linear regression I would recommend using the rms package in R. It is very easy to use and has lots of nice features.

+ +

Here's an example:

+ +
# Load package (remember to install.packages(""rms"") or this will fail the first time)
+library(rms)
+
+# Get a dataset to experiment with
+data(mtcars)
+mtcars$am <- factor(mtcars$am, levels=0:1, labels=c(""Automatic"", ""Manual""))
+
+# The rms package needs this work properly
+dd <- datadist(mtcars)
+options(datadist=""dd"")
+
+# Do the regression
+f <- ols(mpg~wt, data=mtcars, x=T, y=T)
+
+# Plot regular mean confidence interval
+p <- Predict(f, wt=seq(2.5, 4, by=.001), conf.type=""mean"")
+plot(p, ylim=c(10, 30), col=""lightblue"")
+
+# Plot wide confidence interval
+p <- Predict(f, wt=seq(2.5, 4, by=.001), conf.type=""individual"")
+plot(p, ylim=c(10, 30), col=""lightblue"")
+
+ +

Gives this output:

+ +

+ +

Now usually you want to test the linearity assumption:

+ +
# Try the model with a restricted cubic spline
+f <- ols(mpg~rcs(wt, 3), data=mtcars, x=T, y=T)
+anova(f)
+
+ +

Gives this output:

+ +
> anova(f)
+                Analysis of Variance          Response: mpg 
+
+ Factor     d.f. Partial SS MS         F     P     
+ wt          2   922.04230  461.021149 65.54 <.0001
+  Nonlinear  1    74.31705   74.317047 10.56 0.0029
+ REGRESSION  2   922.04230  461.021149 65.54 <.0001
+ ERROR      29   204.00489    7.034651             
+
+ +

And if you plot the graphs with the same code as a bove you get this picture:

+ +

+ +

If you want to make your formula more complicated just add that variable:

+ +
f <- ols(mpg~rcs(wt, 3)+am, data=mtcars, x=T, y=T)
+p <- Predict(f, wt=seq(2.5, 4, by=.001), am=levels(mtcars$am), conf.type=""individual"")
+plot(p)
+
+ +

I don't know anything about JMP, it shouldn't be too difficult but I recommend learning R because it gives you an incredible freedom.

+ +

Hope this helped.

+",2012-05-06 10:01:28.687 +25087,8363.0,2,,25072.0,,,,CC BY-SA 3.0,"

I will just address question 2. I have some doubts about how well the author knows his subject if he really said it the way you have presented it. PCA is applied to the sample just like EFA and CFA. It simply takes a list of n possibly related factors looks at how the points (samples) scatter in n-dimension space and then gets the first principal component as the linear combination that explains more of the variability in the data than any other linear combination. Then the second looks at orthogonal directions to the first to find theone out of those that explains the most of the remaining variability and so on with the 3rd and 4th. So sometimes one can take just 1-3 components to describe most of the variation in the data. That is why factor analysis and principal componet analysis are described according to 1 and 2 in your statement.

+",2012-05-14 14:04:26.687 +26657,190.0,2,,26070.0,,,,CC BY-SA 3.0,"

Here is a simple toy example illustrating the effect of dimension in a discrimination problem e.g. the problem you face when you want to say if something is observed or if only random effect is observed (this problem is a classic in science).

+ +

Heuristic. The key issue here is that the Euclidian norm gives the same importance to any direction. This constitutes a lack of prior, and as you certainly know in high dimension there is no free lunch (i.e. if you have no prior idea of what you are searching for, then there is no reason why some noise would not look like what you are searching for, this is tautology ...).

+ +

I would say that for any problem there is a limit of information that is necessary to find something else than noise. This limit is related somehow to the ""size"" of the area you are trying to explore with regard to the ""noise"" level (i.e. level of uninformative content).

+ +

In high dimension if you have the prior that your signal is sparse then you can remove (i.e. penalize) non sparse vector with a metric that fills the space with sparse vector or by using a thresholding technique.

+ +

Framework Assume that $\xi$ is a gaussian vector with mean $\nu$ and diagonal covariance $\sigma Id$ ($\sigma$ is known) and that you want to test the simple hypothesis

+ +

$$H_0: \;\nu=0,\; Vs \; H_{\theta}: \; \nu=\theta $$ +(for a given $\theta\in \mathbb{R}^n$) $\theta$ is not necessarily known in advance.

+ +

Test statistic with energy. The intuition you certainly have is that it is a good idea to evaluate the norm/energy $\mathcal{E}_n=\frac{1}{n}\sum_{i=1}^n\xi_i^2$ of you observation $\xi$ to build a test statistic. Actually you can construct a standardized centered (under $H_0$) version $T_n$ of the energy $T_n=\frac{\sum_i\xi_i^2-\sigma^2}{\sqrt{2n\sigma^4}}$. That makes a critical region at level $\alpha$ of the form $\{T_n\geq v_{1-\alpha}\}$ for a well chosen $v_{1-\alpha}$

+ +

Power of the test and dimension. In this case it is an easy probability exercise to show the following formula for the power of your test:

+ +
+

$$P_{\theta}(T\leq v_{1-\alpha})=P\left (Z\leq \frac{v_{1-\alpha}}{\sqrt{1+2\|\theta\|_2^2/(n\sigma^2)}}-\frac{\|\theta\|^2_2}{\sqrt{2n\sigma^4+2\sigma^2\|\theta\|_2^2/(n\sigma^2)}}\right )$$ + with $Z$ a sum of $n$ iid random variables with $\mathbb{E}[Z]=0$ and $Var(Z)=1$.

+
+ +

This means that the power of your test is increased by the energy of your signal $\|\theta\|^2_2$ and decreased by $n$. Practically speaking this means that when you increase the size $n$ of your problem if it does not increase the strength of the signal at the same time then you are adding uninformative information to your observation (or you are reducing the proportion of useful information in the information you have): this is like adding noise and reduces the power of the test (i.e. it is more likely that you are gonna say nothing is observed while there is actually something).

+ +

Toward a test with a threshold statistic. If you do not have much energy in your signal but if you know a linear transformation that can help you to have this energy concentrated in a small part of your signal, then you can build a test statistic that will only evaluate the energy for the small part of your signal. If you known in advance where it is concentrated (for example you known there cannot be high frequencies in your signal) then you can obtain a power in the preceding test with $n$ replaced by a small number and $\|\theta\|^2_2$ almost the same... If you do not know it in advance you have to estimate it this leads to well known thresholding tests.

+ +

Note that this argument is exactly at the root many papers such as

+ +
    +
  • A Antoniadis, F Abramovich, T Sapatinas, and B Vidakovic. Wavelet methods for testing +in functional analysis of variance models. International Journal on Wavelets and its +applications, 93 :1007–1021, 2004.
  • +
  • M. V. Burnashef and Begmatov. On a problem of signal detection leading to stable distribution. Theory of probability and its applications, 35(3) :556–560, 1990.
  • +
  • Y. Baraud. Non asymptotic minimax rate of testing in signal detection. Bernoulli, 8 :577–606, 2002.
  • +
  • J Fan. Test of significance based on wavelet thresholding and neyman’s truncation. JASA, +91 :674–688, 1996.
  • +
  • J. Fan and S-K Lin. Test of significance when data are curves. JASA, 93 :1007–1021, 1998.
  • +
  • V. Spokoiny. Adaptative hypothesis testing using wavelets. Annals of Statistics, 24(6) :2477–2498, december 1996.
  • +
+",2012-06-12 09:23:27.760 +69414,7189.0,2,,58876.0,,,,CC BY-SA 3.0,"

I think what people mean by support recovery is very simple. If the observed data is generated using a model of the form $$y=Xw^*+\epsilon$$ +$y\in\mathbb{R}^n, X\in \mathbb{R}^{n\times p},\epsilon\in\mathbb{R}^n$ is noise (assumed to be subGaussian), where $w^*\in\mathbb{R}^p$ is known to be sparse, under what conditions will the solution of the lasso problem

+ +

$$\hat{w}=\arg\min\limits_w \frac{1}{2n}||y-Xw||_2^2+\lambda_n||w||_1$$

+ +

have the same indices that are non-zero as the true solution $w^*$? i.e. support($\hat{w}$)=support($w^*$)

+",2014-03-17 19:14:24.097 +27132,668.0,2,,27120.0,,,,CC BY-SA 3.0,"

The conceptual uses of ""square"" and ""squared"" are subtly different, although (almost) interchangeable:

+ +
    +
  • ""Squared"" refers to the past action of taking or computing the second power. E.g., $x^2$ is usually read as ""x-squared,"" not ""x-square."" (The latter is sometimes encountered but I suspect it results from speakers who are accustomed to clipping their phrases or who just haven't heard the terminal dental in ""x-squared."")

  • +
  • ""Square"" refers to the result of taking the second power. E.g., $x^2$ can be referred to as the ""square of x."" (The illocution ""squared of x"" is never used.)

  • +
+ +

These suggest that a person using a phrase like ""mean squared error"" is thinking in terms of a computation: take the errors, square them, average those. The phrase ""mean square error"" has a more conceptual feel to it: average the square errors. The user of this phrase may be thinking in terms of square errors rather than the errors themselves. I believe this shows up especially in theoretical literature where the second form, ""square,"" appears more often (I believe: I haven't systematically checked).

+ +

Obviously both are equivalent in function and safely interchangeable in practice. It is interesting, though, that some careful Google queries give substantially different hit counts. Presently,

+ +
""mean squared"" -square -root -Einstein -Relativity
+
+ +

returns about 367,000 results (notice the necessity of ruling out the phrase ""$e=m c^2$"" popularly quoted in certain contexts, which demands the use of ""squared"" instead of ""square"" when written out), while

+ +
""mean square"" -squared -root  -Einstein -Relativity
+
+ +

(maintaining analogous exclusions for comparability) returns an order of magnitude more, at 3.47 million results. This (weakly) suggests people favor ""mean square"" over ""mean squared,"" but don't take this too much to heart: ""mean squared"" is used in official SAS documentation, for instance.

+",2012-06-20 20:29:04.380 +30434,9605.0,2,,16366.0,,,,CC BY-SA 3.0,"

This is indeed something often glossed over.

+ +

Some people are doing something a bit cheeky: holding out a proportion of the words in each document, and giving using predictive probabilities of these held-out words given the document-topic mixtures as well as the topic-word mixtures. This is obviously not ideal as it doesn't evaluate performance on any held-out documents.

+ +

To do it properly with held-out documents, as suggested, you do need to ""integrate over the Dirichlet prior for all possible topic mixtures"". http://people.cs.umass.edu/~wallach/talks/evaluation.pdf reviews a few methods for tackling this slightly unpleasant integral. I'm just about to try and implement this myself in fact, so good luck!

+",2012-08-20 14:56:05.707 +30864,4890.0,2,,30862.0,,,,CC BY-SA 3.0,"

A low rank approximation $\hat{X}$ of $X$ can be decomposed into a matrix square root as $G=U_{r}\lambda_{r}^\frac{1}{2}$ where the eigen decomposition of $X$ is $U\lambda U^T$, thereby reducing the number of features, which can be represented by $G$ based on the rank-r approximation as $\hat{X}=GG^T$. Note that the subscript $r$ represents the number of eigen-vectors and eigen-values used in the approximation. Hence, it does reduce the number of features to represent the data. In some examples low-rank approximations are considered as basis or latent variable (dictionary) based expansions of the original data, under special constraints like orthogonality, non-negativity (non-negative matrix factorization) etc.

+",2012-08-28 00:53:18.257 +30960,132.0,2,,30957.0,,,,CC BY-SA 3.0,"

You can ""fit"" the model to different data and then simulate:

+ +
m2 <- Arima(z,model=m1)
+simulate.Arima(m2,future=TRUE,bootstrap=TRUE)
+
+ +

m2 will have the same parameters as m1 (they are not re-estimated), but the residuals, etc., are computed on the new data.

+ +

However, I am concerned with your model. Seasonal models are for when the seasonality is fixed and known. With animal population data, you almost certainly have aperiodic population cycling. This is a well-known phenomenon and can easily be handled with non-seasonal ARIMA models. Look at the literature on the Canadian lynx data for discussion.

+ +

By all means, use the square root, but then I would use a non-seasonal ARIMA model. Provided the AR order is greater than 1, it is possible to have cycles. See

+ +

You can do all this in one step:

+ +
m1 <- auto.arima(y, lambda=0.5)
+
+ +

Then proceed with your simulations as above.

+",2012-08-29 17:25:54.740 +31587,7007.0,2,,31575.0,,,,CC BY-SA 4.0,"

Please, check the comments above. Here is a quick implementation in R.

+ +
x <- c(1,2,1,1,3,4,4,1,2,4,1,4,3,4,4,4,3,1,3,2,3,3,3,4,2,2,3)
+p <- matrix(nrow = 4, ncol = 4, 0)
+for (t in 1:(length(x) - 1)) p[x[t], x[t + 1]] <- p[x[t], x[t + 1]] + 1
+for (i in 1:4) p[i, ] <- p[i, ] / sum(p[i, ])
+
+ +

Results:

+ +
> p
+          [,1]      [,2]      [,3]      [,4]
+[1,] 0.1666667 0.3333333 0.3333333 0.1666667
+[2,] 0.2000000 0.2000000 0.4000000 0.2000000
+[3,] 0.1428571 0.1428571 0.2857143 0.4285714
+[4,] 0.2500000 0.1250000 0.2500000 0.3750000
+
+ +

A (probably dumb) implementation in MATLAB (which I have never used, so I don't know if this is going to work. I've just googled ""declare vector matrix MATLAB"" to get the syntax):

+ +
x = [ 1, 2, 1, 1, 3, 4, 4, 1, 2, 4, 1, 4, 3, 4, 4, 4, 3, 1, 3, 2, 3, 3, 3, 4, 2, 2, 3 ]
+n = length(x) - 1
+p = zeros(4,4)
+for t = 1:n
+  p(x(t), x(t + 1)) = p(x(t), x(t + 1)) + 1
+end
+for i = 1:4
+  p(i, :) = p(i, :) / sum(p(i, :))
+end
+
+",2012-09-11 17:05:10.090 +32053,346.0,2,,32038.0,,,,CC BY-SA 3.0,"

Short answer: Yes, you can use ID as random effect with 6 levels.

+ +

Slightly longer answer: The @BenBolker's GLMM FAQ says (among other things) the following under the headline ""Should I treat factor xxx as fixed or random?"":

+ +
+

One point of particular relevance to 'modern' mixed model estimation + (rather than 'classical' method-of-moments estimation) is that, for + practical purposes, there must be a reasonable number of + random-effects levels (e.g. blocks) — more than 5 or 6 at a minimum.

+
+ +

So you are at the lower bound, but on the right side of it.

+",2012-09-20 12:35:29.537 +35160,12273.0,2,,35097.0,,,,CC BY-SA 3.0,"

The main issue is that the first experiment (Sun gone nova) is not repeatable, which makes it highly unsuitable for frequentist methodology that interprets probability as estimate of how frequent an event is giving that we can repeat the experiment many times. In contrast, bayesian probability is interpreted as our degree of belief giving all available prior knowledge, making it suitable for common sense reasoning about one-time events. The dice throw experiment is repeatable, but I find it very unlikely that any frequentist would intentionally ignore the influence of the first experiment and be so confident in significance of the obtained results.

+ +

Although it seems that author mocks frequentist reliance on repeatable experiments and their distrust of priors, giving the unsuitability of the experimental setup to the frequentist methodology I would say that real theme of this comic is not frequentist methodology but blind following of unsuitable methodology in general. Whether it's funny or not is up to you (for me it is) but I think it more misleads than clarifies the differences between the two approaches.

+",2012-11-12 16:27:57.197 +42517,594.0,2,,42513.0,,,,CC BY-SA 3.0,"

Of course, why not?

+ +

+ +

Here's an example (one of dozens I found with a simple google search):

+ +

+ +

(Image source is is the measuring usability blog, here.)

+ +

I've seen means, means plus or minus a standard deviation, various quantiles (like median, quartiles, 10th and 90th percentiles) all displayed in various ways.

+ +

Instead of drawing a line right across the plot, you might mark information along the bottom of it - like so:

+ +

+ +

There's an example (one of many to be found) with a boxplot across the top instead of at the bottom, here.

+ +

Sometimes people mark in the data:

+ +


+(I have jittered the data locations slightly because the values were rounded to integers and you couldn't see the relative density well.)

+ +

There's an example of this kind, done in Stata, on this page (see the third one here)

+ +

Histograms are better with a little extra information - they can be misleading on their own

+ +

You just need to take care to explain what your plot consists of! (You'd want a better title and x-axis label than I used here, for starters. Plus an explanation in a figure caption explaining what you had marked on it.)

+ +

--

+ +

One last plot:

+ +

+ +

--

+ +

My plots are generated in R.

+ +

Edit:

+ +

As @gung surmised, abline(v=mean... was used to draw the mean-line across the plot and rug was used to draw the data values (though I actually used rug(jitter(... because the data was rounded to integers).

+ +

Here's a way to do the boxplot in between the histogram and the axis:

+ +
hist(Davis2[,2],n=30)
+boxplot(Davis2[,2],
+  add=TRUE,horizontal=TRUE,at=-0.75,border=""darkred"",boxwex=1.5,outline=FALSE)
+
+ +

I'm not going to list what everything there is for, but you can check the arguments in the help (?boxplot) to find out what they're for, and play with them yourself.

+ +

However, it's not a general solution - I don't guarantee it will always work as well as it does here (note I already changed the at and boxwex options*). If you don't write an intelligent function to take care of everything, it's necessary to pay attention to what everything does to make sure it's doing what you want.

+ +

Here's how to create the data I used (I was trying to show how Theil regression was really able to handle several influential outliers). It just happened to be data I was playing with when I first answered this question.

+ +
 library(""car"")
+ add <- data.frame(sex=c(""F"",""F""),
+       weight=c(150,130),height=c(NA,NA),repwt=c(55,50),repht=c(NA,NA))
+ Davis2 <- rbind(Davis,add)
+
+ +

* -- an appropriate value for at is around -0.5 times the value of boxwex; that would be a good default if you write a function to do it; boxwex would need to be scaled in a way that relates to the y-scale (height) of the boxplot; I'd suggest 0.04 to 0.05 times the upper y-limit might often be okay.

+ +

Code for the marginal stripchart:

+ +
 hist(Davis2[,2],n=30)
+ stripchart(jitter(Davis2[,2],amount=.5),
+       method=""jitter"",jitter=.5,pch=16,cex=.05,add=TRUE,at=-.75,col='purple3')
+
+",2013-03-19 21:39:28.337 +53264,17249.0,2,,53261.0,,,,CC BY-SA 3.0,"

I can only think of this referring to $\eta^2$, computed as:

+ +

$\eta^2={SS_{effect} \over SS_{total}}$

+ +

This is the proportion of variance explained in the dependent variable by the grouping variable (in this case, a binary variable). This would be indeed the same value as the $R^2$ obtained if the difference between the two groups was estimated using simple linear regression:

+ +

$y_i=\beta_0+\beta_1group_i+\epsilon_i$

+ +

I can see from the paper that the second F test is actually that of an interaction term, and since it has 1 degree of freedom, I am deducing that the second factor was also a binary variable. In this case, the $\eta^2$'s are partial $\eta^2$'s, which are the proportion of variance explained by the grouping variable (or the interaction term) controlling for the other grouping variable. In this more complex case, the partial $\eta^2$'s are the same as the partial $R^2$'s obtained from the multiple linear regression:

+ +

$y_i=\beta_0+\beta_1group_{1i}+\beta_2group_{1i}+\beta_3 \cdot group_{1i} \cdot group_{2i} + \epsilon_i$

+",2013-08-14 03:24:19.637 +53471,11155.0,2,,49879.0,,,,CC BY-SA 3.0,"

I think adaptive in this context just means the reestimation on a rolling basis. So the parameter should not change until there is a change point. Then the true parameter increases and stays constant after it decreases again because of the second change point. The estimated parameter is evaluated compared to the true parameter: How fast does it get the change point? How fast does it adapt to the new environment?

+",2013-08-17 07:53:38.020 +54624,503.0,2,,54622.0,,,,CC BY-SA 3.0,"

Yes it is. I don't know the commands in R but in SAS PROC MIXED you can have variables at either level in the MODEL statement and you can include interactions. e.g., a split plot design

+ +
proc mixed;
+   class A B Block;
+   model Y = A B A*B;
+   random Block A*Block;
+run;
+
+ +

where A is assigned to whole plots and B is assigned to subplots.

+",2013-09-03 22:08:41.237 +55182,17573.0,2,,55150.0,,,,CC BY-SA 4.0,"

You have defined causality incorrectly, yes. Probably, you have heard the saying "correlation isn't causation." You have essentially defined causality as correlation. The problem is worse than that, though. Causality is not a statistical or probabilistic concept at all, at least as those topics are normally taught. There is no statistical or probabilistic definition of causality: nothing involving conditional expectations or conditional distributions or suchlike. It is hard to pick up this fact from courses in statistics or econometrics, though.

+

Unfortunately, we tend to do a better job saying what causality isn't than what causality is. Causality always and everywhere comes from theory, from a priori reasoning, from assumptions. You mentioned econometrics. If you have been taught instrumental variables competently, then you know that causal effects can only be measured if you have an "exclusion restriction." And you know that exclusion restrictions always come from theory.

+

You said you wanted math, though. The guy you want to read is Judea Pearl. It's not easy math, and the math sometimes wanders off into philosophy, but that's because causality is a hard subject. Here is a page with more links on the subject. Here is a free online book I just came across. Finally, here is a previous question where I gave an answer you might find useful.

+",2013-09-12 13:18:37.003 +56091,2802.0,2,,55722.0,,,,CC BY-SA 3.0,"

If you are searching for proofs, I have been working for some time on a free stats textbook that collects lots of proofs of elementary and less elementary facts that are difficult to find in probability and statistics books (because they are scattered here and there). You can have a look at it at http://www.statlect.com/

+",2013-09-25 11:04:55.647 +56783,20473.0,2,,56780.0,,,,CC BY-SA 3.0,"

(This is an adaptation from Granger & Newbold(1986) ""Forecasting Economic Time Series"").

+ +

By construction, your error cost function is $\left[Y-g(X)\right]^2$. This incorporates a critical assumption (that the error cost function is symmetric around zero) -a different error cost function would not necessarily have the conditional expected value as the $\arg \min$ of its expected value. +You cannot minimize your error cost function because it contains unknown quantities. So you decide to minimize its expected value instead. Then your objective function becomes

+ +

$$E\left[Y-g(X)\right]^2 = \int_{-\infty}^{\infty}\left[y-g(X)\right]^2f_{Y|X}(y|x)dy $$

+ +

which I believe answers also your second question. It is intuitive that the expected value will be of $Y$ conditional on $X$, since we are trying to estimate/forecast $Y$ based on $X$. Decompose the square to obtain

+ +

$$E\left[Y-g(X)\right]^2 = \int_{-\infty}^{\infty}y^2f_{Y|X}(y|x)dy -2g(X)\int_{-\infty}^{\infty}yf_{Y|X}(y|x)dy \\+ \Big[g(X)\Big]^2\int_{-\infty}^{\infty}f_{Y|X}(y|x)dy$$

+ +

The first term does not contain $g(X)$ so it does not affect minimization, and it can be ignored. The integral in the second term equals the conditional expected value of $Y$ given $X$, and the integral in the last term equals unity. So

+ +

$$\arg \min_{g(x)} E\left[Y-g(X)\right]^2 = \arg \min_{g(x)} \Big\{ -2g(X)E(Y\mid X) + \Big[g(X)\Big]^2 \Big\}$$

+ +

The first derivative w.r.t $g(X)$ is $-2E(Y\mid X) + 2g(X)$ leading to the first order condition for minimization $g(X) = E(Y\mid X)$ while the second derivative is equal to $2>0$ which is sufficient for a minimum.

+ +

ADDENDUM:The logic of the ""add and subtract"" proof approach.

+ +

The OP is puzzled by the approach stated in the question, because it seems tautological. It isn't, because while using the tactic of adding and subtracting makes a specific part of the objective function zero for an arbitrary choice of the term that is added and subtracted, it does NOT equalize the value function , namely the value of the objective function evaluated at the candidate minimizer.

+ +

For the choice $g(X) = E(Y \mid X)$ we have the value function $ V\left(E(Y\mid X)\right) = E\Big[ (Y-E(Y \mid X))^2\mid X\Big]$ +For the arbitrary choice $g(X) = h(X)$we have the value funtion $ V\left(h(X)\right) = E\Big[ (Y-h(X))^2\mid X\Big]$.

+ +

I claim that

+ +

$$V\left(E(Y\mid X)\right) \le V\left(h(X)\right)$$ +$$\Rightarrow E(Y^2\mid X) -2E\Big [(YE(Y \mid X))\mid X\Big] + E\Big [(E(Y \mid X))^2\mid X\Big] \\\le E(Y^2\mid X) -2E\Big [(Yh(X))\mid X\Big] + E\Big [(h(X))^2\mid X\Big]$$

+ +

The first term of the LHS and the RHS cancel out. Also note that the outer expectation is conditional on $X$. By the properties of conditional expectations we end up with

+ +

$$...\Rightarrow -2E(Y \mid X)\cdot E\Big (Y\mid X\Big) + \Big [E(Y \mid X)\Big]^2 \le -2E(Y\mid X)h(X) + \Big [h(X)\Big]^2$$

+ +

$$\Rightarrow 0 \le \Big [E(Y \mid X)\Big]^2-2E(Y\mid X)h(X) + \Big [h(X)\Big]^2$$

+ +

$$\Rightarrow 0 \le \Big [E(Y \mid X) - h(x)\Big]^2$$ +which holds with strict inequality if $h(x) \neq E(Y \mid X)$. So $E(Y \mid X)$ is the global and unique minimizer.

+ +

But this also says that the ""add-and-subtract"" approach is not the most illuminating way of proof here.

+",2013-10-04 01:05:36.887 +56860,155.0,2,,56859.0,,,,CC BY-SA 3.0,"

Coming from a behavioural sciences background, I associate this terminology particularly with introductory statistics textbooks. In this context the distinction is that :

+ +
    +
  • Descriptive statistics are functions of the sample data that are intrinsically interesting in describing some feature of the data. Classic descriptive statistics include mean, min, max, standard deviation, median, skew, kurtosis.
  • +
  • Inferential statistics are a function of the sample data that assists you to draw an inference regarding an hypothesis about a population parameter. Classic inferential statistics include z, t, $\chi^2$, F-ratio, etc.
  • +
+ +

The important point is that any statistic, inferential or descriptive, is a function of the sample data. A parameter is a function of the population, where the term population is the same as saying the underlying data generating process.

+ +

From this perspective the status of a given function of the data as a descriptive or inferential statistic depends on the purpose for which you are using it.

+ +

That said, some statistics are clearly more useful in describing relevant features of the data, and some are well suited to aiding inference.

+ +
    +
  • Inferential statistics: Standard test statistics like t and z, for a given data generating process, where the null hypothesis is false, the expected value is strongly influenced by sample size. Most researchers would not see such statistics as estimating a population parameter of intrinsic interest.
  • +
  • Descriptive statistics: In contrast descriptive statistics do estimate population parameters that are typically of intrinsic interest. For example the sample mean and standard deviation provide estimates of the equivalent population parameters. Even descriptive statistics like the minimum and maximum provide information about equivalent or similar population parameters, although of course in this case, much more care is required. Furthermore, many descriptive statistics might be biased or otherwise less than ideal estimators. However, they still have some utility in estimating a population parameter of interest.
  • +
+ +

So from this perspective, the important things to understand are:

+ +
    +
  • statistic: function of the sample data
  • +
  • parameter: function of the population (data generating process)
  • +
  • estimator: function of the sample data used to provide an estimate of a parameter
  • +
  • inference: process of reaching a conclusion about a parameter
  • +
+ +

Thus, you could either define the distinction between descriptive and inferential based on the intention of the researcher using the statistic, or you could define a statistic based on how it is typically used.

+",2013-10-05 05:51:35.693 +57055,22.0,2,,57053.0,,,,CC BY-SA 3.0,"

I think only one descriptive statistic is needed: ""47% are male"" (assuming 0 encodes female and 1 encodes male). No other statistics are really helpful to describe those data. If you thought these were a randomish sample of a larger population, you could compute the confidence interval for that proportion.

+",2013-10-08 13:08:34.837 +57177,594.0,2,,57164.0,,,,CC BY-SA 4.0,"

As for qualitative differences, the lognormal and gamma are, as you say, quite similar.

+

Indeed, in practice they're often used to model the same phenomena (some people will use a gamma where others use a lognormal). They are both, for example, constant-coefficient-of-variation models (the CV for the lognormal is $\sqrt{e^{\sigma^2} -1}$, for the gamma it's $1/\sqrt \alpha$).

+

[How can it be constant if it depends on a parameter, you ask? It applies when you model the scale (location for the log scale); for the lognormal, the $\mu$ parameter acts as the log of a scale parameter, while for the gamma, the scale is the parameter that isn't the shape parameter (or its reciprocal if you use the shape-rate parameterization). I'll call the scale parameter for the gamma distribution $\beta$. Gamma GLMs model the mean ($\mu=\alpha\beta$) while holding $\alpha$ constant; in that case $\mu$ is also a scale parameter. A model with varying $\mu$ and constant $\alpha$ or $\sigma$ respectively will have constant CV.]

+

You might find it instructive to look at the density of their logs, which often shows a very clear difference.

+

The log of a lognormal random variable is ... normal. It's symmetric.

+

The log of a gamma random variable is left-skew. Depending on the value of the shape parameter, it may be quite skew or nearly symmetric.

+

Here's an example, with both lognormal and gamma having mean 1 and variance 1/4. The top plot shows the densities (gamma in green, lognormal in blue), and the lower one shows the densities of the logs:

+

+

(Plotting the log of the density of the logs is also useful. That is, taking a log-scale on the y-axis above)

+

This difference implies that the gamma has more of a tail on the left, and less of a tail on the right; the far right tail of the lognormal is heavier and its left tail lighter. And indeed, if you look at the skewness, of the lognormal and gamma, for a given coefficient of variation, the lognormal is more right skew ($\text{CV}^3+3\text{CV}$) than the gamma ($2\text{CV}$).

+",2013-10-09 22:43:42.927 +59199,21029.0,2,,59176.0,,,,CC BY-SA 3.0,"
    +
  1. Is $(X, X+Y)$ normal? Yes! It is a linear combination of independent univariate normal distributions.
  2. +
  3. Means: the mean of $X$ is $\mu_1$, and the mean of $X+Y$ is the sum of the means because they are independent, so $\mu_1+\mu_2$.
  4. +
  5. Variance-covariance. The variance of the sum of two independent random variables is the sum of their variance. So, the variance of $(X,X+Y)$ is $(\sigma_1^2, \sigma_1^2+\sigma_2^2)$. Now calculate the covariance:
  6. +
+ +

$$ Cov(X,X+Y) = Cov(X,X)+Cov(X,Y) = \sigma_1^2+0$$

+ +

A multidimension normal distribution is defined by its mean and variance-covariance matrix. Therefore,

+ +

$$ (X,X+Y)\sim N \left(\left( \begin{array}{c} \mu_1 \\ \mu_1+\mu_2 \end{array}\right), +\left( \begin{array}{ccc} +\sigma_{1}^2 & \sigma_{1}^2 \\ +\sigma_{1}^2 &\sigma_{1}^2+\sigma_{2}^2 \\ + \end{array} \right) \right) +$$

+",2013-11-09 08:11:27.770 +59239,1411.0,2,,59110.0,,,,CC BY-SA 3.0,"

I think you're exactly right.

+ +

Set up data like your example:

+ +
d <- expand.grid(Site=factor(1:10),rep=1:5)
+d <- transform(d,Clone=factor(LETTERS[(as.numeric(Site)+1) %/% 2]))
+library(lme4)
+## could use development version of lme4 to simulate, but will do
+## it by hand
+beta <- c(2,1,3,-2,2)  ## clone effects (intercept + differences)
+X <- model.matrix(~Clone,d)
+set.seed(1)
+u.site <- rnorm(length(levels(d$Site)),sd=1)
+    d$y <- rnorm(nrow(d),
+       mean=X %*% beta + u.site[d$Site],
+       sd=2)
+
+ +

Now analyze:

+ +
m1 <- lmer(y~Clone+(1|Site),data=d)
+round(fixef(m1),3)
+## (Intercept)      CloneB      CloneC      CloneD      CloneE 
+##       2.624      -0.034       2.504      -2.297       2.396
+
+VarCorr(m1)
+##  Groups   Name        Std.Dev.
+##  Site     (Intercept) 0.0000  
+##  Residual             1.6108
+
+ +

I don't think there's actually anything wrong, but I used a pretty big residual variance, and so in this case (probably only on a subset of replicates), lmer estimates a zero among-site variation.

+",2013-11-10 00:36:44.380 +59371,7155.0,2,,57549.0,,,,CC BY-SA 3.0,"

Replacing the euclidean distance in kNN with another distance function is equivalent to ""kernelizing it."" A valid Mercer kernel is any function taking two observations that is continuous, symmetric and has a positive definite covariance matrix $\forall x \in D$. Many interesting properties such as stationarity can be imbued in a kernel that make it an attractive option for things like, time-series and geospatial statistics. There exists kernels for structured input that otherwise could not be represented as fixed length vectors. There exists kernels in the literature that are not valid Mercer kernels and still empirically perform well.

+ +

If you would like to know more about kernels, I'd recommend reviewing the literature on Gaussian Processes and SVMs.

+",2013-11-11 18:29:39.543 +59372,668.0,2,,58770.0,,,,CC BY-SA 3.0,"

A formula is requested. Unfortunately, the situation is so complicated it appears that any formula will merely be a roundabout way of enumerating all the possibilities. Instead, this answer offers an algorithm which is (a) tantamount to a formula involving sums of products of binomial coefficients and (b) can be ported to many platforms.

+ +
+ +

To obtain such a formula, break down the possibilities into mutually disjoint groups in two ways: according to how many letters not in the word are selected in the rack (let this be $m$) and according to how many wildcards (blanks) are selected (let this be $w$). When there are $r=7$ tiles in the rack, $N$ available tiles, $M$ available tiles with letters not in the word, and $W=2$ blanks available, the number of possible choices given by $(m,w)$ is

+ +

$$\binom{M}{m}\binom{W}{w}\binom{N-M-W}{r-m-w}$$

+ +

because the choices of non-word letters, blanks, and word letters are independent conditional on $(m,w,r).$

+ +

This reduces the problem to finding the number of ways to spell a word when selecting only from the tiles representing the word's letters, given that $w$ blanks are available and $r-m-w$ tiles will be selected. The situation is messy and no closed formula seems available. For instance, with $w=0$ blanks and $m=3$ out-of-word letters are drawn there will be precisely four letters left to spell ""boot"" that were drawn from the ""b"", ""o"", and ""t"" tiles. Given there are $2$ ""b""'s, $8$ ""o""'s, and $6$ ""t""'s in the Scrabble tile set, there are positive probabilities of drawing (multisets) ""bboo"", ""bbot"", ""bbtt"", ""booo"", ""boot"", ""bott"", ""bttt"", ""oooo"", ""ooot"", ""oott"", ""ottt"", and ""tttt"", but only one of these spells ""boot"". And that was the easy case! For example, supposing the rack contains five tiles chosen randomly from the ""o"", ""b"", and ""t"" tiles, together with both blanks, there are many more ways to spell ""boot""--and not to spell it. For instance, ""boot"" can be spelled from ""__boott"" and ""__bbttt"", but not from ""__ttttt"".

+ +

This counting--the heart of the problem--can be handled recursively. I will describe it with an example. Suppose we wish to count the ways of spelling ""boot"" with one blank and four more tiles from the collection of ""b"", ""o"", and ""t"" tiles (whence the remaining two tiles show non-blank letters not in {""b"", ""o"", ""t""}). Consider the first letter, ""b"":

+ +
    +
  1. A ""b"" can be drawn in $\binom{2}{1}$ ways from the two ""b"" tiles available. This reduces the problem to counting the number of ways of spelling the suffix ""oot"" using both blanks and just three more tiles from the collection of ""o"" and ""t"" tiles.

  2. +
  3. One blank can be designated as a ""b"". This reduces the problem to counting the number of ways of spelling ""oot"" using the remaining blank and just three more tiles from the collection of ""o"" and ""t"" tiles.

  4. +
+ +

In general, steps (1) and (2)--which are disjoint and therefore contribute additively to the probability calculations--can be implemented as a loop over the possible number of blanks that might be used for the first letter. The reduced problem is solved recursively. The base case occurs when there's one letter left, there is a certain number of tiles with that letter available, and there may be some blanks in the rack, too. We only have to make sure that the number of blanks in the rack plus the number of available tiles will be enough to obtain the desired quantity of that last letter.

+ +

Here is R code for the recursive step. rack usually equals $7$, word is an array of counts of the letters (such as c(b=1, o=2, t=1)), alphabet is a similar structure giving the numbers of available tiles with those letters, and wild is the number of blanks assumed to occur in the rack.

+ +
f <- function(rack, word, alphabet, wild) {
+  if (length(word) == 1) {
+    return(ifelse(word > rack+wild, 0, choose(alphabet, rack)))
+  }
+  n <- word[1]
+  if (n <= 0) return(0)
+  m <- alphabet[1]
+  x <- sapply(max(0, n-wild):min(m, rack), 
+              function(i) {
+                choose(m, i) * f(rack-i, word[-1], alphabet[-1], wild-max(0, n-i))
+              })
+  return(sum(x))
+}
+
+ +

An interface to this function specifies the standard Scrabble tiles, converts a given word into its multiset data structure, and performs the double sum over $m$ and $w$. Here is where the binomial coefficients $\binom{M}{m}$ and $\binom{W}{w}$ are computed and multiplied.

+ +
scrabble <- function(sword, n.wild=2, rack=7, 
+              alphabet=c(a=9,b=2,c=2,d=4,e=12,f=2,g=3,h=2,i=9,j=1,k=1,l=4,m=2,
+                         n=6,o=8,p=2,q=1,r=6,s=4,t=6,u=4,v=2,w=2,x=1,y=2,z=1),
+              N=sum(alphabet)+n.wild) {
+  word = sort(table(strsplit(sword, NULL))) # Sorting speeds things a little
+  a <- sapply(names(word), function(s) alphabet[s])
+  names(a) <- names(word)
+  x <- sapply(0:n.wild, function(w) {
+    sapply(sum(word):rack-w, 
+           function(i) {
+             f(i, word, a, wild=w) *
+               choose(n.wild, w) * choose(N-n.wild-sum(a), rack-w-i)
+           })
+  })
+  return(list(numerator = sum(x), denominator = choose(N, rack),
+              value=sum(x) / choose(N, rack)))
+}
+
+ +
+ +

Let's try out this solution and time it as we go. The following test uses the same inputs employed in the simulations by @Rasmus Bååth:

+ +
system.time(x <- sapply(c(""boot"", ""red"", ""axe"", ""zoology""), scrabble))
+
+ +

This machine reports $0.05$ seconds total elapsed time: reasonably quick. The results?

+ +
> x
+            boot        red         axe         zoology     
+numerator   114327888   1249373480  823897928   11840       
+denominator 16007560800 16007560800 16007560800 16007560800 
+value       0.007142118 0.07804896  0.0514693   7.396505e-07
+
+ +

The probability for ""boot"" of $114327888/16007560800$ exactly equals the value $2381831/333490850$ obtained in my other answer (which uses a similar method but couches it in a more powerful framework requiring a symbolic algebra computing platform). The probabilities for all four words are reasonably close to Bååth's simulations (which could not be expected to give an accurate value for ""zoology"" due to its low probability of $11840/16007560800,$ which is less than one in a million).

+",2013-11-11 18:34:44.430 +59515,22564.0,2,,58861.0,,,,CC BY-SA 3.0,"

If I understood correctly you are not worried about false positives. In that case the patients that do not have the complication are not of interest to you. You want to design the study to claim a certain degree of sensitivity.

+ +

There may be a way to calculate this directly, but since no one has answered you yet I will show my approach. I calculated the p-values for a one sided proportion test using the R prop.test() function and null hypothesis that % correct <0.5. This approach ignores all the patients who did not have complications. It gives the number of patients with complications that you will need.

+ +

This was done for samples sizes in the sequence of 10 to 100 by 1, and for % correct from .01 to 1 by .01. Then from these results I found the minimum % correct that yielded p<0.05 for each sample size.

+ +

The upper chart shows the relationship between ""significant"" % corrects vs sample size. While the lower shows power functions for a few example sample sizes. For the latter, the horizontal line corresponds to p=.05.

+ +

If n=10 then the minimum observed % correct that will allow you to ""reject"" (at p<0.05) the hypothesis that your method is will detect less than half the patients with complications is 76.1%. Even for n=100 (remember this is 100 patients with complications), you will need to observe at least a 58.3% success rate to claim ""significance"".

+ +

If you want to calculate the total number of patients to enroll you need to multiply the sample sizes by 1/0.15 which is ~7.

+ +

I am fairly confident about this approach but not 100%, so hopefully someone will check it.

+ +

+ +

R code:

+ +
n<-seq(10,100,by=1) # sample sizes to check
+perc.correct<-seq(.001,1,by=.001) # observed percent correct to check
+
+alpha<-0.05 # ""Significance"" Cutoff
+
+out=matrix(nrow=length(n)*length(perc.correct),ncol=3)
+cnt=1
+for(i in n){
+  for(j in perc.correct){
+    p<-prop.test(j*i,i, p=.5, alternative=""g"", correct=F)$p.value
+    out[cnt,]<-cbind(i,100*j,p)
+    cnt<-cnt+1
+  }
+}
+
+
+# get lowest % correct that yielded p<0.05 for each sample size
+out2=matrix(nrow=length(n),ncol=2)
+cnt<-1
+for(n2 in n){
+  min.perc<-head(out[which(out[,1]==n2 & out[,3]<alpha),2],1)
+  out2[cnt,]<-cbind(n2,min.perc)
+  cnt<-cnt+1
+}
+
+
+# plots
+layout(matrix(c(1,1,2,3,4,5), ncol=2, nrow=3, byrow=T))
+plot(out2, xlab=""Sample Size"", ylab=""% Correct"",
+     main=c(""% Correct to Reject Hypothesis Success <50%"",paste(""alpha="",alpha))
+)
+
+for(n2 in c(10,30,60,100)){
+  min.perc<-head(out[which(out[,1]==n2 & out[,3]<alpha),2],1)
+  plot(out[which(out[,1]==n2),2],out[which(out[,1]==n2),3], 
+       xlab=""% Correct"", ylab=""P-Value"", 
+       main=c(paste(""n="",n2),paste(""Min % Correct for p<"", alpha,""="",min.perc))
+  )
+  abline(h=alpha)
+}
+
+ +

Edit: +Here is a different perspective. In this case we take the theory of % correct prediction of complications >50% as the null hypothesis. So using the significance testing logic we would choose to study the imaging technique more if it is not significant, but not use it if the result is significant. Here we are attempting to disprove the research hypothesis rather than the opposite as was done above (although the original way using reverse logic is the common approach for some strange reason).

+ +

+ +

R code 2 (lines with minor changes marked with ""#*"" at the end):

+ +
n<-seq(10,100,by=1) # sample sizes to check
+perc.correct<-seq(.001,1,by=.001) # observed percent correct to check
+
+alpha<-0.05 # ""Significance"" Cutoff
+
+out=matrix(nrow=length(n)*length(perc.correct),ncol=3)
+cnt=1
+for(i in n){
+  for(j in perc.correct){
+    p<-prop.test(j*i,i, p=.5, alternative=""l"", correct=F)$p.value #*
+    out[cnt,]<-cbind(i,100*j,p)
+    cnt<-cnt+1
+  }
+}
+
+
+# get highest % correct that yielded p<0.05 for each sample size
+out2=matrix(nrow=length(n),ncol=2)
+cnt<-1
+for(n2 in n){
+  max.perc<-tail(out[which(out[,1]==n2 & out[,3]<alpha),2],1) #*
+  out2[cnt,]<-cbind(n2,max.perc) #*
+  cnt<-cnt+1
+}
+
+
+# plots
+layout(matrix(c(1,1,2,3,4,5), ncol=2, nrow=3, byrow=T))
+plot(out2, xlab=""Sample Size"", ylab=""% Correct"",
+     main=c(""% Correct to Reject Hypothesis Success >50%"",paste(""alpha="",alpha))
+)
+
+for(n2 in c(10,30,60,100)){
+  max.perc<-tail(out[which(out[,1]==n2 & out[,3]<alpha),2],1) #*
+  plot(out[which(out[,1]==n2),2],out[which(out[,1]==n2),3], 
+       xlab=""% Correct"", ylab=""P-Value"", 
+       main=c(paste(""n="",n2),paste(""Max % Correct for p<"", alpha,""="",max.perc)) #*
+  )
+  abline(h=alpha)
+}
+
+",2013-11-13 16:30:01.650 +59638,23776.0,2,,58292.0,,,,CC BY-SA 3.0,"

Since the ""deficient"" status in known for the moment of entry into follow-up, the data can be regarded as left-censored (not truncated, where it is unknown what the status is at entry).

+ +

Take a look at this paper, which deals with a very similar problem in medical research. R, SAS and Stata code (for example using survfit) is provided in the web appendix.

+ +

Cain et al., 2011 survival with left-censoring and left-truncating

+",2013-11-15 07:41:25.893 +59733,23085.0,2,,58330.0,,,,CC BY-SA 3.0,"

So I think I figured out a valid counter example.

+ +

$X_i \overset{iid}\sim N(\theta, 1)$. Then $\bar{X}$ is minimax. (This can be shown by using a sequence of priors). However, $\bar{X}$ being unbiased for $\theta$, can only be Bayes for any proper prior if the Bayes Risk is 0.

+ +

$R(\theta,\bar{X}) =\dfrac{1}{n} \implies \int_{\Theta} R(\theta,\bar{X}) \pi(\theta) d\theta = \dfrac{1}{n} \int_{\Theta} \pi(\theta) d\theta = \dfrac{1}{n}$

+ +

Thus, as long $\pi$ is proper, Bayes risk of $\bar{X}$ cannot be 0. Hence $\bar{X}$ cannot be Bayes with respect to any proper. The same reasoning can be made for admissible estimators.

+",2013-11-16 21:11:29.867 +59802,22564.0,2,,59064.0,,,,CC BY-SA 3.0,"

I think your request for the ""overall correlation"" may be asking the wrong question. If you already know that you have varied factor1 and factor2, the correlations you want to look for are conditional the combination of those factors. It is unlikely the independent variables have absolutely 0 effect on the dependent variables, so looking at the total correlation actually includes less information than looking at each individually.

+ +

+ +
  factor1 factor2      r     p
+1       A       1  -0.67 0.034
+2       B       1 -0.043 0.907
+3       A       2 -0.366 0.298
+4       B       2 -0.632  0.05
+5       A       3  0.066 0.856
+6       B       3 -0.276  0.44
+
+ +

R code:

+ +
set.seed(1154)
+
+dat <- data.frame(id=gl(10, 6),
+                   factor1=gl(2, 3, labels=c(""A"", ""B"")),
+                   factor2=gl(3, 1),
+                   DV1=rnorm(60),
+                   DV2=rnorm(60))
+
+
+
+out=matrix(nrow=6,ncol=4)
+par(mfrow=c(3,2))
+cnt<-1
+for(j in unique(dat$factor2)){
+  for(i in unique(dat$factor1)){
+    sub<-dat[which(dat$factor1==i & dat$factor2==j),]
+
+    cor.result<-cor.test(sub$DV1,sub$DV2)
+
+    p<-round(cor.result$p.value,3)
+    r<-round(cor.result$estimate,3)
+    out[cnt,]<-cbind(i,j, r, p)
+
+    plot(sub$DV1,sub$DV2, xlab=""DV1"", ylab=""DV2"",
+         main=c(paste(""Factor1:"", i),paste(""Factor2:"", j),paste(""r="",r,""p="",p)))
+    abline(lm(sub$DV2~sub$DV1))
+    cnt<-cnt+1
+  }
+}
+
+out<-as.data.frame(out)
+colnames(out)<-c(""factor1"",""factor2"",""r"",""p"")
+
+",2013-11-18 08:50:56.970 +59809,23094.0,2,,58353.0,,,,CC BY-SA 3.0,"

After much searching, I have found exactly the software I am looking for:

+ +

http://www.datplot.com/

+ +

Simple, GUI-driven software that allows you to import raw data and plot graphs, with dynamic scroll and zoom.

+",2013-11-18 12:42:37.623 +60004,15827.0,2,,57869.0,,,,CC BY-SA 3.0,"

I guess I would like to read or at least browse in that too, but only a polymath or a committee could write it, and the polymath isn't evident and committee books often don't work well. Also, many of the general books on statistics that tend to pop up from (e.g.) searches on Amazon just leave out most of the interesting technical details and/or are written by people not close to any cutting edge.

+ +

But I would recommend browsing in the Encyclopedia of Statistical Sciences if a library near you holds a copy:

+ +

http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0471150444.html

+ +

and also that you look through what appears in Statistical Science, which has a good track record of readable review and discussion papers.

+ +

I would venture an assertion that most specialists in econometrics, psychometrics, machine learning, etc. would have little confidence that people outside their own field really understand what is currently central and most interesting in that field. (So, what else is new?)

+",2013-11-20 14:43:40.717 +60010,22601.0,2,,57752.0,,,,CC BY-SA 3.0,"

Here's my solution to the problem. I calculate all possible combinations of k of n items and calculate their mutual dependencies by transforming the problem in a graph-theoretical one: Which is the complete graph containing all k nodes with the smallest edge sum (dependencies)? Here's a python script using the networkx library and one possible output. Please apologize for any ambiguity in my question!

+ +

Code:

+ +
import networkx as nx
+import itertools
+import os
+
+#Create new graph
+G=nx.Graph()
+
+#Each node represents a dimension
+G.add_nodes_from([1,2,3,4,5,6,7,8,9,10,11])
+
+#For each dimension add edges and correlations as weights
+G.add_weighted_edges_from([(3,1,0.563),(3,2,0.25)])
+G.add_weighted_edges_from([(4,1,0.688),(4,3,0.438)])
+G.add_weighted_edges_from([(5,1,0.25),(5,2,0.063),(5,3,0.063),(5,4,0.063)])
+G.add_weighted_edges_from([(6,1,0.063),(6,2,0.25),(6,3,0.063),(6,4,0.063),(6,5,0.063)])
+G.add_weighted_edges_from([(7,2,0.25),(7,3,0.063),(7,5,0.125),(7,6,0.063)])
+G.add_weighted_edges_from([(8,1,0.125),(8,2,0.125),(8,3,0.5625),(8,5,0.25),(8,6,0.188),(8,7,0.125)])
+G.add_weighted_edges_from([(9,1,0.063),(9,2,0.063),(9,3,0.25),(9,6,0.438),(9,7,0.063),(9,8,0.063)])
+G.add_weighted_edges_from([(10,1,0.25),(10,2,0.25),(10,3,0.563),(10,4,0.125),(10,5,0.125),(10,6,0.125),(10,7,0.125),(10,8,0.375),(10,9,0.125)])
+G.add_weighted_edges_from([(11,1,0.125),(11,2,0.063),(11,3,0.438),(11,5,0.063),(11,6,0.1875),(11,7,0.125),(11,8,0.563),(11,9,0.125),(11,9,0.188)])
+
+nodes = set(G.nodes())
+combs = set(itertools.combinations(nodes,6))
+sumList = []
+for comb in combs:
+    S=G.subgraph(list(comb))
+    sum=0
+    for edge in S.edges(data=True):
+        sum+=edge[2]['weight']
+    sumList.append((sum,comb))
+
+sorted = sorted(sumList, key=lambda tup: tup[0])    
+
+fo = open(""dependency_ranking.txt"",""wb"")
+
+for i in range(0,len(sorted)):
+    totalWeight = sorted[i][0]
+    nodes = list(sorted[i][1])
+    nodes.sort()
+    out = str(i)+"": ""+str(totalWeight)+"",""+str(nodes)
+    fo.write(out.encode())
+    fo.write(""\n"".encode())
+
+fo.close()
+
+S=G.subgraph([1,2,3,4,6,7])
+sum = 0
+for edge in S.edges(data=True):
+        sum+=edge[2]['weight']
+print(sum)
+
+ +

Sample output:

+ +
0: 1.0659999999999998,[2, 4, 5, 7, 9, 11]
+1: 1.127,[4, 5, 7, 9, 10, 11]
+2: 1.128,[2, 4, 5, 9, 10, 11]
+3: 1.19,[2, 4, 5, 7, 8, 9]
+4: 1.2525,[4, 5, 6, 7, 10, 11]
+5: 1.377,[2, 4, 5, 7, 9, 10]
+6: 1.377,[2, 4, 7, 9, 10, 11]
+7: 1.377,[2, 4, 5, 7, 10, 11]
+
+ +

Input graph: +

+ +

Solution graph: +

+ +

For a toy example, k=4, n=6: +Input graph: +

+ +

Solution graph: +

+ +

Best,

+ +

Christian

+",2013-11-20 15:38:27.410 +60256,12900.0,2,,58248.0,,,,CC BY-SA 4.0,"

Genetic algorithms were used to lower the prime gap to 4680 in the recent Zhang twin primes proof breakthrough and associated Polymath project. The bound has been lowered by other methods but it shows some potential for machine learning approaches in this or related areas. they can be used to devise/optimize effective ""combs"" or basically sieves for analyzing/screening smallest-possible prime gaps.

+ +

Together and Alone, Closing the Prime Gap (Erica Klarreich, Quanta magazine, 19 November 2013):

+ +
+

The team eventually came up with the Polymath project’s record-holder — a 632-tooth comb whose width is 4,680 — using a genetic algorithm that “mates” admissible combs with each other to produce new, potentially better combs.

+
+",2013-11-23 21:14:03.227 +60467,24121.0,2,,59063.0,,,,CC BY-SA 4.0,"

Suppose you have data $\{Y_t,X_{t-h}\}_{t=h+1}^T$, where $h \in \{1,2,\ldots\},$ and your goal is to build a model (say, $\hat f(X_{t-h})$) to predict $Y_t$ given $X_{t-h}$. For concreteness, suppose the data is daily and $T$ corresponds to today.

+

In-sample analysis means to estimate the model using all available data up to and including $T$, and then compare the model's fitted values to the actual realizations. However, this procedure is known to draw an overly optimistic picture of the model's forecasting ability, since common fitting algorithms (e.g. using squared error or likelihood criteria) tend to take pains to avoid large prediction errors, and are thus susceptible to overfitting - mistaking noise for signal in the data.

+

A true out-of-sample analysis would be to estimate the model based on data up to and including today, construct a forecast of tomorrow's value $Y_{T+1}$, wait until tomorrow, record the forecast error $e_{T+1} \equiv Y_{T+1} - \hat f(X_{T+1-h}),$ re-estimate the model, make a new forecast of $Y_{T+2}$, and so forth. At the end of this exercise, one would have a sample of forecast errors $\{e_{T+l}\}_{l=1}^L$ which would be truly out-of-sample and would give a very realistic picture of the model's performance.

+

Since this procedure is very time-consuming, people often resort to "pseudo", or "simulated", out-of-sample analysis, which means to mimic the procedure described in the last paragraph, using some historical date $T_0 < T$, rather than today's date $T$, as a starting point. The resulting forecasting errors $\{e_t\}_{t=T_0+1}^T$ are then used to get an estimate of the model's out-of-sample forecasting ability.

+

Note that pseudo-out-of-sample analysis is not the only way to estimate a model's out-of-sample performance. Alternatives include cross-validation and information criteria.

+

A very good discussion of all these issues is provided in Chapter 7 of

+

[old link]

+

http://www.stanford.edu/~hastie/local.ftp/Springer/OLD/ESLII_print4.pdf

+

[new link, 11/01/2021]

+

https://web.stanford.edu/~hastie/Papers/ESLII.pdf

+",2013-11-26 17:19:29.190 +60584,22923.0,2,,54724.0,,,,CC BY-SA 3.0,"

In these 2 situations, comparative performance flexible vs. inflexible model also depends on:

+ +
    +
  • is true relation y=f(x) close to linear or very non-linear;
  • +
  • do you tune/constrain flexibility degree of the ""flexible"" model when fitting it.
  • +
+ +

If relation is close to linear and you don't constrain flexibility, then linear model should give better test error in both cases because flexible model likely to overfit in both cases.

+ +

You can look at it as that:

+ +
    +
  • In both cases data doesn't contain enough information about true relation (in first case relation is high dimensional and you have not enough data, in second case it corrupted by noise) but +
      +
    • linear model brings some external prior information about true relation (constrain class of fitted relations to linear ones) and
    • +
    • that prior info turns out to be right (true relation is close to linear).
    • +
  • +
  • While flexible model doesn't contain prior information (it can fit anything), so it fits to noise.
  • +
+ +

If however true relation is very non-linear, it's hard to say who will win (both will loose :)).

+ +

If you tune/constrain degree of flexibility and do it in a right way (say by cross-validation), then flexible model should win in all cases.

+",2013-11-28 02:02:48.857 +69580,27332.0,2,,58069.0,,,,CC BY-SA 3.0,"

Actually it's mentioned in the Regression section of Mean squared error in Wikipedia:

+ +
+

In regression analysis, the term mean squared error is sometimes used + to refer to the unbiased estimate of error variance: the residual sum + of squares divided by the number of degrees of freedom.

+
+ +

You can also find some informations here: Errors and residuals in statistics +It says the expression mean squared error may have different meanings in different cases, which is tricky sometimes.

+",2014-03-19 13:05:56.903 +61071,3999.0,2,,58609.0,,,,CC BY-SA 3.0,"

A few examples from clinical research might be variables that arise after randomization - randomization doesn't protect you from those at all. A few off the top of my head, that have been raised as either possibilities or been noted:

+ +
    +
  • Changes in behavior post voluntary adult male circumcision for the prevention of HIV
  • +
  • Differential loss to follow-up between treatment and control arms of an RCT
  • +
  • A more specific example might include the recent ""Benefits of Universal Gowning and Gloving"" study looking at prevention of hospital acquired infections (blog commentary here, the paper is behind a paywall). In addition to the intervention, and potentially because of it, both hand hygiene rates and contact rates between patients and staff/visitors changed.
  • +
+ +

Randomization protects against none of those effects, because they arise post-randomization.

+",2013-12-04 17:29:38.140 +61392,2958.0,2,,59134.0,,,,CC BY-SA 3.0,"

I think the problem is that you train (or rather: optimize) using your ""test"" set. In other words, you can do this, but then you need an additional independent test set for the final validation. Or a nested validation set up from the beginning.

+ +

This is how I see the problem:

+ +

There can be combinations of training and test data (particular splits), where the model trained on that training data works well with the given test set -- regardless of how representative the test set is for the actual problem. Your strategy is actually a search strategy that tries to find such combinations. As there is no guarantee that you'll encounter a really satisfactory model before encountering one of the ""fake satisfactory"" models, there is trouble lurking.

+ +

Because you decide depending on the test set performance whether to go on for a new model or not, your testing is not independent. I think this is related to the problems with other iterative model optimization approaches, where an increase in model quality seems to occur also between equivalent models.

+ +

Here's a simulation:

+ +
    +
  • multivariate normally distributed data, sd = 1 for 25 variates, first 4 informative being 0 for one class and 1 for the other.
  • +
  • 500 cases of each class in the data set, split 80:20 randomly without replacement into train and ""test"" sets.
  • +
  • 50000 cases each class independent test set.
  • +
  • repeat until ""acceptable"" accuracy of 90% is reached according to internal test set.
  • +
+ +


+circles: internal test estimate, dots and whiskers: external independent test set with 95% ci (Agresti-Coull method), red line: cumulative maximum of internal estimate.

+ +

Your rule basically uses the cumulative maximum of the internal test set. In the example that means that within few iterations, you end up with an optimistic bias that claims 1/3 less errors than your models actually have. Note that the models here cannot be distinguished with a 200 cases test set. The order of differences between the large external test set results is the same as the confidence interval width.
+You can also nicely see what I mean with skimming variance: the internal test set estimate itself is unbiased. What causes the bias is doing (potentially large) numbers of iterations and picking the maximum.

+ +

Besides the optimization that is hidden in this procedure as well, the problem is of course the large variance of the accuracy. Other performance measures like Brier's score have lower variance and thus do not lead to such serious overfitting that fast.

+ +
+ +

The code of the simulation:

+ +
require (""binom"")
+require (""MASS"")
+
+set.seed (seed=1111)
+
+randomdata <- function (class, n, p = 25, inf = 1:4){
+  x <- matrix (rnorm (n * p), nrow = n)
+  x [, inf] <- x [, inf] + class 
+
+  data.frame (class = class, X = I (x))
+}
+
+
+data <- rbind (randomdata (class = 0, n = 500), 
+               randomdata (class = 1, n = 500)) 
+indeptest <- rbind (randomdata (class = 0, n = 5e4), 
+                    randomdata (class = 1, n = 5e4)) 
+
+internal.acc <- rep (NA, 100)
+external.acc <- rep (NA, 100)
+
+for (i in 1 : 100){
+  i.train <- sample (nrow (data), size=nrow (data) *.8, replace=FALSE)
+  train <- data [ i.train, ]
+  test  <- data [- i.train,]
+
+  model <- lda (class ~ X, train)
+
+  pred <- predict (model, test)
+  indep.pred <- predict (model, indeptest)
+
+  #table (reference = test$class, prediction = pred$class)
+  internal.acc [i] <- sum (diag (table (reference = test$class, prediction = pred$class))) / nrow (test)
+  external.acc [i] <- sum (diag (table (reference = indeptest$class, prediction = indep.pred$class))) / nrow (indeptest)
+
+  if (internal.acc [i] >= 0.9) break ;
+  cat (""."")
+  }
+
+internal.acc <- internal.acc [1 : i]
+external.acc <- external.acc [1 : i]
+
+plot (internal.acc, ylab = ""accuracy"", xlab = ""iteration"")
+points (external.acc, pch = 20)
+lines (cummax (internal.acc), col = ""red"")
+
+ci <- binom.agresti.coull (external.acc*nrow (indeptest), nrow (indeptest))
+segments (x0 = seq_along (external.acc), x1 = seq_along (external.acc), y0 = ci$lower, y1 = ci$upper)
+
+",2013-12-08 21:16:58.060 +61894,13165.0,2,,58372.0,,,,CC BY-SA 3.0,"

BP like methods find the optimal parameters for maximizing posterior. This is basically the parameter estimation. Inference can also be done in a similar way. Prediction of the new label could be cast as maximizing a posterior given the optimal parameters of the new model. This is a fully Bayesian approach to estimation, in contrary to directly doing EM steps. For more details see section 4 @ http://www.eecs.berkeley.edu/~wainwrig/Papers/Wainwright06_JMLR.pdf

+",2013-12-14 21:30:35.820 +62181,13037.0,2,,57487.0,,,,CC BY-SA 3.0,"

Consider $\mathbf{Y} = \mathbf{X}\mu + \epsilon$ where the matrices are defined as follows:

+ +

$\begin{pmatrix} X_1 \\ X_2 \\ \vdots\\ X_n \\ Y_1 \\ Y_2 \\ \vdots \\ Y_m \end{pmatrix} = \begin{pmatrix} 1 \\ 1\\ \vdots \\ 1 \end{pmatrix}\begin{pmatrix} \mu \end{pmatrix} + \begin{pmatrix} \epsilon_1 \\ \epsilon_2 \\ \vdots \\ \epsilon_n \\ \delta_1 \\ \delta_2 \\ \vdots \\ \delta_m\end{pmatrix}$

+ +

Now we consider this as a weighted least squares problem. Let $w_i = \rho$ for $i=1,\ldots,n$ and $w_i = 1$ for $i=n+1,\ldots,n+m$. Then the weighted least squares estimate of $\mu$ is

+ +

$\hat{\mu}_{WLS} = \left(\mathbf{X}^T\mathbf{W}^{-1}\mathbf{X}\right)^{-1}\mathbf{X}^T\mathbf{W}\mathbf{Y}$ where $diag(\mathbf{W}) = (w_1,\ldots,w_{n+m})=(\rho,\ldots,\rho,1,\ldots,1)$.

+ +

Solving for $\hat{\mu}$ we get

+ +

$\begin{align*} +\hat{\mu} &= \dfrac{1}{\rho n + m}\left(\rho\sum_{i=1}^n X_i + \sum_{i=1}^m Y_i\right)\\ +&= \dfrac{n\rho\bar{X} + m\bar{Y}}{n\rho + m}\\ +&= \dfrac{\rho\bar{X} + \tfrac{m}{n}\bar{Y}}{\rho +\tfrac{m}{n}}\\ +&= \dfrac{\rho\bar{X} + \theta\bar{Y}}{\rho +\theta} +\end{align*}$

+",2013-12-18 18:33:25.387 +63675,13549.0,2,,57341.0,,,,CC BY-SA 3.0,"

In the interest of closing up this question, I have received confirmation from a colleague that the correct interpretation is that permutation destroys the structure of autocorrelation.

+ +

I have bolded (in my original question) the passage in the user guide that is relevant to the answer.

+",2014-01-10 23:03:52.990 +66211,668.0,2,,58203.0,,,,CC BY-SA 3.0,"

You are correct: there are serious problems with the sampling procedures described in that reference. Its description of how systematic sampling should be carried out is plainly wrong, but the problems can be fixed.

+ +

We are talking about systematic sampling of a population of $N$ sampling units. They have been indexed with $1, 2, \ldots, N$ beforehand. Systematic sampling selects a desired number $n$ of individuals in the population by taking them at relatively regular intervals within this set of indexes.

+ +

The three methods in the reference begin by choosing the nominal sampling interval $k$ either to be $N/n$ (potentially non-integral), $\lceil N/n \rceil$ (rounding up), or $\lfloor N/n \rfloor$ (rounding down). Starting at a random integer $r$ chosen uniformly from the set $\{1, 2, \ldots, \lfloor k \rfloor\}$ they select the units at indexes $r, [r + k], [r + 2k], \ldots, [r + (n-1)k],$ where ""$[x]$"" means to round $x$ to an integer according to some definite rule if $x$ is not integral. These indexes are taken modulo $N$ if necessary, circling back to the beginning if the sequence goes beyond $N$.

+ +

To see more clearly what can happen, let's consider a small population of, say, $N=7$ from which we wish to obtain a sample of size $n=3.$

+ +
    +
  1. Using $k = N/n = 7/3$, the possible starting indexes are $r=1$ and $r=2$. In the first case the selected indexes would be $1, [1+7/3], [1+2*7/3]$ = $1, 3, 6$ and in the second case the indexes would be $2, [2+7/3], [2+2*7/3]$ = $2, 4, 7$. We can picture these two samples--both equally likely to be chosen--by drawing marks below the selected indexes thus:

    + +
    Sample   1 2 3 4 5 6 7
    +   r=1   *   *     *
    +   r=2     *   *     *
    +
  2. +
  3. Using $k = \lceil 7/3 \rceil = 3$ gives three possible samples, each with a chance of $1/3$ of being chosen:

    + +
    Sample   1 2 3 4 5 6 7 
    +   r=1   *     *     *
    +   r=2   * *     *   
    +   r=3     * *     *
    +
    + +

    Notice how in the second and third samples indexes of $8$ and $9$ were calculated and wrapped around modulo $7$ to the beginning, turning into $1$ and $2$, respectively.

  4. +
  5. Using $k = \lfloor 7/3 \rfloor = 2$ gives two possible samples, each with a chance of $1/2$ of being chosen:

    + +
    Sample   1 2 3 4 5 6 7 
    +   r=1   *   *   *
    +   r=2     *   *   *
    +
  6. +
+ +

The problems pointed out in the question are apparent:

+ +
    +
  • In case 1, index $5$ is in neither possible sample and therefore is never selected.

  • +
  • In case 2, indexes $1$ and $2$ each have a $2/3$ chance of being in the sample (because they each appear in two out of the three equally likely samples) while the other indexes have only a $1/3$ chance of being in the sample. Thus this sampling method is biased towards the first two subjects in the list.

  • +
  • In case 3, index $7$ is never selected.

  • +
+ +

In general, for larger $N$ a certain fraction of the population will either never be selected (as in cases 1 and 3) or the initial subjects in the list will have relatively high chances of being selected (case 2). In case 3, the final subjects in the list will never be selected, while in case 1 the non-selected subjects are situated fairly evenly throughout the list.

+ +

Normally, systematic sampling is used because there is a natural ordering of the subjects that is related to how they can be sampled: their indexes correspond to their time of enrollment in a human trial or are positions of possible samples in a physical medium or process (like soils in a streambed or products coming off an assembly line). Therefore the problems in cases 1 and 2 are serious ones, because they systematically omit or over-weight one end of the population, whose properties easily could differ from those of the rest of the population. Any study using these methods would thereby be seriously flawed.

+ +

There are two solutions. The simplest one is straightforward and is the correct way to conduct such sampling: choose $r$ uniformly at random from the entire list of indexes, not just the first $k$ of them. For instance, here are all the equally-likely samples that can be obtained using the first (round-as-you-go) method. The first two already appeared in (1) above:

+ +
Sample   1 2 3 4 5 6 7
+   r=1   *   *     *
+   r=2     *   *     *
+   r=3   *   *   *  
+   r=4     *   *   *
+   r=5       *   *   *
+   r=6   *     *   *
+   r=7     *     *   *
+
+ +

By looking down the columns it is easy to see that each index has $3/7$ chance of appearing in the sample: there is no longer any bias.

+ +

It is worth reflecting on how such a subtle change--choosing the initial index randomly from $1, \ldots, N$ rather than $1, \ldots, k$--can have a profound effect on the quality of the study.

+ +

The other solution is to compensate for the sampling bias using the Horvitz-Thompson estimator. This adjusts for varying probabilities of inclusion in the sample. In method (1) it would effectively recognize that the samples are only from the subpopulation indexed by $1,2,3,4,6,7$ (skipping $5$, which is never included). This wouldn't really fix much, but it would highlight the shortcomings of the results and perhaps prevent invalid inferences.

+ +

In method (2) the H-T estimator would give any observations from indexes $1$ and $2$ a weight of just one-half the weights of the other indexes. For example, if the sample happened to be $r=3$ then subjects $2, 3,$ and $6$ would be drawn. Let the values of some quantity observed for these subjects be written $x_2, x_3,$ and $x_6,$ respectively. Because the chance of including $2$ in the sample a priori was $2/3$ and the chances of including $3$ or $6$ were only $1/3,$ the Horvitz-Thompson estimator of the population mean would be

+ +

$$\frac{\frac{1}{2/3} x_2 + \frac{1}{1/3} x_3 + \frac{1}{1/3} x_6}{\frac{1}{2/3} + \frac{1}{1/3} + \frac{1}{1/3}} =\frac{\frac{1}{2} x_2 + x_3 + x_6}{\frac{1}{2} + 1 + 1} = \frac{1}{5}x_2 + \frac{2}{5} x_3 + \frac{2}{5} x_6.$$

+ +

There are comparable formulas for the sampling variance of this estimate (used to construct confidence intervals and test hypotheses, for instance).

+ +

The Horvitz-Thompson estimator comes to the fore when it is impossible to avoid over- and under-weighting some subjects in a sample. It can also be applied to rescue a study that was otherwise ruined using the procedures in the reference: after the fact, if we can compute the actual probability of each subject's inclusion in the sample, then we can apply these probabilities to obtain valid estimates and valid hypothesis tests.

+",2014-02-10 17:01:10.777 +67041,17740.0,2,,58790.0,,,,CC BY-SA 3.0,"

The hinge loss term $\sum_i\max(0,1-y_i(\mathbf{w}^\intercal \mathbf{x}_i+b))$ in soft margin SVM penalizes misclassifications. In hard margin SVM there are, by definition, no misclassifications.

+ +

This indeed means that hard margin SVM tries to minimize $\|\mathbf{w}\|^2$. Due to the formulation of the SVM problem, the margin is $2/\|\mathbf{w}\|$. As such, minimizing the norm of $\mathbf{w}$ is geometrically equivalent to maximizing the margin. Exactly what we want!

+ +

Regularization is a technique to avoid overfitting by penalizing large coefficients in the solution vector. In hard margin SVM $\|\mathbf{w}\|^2$ is both the loss function and an $L_2$ regularizer.

+ +

In soft-margin SVM, the hinge loss term also acts like a regularizer but on the slack variables instead of $\mathbf{w}$ and in $L_1$ rather than $L_2$. $L_1$ regularization induces sparsity, which is why standard SVM is sparse in terms of support vectors (in contrast to least-squares SVM).

+",2014-02-19 15:21:47.693 +67660,28059.0,2,,49906.0,,,,CC BY-SA 3.0,"

Sorry, it's an old issue, I came across this by chance.

+ +

There is a mistake in your code for the mcnemar test. Try with:

+ +
n <- 100
+do.one <- function(n) {
+  id <- rep(1:n, each=2)
+  case <- rep(0:1, times=n)
+  rs <- rbinom(n*2, 1, 0.5)
+  c(
+    'pclogit' = coef(summary(clogit(case ~ rs + strata(id))))[5],
+    'pmctest' = mcnemar.test(table(rs[case == 0], rs[case == 1]))$p.value
+  )
+}
+
+out <- replicate(1000, do.one(n))
+
+ +

+",2014-02-26 16:50:15.223 +67832,17740.0,2,,57764.0,,,,CC BY-SA 3.0,"

I disagree with the explanation given in the other answer. SVM, works well for large dimensional problems with relatively few instances because it is well regularized. In this case, I suspect the problem is not the tool but rather how it is being used.

+ +
+

For SVM I use a dot kernel and other parameters are all in their defaults ...

+
+ +

This is why your SVM results are bad. If you do not tune the SVM parameters (probably $c$ in your case), the resulting classifier will likely be poor unless you happen to get lucky with the default value.

+ +
+

when I change the parameters and try again, I get same result; Naive Bayes still outperforms SVM.

+
+ +

How do you change the parameters? What search method do you use? Do you just pick a value and hope for the best? Parameter search is important and must be done properly. Typically, optimal parameters are found through cross-validation.

+ +

Note that Naive Bayes may well be better for your particular application. Just because SVM is known to work well on this type of problems, does not mean that it always does.

+",2014-02-28 10:02:29.313 +68114,23984.0,2,,58059.0,,,,CC BY-SA 3.0,"

A simple Matlab code using adaBoost+SVM, probably you can start from here...

+ +
    N = length(X); % X training labels
+    W = 1/N * ones(N,1); %Weights initialization
+    M = 10; % Number of boosting iterations 
+
+    for m=1:M
+        C = 10; %The cost parameters of the linear SVM, you can...
+                 perform a grid search for the optimal value as well           
+
+        %Calculate the error and alpha in adaBoost with cross validation
+        cmd = ['-c ', num2str(C), ' -w ', num2str(W)];
+        model = svmtrain(X, Y, cmd);
+        [Xout, acc, ~] = svmpredict(X,Y,cmd);
+
+        err = sum(.5 * W .* acc * N)/sum(W);
+        alpha = log( (1-err)/err );
+
+        % update the weight
+        W = W.*exp( - alpha.*Xout.*X );
+        W = W/norm(W);
+
+    end
+
+ +

In Wang's et al's Boosting Support Vector Machines for Imbalanced Data Sets, they applied a slightly different formula in the weight update, aiming at dealing with class imbalance.

+ +

Weights for data instances instruction is also shown in Prof. Lin's website.

+ +

Hope it helps.

+",2014-03-03 17:53:00.610 +71920,19377.0,2,,57652.0,,,,CC BY-SA 3.0,"

The tail dependence coefficients $\lambda_U$ and $\lambda_L$ are measures of extremal dependence that quantify the dependence in the upper and lower tails of a bivariate distribution with continuous margins $F$ and $G$.

+ +

The coefficients $\lambda_U$ and $\lambda_L$ are defined in terms of quantile exceedences. +For the upper tail dependence coefficient $\lambda_U$ one looks at the probability that $Y$ exceeds the $u$-quantile $G^{-1}(u)$, given that $X$ exceeds the $u$-quantile $F^{-1}(u)$, and then consider the limit as $u$ goes to $1$, provided it exists. +Large values of $\lambda_U$ imply that joint extremes are more likely than for low values of $\lambda_U$. Interpretation for the lower tail dependence coefficient $\lambda_L$ is analogous.

+ +

If $\lambda_U = 0$, then $X$ and $Y$ are said to be asymptotically independent; if $\lambda_U \in (0, 1]$ they are asymptotically dependent. +For independent variables $\lambda_U = 0$; for perfectly dependent variables $\lambda_U = 0$. Note that $\lambda_U = 0$ does NOT imply independence; the comment/statement in the question's reference is wrong. Indeed, consider a bivariate normal distribution with correlation $\rho \notin \{0, 1\}$. Then, one can show that $\lambda_U = 0$ but the variables are dependent.

+ +

A distribution with, say, $\lambda_U = 0.5$ does not mean that there is a linear dependence between $X$ and $Y$; it means that $X$ and $Y$ are asymptotically dependent in the upper tail, and the strength of the dependence is 0.5. For example, the Gumbel copula with parameter $\theta = \log(2)/\log(1.5)$ has +a coefficient of upper tail dependence equal to $0.5$.

+ +

The following figure shows a sample of 1000 points of the Gumbel copula with parameter $\theta = \log(2)/\log(1.5)$ with uniform margins (left) and standard normal margins (right). The data were generated with the copula package in R (code is provided below).

+ +

+ +
## Parameters
+theta <- log(2)/log(1.5)
+n     <- 1000
+
+## Generate a sample
+library(copula)
+set.seed(234)
+gumbel.cop <- archmCopula(""gumbel"", theta)
+x <- rCopula(n, gumbel.cop)
+
+## Visualization
+par(mfrow = c(1, 2))
+plot(x, pch = 16, col = ""gray"", xlab = ""U"", ylab = ""V"")
+plot(qnorm(x), pch = 16, col = ""gray"", xlab = ""X"", ylab = ""Y"")
+par(mfrow = c(1, 1))
+
+",2014-04-14 20:00:46.893 +79542,24527.0,2,,57851.0,,,,CC BY-SA 3.0,"

I would have to say that it would be extremely difficult for you to estimate the relative quantities of coins in circulation through any but an exhaustive (collecting a large portion of those coins simultaneously) survey.

+ +

The reason is because most businesses (I believe) hold a reasonably large portion of coins in stock and will only distribute the coins which most efficiently lead to correct change. Thus even if you go into the same store 100 times and collect change each time unless you have exhausted the stock of available coins, the coins that you receive in exchange for your sampling will only be those which correspond only with the least change required to fulfill your needs.

+ +

Assuming you draw change requirements uniformly between 1 cent and 499 cents this ratio is:

+ +
       200        100         25         10          5          1 
+0.13559322 0.06779661 0.25423729 0.13559322 0.06779661 0.33898305
+
+ +

If the store has no shortage of coins then your sampling procedure will automatically return the above ratios which have no correlation between the specific samples and the greater population of coins in circulation. To see how I came up with these numbers see my blog post on the topic.

+ +

But this does not account for the oddities of prices which tend to cluster ending in .09 as in .99, .49, or .39 (in the US at least) which will definitely contribute to higher ratio of pennies required for many purchases than in the uniform draw of change. Purchase requirements would need be specified so as to not cause further contamination of the data. Overall, I think it is clear that this is a pretty problematic study design.

+ +

If you were forced to do something like this then you might be alt to 1. record change totals for each purchase, 2. calculating efficient coinage selection via the method I propose on my blog for each purchase, 3. record coins actually returned, 4. estimate the different between the optimal returned coin quantities and that actually returned to estimate to what degree coin stocks might be diverging from the optimal quantities. From there I am not sure what to do with it in order to estimate total coins available in the currency.

+ +

Good luck and thanks for the interesting question!

+",2014-07-05 22:35:10.283 +85353,37033.0,2,,57665.0,,,,CC BY-SA 4.0,"

I think you pretty much nailed it in your Edit. Generative model makes more restrictive assumption about the distribution of $x$.

+

From Minka

+

"Unlike traditional generative random fields, CRFs only +model the conditional distribution $p(t|x)$ and do not explicitly model the marginal $p(x)$. Note that the labels ${ti }$ are +globally conditioned on the whole observation $x$ in CRFs. +Thus, we do not assume that the observed data $x$ are conditionally independent as in a generative random field."

+",2014-09-08 15:28:58.277 +85707,36513.0,2,,55361.0,,,,CC BY-SA 3.0,"

I'm first outlining an approach for two companies in detail, the extension to even more companies then should be intuitive (at least for the likelihood, the prior could be more tricky).

+ +

Imagine there are two companies A and B, where A has $N_A$ locomotives and B has $N_B$ locomotives. We assume $N_A \ge N_B$ (you can always switch A and B to make this hold). The total number for that hypothesis of locomotives is $N_{tot} = N_A + N_B$.

+ +

Imagine you see a locomotive with the number $n$. There are three cases for the likelihood:

+ +
    +
  1. $N_A < n$: This can't happen, so the likelihood is zero.
  2. +
  3. $N_B < n \le N_A$: This locomotive must come from company A, so there is only one locomotive with this number. Thus the likelihood is $1/N_{tot}$
  4. +
  5. $n \le N_B$: This locomotive can be either from A or from B, so there are two locomotives with this number. The likelihood to see one of them is $2/N_{tot}$.
  6. +
+ +

As a quick sanity check: The likelihood to see any number at all is $$\sum_{i=1}^\infty L(i) = \sum_{i=1}^{N_B} \frac{2}{N_{tot}} + \sum_{i=N_B+1}^{N_A} \frac{1}{N_{tot}} \\ = \frac{2\cdot N_B}{N_{tot}} + \frac{N_A-N_B}{N_{tot}} = \frac{N_A+N_B}{N_{tot}} = 1$$.

+ +
+ +

Generally, there will be (number of companies + 1) cases, one for each interval $N_i < n \le N_{i+1}$. Luckily, we can look at the problem from a different angle and see that what we need for the likelihood are actually just two numbers: $N_{tot}$, the total number of locomotives; and $N_n$, the number of locomotives that have the number $n$. How likely are we to see one of the $N_n$ locomotive, out of $N_{tot}$ locomotives? This will happen in $\frac{N_n}{N_{tot}}$ of all cases, so this fraction is the likelihood. +In Python, you can calculate this with two sum generators (and you don't even have to order the companies by size). If Ns contains a list (or tuple) of company sizes according to your hypothesis, then this will give the likelihood for seeing a locomotive with number n:

+ +
total_number_of_locomotives = sum(N for N in Ns)
+number_of_locomotives_with_that_number = sum(1 for N in Ns if n<=N)
+likelihood = (number_of_locomotives_with_that_number / total_number_of_locomotives)
+
+ +

Note that the trivial case with one company is also handled by this code (the first sum just will be $N$, the second sum will be 0 or 1, depending on whether $n\le N$).

+ +
+ +

For the priors, Zipf's law could be a good starting point for a realistic distribution of company sizes.

+",2014-09-12 09:27:13.160 +85831,37646.0,2,,57779.0,,,,CC BY-SA 3.0,"

This is the simplest proof I've been able to find.

+ +

Just by rearranging factorials, we can rewrite the hypergeometric probability function as
+$$ \mathrm{Prob}(X=x) = \frac{1}{x!} \cdot \dfrac{M^{(x)} \, K^{(x)}}{N^{(x)}} \cdot \dfrac{(N-K)^{(M-x)}}{(N-x)^{(M-x)}}, $$ +where $a^{(b)}$ is the falling power $a(a-1)\cdots(a-b+1)$. +Since $x$ is fixed, +\begin{align*} +\dfrac{M^{(x)} \, K^{(x)}}{N^{(x)}} +&= \prod_{j=0}^{x-1} \dfrac{(M-j) \cdot (K-j)}{(N-j)} \\ +&= \prod_{j=0}^{x-1} \left( \dfrac{MK}{n} \right) \cdot \dfrac{(1-j/M) \cdot (1-j/K)}{(1-j/N)} \\ +&= \left( \dfrac{MK}{N} \right) ^x \; \prod_{j=0}^{x-1} \dfrac{(1-j/M) \cdot (1-j/K)}{(1-j/N)}, +\end{align*} +which $\to \lambda^x$ as $N$, $K$ and $M$ $\to \infty$ with $\frac{MK}{N} = \lambda$.

+ +

Lets replace $N-x$, $K-x$ and $M-x$ by new variables $n$, $k$ and $m$ for simplicity. Since $x$ is fixed, as $N,K,M \to \infty$ with $KM/N \to \lambda$, so too $n,k,m \to \infty$ with $nk/m \to \lambda$. Next we write +$$ A = \dfrac{(N-K)^{(M-x)}}{(N-x)^{(M-x)}} = \dfrac{(n-k)^{(m)} }{(n)^{(m)}} = \prod_{j=0}^{m-1} \left( \dfrac{n-j-k}{n-j} \right)= \prod_{j=0}^{m-1} \left( 1 - \dfrac{k}{n-j} \right)$$ +and take logs: +$$ \ln \, A = \sum_{j=0}^{m-1} \ln \left( 1 - \dfrac{k}{n-j} \right). $$ +Since the bracketed quantity is an increasing function of $j$ we have +$$ \sum_{j=0}^{m-1} \ln \left( 1 - \dfrac{k}{n} \right) \le \ln \, A \le \sum_{j=0}^{m-1} \ln \left( 1 - \dfrac{k}{n-m+1} \right), $$ +or +$$ m \, \ln \left( 1 - \dfrac{k}{n} \right) \le \ln \, A \le m \, \ln \left( 1 - \dfrac{k}{n-m+1} \right). $$ +But $\ln (1-x) < -x$ for $0 < x < 1$, so +$$ m \, \ln \left( 1 - \dfrac{k}{n} \right) \le \ln \, A < -m \, \left( \dfrac{k}{n-m+1} \right), $$ +and dividing through by $km/n$ gives +$$ \frac{n}{k} \, \ln \left( 1 - \dfrac{k}{n} \right) \le \dfrac{\ln \, A}{km/n} < - \, \left( \dfrac{n}{n-m+1} \right) = - \, \left( \dfrac{1}{1-m/n+1/n} \right). $$ +Finally, we let $k$, $m$ and $n$ tend to infinity in such a way that $km/n \to \lambda$. Since both $k/n \to 0$ and $m/n \to 0$, both the left and right bounds $\to -1$. (The left bound follows from $\lim_{n \to \infty} (1-1/n)^n = e^{-1}$, which is a famous limit in calculus.) So by the Squeeze Theorem we have $\ln \, A \to -\lambda$, and thus $A \to e^{-\lambda}$.

+",2014-09-14 08:16:03.980 +92891,10986.0,2,,57939.0,,,,CC BY-SA 3.0,"

I also just started to look at this question.

+ +

As mentioned before, when we use the normal distribution to calculate p-values for each test, then these p-values do not take multiple testing into account. To correct for it and control the family-wise error rate, we need some adjustments. Bonferonni, i.e. dividing the significance level or multiplying the raw p-values by the number of tests, is only one possible correction. There are a large number of other multiple testing p-value corrections that are in many cases less conservative.

+ +

These p-value corrections do not take the specific structure of the hypothesis tests into account.

+ +

I am more familiar with the pairwise comparison of the original data instead of the rank transformed data as in Kruskal-Wallis or Friedman tests. In that case, which is the Tukey HSD test, the test statistic for the multiple comparison is distributed according to the studentized range distribution, which is the distribution for all pairwise comparisons under the assumption of independent samples. It is based on probabilities of multivariate normal distribution which could be calculated by numerical integration but are usually used from tables.

+ +

My guess, since I don't know the theory, is that the studentized range distribution can be applied to the case of rank tests in a similar way as in the Tukey HSD pairwise comparisons.

+ +

So, using (2) normal distribution plus multiple testing p-value corrections and using (1) studentized range distributions are two different ways of getting an approximate distribution of the test statistics. However, if the assumptions for the use of the studentized range distribution are satisfied, then it should provide a better approximation since it is designed for the specific problem of all pairwise comparisons.

+",2014-12-03 04:05:14.547 +94362,5179.0,2,,57676.0,,,,CC BY-SA 3.0,"

The ""two uniforms"" are not absolutely necessary when generating from a mixture, but they make the simulation easy to understand. The mixture of normal distributions, +$$rf_a(x)+(1-r)f_b(x)$$ +has a probability mass of $r$ associated with the first normal and $(1-r)$ with the second normal. This means that the distribution of $X\sim f$ can be decomposed as +$$\mathbb{P}(X\in\mathcal{A})=r\mathbb{P}(X_a\in\mathcal{A})+(1-r)\mathbb{P}(X_b\in\mathcal{A})$$ +for any measurable set $\mathcal{A}$, where $X_a$ and $X_b$ are normal random variables with means $a$ and $b$ respectively. This can be reinterpreted as +$$X=\begin{cases} X_a &\text{with probability $r$}\\ +X_b &\text{with probability $1-r$}\end{cases}$$ +meaning that to generate from the mixture, one can follow the steps

+ +
    +
  1. Pick between components $a$ and $b$ by generating a uniform $U\sim\mathcal{U}(0,1)$ and, if $U<r$ take $\mu=a$ and else take $\mu=b$;
  2. +
  3. Generate $X$ as $X_a$ or $X_b$ depending on the first step result, by generating a uniform $V\sim\mathcal{U}(0,1)$ and take $X=\Phi^{-1}(V)+\mu$
  4. +
+ +

This explains for the use of two uniforms.

+",2014-12-16 17:53:22.597 +99688,45797.0,2,,57217.0,,,,CC BY-SA 3.0,"

When you say ""better results"" and provide the different errors you got, is that evaluated in-sample or out-of-sample?

+ +

SVM tuning will attempt to give you the best tuning parameters for out-of-sample prediction. But for in-sample ""performance,"" higher gamma and higher cost will always do ""better."" I put 'performance' and 'better' in quotes here, b/c higher values of gamma and cost will not actually give you a better estimate of the underlying mapping between independent and outcome variables, but will ""overfit"" the training data. Keep in mind, SVM is capable of arbitrary complexity, so if you let it, it will perfectly fit your data. (That's assuming there are no instances of identical X's producing different Y's, but that almost surely won't happen with continuous data.) When you fit so tightly, if actually harms predictive ability, b/c there is a lot of spurious complexity in the prediction function, so my guess is that gamma=0.02 will outperform out-of-sample, while gamma=0.042 will dominate in-sample. If you're interested in prediction, go with 0.02. There aren't many reasons to go with in-sample performance, but there are instances when you want to be a little liberal with the cost and gamma parameters. (For example, if you're using SVM-regression to produce a residual which has purged variation related to all of the explanatory variables, a little over-fitting can be beneficial.) But in most cases, go with the out-of-sample prediction accuracy.

+ +

However, if you evaluated this out-of-sample and found that the higher gamma did better, then I must say that I'm surprised. It's theoretically possible for the performance manifold (the 3-d surface of out-of-sample MSE, cost and gamma) to be very spike-y, which can lead to poor tuning outcomes, but looking at the plot above, it looks pretty well-behaved.

+ +

Hope that helps!

+",2015-02-14 15:31:04.593 +99851,35937.0,2,,58043.0,,,,CC BY-SA 3.0,"

I know it's a delayed response but this could help anyone who is trying to do this:

+ +

Follow these steps:

+ +

Stat -> DOE -> Factorial -> Create Factorial design

+ +

Select ""General Full factorial design"" and select 2 in ""Number of Factors""

+ +

+ +

then Click on ""Design tab"", Enter the factor names (Environment & Frequency) in the name boxes and enter ""2"" as factor as you want to have 2 levels ""H2O"" and ""Salt H2O""

+ +

+ +

Then Click ""Factors"" tab and change the type as ""Text"" from ""Numeric"" using drop down menu.

+ +

Then enter the level values for two different levels as shown in the picture.

+ +

+ +

And here is the output

+ +

+ +

You can enter the response for each combination in C7 column and proceed with further analysis. Hope this helps !

+",2015-02-16 15:03:54.463 +100174,46110.0,2,,57567.0,,,,CC BY-SA 3.0,"

Using the F-statistic formula from Hayashi:

+ +

$F=\frac{\left(Rb-r\right)^T\left[R\left(X^T X\right)^{-1}R^T\right]^{-1}\left(Rb-r\right)/ \rho}{s^2}$

+ +

(notation: we're testing the hypothesis $R\beta = r$, $R\in\mathbb{R}^{\rho\times K}$, rank($R$)=$\rho$, $s^2$ is the variance of the error term)

+ +

We can immediately see why this won't apply to the heteroskedastic case--in particular, we definitely don't expect $s^2$ to be a constant.

+ +

Intuitively, we know there's a (nontrivial) relationship between $t$ and $F$ statistics--so given that $t$ statistics are heteroskedasticity-dependent, we should expect $F$-statistics to be so as well.

+ +

See here (pp 3-4) for why the Wald test you suggested is correct & hence the waldtest function in R is appropriate.

+",2015-02-19 23:03:56.703 +101645,20410.0,2,,2509.0,,,,CC BY-SA 4.0,"

Imagine a big family dinner where everybody starts asking you about PCA. First, you explain it to your great-grandmother; then to your grandmother; then to your mother; then to your spouse; finally, to your daughter (a mathematician). Each time the next person is less of a layman. Here is how the conversation might go.

+

Great-grandmother: I heard you are studying "Pee-See-Ay". I wonder what that is...

+

You: Ah, it's just a method of summarizing some data. Look, we have some wine bottles standing here on the table. We can describe each wine by its colour, how strong it is, how old it is, and so on. + +Visualization originally found here.

+

We can compose a whole list of different characteristics of each wine in our cellar. But many of them will measure related properties and so will be redundant. If so, we should be able to summarize each wine with fewer characteristics! This is what PCA does.

+

Grandmother: This is interesting! So this PCA thing checks what characteristics are redundant and discards them?

+

You: Excellent question, granny! No, PCA is not selecting some characteristics and discarding the others. Instead, it constructs some new characteristics that turn out to summarize our list of wines well. Of course, these new characteristics are constructed using the old ones; for example, a new characteristic might be computed as wine age minus wine acidity level or some other combination (we call them linear combinations).

+

In fact, PCA finds the best possible characteristics, the ones that summarize the list of wines as well as only possible (among all conceivable linear combinations). This is why it is so useful.

+

Mother: Hmmm, this certainly sounds good, but I am not sure I understand. What do you actually mean when you say that these new PCA characteristics "summarize" the list of wines?

+

You: I guess I can give two different answers to this question. The first answer is that you are looking for some wine properties (characteristics) that strongly differ across wines. Indeed, imagine that you come up with a property that is the same for most of the wines - like the stillness of wine after being poured. This would not be very useful, would it? Wines are very different, but your new property makes them all look the same! This would certainly be a bad summary. Instead, PCA looks for properties that show as much variation across wines as possible.

+

The second answer is that you look for the properties that would allow you to predict, or "reconstruct", the original wine characteristics. Again, imagine that you come up with a property that has no relation to the original characteristics - like the shape of a wine bottle; if you use only this new property, there is no way you could reconstruct the original ones! This, again, would be a bad summary. So PCA looks for properties that allow reconstructing the original characteristics as well as possible.

+

Surprisingly, it turns out that these two aims are equivalent and so PCA can kill two birds with one stone.

+

Spouse: But darling, these two "goals" of PCA sound so different! Why would they be equivalent?

+

You: Hmmm. Perhaps I should make a little drawing (takes a napkin and starts scribbling). Let us pick two wine characteristics, perhaps wine darkness and alcohol content -- I don't know if they are correlated, but let's imagine that they are. Here is what a scatter plot of different wines could look like:

+

+

Each dot in this "wine cloud" shows one particular wine. You see that the two properties ($x$ and $y$ on this figure) are correlated. A new property can be constructed by drawing a line through the centre of this wine cloud and projecting all points onto this line. This new property will be given by a linear combination $w_1 x + w_2 y$, where each line corresponds to some particular values of $w_1$ and $w_2$.

+

Now, look here very carefully -- here is what these projections look like for different lines (red dots are projections of the blue dots):

+

+

As I said before, PCA will find the "best" line according to two different criteria of what is the "best". First, the variation of values along this line should be maximal. Pay attention to how the "spread" (we call it "variance") of the red dots changes while the line rotates; can you see when it reaches maximum? Second, if we reconstruct the original two characteristics (position of a blue dot) from the new one (position of a red dot), the reconstruction error will be given by the length of the connecting red line. Observe how the length of these red lines changes while the line rotates; can you see when the total length reaches minimum?

+

If you stare at this animation for some time, you will notice that "the maximum variance" and "the minimum error" are reached at the same time, namely when the line points to the magenta ticks I marked on both sides of the wine cloud. This line corresponds to the new wine property that will be constructed by PCA.

+

By the way, PCA stands for "principal component analysis", and this new property is called "first principal component". And instead of saying "property" or "characteristic", we usually say "feature" or "variable".

+

Daughter: Very nice, papa! I think I can see why the two goals yield the same result: it is essentially because of the Pythagoras theorem, isn't it? Anyway, I heard that PCA is somehow related to eigenvectors and eigenvalues; where are they in this picture?

+

You: Brilliant observation. Mathematically, the spread of the red dots is measured as the average squared distance from the centre of the wine cloud to each red dot; as you know, it is called the variance. On the other hand, the total reconstruction error is measured as the average squared length of the corresponding red lines. But as the angle between red lines and the black line is always $90^\circ$, the sum of these two quantities is equal to the average squared distance between the centre of the wine cloud and each blue dot; this is precisely Pythagoras theorem. Of course, this average distance does not depend on the orientation of the black line, so the higher the variance, the lower the error (because their sum is constant). This hand-wavy argument can be made precise (see here).

+

By the way, you can imagine that the black line is a solid rod, and each red line is a spring. The energy of the spring is proportional to its squared length (this is known in physics as Hooke's law), so the rod will orient itself such as to minimize the sum of these squared distances. I made a simulation of what it will look like in the presence of some viscous friction:

+

+

Regarding eigenvectors and eigenvalues. You know what a covariance matrix is; in my example it is a $2\times 2$ matrix that is given by $$\begin{pmatrix}1.07 &0.63\\0.63 & 0.64\end{pmatrix}.$$ What this means is that the variance of the $x$ variable is $1.07$, the variance of the $y$ variable is $0.64$, and the covariance between them is $0.63$. As it is a square symmetric matrix, it can be diagonalized by choosing a new orthogonal coordinate system, given by its eigenvectors (incidentally, this is called spectral theorem); corresponding eigenvalues will then be located on the diagonal. In this new coordinate system, the covariance matrix is diagonal and looks like that: $$\begin{pmatrix}1.52 &0\\0 & 0.19\end{pmatrix},$$ meaning that the correlation between points is now zero. It becomes clear that the variance of any projection will be given by a weighted average of the eigenvalues (I am only sketching the intuition here). Consequently, the maximum possible variance ($1.52$) will be achieved if we simply take the projection on the first coordinate axis. It follows that the direction of the first principal component is given by the first eigenvector of the covariance matrix. (More details here.)

+

You can see this on the rotating figure as well: there is a gray line there orthogonal to the black one; together, they form a rotating coordinate frame. Try to notice when the blue dots become uncorrelated in this rotating frame. The answer, again, is that it happens precisely when the black line points at the magenta ticks. Now I can tell you how I found them (the magenta ticks): they mark the direction of the first eigenvector of the covariance matrix, which in this case is equal to $(0.81, 0.58)$.

+
+

Per popular request, I shared the Matlab code to produce the above animations.

+",2015-03-06 00:30:06.837 +115327,15723.0,2,,40030.0,,,,CC BY-SA 4.0,"

Stratification seeks to ensure that each fold is representative of all strata of the data. Generally this is done in a supervised way for classification and aims to ensure each class is (approximately) equally represented across each test fold (which are of course combined in a complementary way to form training folds).

+

The intuition behind this relates to the bias of most classification algorithms. They tend to weight each instance equally which means overrepresented classes get too much weight (e.g. optimizing F-measure, Accuracy or a complementary form of error). Stratification is not so important for an algorithm that weights each class equally (e.g. optimizing Kappa, Informedness or ROC AUC) or according to a cost matrix (e.g. that is giving a value to each class correctly weighted and/or a cost to each way of misclassifying). See, e.g. +D. M. W. Powers (2014), What the F-measure doesn't measure: Features, Flaws, Fallacies and Fixes. http://arxiv.org/pdf/1503.06410

+

One specific issue that is important across even unbiased or balanced algorithms, is that they tend not to be able to learn or test a class that isn't represented at all in a fold, and furthermore even the case where only one of a class is represented in a fold doesn't allow generalization to performed resp. evaluated. However even this consideration isn't universal and for example doesn't apply so much to one-class learning, which tries to determine what is normal for an individual class, and effectively identifies outliers as being a different class, given that cross-validation is about determining statistics not generating a specific classifier.

+

On the other hand, supervised stratification compromises the technical purity of the evaluation as the labels of the test data shouldn't affect training, but in stratification are used in the selection of the training instances. Unsupervised stratification is also possible based on spreading similar data around looking only at the attributes of the data, not the true class. See, e.g. +https://doi.org/10.1016/S0004-3702(99)00094-6 +N. A. Diamantidis, D. Karlis, E. A. Giakoumakis (1997), +Unsupervised stratification of cross-validation for accuracy estimation.

+

Stratification can also be applied to regression rather than classification, in which case like the unsupervised stratification, similarity rather than identity is used, but the supervised version uses the known true function value.

+

Further complications are rare classes and multilabel classification, where classifications are being done on multiple (independent) dimensions. Here tuples of the true labels across all dimensions can be treated as classes for the purpose of cross-validation. However, not all combinations necessarily occur, and some combinations may be rare. Rare classes and rare combinations are a problem in that a class/combination that occurs at least once but less than K times (in K-CV) cannot be represented in all test folds. In such cases, one could instead consider a form of stratified boostrapping (sampling with replacement to generate a full size training fold with repetitions expected and 36.8% expected unselected for testing, with one instance of each class selected initially without replacement for the test fold).

+

Another approach to multilabel stratification is to try to stratify or bootstrap each class dimension separately without seeking to ensure representative selection of combinations. With L labels and N instances and Kkl instances of class k for label l, we can randomly choose (without replacement) from the corresponding set of labeled instances Dkl approximately N/LKkl instances. This does not ensure optimal balance but rather seeks balance heuristically. This can be improved by barring selection of labels at or over quota unless there is no choice (as some combinations do not occur or are rare). Problems tend to mean either that there is too little data or that the dimensions are not independent.

+",2015-07-15 01:23:38.037 +121372,58011.0,2,,57890.0,,,,CC BY-SA 3.0,"

As mentioned in a comment, the MAP estimate is the maximum likelihood estimate when you omit $g(\theta)$ or if it is a constant. If $g(\theta)$ is not a constant, then there are of course various methods for finding the MAP estimate. Omitting the survey sampling aspect (or assuming we have a completely representative sample from a population of infinite size or assuming you have included the sampling mechanism into your likelihood):

+ +
    +
  1. Analytically (often by taking logs and finding the maximum).
  2. +
  3. In some cases conjugate priors are available have known modes so that you do not need to do the analytic calculation yourself. E.g. in +the example you give we could use a Beta prior. You did not specify +how certain you were about your prior, but let's say that in a +previous survey you had 20 out of 50 for ""A"" and 30 out of 50 for ""B"" (and that there are no other options to vote for). If you are happy to use a Beta(20,30) prior, then your posterior is a Beta(20+60, 30+40) distribution. +The mode is then known to be (80-1)/(150-2)=0.53 This would not be +correct for a non-representative sample or one from a non-infinite +population and this option only exists for a few distributions. Additionally, just because a conjugate prior is available and convenient does not mean it is what you want to use (e.g. you may have wanted to express some doubt about the applicability of the previous survey to your new survey by using a mixture of a Beta(0.5,0.5) prior and a Beta(20,30) prior with weights of 0.2 and 0.8 to express this uncertainty. Then you can still do conjugate updating, but getting the updated posterior weights is a tiny bit harder.
  4. +
  5. Using some numeric minimization routine.
  6. +
+ +

In a simplistic situation where surveys really sample exactly how people will really vote (nothing else happens before the election to change the mind of people, there is no issues with voter turnout differing for parties etc.), you could then for a known total size of the number of voters predict the outcome of voting using the beta-binomial distribution (the predictive distribution of the binomial distribution with a beta prior). In reality predicting an election is of course much more difficult.

+",2015-09-13 09:06:45.893 +217429,102539.0,2,,57626.0,,,,CC BY-SA 3.0,"

https://www.researchgate.net/publication/312165764_Panel_Vector_Autoregression_in_R_The_panelvar_Package

+ +

Here you will find the R-package and the link to the paper.

+",2018-01-17 12:42:00.307 +227108,128628.0,2,,58826.0,,,,CC BY-SA 3.0,"

Some recommended standards for statistical notation are presented in Halperin, Hartley and Hoel (1965) and Sanders and Pugh (1972). Most of the current notation comes from conventions that were established by the biometric statisticians in the late 19th and early 20th century (most of it was done by Pearson and Fisher and their associates). A useful list of early uses of notation is maintained by the economist John Aldrich here, and a historical account of the English biometric school is published in Aldrich (2003). (If you have further enquiries about this topic, Aldrich is probably the world's foremost living expert in the history of notation in statistics.)

+ +

Aside from this explicit work, there are a lot of books that give introductions to the field, and these are careful to define notation consistent with common conventions, defining notation as they go. There are many well-known conventions in this field that run consistently through the literature, and statisticians are well-acquainted with these through practice, even without having read the recommendations of these researchers.

+ +

Ambiguity of the distribution-centric notation: The use of the ""distribution-centric"" notation is a standard convention that is used throughout statistical literature. However, one interesting thing to point out about this notation is that there is a bit of wiggle-room as to what it actually means. The standard convention is to read the object on the right-hand-side of these statements as some kind of description of a probability measure (e.g, a distribution function, density function, etc.) and then read the $\sim$ relation with meaning ""...has distribution..."" or ""...has probability measure..."", etc. Under this interpretation the relation compares two distinct sets of things; the object on the left-hand-side is a random variable and the object on the right-hand-side is a description of a probability measure.

+ +

However, it is also equally valid to interpret the right-hand-side as a reference to a random variable (as opposed to a distribution) and read the $\sim$ relation as meaning ""...has the same distribution as..."". Under this interpretation the relation is an equivalence relation comparing random variables; the objects on the left- and right--hand-sides are both random variables and the relation is reflexive, symmetric and transitive.

+ +

This gives two possible (and equally valid) interpretations of a statement like:

+ +

$$X \sim \text{N}(\mu, \sigma^2).$$

+ +
    +
  • Distributional interpretation: ""$X$ has probability distribution $\text{N}(\mu, \sigma^2)$"". This interpretation takes the latter object to be some description of a normal probability measure (e.g., its density function, distribution function, etc.).

  • +
  • Random variable interpretation: ""$X$ has the same probability distribution as $\text{N}(\mu, \sigma^2)$"". This interpretation takes the latter object to be a normal random variable.

  • +
+ +

Each interpretation has advantages and disadvantages. The advantage of the random-variable interpretation is that it uses the standard symbol $\sim$ to refer to an equivalence relation, but its disadvantage is that it requires reference to random variables with similar notation to their distribution functions. The advantage of the distributional interpretation is that it uses similar notation for the distributions as a whole, and their functional forms with a given argument value; the disadvantage is that it uses the $\sim$ symbol in a way that is not an equivalence relation.

+ +
+ +

Aldrich, J. (2003) The Language of the English Biometric School International Statistical Review 71(1), pp. 109-131.

+ +

Halperin, M., Hartley, H.O. and Hoel, P.G. (1965) Recommended Standards for Statistical Symbols and Notation. The American Statistician 19(3), pp. 12-14.

+ +

Sanders, J.R. and Pugh, R.C. (1972) Recommendation for a Standard Set of Statistical Symbols and Notations. Educational Researcher 1(11), pp. 15-16.

+",2018-04-11 01:21:58.497 +236322,116224.0,2,,58305.0,,,,CC BY-SA 4.0,"

Differential entropy should be considered as a measure of relative (privation of) information - not absolute. In particular, note that the differential entropy responds to a change in scale (i.e. you have a logarithm of a unitful quantity, which means that it will depend on the units you measure the axis $x$ in), which is not a concept that makes sense for a discrete information source. The inability to specify an absolute information in this context should be taken as based on the intuitive idea that the amount of information required to specify a specific value in an infinite continuum is itself infinite as you must distinguish one from amongst an infinitude of possibilities to achieve such a specification.

+

To understand it more precisely, consider a uniform distribution where the differential entropy is zero. A simple example is precisely that which is one unit wide:

+

$$P(x) = \begin{cases}1,\ \mbox{if $x \in [0, 1]$} \\ 0,\ \mbox{otherwise} \end{cases}$$

+

If you compute the differential entropy, $h$, of the above distribution, you will find it is zero since $\ln(1) = 0$ and moreover in the appropriate limit $0 \ln(0)$ "equals" 0. This corresponds to the fact you (or the agent to which the Bayesian probability $P(x)$ is specified relative to) know the position (or whatever) of the object to within exactly one unit. If you make the distribution wider, say two units, so you have even less information, then the differential entropy will be $\ln(2)$ or about 0.693. This is the same as the discrete entropy for an infinite discrete set of bins, each standing in for "a unit", or if you like, the bins between marks on a ruler and where you only report the measurement made with the ruler as a whole number of its finest ticks, now distributed uniformly among two such bins, and means we have 0.697 nats less information now about the position of the particle up to a resolution of one unit.

+

Negative differential entropy then just means we go the other way - since we aren't working with discrete bins, we can know it "more precisely" than one bin, i.e. to an accuracy less than one unit, and thus the entropy (privation of information) will have to be less now as we are more informed, thus now less than zero. But if I switch to a finer scale, i.e. a smaller unit, then the entropy will once again exceed zero, as now we don't have enough to know it down to that fine scale.

+

You cannot have an absolute measure because on a continuum, effectively you have an uncountable infinity of "bins" within any arbitrarily small interval, thus even a tiny interval of uncertainty is still effectively infinite information. Thus we have to get up "high into infinity" to measure the differences in entropy in realistic distributions and that is why the "bottom" of differential entropy is at $-\infty$, which is like a probability zero in a continuous probability measure, and how that such does not necessarily represent impossibility, but rather negligibility with regard to the infinitude of the sets we are considering.

+

Or to go the other way intuitively, suppose you were treating the continuum as a set of bins - one for each point - like how you do an ordinary discrete random variable. Then with a probability distribution of $P(x) = \delta(x - a)$, i.e. a delta function at some central real number $a$, that is one bin occupied, so entropy 0, but if you now have 2 bins with probability 1/2, i.e. $P(x) = \frac{1}{2} \delta(x - a_1) + \frac{1}{2} \delta(x - a_2)$ meaning "we know the particle is at either $a_1$ exactly or $a_2$ exactly but not which", then by the usual discrete entropy formula you have entropy $0.693$ nat (or $1$ shannon using $\lg$ instead of $\ln$). But if you continue in this way, long before you reach a truly continuous distribution you will soon "top out" (after a countably infinite number of bins have been summed) using discrete entropy, saturating at positive infinity. And that is what I mean by saying that to then go up into continuous distributions, you then have to "soar up high" - effectively the integral jams in an uncountable infinity and rises your reference point far above the true baseline so you can distinguish the different infinite amounts that are "above the infinite boundary of the discrete entropy". That rise then, by symmetry, puts the baseline infinitely far below you, or at $-\infty$, and moreover due to the Archimedean nature of the real numbers, prevents you from distinguishing these finite-bin cases any more (all of them, if you check for yourself, have differential entropy $-\infty$).

+",2018-07-02 05:22:06.023 +327191,93032.0,2,,58082.0,,,,CC BY-SA 4.0,"

For any recent visitors, there's been new developments in this area by Hitz, Davis and Samorodnitsky (arXiv:1707.05033). Taking a peaks-over-threshold approach instead of block maxima, the Discrete Generalised Pareto Distribution is derived as the $\operatorname{floor}$ of a GPD, and discrete Maximum Domains of Attraction (DMDA) are introduced by relating them to the classical MDAs. The whole thing is linked to, but different from, Zipf's Law.

+

In terms of the paper's terminology, the Poisson distribution is in the DMDA of a Gumbel distribution $(\xi = 0)$, as are the Negative Binomial and Geometric distributions.

+",2020-10-21 20:14:59.210 +331473,9081.0,2,,58679.0,,,,CC BY-SA 4.0,"

What is the difference between segmenting and clustering?

+

First, let us define the two terms:

+
    +
  1. Segmentation partitioning of some whole, some object, into parts vased on similarity and contiguity. See Wikipedia which gives as an example Segmentation (biology), the division of body plans into a series of repetitive segments and also Oxford.

    +
  2. +
  3. Clustering Wikipedia says the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar (in some sense) to each other than to those in other groups (clusters).

    +
  4. +
+

This is, in some sense, closely associated. If we consider some whole ABC as consisting of many atoms, like a market consisting of customers, or a body consisting of body parts, we can say that we segment ABC but cluster the atoms. But it seems that segmentation is more used when there is some concept of (spatial) contiguity of the atoms within the whole.

+

There seems to be confusion of this usage. On this site customer segmentation is often used, it should be market segmentation. The customers are not segmented (hopefully!), they are clustered. Wikipedia got it right.

+

Use in connection with time series With multiple (parallel) time series, we can cluster the series into groups of similar series, while segmentation typically refers to partitioning a single series in similar, contiguous, parts. See the tag timeseries-segmentation and this list of posts about time series clustering. That points to a connection to change-point-detection. See Wikipedia.

+

On this site there are many posts on image-segmentation.

+",2020-12-04 16:44:02.767 diff --git a/examples/csv_examples/users.csv b/examples/csv_examples/users.csv new file mode 100644 index 00000000..2f4368e1 --- /dev/null +++ b/examples/csv_examples/users.csv @@ -0,0 +1,3253 @@ +id,account_id,display_name,location,profile_image_url,website_url,about_me,creation_date +22552,198592,Ramchandra Apte,"Bangalore, India",,http://ramchandraapte.blogspot.com,"

#SOreadytohelp

+

random 14-yr old nerd

+",2013-10-10 07:52:12.460 +23452,3546819,Nox,,,,,2013-11-06 22:36:03.583 +22745,1679184,Rohan Sandeep,Bangalore,,,"

Product design professional, India Design Leader

+ +

Design leader with 18 years of Experience in Healthcare, Life Sciences, Manufacturing, Supply Chain Management, Procurement domains, specializing in Experience design for companies including HSBC, SAP Labs, Ariba, Hewlett Packard, General Electric and Oracle. In Cloud and On-Premise delivery models for Mobile, Desktop channels.

+ +

Deep knowledge of standard and emerging methodologies in solutioning including design thinking, business model canvas, value proposition design, service design, lean ux and standard user experience methodologies. Along with Agile and Lean methods of implementation.

+ +

Management experience solving people and process challenges with creative and analytical reasoning. Strong experience in evangelization and implementation of new methodologies organization wide.

+ +

Training in Usability, Project Management and User Research methodologies with proven track record of implementation.

+",2013-10-16 05:32:28.697 +5196,39810,Cheng,,,,,2011-11-06 08:54:44.363 +13037,424128,bdeonovic,"Des Moines, IA, USA",,,"

I am Benjamin Deonovic, a research scientist at the Corteva. My research interests include Bayesian data analysis, MCMC, computational statistics, bioinformatics, and psychometrics. +email: bdeonovic@gmail.com

+",2012-12-10 03:48:40.213 +20613,2455667,Jean V. Adams,,,,,2013-08-07 17:55:20.547 +23166,96339,yfzhu,"Shanghai, China",,,"

Rubyist, Computer Science undergraduate.

+",2013-10-30 06:08:31.967 +22893,1480960,indieman,,,,,2013-10-21 03:41:57.907 +18372,1634313,David Pfau,,,,,2013-05-28 00:11:35.177 +19996,3059495,ScatterSignalNoise,,,http://scattersignalnoise.wordpress.com,,2013-07-17 17:26:59.910 +23069,2386966,user2087739,,,,,2013-10-27 07:22:02.170 +11197,1885060,Alpha,"Vilnius, Lithuania",,,,2012-09-27 20:11:34.810 +14525,1812565,deeps,,,,,2013-02-04 07:23:52.913 +23000,348658,Larry Lo,Hong Kong,,,"

Code with passion. +https://larrywebsite.vercel.app

+",2013-10-24 04:27:54.680 +20820,3176923,Cynderella,,,,,2013-08-15 02:07:31.247 +11643,1393374,Herman Haugland,Oslo,,,"

MSc. in Political Economy Student, at BI Norwegian Business School.

+",2012-10-16 09:16:20.043 +1691,94867,celenius,"Cambridge, MA",,http://celenius.com,"

grad student. +interested in arcgis, python, R and LaTeX.

+",2011-01-04 13:55:46.863 +23539,1085543,Vrashabh Irde,"London, UK",,https://vrash.github.io/Slartibartfast/,"

https://vrash.github.io/Slartibartfast/

+",2013-11-08 21:05:59.840 +15377,1174461,Ron,,,,,2013-03-02 18:37:38.933 +22661,3414366,Talita,"Podkowa Lesna, Poland",,,"

I love to write code. I am excited to work in technologies I know and like as well as to learn new programming languages, design patterns and approaches to problems.

+ +

I care about code correctness as much as about its performance and simplicity. I write unit tests and prefer to do TDD.

+ +

I like to be a part of a team and be fully engaged in the project.

+ +

Python and ES6 are my favourite technologies at the moment by I have experience in many more and a will to learn.

+",2013-10-13 21:09:29.647 +22568,2487733,mikeLdub,,,,"

all things GIS, spatial analysis, spatial stats... and beyond

+",2013-10-10 15:11:44.167 +23535,2452876,manimino,,,,"

Computing for science.

+",2013-11-08 19:37:33.423 +21243,3113880,Louis Cialdella,"New York, NY, USA",,https://lmc2179.github.io/,"

Give me data or give me death

+",2013-08-29 19:32:40.537 +5273,22211,utdiscant,"Copenhagen, Denmark",,http://www.utdiscant.dk,,2011-11-12 12:28:45.653 +23154,3177318,Luis Torres,Mexico,,,"

I'm an actuary. I like maths (particularly statistics), linux, multithreading, data mining... +videogames... what else?

+",2013-10-29 23:12:38.590 +1831,33200,bayerj,Munich,,,"

I do research in machine learning in industry for a living . My focus is on sequence models with a strong focus on deep architectures and variational inference. I also like to ponder on decision making problems.

+",2011-01-21 15:08:24.717 +23436,1644376,Lukas Halim,"Raleigh, NC",,,,2013-11-06 15:41:28.713 +22642,1435980,arshajii,"Cambridge, MA, United States",,http://ars.me," +",2013-10-12 23:24:24.150 +17573,2721939,Bill,United States,,http://www.terry.uga.edu/directory/profile/wbvogt/,"

I am an economist with interest and expertise in antitrust, health economics, industrial organization, and applied econometrics.

+",2013-05-03 18:42:41.887 +22977,2172157,Atcold,"New York, NY",,https://atcold.github.io/,"

Musician, math lover, cook, dancer, ️‍, and an ass prof of Computer Science at New York University

+",2013-10-23 16:48:52.173 +22,504968,Harvey Motulsky,Santa Monica CA,,http://www.graphpad.com,"

I am the founder of GraphPad Software and the author of Intuitive Biostatistics.

+",2010-07-19 19:09:32.720 +23393,3540189,james,,,,,2013-11-05 18:09:29.573 +18513,2076911,goldisfine,,,,,2013-05-31 21:17:53.117 +5001,188416,H2ONaCl,,,,,2011-10-24 00:50:57.973 +14179,920001,Nathan,United States,,,,2013-01-24 00:38:06.040 +5038,998911,user7064,,,,,2011-10-26 08:35:46.157 +23234,425553,Julien Bourdon,"Paris, France",,http://www.ai.soc.i.kyoto-u.ac.jp/~julien,"

I work as a training consultant at OXiane, a consulting and traning company based in Paris Area, France.

+ +

My main interests are Java , Smlltalk and their associated frameworks. I used to work as a freelance web developer, that's why you might see a lot of PHP related questions and answers here.

+",2013-11-01 00:58:59.460 +23477,1440074,Michel Hua,"Paris, France",,https://mycaule.github.io/,"

about:blank

+",2013-11-07 14:39:06.010 +23385,2982228,Supernormal,,,,,2013-11-05 15:17:21.980 +22059,3054178,mpr,,,,,2013-09-25 16:04:08.363 +22750,1023978,Rob Sedgwick,UK,,http://www.thefishy.co.uk,"

I have been a professional developer since the beginning of the 1990s. I am versed in C#, VB.net, SQL Server, VB6, VBA, ASP.net and also dabble with MVC, JQuery, php, MySql and Perl. Once upon a time I used to write regularly in C, C++ and Clipper, but have not done so for some time.

+",2013-10-16 08:25:41.550 +22646,3431317,lisgee,,,,,2013-10-13 08:15:40.013 +23379,3424887,user2870897,,,,,2013-11-05 14:26:19.097 +8063,1364141,yotiao,,,,,2012-04-16 21:43:32.433 +22582,3421099,Jennifer,,,,,2013-10-10 20:15:15.787 +23480,3550739,gradstudent,,,,,2013-11-07 16:23:32.583 +22987,3480616,janice,,,,,2013-10-23 18:53:42.190 +23119,1804292,Dmitriy Vlasov,"Russia, St. Petersburg",,http://www.vlasovde.com,,2013-10-29 05:27:24.483 +3641,476639,Dail,Rome,,,,2011-07-15 10:25:15.677 +23528,3556124,Chinook,,,,,2013-11-08 17:08:52.000 +23199,3281182,Gnani,"Hyderabad, India",,,,2013-10-31 05:53:43.303 +23432,3544490,user32401,,,,,2013-11-06 14:10:40.533 +23984,3473749,lennon310,"New York, United States",,,"

Working on large scale distribute systems

+",2013-11-21 00:40:11.893 +4221,873320,Ricardo Bessa,,,,,2011-09-01 14:56:28.933 +23469,3400281,MGoll,,,,,2013-11-07 11:02:35.513 +16043,2523286,Sycorax,"Washington, DC, United States",,,,2013-03-20 23:59:56.610 +16703,328682,Niek,,,,,2013-04-10 10:44:19.540 +23533,333639,Zikes,"Bentonville, AR",,http://zikes.me,"

Web enthusiast

+",2013-11-08 19:32:15.367 +23287,3527400,galstar,,,,,2013-11-02 15:54:55.837 +23149,3508855,chae,,,,,2013-10-29 19:50:50.240 +2420,31379,dole doug,,,,,2011-03-23 08:33:42.433 +20304,2132983,user1893354,,,,,2013-07-27 01:56:04.663 +23305,3530155,user5866,,,,,2013-11-03 14:39:20.850 +16046,94348,Cupitor,,,,,2013-03-21 01:56:47.610 +13666,63710,Spaceghost,"Plano, TX",,http://ducktyping.blogspot.com,"

Living in Dallas, coding in Python, C++, Javascript, Java or whatever fits.. +Interested in Machine Learning, Natural Language Processing and the total elimination of spam via FUSSP.

+",2013-01-06 17:06:05.483 +23419,345809,Maroon,,,,,2013-11-06 07:37:05.067 +23468,3549174,user32445,,,,,2013-11-07 10:54:39.487 +658,140666,Xodarap,,,,,2010-08-12 20:48:53.613 +8869,450468,Paul,,,,"

Data scientist at MassMutual. We are often hiring and we are always interested in hearing from you.

+

Aspiring data scientists: we have the best internship in the industry!

+",2012-05-30 19:43:03.037 +22159,2911067,mining,,,,,2013-09-29 07:40:21.517 +20752,3167899,Mike Nute,"Urbana, IL & Cincinnati, OH",,,"

I'm a PhD student in Statistics at the University of Illinois at Urbana-Champaign.

+",2013-08-13 07:09:40.240 +22612,3426128,difrncdregression,,,,,2013-10-11 16:46:12.820 +3922,468726,StasK,"Columbia, MO",,http://stas.kolenikov.name,"

Principal Survey Scientist, Abt Associates (http://www.abtassociates.com)

+

Primary areas of expertise: survey statistics, structural equation and latent variable modeling, Stata programming, microeconometrics, resampling methods

+

Given that I spent those freaking three minutes to fill in this profile information, I think that the users with generic user0123456 lack respect for the community. My response to their questions is limited, and it is intentional.

+

The views expressed are my own, independent of my employer.

+",2011-08-08 17:31:44.413 +23465,1419612,ajaykarpur,,,https://ajaykarpur.com,,2013-11-07 06:35:48.680 +22544,3384364,Hype,,,,,2013-10-10 03:48:14.553 +18767,1382613,user1315305,,,,,2013-06-08 22:43:38.413 +18356,2822696,Jane,,,,,2013-05-27 18:56:55.533 +10494,182102,Nan,"Houghton, MI",,,"

Using R to count trees

+",2012-08-24 20:41:54.553 +23036,229137,Suresh,,,,,2013-10-25 13:18:40.717 +23541,874629,Logan,"Boulder, CO, USA",,,,2013-11-08 21:41:27.807 +22743,3443579,Alfred,,,,,2013-10-16 03:44:01.090 +22970,3478704,Emilia,,,,,2013-10-23 12:50:26.157 +23080,3362649,moof,,,,,2013-10-27 23:06:39.180 +23456,238557,Refefer,,,,,2013-11-06 23:55:36.903 +23231,3520956,Steve Loeffler,,,,,2013-10-31 22:32:16.063 +23415,2214894,vishwaovi,"Dharamshala, Himachal Pradesh, India",,,,2013-11-06 04:08:37.257 +22906,3468715,Adidul,,,,,2013-10-21 14:57:14.690 +23515,407465,Subedi Kishor,"Panchkhal, Nepal",,,"

I am a software enthusiast and enjoy working with Web Services and Embedded Systems.

+",2013-11-08 07:58:54.717 +10492,941039,user969113,,,,,2012-08-24 18:24:11.783 +23288,3397561,DaAwesomeP,,,https://perrynaseck.com/,"

Please see my website.

+",2013-11-02 16:56:15.620 +23282,2777102,Kenneth Feng,"New York, NY",,,,2013-11-02 13:39:31.037 +23010,46169,Proposition Joe,,,,,2013-10-24 10:34:41.993 +23343,3535072,MissMeijer,,,,,2013-11-04 18:03:06.073 +19120,1304524,learnerer,,,,"

I enjoy programming. Most of my projects are related to machine learning and big data.

+",2013-06-19 20:58:48.577 +14684,2341754,Michel,,,,,2013-02-08 01:12:44.613 +22577,3420971,user31348,,,,,2013-10-10 19:43:34.243 +23217,1162379,AbdullahR,,,http://@AbdullahAl_R,"

MIS student at KFUPM, Saudi Arabia

+",2013-10-31 14:38:58.080 +23070,934350,WeaselFox,,,,"

Im a software engineer currently working at IBM.

+",2013-10-27 07:58:45.940 +23102,3028328,jsh,,,,,2013-10-28 14:46:52.090 +17678,2736960,Bruno,,,http://bacalfa.com,,2013-05-07 19:48:35.567 +2352,115563,Michael Bishop,"Philadelphia, PA",,,"

I'm a computational social scientist utterly inspired by the existence of StackOverflow.

+",2011-03-16 21:48:01.583 +13889,929856,Epimetheus,,,https://en.wikipedia.org/wiki/Epimetheus,"

God of hindsight

+",2013-01-14 18:03:58.583 +19436,201416,Thomas,"Magdeburg, Germany",,http://www.thomas-kahle.de,"

I'm a mathematician working in Magdeburg, Germany.

+",2013-06-30 07:55:22.537 +10772,1823299,Christos,,,,,2012-09-08 14:30:50.407 +23158,1735500,antom,,,,,2013-10-30 01:58:19.363 +23241,1309805,Lev Levitsky,"Moscow, Russia",,https://github.com/levitsky,"

A graduate of Moscow Institute of Physics and Technology (B.S. and M.S. in applied physics and math). +Now I focus on utilizing Python for the needs of proteomics. Most of our results are available as Python packages pyteomics and pyteomics.biolccc.

+",2013-11-01 07:53:08.710 +5875,1114905,Elvis,,,,"

As one of my idols once did, I use the name Elvis Jagger Abdul-Jabbar when I am too shy to tell my real name.

+ +

I am a former algebraist who made a strategic move to applied research (successful move, now I have a position). I’m a full autodidact in statistics, still learning and struggling to enhance my level.

+ +

My portrait as Homer has been drawn in R by one of my students!

+",2011-12-18 13:06:46.540 +803,193831,Joe,,,,,2010-08-30 13:36:46.657 +22773,3408437,Kevin,Maryland,,,,2013-10-16 20:26:31.537 +23354,3010092,user2553813,,,,,2013-11-04 23:17:51.980 +21947,2483086,Mike McCoy,"Pasadena, CA",,http://users.cms.caltech.edu/~mccoy,"

I'm a postdoc working in Computational & Mathematical Sciences at the California Institute of Technology.

+",2013-09-22 22:14:02.447 +23038,2922402,MindaugasK,,,,,2013-10-25 14:09:01.223 +23272,3526266,Taimoor,,,,,2013-11-02 07:12:50.913 +16746,2515093,Anton Ashanin,,,,,2013-04-11 08:34:52.350 +7189,215533,elexhobby,,,,,2012-03-05 08:40:28.773 +22804,1510225,ANN,,,,,2013-10-17 19:44:40.933 +21840,3321341,user30438,,,,,2013-09-18 16:52:31.390 +22589,2628009,user2274644,,,,,2013-10-10 23:19:57.333 +22715,1908941,chris,,,,,2013-10-15 14:58:15.270 +5237,1028075,gung - Reinstate Monica,Kingdom of Zhao,,http://en.wikipedia.org/wiki/Gongsun_Long,"

Stack Exchange's actions have been reprehensible. Monica Cellio, a popular and well-respected volunteer network moderator, was terminated without warning and portrayed to the press as misgendering trans people by Stack Overflow, Inc. All signs suggest this was an erroneous, if not malicious, decision not founded on fact, for which the company is completely failing to take any responsibility. This has caused an enormous and ongoing uproar in the community.

+ +

Monica and the community quite literally begged for some kind of response from the company either opening up a path to reinstatement, or justifying their decision to terminate her, for more than a month now. She made many good-faith offers to talk, and others offered to mediate (and made dozens of constructive suggestions on how to resolve the situation). Members of the Lavender community have spoken up on her behalf (1, 2).

+ +

Stack Overflow, Inc. met all of this with thunderous silence. As a long-time contributor and someone always willing to see the good in the company's and its employees' actions and assume good faith, I'm still struggling to come to terms with the corporate callousness of this behavior which we have to assume now is the new normal.

+ +

Having exhausted all other means of communication, Monica began raising funds for potential legal action to clear her name. This process ended with some form of legal agreement that entailed a non-disclosure agreement. The NDA means that nothing else about the agreement is known. What can be seen from the outside is that SE offered another lawyer-written non-apology, and issued a statement out of both sides of its mouth to The Register reiterating, falsely, that she did something wrong, but that she isn't really a bad person. We can also see that she was not reinstated. This episode in the drama is over. I trust the funds left over will go donated to The Trevor Project, an organization that provides crisis-intervention services to LGBTQ+ youth.

+ +

Monica is an exemplar of the community, it's incredible how she's managed to keep a level head during this time when emotions have been very high. She should be reinstated now.

+ +
+ +

In the golden age of ancient Chinese philosophy, Gung-sun Lung Tsu was the one of the only thinkers, and certainly the most important, to discuss topics pertaining to logic and epistemology.

+",2011-11-09 04:43:15.613 +22986,951190,ipman,,,,,2013-10-23 18:46:00.670 +23050,3492176,swhusky,"Seattle, WA",,,,2013-10-25 21:05:54.560 +22543,2134267,user1894426,,,,,2013-10-10 03:33:27.480 +23511,1544691,Plomoxed,"Toronto, Canada",,,"

Fullstack Web Developer with Ruby on Rails and React.js Experience

+

Ruby/Ruby on Rails - expert

+

React.js - intermediate

+

Elixir/Phoenix - learning

+",2013-11-08 04:42:18.303 +22790,68466,Michał,"Warsaw, Poland",,https://michalbojanowski.com,,2013-10-17 10:11:13.017 +23302,3529820,Kangmin,,,,,2013-11-03 12:14:55.333 +22873,2206286,jinawee,Spain,,,"

Undergraduate student in Physics. Mostly interested in Theoretical Physics (QFT, General Relativy, String Theory, etc). Learning C++, Python and Scheme.

+",2013-10-20 13:42:37.603 +28059,159648,eusebe,,,,,2014-02-26 16:49:24.500 +23487,3551305,Tropa,,,,,2013-11-07 18:38:35.007 +6630,188507,Memming,,,http://memming.wordpress.com,"

Assistant professor at Stony Brook University.

+",2012-02-04 04:50:46.390 +23187,2135370,skauf,,,,,2013-10-30 20:13:41.447 +1602,145274,rhombidodecahedron,,,,,2010-12-22 17:43:38.063 +22328,3387673,Vijay Chakilam,,,,,2013-10-03 15:38:19.177 +21485,3272063,Stat question,,,,,2013-09-06 17:18:24.393 +7741,551019,alfa,,,,"

I am interested in

+
    +
  • artificial intelligence
  • +
  • machine learning
  • +
  • neural networks
  • +
  • reinforcement learning
  • +
  • computer vision
  • +
+",2012-03-31 18:06:55.903 +22845,132327,starflyer,,,,,2013-10-18 20:58:20.513 +20416,3117807,user28673,Germany,,,,2013-07-31 13:12:04.367 +2015,63721,David D,,,,"

David D wants to remain unknown

+",2011-02-13 20:57:01.410 +23292,2337387,Alex Lamb,,,,,2013-11-02 19:04:43.113 +22034,136853,dfife,New Jersey,,http://www.quantpsych.net,,2013-09-24 20:02:13.990 +16464,2545519,tjnel,,,,,2013-04-03 04:02:40.793 +23448,3378030,kevinykuo,,,,,2013-11-06 21:10:58.753 +5480,197213,drhanlau,"Melbourne VIC, Australia",,http://www.drhanlau.com,"

Data Scientist with a PhD degree from Queensland University of Technology (QUT), Australia. My research interest is on social media analytics and text mining. My personal interest on programming span across C#, JavaScript, Python, and HTML5, CSS3 web programming. I am also a Microsoft Certified Application Developer for .NET and Microsoft Office Expert World Champion on Microsoft Excel and National Champion for Microsoft Word.

+",2011-11-24 21:44:56.330 +23293,41791,grettke,,,http://www.wisdomandwonder.com/,,2013-11-02 19:25:53.313 +22958,3477321,Nacho Alborés,Spain,,,"

Marine researcher

+",2013-10-23 07:58:09.830 +23387,3472094,user32348,,,,,2013-11-05 16:10:30.780 +17447,2707035,SGd,,,,,2013-04-30 15:05:11.007 +23275,1891013,curious,,,,,2013-11-02 09:06:49.977 +23283,3527111,Gary T.,,,,,2013-11-02 14:01:34.637 +16441,2138743,digdeep,,,,,2013-04-02 08:00:55.800 +22809,1370532,sam,,,,,2013-10-17 22:06:13.990 +22812,3453564,Ishihara,,,,,2013-10-17 23:49:06.267 +23056,2195428,user1941239,,,,,2013-10-26 08:16:55.253 +22956,464964,Gallop,United States,,,,2013-10-23 05:51:00.487 +22727,1624396,waywardEevee,,,,,2013-10-15 19:16:44.243 +23171,3420090,pual ambagher,,,,,2013-10-30 10:33:53.877 +22993,1808072,SealCuadrado,,,,,2013-10-23 21:34:51.507 +4910,982452,Rasmus Bååth,,,http://www.sumsar.net,"

PHD student. Not in statistics. Trying to run a statistics blog anyway... Check it out at http://www.sumsar.net .

+",2011-10-18 19:09:06.503 +22685,1068883,user1068636,,,,,2013-10-14 20:27:18.850 +18865,258222,lightalchemist,Singapore,,,,2013-06-12 04:24:08.977 +14748,1719473,rottentomato56,United States,,,,2013-02-10 23:11:41.393 +22840,2540323,pagibson,,,http://pagibson.com,,2013-10-18 18:56:45.877 +22584,1702191,Markus M.,Abu Dhabi - United Arab Emirates,,,"

Remote sensing/Data Science/Python.

+",2013-10-10 20:51:33.020 +12744,2092474,ashman,,,,,2012-11-29 03:24:08.967 +22760,288507,Jack H,"England, UK",,,,2013-10-16 15:16:14.487 +23087,2597903,daknowles,,,http://cs.stanford.edu/~davidknowles/,,2013-10-28 04:12:14.143 +12358,1620508,Dave,,,,"

Studied physics,

+ +

Works at statistical data analysis,

+ +

Plays at making music.

+",2012-11-14 22:00:59.407 +23284,231907,fatih,,,,,2013-11-02 14:17:38.203 +23364,434180,Matt Faus,,,,,2013-11-05 05:41:02.197 +22488,3409948,mugen,Earth,,,"

Inside every non Bayesian there is a Bayesian struggling to get out (Dennis V. Lindley).

+",2013-10-08 19:57:09.267 +3733,349130,Tomas,,,,,2011-07-22 16:01:45.850 +17538,396401,Pepe Mandioca,,,,,2013-05-02 19:44:42.997 +23049,3491986,zara,,,,,2013-10-25 20:02:27.703 +23011,3484051,Dea Anne,,,,,2013-10-24 10:55:55.683 +22833,3456481,KingKong,,,,,2013-10-18 13:15:31.527 +20700,2961842,TeTs,,,,,2013-08-11 08:01:24.253 +2857,509937,Affine,,,,,2011-05-06 00:56:58.253 +22782,3412763,brillydev,"San Francisco, CA",,http://about.me/brillydev,"

Full-time geek, part-time nerd.

+",2013-10-17 03:35:23.293 +15766,2490267,TLJ,California,,,"

Data Scientist

+",2013-03-14 01:20:19.660 +23453,3546920,user32425,,,,,2013-11-06 23:12:17.867 +23248,1218085,0_0,,,,"

Learning

+",2013-11-01 13:48:01.590 +23007,3483575,user31878,,,,,2013-10-24 09:19:45.900 +23516,265916,vstrale,,,,,2013-11-08 08:50:41.390 +13740,1594514,grssnbchr,,,http://timogrossenbacher.ch,,2013-01-09 08:50:35.900 +23313,3124703,Xuan Liu,,,,,2013-11-03 20:18:03.820 +12273,2035193,Matija Piskorec,,,,,2012-11-12 09:54:02.463 +23301,391321,alexk,"Athens, Greece",,,"

Perl programmer, now playing with AngularJS

+",2013-11-03 12:14:54.473 +22871,3463577,tellis,,,,,2013-10-20 10:40:16.803 +22709,3440247,user31514,,,,,2013-10-15 11:46:03.933 +22576,3413685,respectPotentialEnergy,"Knoxville, TN, USA",,,,2013-10-10 19:12:14.343 +21846,1039865,Peter Verbeet,Netherlands,,,"

Researcher, mainly in the social sciences. +User of several different (statistical) softwares, well-versed in several, real expert in none ;-) +General interest in statistics and statistical programming.

+",2013-09-18 19:22:26.847 +23109,2920649,morudy,"Portland, OR",,,"

A beginner trying to learn how to crawl...

+",2013-10-28 16:43:09.680 +23478,3550282,Bianca van Keulen,,,,,2013-11-07 14:57:29.900 +22369,2773505,n8sty,,,,"

data scientist, breaker/fixer, technologist, eater, drinker, dancer, urban explorer.

+",2013-10-04 18:37:04.180 +22988,1929564,Calder,United States,,,,2013-10-23 19:05:02.933 +20740,2910176,Dirk Calloway,Tlön,,,,2013-08-12 18:31:48.117 +23509,2321957,Ammar Husain,"Pittsburgh, PA",,http://ammar.whiteant.org,"

Roboticist at Carnegie Mellon

+",2013-11-08 04:02:43.570 +5249,184675,drevicko,,,,,2011-11-10 01:51:24.040 +23394,3540257,Emily,,,,,2013-11-05 18:28:38.613 +95860,3444122,user3263600,,,,,2016-11-15 20:38:12.993 +23399,933933,stanford202,,,,,2013-11-05 20:39:50.417 +22969,2762025,lamarvannoy,,,,,2013-10-23 12:31:42.990 +19331,2966851,Sam Dickson,,,,"

I'm a statistician/bioinformatician with experience working with clinical and pre-clinical data and have done some work with statistical genetics as well.

+",2013-06-26 21:07:25.337 +15563,64670,Timothée HENRY,,,,,2013-03-08 12:41:57.657 +14728,1451406,LWZ,California,,,"

I'm learning Python.

+",2013-02-10 02:48:21.420 +23206,321962,aberaud,,,,,2013-10-31 09:55:17.167 +22677,3326559,Firhat Nawfan H.,Indonesia,,,,2013-10-14 13:15:20.993 +23164,58842,CWC,"Seattle, WA",,http://cascadecrest100.com,,2013-10-30 04:45:22.117 +23117,3397556,user2850757,,,,,2013-10-29 03:50:37.217 +7949,1397036,Erik,"Berlin, Germany",,http://www.epeter-stats.com/,"

I studied mathematics at the University of Potsdam and am now working as a statistician in a biotechnological enviroment. You can also visit my personal webpage.

+",2012-04-11 12:52:52.120 +23337,3184468,user2690457,,,,,2013-11-04 14:53:10.303 +22962,2850010,bbudescu,,,,,2013-10-23 08:57:32.710 +22952,24957,ruiyiz,"Glen Allen, Virginia United States",,,"

I build software ...

+",2013-10-23 02:00:26.387 +22900,3467774,mmmm,,,,,2013-10-21 11:27:29.240 +23020,2011239,Konie,,,,,2013-10-24 19:55:51.757 +22578,557104,Sim,,,,"

caffeine driven development!

+",2013-10-10 19:45:46.673 +23427,3543561,user32395,,,,,2013-11-06 10:51:50.160 +23228,1557669,user2019458,,,,,2013-10-31 20:41:57.533 +23325,322828,Roger Stuckey,"Newcastle, New South Wales, Australia",,http://rogerstuckey.com/,,2013-11-04 05:29:08.127 +23307,221711,Sébastien,"Antibes, France",,,"

Android and iOS developer.

+",2013-11-03 16:54:49.663 +22355,3391809,Gerko Vink,"Utrecht, Netherlands",,http://www.gerkovink.com,,2013-10-04 12:05:30.553 +186,7405,SilentGhost,,,,"
+

We are all here on earth to help others; what on earth the others are here for, I've no idea
+           -- W.H. Auden

+ +

Give man a fish and you got him fed for a night; teach him to fish and it's not your fault he died of mercury poisoning.
+           --unknown

+
+",2010-07-20 10:17:21.753 +23074,1647158,Yusuke,Japan,,,,2013-10-27 14:24:46.167 +10278,119993,pontikos,United Kingdom,,https://www-gene.cimr.cam.ac.uk/staff/nikolas/,"

A computer scientist who has moved to computational biology and genetic research. I enjoy bringing algorithms, machine learning and statistics to my research and discovering links between different branches of science.

+",2012-08-15 06:37:12.217 +23409,1917933,piggybox,"Bay Area, CA, United States",,,"

A hungry fool

+",2013-11-06 00:37:31.363 +22654,168430,black_puppydog,,,,,2013-10-13 15:52:57.587 +22602,2353933,CML,,,,"

engineering student and hobby programmer

+",2013-10-11 09:06:06.710 +2198,509747,topepo,,,,,2011-02-28 14:09:29.087 +23280,3526695,Jos,,,,,2013-11-02 11:03:54.097 +22754,3445619,Emiliano,,,,,2013-10-16 13:07:09.340 +12501,1933309,A. Donda,University of Kulahari,,,"

– I'm a physicist working in cognitive neuroscience on statistical data analysis methods.
+– I program mainly in Matlab and Python, but also occasionally in C, bash/sed/awk, Perl, Java, and JavaScript.
+– I use Debian GNU/Linux for work & leisure, Windows for games, Android when mobile.
+– I write Pandoc-flavored Markdown in Atom which is rendered by LaTeX, though sometimes MS Word can't be avoided.
+– I dabble in typography and information design.

+",2012-11-20 12:52:54.550 +750,298433,Andy W,"Raleigh, NC, USA",,https://crimede-coder.com/,"

PhD in Criminal Justice and former academic. Consulting firm at https://crimede-coder.com/.

+",2010-08-22 00:22:31.277 +5208,135746,siamii,,,,,2011-11-07 03:48:16.533 +182,509468,Graham Cookson,"London, United Kingdom",,http://grahamcookson.com,"

I’m currently a Lecturer in Economics in the Department of Management at King’s College London, which I joined in September 2007. I have an PhD in Econometrics from Imperial College London. I’m a Bayesian and my research is focused broadly in econometrics applied to public policy but with a particular interest in spatial econometrics. I use R and Python for my research and SPSS, Stata and WinBUGS for teaching methods.

+",2010-07-20 09:21:13.470 +14470,2316566,user20370,"Verona, Italy",,,"

Data scientist, author of books in the Dummies series on machine learning & AI. Kaggle master, highest rank achieved on Kaggle: 7th (worldwide). Google Developer Expert in Machine Learning.

+",2013-02-01 18:09:47.307 +23300,505126,Pierre-Yves Gaillard,"Montreal, QC, Canada",,https://sites.google.com/site/pierreyvesgaillard,"

Email: pierre.yves.gaillard at gmail.com

+

If you have any idea about this MathOverflow question, thanks for letting me know.

+",2013-11-03 08:54:37.207 +22571,3419978,anthonydn,"Moab, UT",,,"

Postdoc at USGS Canyonlands Research Station, currently studying climate change effects on desert biogeochemistry.

+",2013-10-10 16:06:44.113 +23193,3516520,Cha-am Jamal,,,,,2013-10-31 02:23:38.287 +22558,2385964,Fabio,,,http://www2.unine.ch/philippe.renard/page-28557.html,,2013-10-10 09:58:27.403 +23197,1238352,Bosma,Canada,,,,2013-10-31 04:23:41.357 +22694,3438007,user31493,,,,,2013-10-14 23:00:33.067 +21322,3247536,KhengMei,Australia,,,"

A forensic psychology scientist attempting DIY stats.

+",2013-09-02 04:48:52.793 +23237,3521479,Shusheng,,,,,2013-11-01 02:19:13.343 +22595,3422062,Naveen,,,,,2013-10-11 01:23:37.220 +10547,1520233,Druss2k,Germany,,http://www.linkedin.com/in/jan-felix-meyer,"

I am a professional (PhD level) statistician with eight years of work experience doing data analytics, forecasting, modelling, optimization, revenue management, pricing and more using R, Python, SPSS, and Stata. Currently, I work in the aviation industry, leading data science projects focusing on innovation to automatize and to maximize revenue returns (Proof to my credibility is found at my linked-in page: /in/jan-felix-meyer).

+",2012-08-27 18:44:13.957 +1959,271749,Raffael,"Hamburg, Germany",,,,2011-02-06 20:00:42.043 +22984,42361,kaliatech,,,http://www.jgstechnical.com,"

A major interest in building things. And learning. And applying what I've learned to what I'm building. And finding time to do more of all three.

+",2013-10-23 18:36:15.153 +22735,2463235,Ben,"New Orleans, LA",,https://gormanalysis.com/,"

Founder of Practice Probs - a platform for learning programming via fun challenge problems.

+",2013-10-15 22:38:23.577 +22810,2761931,coffeinjunky,"London, UK",,,"

I like coffee.

+
+

For reference purposes:

+

https://stackoverflow.com/help/mcve

+

https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example

+

https://stackoverflow.com/questions/20109391/how-to-make-good-reproducible-pandas-examples

+",2013-10-17 22:07:42.617 +22511,3413120,SooBin,,,,,2013-10-09 11:56:48.657 +19870,3041564,Aureon,,,,,2013-07-13 03:37:18.710 +668,449752,whuber,,,,"

Consultant, expert witness, and teacher.

+
+

The mathematics are not there for the joy of the analyst but because they are essential to the solution.

+
+

Karl Pearson, Notes on the History of Correlation (1920).

+
+
+

Whenever there is a simple error that most laymen fall for, there is always a slightly more sophisticated version of the same problem that experts fall for.

+
+

--Amos Tversky

+
+
+

It is largely because of lack of knowledge of what statistics is that the person untrained in it trusts himself with a tool quite as dangerous as any he may pick out from the whole armamentarium of scientific methodology.

+
+

--Edwin B. Wilson (1927), quoted in Stephen M. Stigler, The Seven Pillars of Statistical Wisdom.

+
+

[Concerning scientific knowledge] Admission to its sanctuary, and to the privileges and feelings of a votary, is only to be gained by one means—sound and sufficient knowledge of mathematics, the great instrument of all exact inquiry, without which no man can ever make such advances in this or any other of the higher departments of science as can entitle him to form an independent opinion on any subject of discussion within their range.

+
+

-- Sir John Herschel, quoted by Augustus de Morgan (1872) in A Budget of Paradoxes. Emphasis in the original.

+",2010-08-13 15:29:47.140 +22759,3446157,John,,,,,2013-10-16 15:03:08.217 +5984,1081868,N Brouwer,,,https://www.researchgate.net/profile/Nathan_Brouwer,"

Quantitative ecologist at the National Aviary, Pittsburgh, PA.

+",2011-12-26 17:07:29.903 +23380,3539101,Ilya,,,,,2013-11-05 14:29:26.380 +22761,3446275,ORStudent,"San Francisco, CA, United States",,,"

Branch and Bound.

+",2013-10-16 15:24:46.733 +22749,118559,geraint,,,,,2013-10-16 08:13:44.353 +14298,1903579,Heisenberg,,,http://people.duke.edu/~aql3/,"

PhD student in social science

+",2013-01-27 06:29:17.293 +1804,32140,cakeforcerberus,,,,"

x

+",2011-01-18 00:37:20.490 +6204,433926,David Marx,"Seattle, WA, United States",,http://dmarx.github.io/posts/,"

Data Mining, Machine Learning, Probability, Numerical Statistics, Bayesian Statistics, Regression Analysis, Simulation, Graph Analytics, Data Visualization

+

Python, R

+",2012-01-10 14:57:25.657 +5917,1120220,Shea Parkes,,,,,2011-12-20 21:29:03.420 +23519,3554548,user32507,,,,,2013-11-08 10:56:21.150 +22013,3166215,lroca,,,,,2013-09-24 10:13:33.730 +22777,3447859,Henry Indvik,,,,,2013-10-16 21:40:34.343 +11359,544535,crsh,,,http://frederikaust.com,,2012-10-04 14:11:32.393 +58011,6835077,Björn,Europe,,,"

Biostatistician working in pharmaceutical drug development with an interest in Bayesian statistics, missing data imputation, estimands, meta-analysis, observational studies, hierarchical models, multiple testing procedures, dose finding methods, count- and time-to-event data analysis, prediction modeling and Machine Learning

+",2015-08-23 14:39:03.497 +22692,1045435,Arjun,,,,,2013-10-14 22:00:48.983 +22652,2328748,MrRaymondLee,"Toronto, Canada",,,,2013-10-13 13:23:13.207 +23398,1394549,David K,"Washington, DC",,,"

I am an electronics engineer with a focus on signal processing and algorithm development. I primarily work in the development of new sensors and systems, focusing on the algorithms and overall system concept rather than the software, so I consider myself a beginner when it comes to programming and networking. Right now I primarily work in Matlab, but I have also worked in Java and C++ in the past.

+",2013-11-05 19:25:05.617 +22930,3314389,pooja,,,,,2013-10-22 09:58:26.890 +23161,1051560,yanzheng,Hangzhou,,http://yanzheng.me,,2013-10-30 03:14:31.060 +18447,2788815,Amin,,,,"

I'm a PhD student in Educational Measurement and use Mathematica and R for programming. My main area of interest is latent variable modeling.

+",2013-05-29 19:19:38.213 +23366,3537397,hahahehe,,,,"

Just an average technician.

+",2013-11-05 07:19:05.257 +22865,2234761,ankit,,,,,2013-10-19 21:50:38.173 +20831,3179323,Vincent,"London, United Kingdom",,,"

Postgraduate statistician from Oxford and working on Statistical Quality in financial industry.

+",2013-08-15 15:27:54.387 +22606,348975,Christian,,,,,2013-10-11 14:07:24.010 +23097,3501314,user31992,,,,,2013-10-28 12:18:57.117 +23402,3541205,user2958275,,,,,2013-11-05 22:23:32.920 +23285,3527168,Navid Ghanizadeh,,,,,2013-11-02 14:23:26.503 +23400,3541121,Melissa N.,,,,,2013-11-05 21:59:38.740 +23111,3503690,user32015,,,,,2013-10-28 21:14:33.927 +23247,1871804,Sam R.,In the puɐlǝʇsɐM,,,"
+

That which can be asserted without evidence, can be dismissed without evidence.

+

- Christopher Hitchens (1949-2011)

+
+

The avatar is called The Town of Thoughts by the magnificent Marija Tiurina.

+",2013-11-01 13:19:04.267 +23224,297346,Vitomir Kovanovic,"Edinburgh, United Kingdom",,http://vitomir.kovanovic.info,"

PhD Student at the Univeristy of Edinburgh, UK. +My research interests are Learning Analytics and Educational Data Mining

+",2013-10-31 18:29:57.263 +22623,2348943,El Dude,"San Diego, CA, USA",,,,2013-10-11 20:40:03.740 +9175,1569641,Budhapest,,,,,2012-06-15 01:54:20.767 +23373,3534794,chatzipr,,,,,2013-11-05 11:57:45.480 +8374,529574,user1234440,,,,,2012-05-03 00:46:21.590 +23215,112206,Yeti,"London, UK",,http://bernat.tech,"

tox and virtualenv maintainer

+",2013-10-31 13:58:11.160 +10735,1668547,alm,,,,,2012-09-06 17:06:05.777 +18198,1862068,Baz,,,,,2013-05-22 13:51:37.733 +7700,1364849,dav,"Beavercreek, Ohio",,,"

I'm a fire chief looking to use some of the huge amounts of data we collect/report to help make better decisions for service delivery.

+",2012-03-29 18:36:33.897 +20426,3119295,user28687,,,,,2013-07-31 19:04:37.733 +22609,3425540,user31389,,,,,2013-10-11 14:32:11.217 +22842,3458346,user31671,,,,,2013-10-18 20:20:37.223 +22587,3421534,Delyle,,,,,2013-10-10 22:06:59.227 +23061,3494265,nikos,,,,,2013-10-26 13:22:03.853 +22951,3476280,AndyHopkins,,,,,2013-10-23 01:11:45.343 +22586,77268,odedbd,,,,,2013-10-10 21:43:45.363 +23525,3555378,AK47,Lithuania,,,,2013-11-08 14:44:54.070 +22686,3437631,CJ Stoneking,,,,,2013-10-14 20:48:26.107 +22976,1524986,Pedro.Alonso,,,,,2013-10-23 16:30:18.123 +3728,447920,Darren Cook,,,,"

I'm data scientist, software developer, computer book author, entrepreneur. +I'm director at QQ Trends, a company that solves difficult data and software challenges for our clients. Lots of machine learning, especially NLP-related, recently. (We sometimes have freelance projects, so get in touch if interested.) +(Contact me at dc at qqtrend dot com: please mention you are coming from StackOverflow, so I know it is not spam.)

+ +

My first book was ""Data Push Apps with HTML5 SSE"", with O'Reilly, 2014 (ISBN: 978-1449371937). Old by computer standards, but the standard has been stable, so surprisingly still useful. My second book, at the end of 2016, also with O'Reilly, was Practical Machine Learning with H2O (ISBN: 978-1491964606).

+ +

I'm British, speak English and Japanese (fairly fluent, 1 kyu), with a bit of German, Chinese and Arabic. As for computer languages, I've done commercial work in most of them; but it has been mostly JavaScript, R, Python, C++ the past five years.

+ +

All my Stack Overflow and all my Stack Exchange contributions (across all sites) are dedicated to the public domain or available under the CC0 license at your choice. I don't like viral licenses.

+ +

Easy ways to irritate me on StackExchange sites (whether my own question or someone else's): 1. Downvote without a comment (N/A if someone already left a comment and you just agree with it, of course); 2. Answers in comments. Other than that I'm an easy-going and pragmatic guy :-)

+",2011-07-22 03:35:31.883 +2121,60564,Jonathan,California,,,,2011-02-20 16:54:08.953 +22905,3468450,user31752,,,,,2013-10-21 14:03:14.387 +23106,3371368,Konstantinos Giannakopoulos,Hong Kong,,,"

Data Scientist with Ph.D. In Computer Science and Engineering from the Hong Kong University of Science and Technology, with expertise in Machine Learning, Statistics, Data Mining, Text Mining and with additional working experience in Computer Vision, Deep Learning.

+

Moreover, a Data Analyst and Data Engineer with interests in Predictive Analysis, Technical Training, Object Oriented Development and Programming (OOD and OOP), DevOps, Business Intelligence (BI) and Business Analysis (BA) (PowerBI), Agile Methodologies (Scrum).

+

Spare time interests: Coffee, Reading, Writing, History, Music.

+",2013-10-28 15:55:53.903 +23147,3508727,Lou,,,,,2013-10-29 19:17:38.657 +22630,2730129,pinkpanther,"Bengaluru, India",,,"

Works at Synopsys

+ +

From: Bhimavaram, India

+ +

Lives in: Bengaluru, India

+",2013-10-12 10:21:22.203 +22565,1078704,zie1ony,,,,"

""The generation of random numbers is too important to be left to chance.""
+Robert R. Coveyou, Oak Ridge National Laboratory

+",2013-10-10 13:54:16.323 +14548,2327283,abayesed,,,,,2013-02-05 00:45:04.990 +22938,2147856,user1905080,,,,,2013-10-22 16:08:48.293 +22721,3441892,Kim J,"Philadelphia, PA",,,"

Researcher in Clinical Psychology & Psychotherapy. Currently at U Penn for an exchange year, normally Assistant Professor at Leiden University - risk models - multilevel modeling

+",2013-10-15 17:49:56.123 +22600,1418505,Paolo Bernasconi,"Geneva, Switzerland",,http://none,"

A programmer

+",2013-10-11 07:09:49.420 +23153,1759550,bogs,"Bucharest, Romania",,http://nlpforhackers.io,"

I’m a Software Engineer with a M.Sc in Artificial Intelligence.

+ +
    +
  1. Have 6+ years experience in web development
  2. +
  3. Love writing code in Python
  4. +
  5. 2 years of freelancing
  6. +
  7. Passionate about traveling and working remote
  8. +
  9. Aspiring entrepreneur
  10. +
  11. New found passion for writing
  12. +
+ +

I'm a co-founder of a few startups. Writing on nlpforhackers.io, a blog about simple and effective Natural-Language-Processing/Computational Linguistics. Microsoft Ventures Seattle Accelerator Alumni.

+",2013-10-29 21:30:54.837 +22415,2320620,Pedro Carvalho,,,,,2013-10-06 18:46:44.733 +23544,482263,Captain Whippet,United Kingdom,,http://twitter.com/captain_whippet,"

@captain_whippet

+",2013-11-08 23:07:57.260 +23033,121020,vprisivko,,,,,2013-10-25 11:41:03.747 +9007,1546824,Gilbert,,,,,2012-06-06 15:39:37.783 +17454,2707647,Tomas Olsson,,,,,2013-04-30 17:27:21.667 +23002,1462562,Jason,"Brisbane, Australia",,,,2013-10-24 05:28:04.167 +23536,3556752,user32530,,,,,2013-11-08 19:51:23.983 +22992,94317,wrongusername,"Seattle, WA",,http://amos.ng,,2013-10-23 21:25:50.253 +22991,3480800,sp1n,,,,,2013-10-23 20:55:21.390 +14597,356868,Roun,,,,,2013-02-06 06:20:21.777 +23084,3026038,Malcolm,"Renens, Switzerland",,https://malcolmmielle.wordpress.com/,"

Very much robots and computers. Very cool.

+",2013-10-28 02:12:59.713 +9605,31476,Matt,,,,,2012-07-09 19:51:22.473 +20320,3085262,Srishti M,,,,,2013-07-28 05:46:42.370 +22996,186646,hoyland,"New York, NY, United States",,,,2013-10-24 01:04:22.187 +4320,901829,alto,"Austin, TX",,,,2011-09-10 01:42:43.253 +21756,3309101,Ben,"Muenster, Germany",,http://ben.graeler.org,"

Benedikt Gräler is a research associate and consultant at 52°North, Münster, Germany. His key research interst lies where the theory of copulas meets spatial statistics. On a daily basis, he is working as a Spatial Data Scientist.

+",2013-09-16 07:19:28.733 +23359,1431178,phillbaker,,,http://phillbaker.com,,2013-11-05 01:53:49.447 +22615,3426413,asker,,,,,2013-10-11 18:00:38.057 +15430,1679888,Ethan,,,,,2013-03-04 22:07:55.287 +23345,3535395,poer,,,,,2013-11-04 19:33:36.103 +22648,2299139,usillos,"Berlin, Germany",,,,2013-10-13 09:07:12.857 +14873,2369692,Katherine Gobin,,,,,2013-02-15 06:58:46.117 +20388,3113851,honeychip,"London, United Kingdom",,http://N/A,"

Research associate in Statistics

+",2013-07-30 17:02:23.277 +22764,3197932,User,,,,,2013-10-16 16:34:59.543 +23470,3549360,Andreas,,,,,2013-11-07 11:32:54.817 +21864,1960434,MedAli,Tunisia,,http://intelligea.wordpress.com,"

Some of my answers:

+ + +",2013-09-19 13:04:45.113 +2149,509726,IrishStat,"Warminster, PA, United States",,,"

Dave (dave@autobox.com) is currently a Senior Consultant at Automatic Forecasting Systems (215-394-8897). His doctorate dissertation on automatic modeling of time series was the basis for the technological advances in time series analysis. One of the founding principals of Automatic Forecasting Systems (AFS)( http://www.autobox.com ), his innovational leadership and statistical expertise is at the core of the success of AFS. He has over 40 years of experience in statistical consulting and expert system product development. Under his leadership and vision, AFS has lead the way in championing expert time series analysis this making these tools available to all. He has held teaching positions at Penn State University and the Drexel University Graduate School. Dave is well-founded both theoretically and in the practice of Management Science. Before founding AFS he held Senior management positions at General Electric, Celanese Corporation and The American Stock Exchange. Dave holds a B.A. in Statistics from CCNY, an M.S. in Statistics from Villanova University and an A.B.D. in Applied Economics from the University of Pennsylvania.

+",2011-02-23 13:29:56.240 +22796,3445931,user2886760,,,,,2013-10-17 14:17:45.533 +22746,3443937,user31556,,,,,2013-10-16 06:12:04.197 +23274,3526433,Aras,,,,,2013-11-02 08:55:19.447 +23110,3503027,BobL,,,,,2013-10-28 18:36:54.563 +22980,370195,Tom Hebbron,,,,,2013-10-23 17:19:50.450 +1923,129863,posdef,,,,"

half interdisciplinary scientist, half code monkey...

+",2011-02-02 13:59:14.100 +9792,1672038,user12719,,,,,2012-07-19 15:07:15.353 +12503,2035987,Funkwecker,,,,,2012-11-20 13:40:43.550 +16665,2339529,Remi.b,"Lausanne, Switzerland",,,,2013-04-09 11:06:49.033 +7769,1372889,pyrole,,,http://jatinpatni.co.nr,,2012-04-02 07:39:49.027 +20286,3098620,EdM,,,,,2013-07-26 15:11:03.380 +22755,194421,Paweł Szczur,"Olsztyn, Poland",,http://dev.pawelsz.eu,"

I'm interested in solving real world problems with technology.

+ +

I am also amazed how important for projects the collaboration and communication is.

+ +

I'm poking around: C++, Go, AppEngine, Android, CUDA and others.

+",2013-10-16 13:58:29.500 +22663,2821701,James Prichard,United Kingdom,,http://jaypri.ch/,,2013-10-13 22:57:52.967 +14900,1379171,altabq,,,,,2013-02-16 13:28:48.377 +9049,1552146,usεr11852,"Manchester, UK",,,"

Empiricist - trying to be rational.

+

Would like to be Stats David Mitchell but ending up Data Science Noel Fielding.

+",2012-06-08 10:20:40.383 +23035,2335751,Will High,"New York, United States",,https://www.highonscience.com/,"

Data scientist, physicist

+",2013-10-25 12:45:47.760 +947,509540,Sympa,United States,,http://www.slideshare.net/gaetanlion,"

Quantitative professional with expertise in econometrics, statistics, risk management. For past few years have been engaged in developing and testing econometrics models related to bank's stress testing.

+",2010-09-16 21:32:57.667 +1411,64461,Ben Bolker,"Hamilton, Canada",,http://www.math.mcmaster.ca/bolker,"

Mixed models, miscellaneous R stuff, animal movement models, phylogenetic comparative methods, ...

+",2010-11-24 20:42:46.860 +23392,1414242,bstockton,,,,"

M.S. in statistics from University of Utah. I currently work as a statistician for an energy company. I use R, SAS, and Python mainly. I love data science and math.

+",2013-11-05 17:55:38.990 +23041,3490808,user31918,,,,,2013-10-25 15:15:10.080 +2872,221664,Siato,"London, UK",,,,2011-05-07 11:13:42.100 +22494,2726023,George Gao,,,,,2013-10-09 03:18:58.997 +15764,542386,David Braun,,,https://dirt.design,,2013-03-14 00:35:09.930 +11262,1885024,Applied mathematician,France,,,,2012-10-01 12:38:37.463 +22676,977393,lucacerone,"Barcelona, Spain",,http://lucacerone.net,,2013-10-14 12:35:03.267 +22751,241426,jimifiki,"Pisa, Italy",,,,2013-10-16 09:53:52.260 +23356,1047950,glaed,"Berlin, Germany",,,,2013-11-05 00:32:37.973 +22636,3429453,user2874331,,,,,2013-10-12 15:38:44.627 +22503,3412219,user31260,,,,,2013-10-09 08:34:12.797 +23184,1441191,Jens TV,Germany,,,"

mass spectrometry data analysis quantitative proteomics biochemistry

+",2013-10-30 18:04:51.207 +14715,37565,Contango,"London, United Kingdom",,https://www.linkedin.com/in/shane-tolmie-b139441/,"

Have been enjoying programming for over 20 years.

+

Experienced in C#, C/C++, Python, KDB, SQL, etc on both Linux and Windows.

+

Currently working in finance. Have built systems running at discretionary trading timescales right down to high frequency trading (HFT) timescales. All full stack, working on everything from low-level drivers to the user interface.

+

Passionate about great software architecture, and writing bug free, maintainable and performant code.

+",2013-02-09 16:53:51.730 +1209,206522,mariana soffer,Argentina,,http://singyourownlullaby.blogspot.com,"
* Artificial Intelligence
+* Alan Turing
+* Math
+* physics
+* biology
+* Neuroscience
+* DNA
+* Literature
+* Visualization of information. Multivariate statistics and datamining Dada
+* Travel everywhere
+* etc.
+
+ +

Twitter:@marianasoffer

+",2010-11-04 21:53:36.430 +1428,54702,ogrisel,"Paris, France",,http://twitter.com/ogrisel,"Average Python / C / Scala datageek with a taste for artificial intelligence, machine learning, cloud computing, OpenCL, NLP, the semantic web and braaaaains!",2010-11-26 13:15:57.300 +23294,3170665,Arif Ali,,,,,2013-11-02 20:31:47.433 +23435,1592795,Anurag Priyadarshi,India,,http://www.scientisttechnologies.com,,2013-11-06 15:36:25.477 +15658,1314281,eXpander,,,,,2013-03-11 09:41:56.130 +22632,2032555,Rolf Marvin Bøe Lindgren,"Levanger, Norway",,http://www.grendel.no,"

Psychologist. ZX Spectrum user since 1982. Mac user since 1984. LaTeX user since 1987. ACT practitioner since 2010. R user since 2012. Azure user since 2016. Xelatex user for some time. Wishes to learn ConTeXt.

+",2013-10-12 11:05:12.467 +23179,931887,user961627,,,,,2013-10-30 16:38:35.707 +22932,3473400,Gedps,,,,,2013-10-22 13:13:29.150 +22598,2192684,guwenwu,,,,,2013-10-11 02:48:57.280 +23499,3552213,Testeris,,,,,2013-11-07 22:17:18.293 +10987,1856255,Placidia,"Saint John, NB, Canada",,,"

I am a data scientist living in Saint John, Canada, with over 20 years' experience. I have yet to see a data set I did not like. My analysis stack involves R, Rstudio, Spark-Scala and the Linux command line.

+",2012-09-19 00:35:00.500 +793,82337,raegtin,,,,,2010-08-27 19:17:29.360 +19264,2297751,OSE,"Albuquerque, NM, United States",,http://jasonmonschke.com,,2013-06-24 22:06:47.730 +15330,2431472,Leo H M Arruda,,,,,2013-03-01 05:06:12.180 +10570,1331637,Stumpy Joe Pete,United States,,http://peterthenelson.com,"

I'm a software developer with a passion for data science. Major interests:

+ +
    +
  • Machine Learning
  • +
  • Natural Language Processing
  • +
  • Web Scraping
  • +
+ +

You bring the data, I'll bring the algorithms, it'll be a party.

+",2012-08-28 20:57:04.283 +19388,2146470,avi,,,,,2013-06-28 10:16:18.047 +22710,3424458,Marie Auger-Methe,Canada,,https://sites.google.com/site/marieaugermethe/,"

I am an assistant professor at the University of British Columbia in the Dept. of Statistics and in the Institute for the Oceans & Fisheries.

+",2013-10-15 13:08:54.143 +17900,2464811,timcdlucas,,,,"

I'm a PhD student at CoMPLEX, UCL studying the epidemiology of bat viruses using network theory.

+ +

twitter.com/timcdlucas

+ +

twitter.com/statsforbios

+",2013-05-14 11:45:51.000 +23183,266484,Arlen,Earth,,,"

Reloading...

+",2013-10-30 17:58:41.437 +22718,3441597,Lara,,,,,2013-10-15 16:40:46.753 +23204,72524,appanponn,"Bangalore, Karnataka, India",,,"

Developer who does not code and an Architect who does not day-dream !!

+",2013-10-31 09:30:08.437 +4735,412173,user785099,,,,,2011-10-08 22:01:25.723 +23327,184997,derPoltergeist,"Bangalore, India",,,"

Interested in running, Statistics, Ubuntu (the OS as well as the philosophy), and R (the programming language).

+",2013-11-04 08:03:44.813 +21346,3021502,user227710,"I always like walking in the rain, so no one can see me crying",,,"

...you should not fall under Mr. Market's influence, but rather you should learn to take advantage of him...Benjamin Graham

+ +

Warning: for() loops are used in R code much less often than in compiled languages. +Code that takes a ‘whole object’ view is likely to be both clearer and faster in R. +p.41

+ +

http://stackoverflow.com/help/mcve

+ +

The Best tutorial:

+ +

data.table: https://github.com/Rdatatable/data.table/wiki/Getting-started

+ +

http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly/27840349#27840349

+ +

RSelenium: http://rpubs.com/johndharrison/12843, http://rpubs.com/johndharrison/RSelenium-headless, & http://rpubs.com/johndharrison/RSelenium-headless

+ +

Shiny: http://shiny.rstudio.com/tutorial/ & http://rstudio.github.io/shinydashboard/

+ +

nyc open data: https://nycopendata.socrata.com/

+",2013-09-02 20:57:24.950 +17459,1017741,Jack2019,New York,,,"

Data scientist + Programmer

+",2013-04-30 20:03:23.573 +21057,3209678,renrenthehamster,,,,,2013-08-23 06:29:20.690 +23424,3444710,financial theory,,,,,2013-11-06 09:22:38.300 +22638,1575257,Animesh Pandey,"Boston, MA",,http://animeshpandey.wix.com/personal,"

LinkedIn Profile : http://www.linkedin.com/in/animeshpandey

+ +

Github Profile : https://github.com/apanimesh061

+",2013-10-12 17:28:08.573 +23291,1995007,Anirudh,"Palo Alto, CA, USA",,,"

Software Developer. Interested in Probability and Statistics.

+",2013-11-02 18:57:22.960 +22954,97728,Thanh DK,"Singapore, Singapore",,http://www.thanhdk.com,"

A geek who loves learning programming languages, playing computer games and practicing martial arts.

+",2013-10-23 03:36:45.683 +1145,509561,Michael Lew,,,,,2010-10-23 06:00:07.847 +22603,3410555,user31381,,,,,2013-10-11 09:34:34.753 +22622,3426932,user31402,,,,,2013-10-11 20:30:29.607 +23065,124682,Mechanical snail,United States,,https://launchpad.net/~replicator-snail,"

. Native speaker of American English. Linux user. Familiar with several programming languages in the procedural, OO, and functional paradigms.

+ + + +

Link rot is evil. Archive everything. The keyboard is king. Correctness over performance. Canonicalize, normalize, deduplicate. Don't repeat yourself. UTF-8 > UTF-16. Use static typing: good for tooling. Re-use; don't re-invent. Correctness, then clarity, then concision and elegance. Play devil's advocate. First understand opponents' positions.

+",2013-10-26 22:50:03.137 +20363,3109455,SlowlyFailing,,,,,2013-07-29 19:04:43.263 +23538,7791,sep332,United States,,,,2013-11-08 20:06:10.797 +23439,2213467,Vlad Preda,Romania,,,"

PHP programmer, Zend and Magento certified

+",2013-11-06 17:32:00.413 +23505,3536272,Phiphy,,,,,2013-11-08 02:35:51.450 +4911,295852,Javier,"CA, United States",,,,2011-10-18 20:34:39.497 +2105,175922,ATMathew,,,,"

I'm an aspiring statistician.

+ +

I also like R, Python, fitness, books, and a few other things.

+",2011-02-18 22:32:05.610 +2916,163642,Leo5188,United States,,,,2011-05-10 18:30:49.580 +23277,3517118,Hsiu-hsian Chen,,,,,2013-11-02 09:34:11.020 +7278,1282059,Daniel Murrell,,,,,2012-03-09 13:07:38.373 +21932,1371036,clarkson,,,,,2013-09-22 03:38:42.280 +16469,1334658,papirrin,,,,,2013-04-03 08:21:06.157 +23346,1400417,user1329307,,,,,2013-11-04 19:55:38.150 +23100,3501317,SergioG,,,http://sergiograziosi.wordpress.com/,,2013-10-28 14:00:18.827 +23265,312858,AndrewNimmo,"Barcelona, Spain",,https://andrew.nimmo.dev/go/stackoverflow.com,"

Freelance Tech // Development // Integration // Monitoring // Maintenance // Systems

+",2013-11-01 22:18:37.797 +22737,3113994,Constablebrew,"Los Angeles, CA, United States",,http://ihazcode.herokuapp.com/,"

I’m a software developer with over 17 years of experience in both front end and back end development and database management. I have worked on many different applications on a wide variety of platforms, including Node.js, React, AngularJS, ASP.NET, SQL Server, Postgres, MongoDB, and Access, and with a variety of languages including JavaScript, C#, and VB.

+

Most recently, I was a Senior Software Engineer at Automation Anywhere, focusing on building React interfaces for our top notch process automation bots. Prior to that I served as Full-stack developer at Mobiquity, building health care applications with a MEAN stack.

+

I have also been fortunate to have worked with multiple other technologies and platforms, including AngularJS, BackboneJS, JQuery, PHP Zend, and both ASP and C# .NET. Despite recent years focusing on client side development, I have built some sweet scripts and queries in SQL Server, Postgres, MongoDB, and even MS Access.

+

Colleagues know me as a highly creative problem solver who works hard to get things done. I am often eager to hit the ground running, as I tend to find the best questions to ask once I have put in some time thinking about the possible solutions. I work well alone, but I’m at my best when collaborating with others.

+",2013-10-15 22:49:55.747 +18331,1088545,Val,,,http://www.google.ee/search?q=greatest+misallocation+of+resources+in+the+history,"

This site is controlled by criminals. They can convert you question into absolutely different one that you have not interest in, migrate to another site, and, in excuse, ban you. As it happened to me.

+ +

No, read Decline of Stackoverflow right away.

+",2013-05-26 11:56:53.120 +22852,1022225,Sunil Bhatia,,,,,2013-10-19 03:27:52.673 +22781,3448670,kkst31,,,,,2013-10-17 03:10:06.457 +22966,1127498,bourneli,"Shenzhen, China",,http://cnblogs.com/bourneli,,2013-10-23 11:31:31.317 +23382,3539106,strand,,,,,2013-11-05 15:06:40.327 +23376,3538840,gloomy,,,,,2013-11-05 13:33:17.720 +22739,3442987,KakasA09,,,,"

I am the pricing & revenue mgmt. lead for the largest customer account (rhymes with Ballmart) of a leading consumer products company. I love my function, am passionate about applying statistics in the business world, and for leveraging the power of insights from large data sets. I am very new to R, or programming in general; however, I learn by doing and by seeing real examples applied to data sets I am familiar with or that I can conceptualize. I have found these boards to be incredibly helpful, and am learning from all the experts here.

+",2013-10-15 23:18:36.437 +22200,3373206,cathy,,,,,2013-09-30 16:31:43.443 +15363,262611,Learner,,,,,2013-03-02 01:22:40.723 +22856,3460867,Jeff Long,,,,,2013-10-19 14:40:30.983 +16110,2531780,Oleg,,,,,2013-03-22 15:50:09.570 +22580,3421037,user31351,,,,,2013-10-10 19:59:51.410 +21434,3055883,timothyjgraham,"University of Queensland, Australia",,,"

I am a research assistant and PhD student at University of Queensland (Australia).

+ +

My research interests are digital sociology, social policy, social network analysis, critical studies of technology.

+",2013-09-05 05:05:36.643 +22832,3456332,user31657,,,,,2013-10-18 12:38:46.960 +23375,1847977,Soldalma,,,,,2013-11-05 13:06:48.647 +22825,202494,Bart,,,,,2013-10-18 09:35:43.560 +23335,156106,b00tsy,,,,,2013-11-04 13:00:14.213 +23073,2285736,Amateur Math Guy,,,,,2013-10-27 14:08:57.467 +22838,2475942,Acyclic Tau,London,,http://4a42.wordpress.com,"

By Day: Networky, DevOpsy, Cloudy Nerd +By Night: Gamerly, Fatherly Nerd

+",2013-10-18 16:17:08.603 +23526,3555882,saihtamtellim,,,,,2013-11-08 16:15:51.660 +22564,3419173,Flask,,,,,2013-10-10 13:37:39.657 +10135,1728377,Stat,Canada,,,"

I like statistical modeling and its applications and a big fan of R!

+ +

+",2012-08-07 16:33:26.800 +22621,617,caseyboardman,"Sudbury, MA, USA",,https://www.linkedin.com/in/caseyboardman/,"

Utility infielder. Looking to learn.

+",2013-10-11 20:10:44.037 +13303,2162070,user22,,,,,2012-12-19 14:01:30.950 +23474,2222373,Rumi P.,Germany,,,,2013-11-07 13:06:07.720 +17660,1509271,jmbejara,"Chicago, IL",,http://jeremybejarano.com/,"

Support economics.stackexchange.com!

+ +

+",2013-05-07 08:09:35.223 +23273,512625,Treeman,"London, United Kingdom",,,,2013-11-02 07:37:30.240 +23209,3518243,Mark.M,,,,,2013-10-31 11:14:09.233 +22778,2230845,exodus,,,,"

Machine Learning Scientist

+",2013-10-16 22:35:27.383 +23094,107919,Amr Bekhit,"İstanbul, Turkey",,https://helmpcb.com,"

Embedded System Engineer, owner of HelmPCB, an embedded systems consultancy specialising in ATEX/IECEx design, Embedded Linux, LoRaWAN and low-power IoT.

+",2013-10-28 11:08:36.673 +22936,169516,sbos,,,,,2013-10-22 15:43:31.903 +14227,381037,Moses Xu,"Sydney, Australia",,,,2013-01-25 03:23:53.557 +23443,3546070,Martin Chaia,,,,,2013-11-06 19:30:31.207 +4537,512630,Shiwen,Madrid,,,,2011-09-26 20:22:25.507 +22975,720767,miyazaki_tara,,,,,2013-10-23 16:18:52.863 +22590,1561545,mgilbert,,,http://matthewdgilbert.com,,2013-10-10 23:21:10.653 +22684,1906615,workaholic,,,,,2013-10-14 19:55:37.183 +17628,289418,Alexey Popkov,"Nizhny Novgorod, Russia",,http://linkedin.com/in/alexeypopkov,"

By training, I am a chemist with a specialization in physical chemistry, finished a postgraduate course on Physical Chemistry at the Department of Chemistry of Lomonosov Moscow State University (Moscow, Russian Federation) in 2010.

+

Active user of Wolfram Mathematica since 2006.

+

I'm the author of Mathematica packages:

+ +

Interested in Mathematica-related work.

+

E-mail: Uncompress["1:eJxTTMoPCpZkYGBIzEmtSK3US9IryC/Izi9zSM9NzMzRS87PBQCy2gtN"].

+",2013-05-06 05:52:56.343 +22934,1802587,Thomas,,,,,2013-10-22 15:23:17.120 +23438,2901355,user2487959,,,,,2013-11-06 17:21:38.667 +23367,3537410,Cristian,,,,,2013-11-05 07:22:39.053 +22876,3464488,hello,,,,,2013-10-20 16:14:18.453 +22830,3455890,Tania,,,,,2013-10-18 11:05:21.687 +17321,2691305,Neo,,,,,2013-04-26 16:04:47.153 +21652,3295470,new-TT-faculty2,,,,,2013-09-12 13:32:10.290 +22846,3458517,Nuesschen,,,,,2013-10-18 21:05:16.017 +23232,2029247,Jake Casey,,,,,2013-10-31 22:56:05.827 +16650,31124,clintonmonk,DC,,http://hardlybusiness.com,"

I'm a full-time developer with experience in Java and web programming. I'm also interested in statistics and machine learning as well as economics and finance.

+",2013-04-08 21:41:58.943 +855,37021,xan,North Carolina,,https://packedbars.com/,"

Data visualization, C++, Java, Forth, Tie-dye

+ +

I work for SAS Institute on JMP and have a few posts on the JMP Community, including blog posts. I'm pretty active with data visualization on Twitter at xangregg. +I have an irregular personal blog, Forth Go. I created the chart type called Packed Bars.

+ +

StackOverflow is intentionally useless for messaging, so here is an email address: xan@pobox.com.

+",2010-09-03 17:00:30.193 +22923,3471288,Kochede,,,,,2013-10-22 04:14:44.413 +11446,1916498,Epifunky,,,,"

Epidemiology student

+",2012-10-08 13:31:45.247 +23537,3556756,Stephen Ramsey,"Oregon State University, OR",,http://lab.saramsey.org,"

For more information about my research program and for contact info, please see my website at http://lab.saramsey.org.

+",2013-11-08 19:52:38.827 +23331,3533237,JJacquelin,,,,,2013-11-04 10:32:01.167 +22831,3456043,Kati,,,,,2013-10-18 11:39:06.263 +436,59121,nico,"Edinburgh, United Kingdom",,http://www.nicolaromano.net,"

I am a Senior Lecturer at the University of Edinburgh.

+

I am interested in understanding how heterogeneous groups of cells decode and interpret signals from their environment to generate meaningful outputs as a population. +I mostly use neuroendocrine systems (the ones that produce hormones in your body) to study this question. +I am also very interested in all that relates to data analysis and science reproducibility.

+",2010-07-29 06:38:18.290 +22524,3130616,Irene,Italy,,,"

Recent Graduate. Now young Analyst.

+",2013-10-09 15:00:37.367 +16990,1294974,Islam El-Nabarawy,,,,,2013-04-18 00:04:05.190 +22550,3411837,DevKhokhar,"Bangalore, India",,,,2013-10-10 07:25:09.580 +12756,224511,HelloWorld1,,,,,2012-11-29 15:37:10.390 +13045,2130273,user17670,,,,,2012-12-10 10:01:39.357 +23350,3485196,rkreddy8080,"Los Angeles, CA, United States",,,"

Hi, I am Rakesh

+",2013-11-04 22:18:24.357 +20144,2541467,bernatguillen,,,,,2013-07-22 12:32:49.330 +23221,456877,Dan Mazzini,"London, UK",,http://danmaz74.me,"

I created hashtagify.me, the top free hashtag search engine, and later turned it into a startup for which I'm CEO & CTO. After 5 years of the ups and downs of a small, completely remote startup, I'm ready to move on.

+",2013-10-31 16:24:52.677 +23370,3431282,randomatlabuser,,,,,2013-11-05 09:54:57.330 +23032,2893055,Vard,,,,,2013-10-25 10:55:34.690 +19545,82438,AatG,CA,,,"

applied mathematician and machine learning researcher

+",2013-07-03 16:52:41.087 +15717,2484658,John Redner,,,,,2013-03-13 00:39:31.833 +102539,10049021,Michael Sigmund,,,,,2017-01-17 08:43:12.820 +9446,1614370,RioRaider,,,,"

I am a wildlife ecologist with interests in distance sampling, population ecology and dynamics, wildlife demography, Galiform ecology and management, wildlife survey/monitoring techniques, avian ecology and management, and effects of wind energy development on avian populations.

+",2012-06-30 02:33:58.100 +7421,1312527,Oeufcoque Penteano,,,,,2012-03-16 00:32:16.327 +22666,3433726,user31458,,,,,2013-10-14 00:49:15.803 +23137,271,Larsenal,"Ventura, CA",,,"

I enjoy running my data processing boxes at a 5-minute load average of 54.

+",2013-10-29 16:08:07.203 +8414,1221927,Patrick S. Forscher,"Madison, WI",,,"

I'm a researcher studying social psychological interventions and quantitative methods, currently at the University of Arkansas Department of Psychology.

+",2012-05-04 16:25:25.133 +22789,346910,Nic,"London, United Kingdom",,http://NA,"

Got a 2:1 in astrophysics at Sussex Uni in 2010. Now studying for a PhD in astronomical instrumentation at UCL.

+",2013-10-17 09:56:50.223 +23276,872594,martini,"Berlin, Germany",,http://jungenschaft-hohenstaufen.de,,2013-11-02 09:26:30.293 +22957,3473572,NK1,,,,,2013-10-23 07:52:05.950 +22961,3477644,Gino,,,,,2013-10-23 08:39:29.317 +23269,1610001,eycheu,,,,,2013-11-02 03:49:59.577 +13396,540273,wsw,,,,,2012-12-24 21:27:17.477 +22924,3096471,Sjobeek,,,,,2013-10-22 05:04:34.663 +22770,3447436,tensorrank,,,,,2013-10-16 19:54:10.257 +23344,894782,Gaurav Agarwal,India,,,"

I programmed in Java
+Android
+Linux - Ubuntu
+Java Servlet
+Ansible
+AWS

+

Good discussion on forming a startup.

+

--

+",2013-11-04 18:57:59.493 +23304,945875,Nikos Alexandris,"Cadrezzate, VA, Italy",,http://nikosalexandris.net/xpress/en,"

Who? osgeo's user-wiki page | personal website | LinkedIn public profile

+ +

What? Contributions | It is OK to accept own answer(s) in (TeX.)SE! | image sharing via pixelfed.social

+",2013-11-03 13:16:30.713 +22660,86412,Jan,"London, United Kingdom",,http://www.janvsmachine.net,,2013-10-13 21:01:00.703 +21991,3343304,arjsgh21,,,,,2013-09-23 23:22:25.137 +23253,390363,DaveFar,"Karlsruhe, Germany",,,"

I'm the CEO of QPR Technologies and always want to improve my coding skills.

+",2013-11-01 14:17:07.993 +3048,42175,Thomas Browne,,,https://sites.google.com/crvm.io/scendance/home,"

Transport, factorization, visualization of high dimensional real time streaming data across disciplines, with focus on finance and cryptocurrency APIs. Ex emerging markets bond trader, PM, strategist, with comprehensive at-the-coalface knowledge of all cash, option, swap, and FX markets, and now crypto!

+

Also: full-stack data engineer from Linux through Postgres, Kafka, messaging protocols, API expert, comprehensive Python / R / Numpy, visualization libraries, Elixir, soon...Rust GPU programming. Get in touch!

+",2011-05-22 17:19:17.440 +2164,509734,Tom Reilly,"Philadelphia, PA",,http://www.autobox.com,"

Contact me at tomreilly@autobox.com. Autobox was launched in 1975 as the first to market forecasting software using Box-Jenkins methodology.

+",2011-02-24 18:21:45.440 +23493,3551984,apt1002,"Cambridge, United Kingdom",,http://www.minworks.co.uk/,,2013-11-07 21:18:58.427 +13459,1160550,user1140126,,,,,2012-12-28 20:57:38.447 +21586,3286870,Skullduggery,,,,,2013-09-10 18:22:46.410 +22897,3430347,Drusilla,,,,,2013-10-21 09:53:54.903 +22805,3452789,Michael Brown,,,,,2013-10-17 19:49:01.727 +20739,3165544,Matt Reichenbach,"Akron, OH",,https://www.linkedin.com/in/matt-reichenbach-2569b051,"

I am a business intelligence developer in the banking industry with a skillset in predictive analytics and data visualization; however, I have a background in marketing, consumer credit, biostatistics, nursing, and psychology. As such, I have a wide array of interest and experience across many areas of study.

+

My primary research interests are in time series forecasting, logistic regression, and natural language processing.

+",2013-08-12 17:16:09.660 +10546,1718145,RobertF,Connecticut,,,"

Statistician currently working for Landmark Health/Optum.

+",2012-08-27 16:41:46.940 +23506,3369426,user2829076,,,,,2013-11-08 03:05:21.663 +22802,1060862,alexhli,"Houston, TX",,,,2013-10-17 18:41:29.127 +23146,511277,Logan M,USA,,,,2013-10-29 18:44:45.303 +21746,2309631,Oleg Shirokikh,,,http://solver.com,,2013-09-16 01:43:54.063 +22933,3473523,cjyetman,,,,,2013-10-22 13:36:16.093 +23182,3514680,Frank,,,,,2013-10-30 17:27:52.937 +13165,1191616,Daniel,,,,,2012-12-13 20:32:04.740 +23504,57352,bshanks,,,http://bayleshanks.com,,2013-11-08 02:11:01.983 +22593,3421931,JulienW,,,,,2013-10-11 00:34:33.700 +23058,3493797,luigi martini,,,,,2013-10-26 10:25:30.220 +23403,3531651,user105181,,,,,2013-11-05 22:28:46.870 +22419,3245994,Siddharth Gopi,United Kingdom,,,"

PhD in Applied Statistics

+ +

.

+ +

..

+ +

...

+ +

.... jk, undergraduate struggling to graduate

+",2013-10-07 00:13:35.340 +22339,3389451,Newb,127.0.0.1,,,"

The best way to learn is to ask and answer lots of questions.

+",2013-10-03 22:57:38.703 +22477,3408418,user31228,,,,,2013-10-08 14:14:31.190 +23134,75124,psihodelia,United States,,,"

Software Engineer

+",2013-10-29 14:26:58.563 +23091,1765925,Saravanan K,Portugal,,,,2013-10-28 08:28:45.337 +9384,415263,tan,,,,,2012-06-27 14:18:29.710 +15972,2135506,Silverfish,,,,"

Some of my more interesing answers, not necessary most upvoted:

+ +

Geometric interpretation of multiple correlation coefficient $R$ and coefficient of determination $R^2$

+ +

What is the distribution of R2 in linear regression under the null hypothesis? My answer includes a comprehensive graphic of the distribution of $R^2$ for multiple regression on a small sample.

+ +

Ink to data ratio and plot backgrounds - a fascinating question about the recent trend for plots to have grey backgrounds.

+ +

Uniform random variable as sum of two random variables - if $U$ is a uniform variable, why can't we find i.i.d. variables $X$ and $Y$ such that $U=X+Y$? A nice solution by thinking about limits on kurtosis.

+ +

Random walk on the edges of a cube - if a spider walks at random along the edges, what is the expected number of steps to reach an ant trapped on the opposite vertex? Parity considerations give an elementary pen-and-paper method.

+ +

What does an Added Variable Plot (Partial Regression Plot) explain in a multiple regression?

+ +

Correlation between a nominal (IV) and a continuous (DV) variable - sometimes we have to explain that something can't be done, but suggest alternatives or analogues.

+ +

Finding an unbiased estimator with the smallest variance - answered with Lagrange multipliers.

+ +

Prove the equivalence of the following two formulas for Spearman correlation - using only high school methods. This is a bit clunky, but the topic sometimes comes up at high school level (at least in my country), so it's nice not to have to rely on higher level maths.

+ +

Expectation of Quotient of Sums of IID Random Variables (Cambridge University Worksheet) - an interesting question that can be answered by (a)symmetry considerations.

+ +

Transformation Chi-squared to Normal distribution - including a trick using a Rademacher variable, and some very pretty graphs to show deciles (and R code).

+ +

Addressing some elementary misconceptions about summary statistics, with nice counterexamples: Does mean = median imply that a unimodal distribution is symmetric? and Will two distributions with identical 5-number summaries always have the same shape?

+",2013-03-19 14:13:29.520 +23104,2463908,Nathan Gould,,,,"

Education: Yale Applied Mathematics. Professional: alternative energy, data, tech. Personal: Running, standup comedy, dancing, coding, beer.

+",2013-10-28 15:01:13.327 +22588,1711518,kylelyk02,,,,,2013-10-10 22:54:16.227 +16992,2650477,Stefan Wager,,,http://web.stanford.edu/~swager/,"

I'm a PhD student in statistics at Stanford. I'm interested among other things in non-parametric statistics, uses of sampling in data analysis, and empirical Bayes techniques. My favorite part about doing statistics is when I manage to help people get new insights about their problems by framing them in terms of statistical concepts.

+",2013-04-18 06:54:58.430 +23298,2739691,Vitaly Isaev,"Moscow, Russia",,,"

Distributed storage developer (Go/Python/C++). +I work here: https://mailion.ru/

+",2013-11-03 00:37:17.277 +22890,3465896,Joseph,,,,,2013-10-21 01:01:58.073 +22572,3316946,ano,"Seattle, WA",,,"

health data analysis, urologic oncology fellow, currently @ UW

+",2013-10-10 16:09:14.030 +18382,2824937,Daniel Big,,,,,2013-05-28 08:54:55.873 +23243,3522923,Lrhod,,,,,2013-11-01 11:19:57.813 +22941,2462742,Sean S,"Nyc, NY",,,,2013-10-22 18:10:04.033 +23498,3453733,Jay,,,,,2013-11-07 22:11:26.900 +22607,3425490,tom cunningham,,,,,2013-10-11 14:20:05.910 +23096,185145,Michael Burrows,,,,,2013-10-28 12:10:52.457 +23401,2812983,Jose Garmilla,,,,,2013-11-05 22:06:15.713 +22399,3397761,prop,,,,,2013-10-06 02:34:42.493 +22579,2531600,Jeff Keller,,,,,2013-10-10 19:56:21.420 +22658,3432846,AMM,,,,,2013-10-13 18:26:48.740 +17038,2656690,socksxbirkenstocks,,,,,2013-04-19 16:18:18.507 +23529,3556184,Onawa,,,,,2013-11-08 17:22:49.013 +21630,1385426,Christian,,,,,2013-09-12 01:03:31.830 +3580,510097,guy,,,,,2011-07-09 15:57:23.153 +23381,269527,Alex Benke,New York,,,"

I'm a Product Manager at Betterment who likes rolling up his sleeves.

+",2013-11-05 15:00:21.330 +22708,3294353,user2771940,,,,,2013-10-15 10:37:51.700 +4318,66802,Royi,,,,"

Electrical Engineer (Student)

+",2011-09-09 18:21:48.913 +546,59464,gmatt,,,,,2010-08-04 16:51:36.150 +22559,2463414,user2147028,,,,,2013-10-10 10:00:46.897 +22645,1810900,Mindmaster Flash,,,,,2013-10-13 04:44:34.267 +20062,2046785,Ladislav Naďo,Slovakia,,https://scholar.google.sk/citations?user=OBXww2UAAAAJ&hl=en&authuser=1,"

Research field: Biological Sciences - Zoology and Animal Science

+ +

Interested in:

+ +
    +
  • evolution
  • +
  • animal behaviour
  • +
  • tree-dwelling bats
  • +
  • statistics
  • +
  • R
  • +
+",2013-07-19 14:22:50.913 +12314,1142427,Jase,,,,"

statsticsssss

+",2012-11-13 15:37:02.773 +22625,3427463,Bi Act,,,,,2013-10-12 00:19:52.913 +23207,3484838,user2916044,,,,,2013-10-31 10:33:33.600 +22341,3389589,Saraya,,,,,2013-10-03 23:58:12.773 +22898,339794,Rosie,,,,,2013-10-21 10:26:05.787 +23414,3541829,charles,,,,,2013-11-06 02:18:19.680 +21762,3310698,Michael M,Switzerland,,,"
    +
  • PhD in statistics & probability
  • +
  • Classic statistics
  • +
  • Machine learning, deep learning
  • +
  • Some teaching at university level
  • +
+",2013-09-16 14:15:13.057 +14553,2066183,Kloser Cheung,,,,,2013-02-05 04:31:19.223 +23170,3511655,Greg McDonald,,,,,2013-10-30 10:13:35.177 +22665,3433718,Fred L.,,,,,2013-10-14 00:46:15.680 +23194,166796,merlin2011,"Bellevue, WA, United States",,https://hq6.me,"

I am a student. A perpetual student.

+

My most recent contribution to the open source community is RpcExplorer, a curses tool that allows the user to search for service methods defined in protobuf and interactively build and issue requests.

+

I released my primary doctoral project Arachne, a lightning-fast cooperative threading library. Please give it a whirl and create an issue if you see any problems.

+

I have also written a few simple tools, such as one for tmux automation, an improved version of the venerable Unix column, and a tool for adding color to text in the terminal based on user-specified patterns.

+",2013-10-31 02:42:25.380 +1406,48549,mpiktas,"Vilnius, Lithuania",,http://vzemlys.wordpress.com,"

* denotes convolution, $\cdot$ denotes multiplication

+ +

I am the developer of the midasr R package:

+ + + +

and you cand find me on

+ + +",2010-11-24 09:52:34.957 +1741,307018,Simone,"Helsinki, Finland",,https://simoneromano.com,,2011-01-10 09:54:02.783 +23314,1223385,yumyum,,,,,2013-11-03 20:26:31.923 +23390,415293,user790018,,,,,2013-11-05 17:08:58.653 +23358,143306,Raja Pasupuleti,,,,,2013-11-05 00:42:30.757 +22855,2755437,Umesh Kumar,,,,,2013-10-19 14:33:17.123 +22701,245819,jxstanford,"Seattle, WA, United States",,https://www.amazon.jobs/en/search?base_query=adinteljobs,"

I have a broad range of experience going back almost 20 years. This experience covers architecting widgets, designing widgets, implementing widgets, and supporting widgets. I have even tried to make widgets look more appealing than they actually are, and confidentially told people they really don't want a particular widget.

+ +

Widgets have included: +* unix systems +* network hardware +* massive storage systems +* databases +* middleware +* ISPs +* enterprise backup/recovery +* systems management frameworks +* automation frameworks +* event correlation engines +* configuration management systems +* security software +* custom software +* cloud software

+ +

Past tools of the trade: +* Scala +* Akka +* Spray +* Django +* Golang +* Elasticsearch +* MongoDB +* D3.js

+ +

Current everyday carry: +* Python +* Pandas +* Scikit-learn +* Scala +* SparkML

+",2013-10-15 05:16:54.993 +15860,1446592,Dr. Beeblebrox,,,,,2013-03-16 17:07:28.447 +11200,1885484,Pascal,"Utrecht, Netherlands",,http://pascal.schulthess.io,,2012-09-27 22:43:39.477 +22864,3461871,Cheryl,,,,,2013-10-19 20:29:44.800 +22637,3272796,JohnK,,,,"
+

Logic is barren, unless fertilized by intution.

+ +

H. Poincarè

+
+ +

I am currently studying for my Master in Statistics where I hope to contribute to the knowledge in the not-so-distant future. My interests are primarily concentrated around likelihood inference and robust methods.

+ +

Here are some of my most notable Q&A:

+ +

Questions

+ +

How to interpret a QQ plot

+ +

A generalization of the Law of Iterated Expectations

+ +

Functions of Independent Random Variables

+ +

Answers

+ +

Using Regression to project outside of the data range ok? never ok? sometimes ok?

+ +

An example of a consistent and biased estimator?

+ +

How incorrect is a regression model when assumptions are not met?

+ +

Distribution of a quadratic form, non-central chi-squared distribution

+",2013-10-12 16:48:18.213 +23042,3490997,florian,,,,,2013-10-25 15:56:13.647 +13385,383118,nomen,,,,,2012-12-23 23:58:19.773 +723,24765,Paul Murray,"Canberra, Australia",,http://paulmurray.id.au,,2010-08-19 04:45:52.747 +17249,2664437,Patrick Coulombe,,,,,2013-04-25 03:55:58.393 +23254,1359328,Sebastialonso,"Santiago, Chile",,,,2013-11-01 14:24:35.293 +23384,3539335,user32344,,,,,2013-11-05 15:16:51.700 +14803,2359201,user20780,,,,,2013-02-12 21:25:38.150 +155,60581,Jeromy Anglim,"Melbourne, Australia",,http://jeromyanglim.blogspot.com/,"

I am a Senior Lecturer in the School of Psychology at Deakin University bridging I/O psychology and statistics.

+

I'm quite active on the Cognitive Sciences and Statistics Stack Exchanges.

+

You can find me also on:

+ +",2010-07-20 02:56:34.160 +5898,1117509,Ram Ahluwalia,New York,,http://www.peeriq.com,"

Ram Ahuwalia

+

CEO of Lumida We love research analysts, CFAs, quants and hire the best of the best! Send your resume to info@lumida.co.

+

Previous roles: Fixed income credit portfolio decisioning at a major bank/broker-dealer, Management Consulting in Financial Services, Columbia Economics, and Machine Learning. Live and work in NYC.

+

All posts and comments represent my views and not that of my employer.

+

My favorite answers:

+

How do you mix quantitative asset allocation with qualitative views?

+

Empirical or theoretical insights that have shaped your thinking

+

Why is the first principal component a proxy for the market portfolio?

+

How do I graphically represent the evolution of a covariance matrix over time?

+

Which approach dominates? Mathematical modelling or data mining?

+",2011-12-19 18:46:57.437 +22687,21891,Crashworks,"Burbank, CA",,http://assemblyrequired.crashworks.org,"I program video games. That's kind of like embedded systems programming, only with more artists, and each function has a budget in microseconds.",2013-10-14 21:03:16.133 +22601,905263,Chris,,,,,2013-10-11 08:38:30.907 +23433,184688,Cedric H.,"Geneva, Switzerland",,http://www.cedrich.net,"

#STOPSHELL

+",2013-11-06 14:48:10.763 +22092,3149776,questionhang,,,,,2013-09-26 14:21:21.627 +11831,1910951,Junkyu Lee,,,,,2012-10-24 02:37:50.237 +23037,2765451,5xum,Slovenia,,,,2013-10-25 13:41:41.853 +23316,2268045,amauboussin,,,,,2013-11-03 22:06:59.877 +22614,25031,Josiah,"Brisbane, Australia",,,,2013-10-11 17:58:28.387 +4890,980204,hearse,"Anchorage, AK, USA",,,,2011-10-17 23:02:48.443 +22605,2284128,mirams,"Nottingham, United Kingdom",,https://www.maths.nottingham.ac.uk/personal/pmzgm/,"

I'm an applied mathematician working in mathematical biology, in particular cardiac electrophysiology simulations.

+ +

Please see my homepage for more details.

+",2013-10-11 10:34:45.277 +23330,12936,zendar,,,,,2013-11-04 08:57:01.997 +23003,1224578,Sandipan Bhattacharyya,,,,,2013-10-24 06:12:34.050 +23326,427016,Vyacheslav Trushkin,,,,,2013-11-04 06:45:34.937 +3693,454152,Michael,,,,,2011-07-19 07:34:22.517 +23488,3551449,Jose,,,,,2013-11-07 18:56:59.717 +22697,1451914,Tushar,Canada,,http://www.tushardhoot.com,"

I am a student at the University of Waterloo studying Computer Science and Business. I have experience working with many languages, mostly on a self-taught basis. I enjoy working with C, C++, PHP, and Android (Java).

+",2013-10-15 01:50:12.237 +21804,2581617,jank,"Berlin, Germany",,,"
    +
  • Scientific programmer developing software for experimental behavioral research
  • +
  • Using Asp.net MVC, Html5, Matlab, Delphi
  • +
  • Social science background and target application domain
  • +
  • Saved by ""StackOverflow"" countless times in the past years
  • +
+",2013-09-17 16:54:21.560 +14888,2043610,ESS,,,,,2013-02-15 22:48:27.060 +2490,103814,paul,,,,,2011-04-01 04:45:32.297 +22800,3451807,SlutskyFan,,,,,2013-10-17 16:11:49.973 +13846,1451348,Alex,,,,,2013-01-12 23:13:32.527 +22814,1917972,Vesnog,,,,,2013-10-17 23:55:40.540 +23426,3543296,qkn101,,,,,2013-11-06 09:56:02.893 +8819,1521894,,,,http://www.stats.stackexchange.com,"

I am a professional with interests in data mining, machine learning and statistical inference

+",2012-05-28 08:53:10.213 +503,80800,Peter Flom,"New York, NY",,http://www.statisticalanalysisconsulting.com/,"

I'm a statistical consultant to graduate students and researchers in fields including the behavioral and health sciences. I've assisted with review of articles, and with preparation of grants, dissertations and papers.

+",2010-08-03 19:42:40.907 +23236,3521378,MLE,,,,,2013-11-01 01:33:29.127 +5661,49098,Aaron McDaid,"Dublin, Ireland",,https://aaronmcdaid.github.io/,"

Particularly interested in having fun with constexpr in C++. I'm working on two projects related to fixing some limitations in C++14, 'cambda' and 'CONSTEXPR_LAMBDA'.

+ +

Looking for a C++ job anywhere in Europe, ideally with a computational/mathematical angle. Changing career direction away from academia after a PhD and four years of a postdoc in statistics and computer science. In Ireland now, ready to start working again after a good break lasting a few months :-)

+",2011-12-05 22:44:10.557 +22624,342874,sharataka,"Seattle, WA",,http://my5or5.com,,2013-10-11 22:54:58.860 +8363,1454110,Michael R. Chernick,"Holland, Pennsylvania, United States",,,"

Michael Ross Chernick --- 01-01-2021
+Obituary: Michael Ross Chernick
+Obituaries

+
+

I am a currently retired biostatistician and last worked at the Lankenau Institute for Medical Research where I worked on lab experiments, clinical trials and other medical research. I have a PhD in Statistics from Stanford University. I have published books on bootstrap and biostatistics and have written or coauthored many articles in statistics, mathematics and medical journals. I am an ASA Fellow and have also been a member of ENAR, the IMS, the Bernoulli Society and the Royal Statistical Society. I like teaching and mentoring and doing puzzles (crossword, cryptograms and Sudoku) I used to play chess with my son Daniel but I am getting rusty at it as I age and he started beating me too often. I have been retired since 2012. I was very active at CrossValidated in 2011, but have not contributed from 2012-2016. I resumed activity in November 2016 and am hoping to finish a few books I had abandoned and I hope to get back to doing some part time consulting at some point in the future.

+",2012-05-02 14:04:04.697 +3185,351545,Ziyuan,"Helsinki, Finland",,,"

+",2011-06-03 10:27:04.040 +22592,1231145,Tobias,,,,,2013-10-11 00:04:36.627 +22734,3442797,Vincent M.,,,,,2013-10-15 22:01:34.797 +22639,3236986,Quanquan Liu,,,,"

I'm currently a masters student studying theoretical computer science at MIT.

+",2013-10-12 18:00:06.023 +23245,1086824,FooBar,,,,,2013-11-01 12:48:27.523 +22664,3342950,user31456,,,,,2013-10-13 23:58:31.360 +22634,66756,vstrien,"Amsterdam, Netherlands",,http://techgineer.blogspot.com,"

Business Intelligence developer at Hot ITem, with a background in Computer Science. Love to discover new things in code, platforms and more.

+ +

When I'm not working, I like playing the piano (& hammond organs - I LOVE those), spending time with friends (going out, having some beers, discussing Deep Thoughts about Life And The Universe) and travelling.

+",2013-10-12 12:49:05.660 +5203,1017882,Matt Krause,United States,,,"

I'm a research scientist at the Montreal Neurological Institute and occasionally enjoy helping other people make sense of their data instead of wondering what's going on in my own.

+

My current research focuses on how the brain represents information, particularly visual information, and how we can enhance these representations with electrical stimulation.

+
+

If you've got an interesting project involving biological data, pretty pictures, or lots of words, send me a message!

+",2011-11-06 17:26:45.683 +23051,3263775,jhermann,Germany,,http://jhermann.github.io/,,2013-10-25 21:49:45.753 +22613,1839767,butch,"Seattle, WA",,,,2013-10-11 16:51:33.360 +22964,3412065,perevales,"Valencia, Spain",,,,2013-10-23 10:24:51.593 +1927,285928,ocram,Belgium,,,,2011-02-02 18:31:38.997 +23500,3552309,maverick,,,,,2013-11-07 22:49:32.913 +22082,3356162,Suneyna,,,,,2013-09-26 10:25:49.247 +9522,1626214,Laura,,,,"

PhD in Genetics

+",2012-07-04 13:36:19.217 +22994,3452296,Meenu Anish,,,,,2013-10-23 22:32:01.830 +22381,3395439,ankc,,,,,2013-10-05 10:15:56.870 +23405,3541346,user32370,,,,,2013-11-05 23:00:21.960 +12683,2085422,Scortchi - Reinstate Monica,England,,,"

For anyone wondering why I've changed my user-name: see Reinstate Monica & links. If Stack Exchange think the summary dismissal of an elected moderator pour encourager les autres is going to work, they can think again.

+ +

Not only did SE sack Monica, but they smeared her in the press. To support her legal action against SE's defamation go here.

+",2012-11-27 12:24:58.070 +20463,3126518,Analyst,Finland,,,"

I am working as an consultant for different customers in different industries.

+ +

Most of my work deals with data management.

+",2013-08-02 09:02:39.177 +5045,999705,dimitriy,"Oakland, CA, USA",,,"

Into data before it was big. Keeping the world safe from folks armed with pivot tables and people trained to detect cats at scale. A great man for things that do not yet exist. Contact me if you are working on interesting problems and want to talk to an economist. Shallow learning for the win.

+
+

Please do not e-mail me with questions unless we drank at least one beer together. I am happy to help, but I won't do it over e-mail, LinkedIn, Facebook, Twitter, etc., as I much prefer to have my answer be a public good, accessible to everyone in the future. Moreover, if I am wrong, someone will usually come along to correct me, which is much better for everyone involved. Create a simpler reproducible example on one of the toy data sets that demonstrates the problem you are having, put that up on SO, and then tag me. I will take a look when I get the chance.

+",2011-10-26 15:10:36.547 +23017,299281,faken,"Lisbon, Portugal",,,,2013-10-24 15:58:04.853 +22990,125744,tshepang,Johannesburg,,,"

I do software development for a living and as a hobby. My favorite language is Rust, and I've used Python much in the past. My OS of choice is Debian.

+",2013-10-23 20:14:02.063 +8351,1055943,Alby,,,,,2012-05-01 20:31:23.877 +23501,3247181,omargrojas,,,,,2013-11-07 22:59:48.500 +23172,3511887,eski,Germany,,,,2013-10-30 11:01:36.227 +23246,3523267,kunfu,,,,,2013-11-01 13:01:27.717 +22948,3431775,Gisela,Stockholm,,http://www.giselajonsson.se,"

PhD student in work and organizational psychology, trying to learn some code, mainly Python and R.

+",2013-10-22 21:38:28.457 +22662,3433206,user2876987,,,,,2013-10-13 22:51:18.897 +19043,2910935,Mina,"Golden, CO, United States",,,"

I'm an ecologist with some formal training in statistics, but no resources within my organization to lean on when faced with novel statistical situations. I appreciate all the help I can get.

+",2013-06-17 22:47:44.993 +14402,2307150,user20281,,,,,2013-01-30 16:38:47.730 +22860,3461197,Jamal,,,,,2013-10-19 16:30:46.430 +6384,1175965,lakshmen,America,,http://www.quantgreeks.com,"

Loves Programming and very keen in learning new things...

+
NSMutableArray *skills = @[
+    @"C++",
+    @"Python",
+    @"Objective-C",
+    @"Matlab",
+    @"VBA",
+    @"R",
+];
+
+

Any questions regarding the codes, do feel free to contact me. My Email address can be obtained using this code in MATLAB:

+
s = char(double([99 110 110 108 97 107 115 104 109 101 110 95 50 48 48 48 64 121 97 104 111 111 46 99 111 109]))
+clc; disp(s); disp(' ');
+
+

You can connect with me in Linkedin

+",2012-01-21 19:13:35.057 +23251,1026046,Johannes,,,,"

Stream Ecologist...

+",2013-11-01 13:59:17.820 +23290,151367,Ozair Kafray,"Karachi, Pakistan",,http://www.linkedin.com/in/ozair,"

I am a technology agnostic software systems development professional with more than 15 years of experience in software development - spanning over different programming languages, operating systems, version control systems, geographic locations as part of geographically distributed and culturally diverse teams.

+",2013-11-02 18:19:31.560 +22866,559554,America,"Denver, CO, USA",,,"

Technical Debtor

+",2013-10-20 02:06:17.643 +22918,128038,misterte,Chile,,http://andresbucchi.com,"Regular stuff: +two legs and two arms... +still.",2013-10-22 00:12:09.437 +1693,509642,rolando2,Eastern Massachusetts,,http://www.integrativestatistics.com,"

Statistical and research consultant for clients in a variety of fields: elementary, secondary, and higher education; health care; psychology; business; law; and archaeology.

+ +

Email: Roland@IntegrativeStatistics.com. Websites: IntegrativeStatistics.com, with a more frivolous site at YellowBrickStats.com.

+ +

Hobbies: woodland and swamp photography (rolly.smugmug.com), chess, target archery.

+",2011-01-04 17:43:06.527 +22551,3417506,user31319,,,,,2013-10-10 07:30:16.633 +23169,451822,tutak,,,,"

merge keep

+",2013-10-30 09:32:23.277 +7714,1351766,sksw,,,,,2012-03-30 03:12:20.367 +116224,511865,The_Sympathizer,,,,,2017-04-30 04:45:38.203 +20498,2142798,John Smith,,,,,2013-08-03 14:35:29.180 +22722,1297737,dudu,,,,,2013-10-15 18:05:12.743 +22729,1367178,Baumann,,,,"

R&D professional, software and electrical engineer, [Python | Java | C++ | R | Bash] experienced developer, [Django | JS | DC.js ] learner, [Linux | vim | sed | awk] heavy user.

+ +

Works with metaheuristic optimization algorithms, statistical modeling, multicriteria analysis etc.

+",2013-10-15 19:46:31.140 +132,48509,Rob Hyndman,"Melbourne, Australia",,http://robjhyndman.com,"

Professor of Statistics, Monash University, Australia. I initiated the formation of crossvalidated.com, and was a moderator for the first six months. I still drop by most days and monitor any forecasting or time series questions. More details at robjhyndman.com.

+",2010-07-19 23:05:39.653 +93032,9492346,Maximilian Aigner,Switzerland,,,"

PhD student in statistics.

+",2016-10-22 09:50:01.600 +23005,3169798,Quicker,,,,,2013-10-24 07:46:36.047 +22904,3468221,SamD,,,,,2013-10-21 13:20:33.820 +22527,3414755,Brett Phinney,Davis CA,,http://proteomics.ucdavis.edu,"

Scientist

+",2013-10-09 17:34:17.050 +23445,1762327,scls,"Poitiers, France",,http://www.celles.net/,"

BY DAY: Professeur agrégé (PRAG / Assigned Associate professor) of Applied physics and Computer Sc. at University of Poitiers

+

BY NIGHT: writing some code (Python, Julia, C, C++, VB, C#, Go ...), contribute and or manage some open source libs...

+

FOR FUN: pilot (go-karts, aircrafts, gliders)

+",2013-11-06 20:05:35.927 +23512,3467137,user45022,,,,,2013-11-08 06:42:30.237 +23198,2853200,rdittmer,"Reno, NV",,,,2013-10-31 05:29:05.607 +23523,7840,MattK,United States,,http://www.morelightmorelight.com,I discuss technology with rich people for a living and program for poor people as a hobby.,2013-11-08 13:59:56.963 +20972,1887671,Jamie Bull,"London, UK",,,"

Manager at Baringa Partners LLP, working on climate change transition risk modelling.

+

Also and just for the love of it, I develop websites and bots using Django, Flask, Heroku, AWS and more.

+",2013-08-20 22:52:52.050 +15473,2264085,Randel,,,,,2013-03-06 05:02:37.703 +22730,189441,Mitch,,,,,2013-10-15 20:18:35.927 +45797,5788420,Matthew Bloomfield,,,,,2015-02-14 15:25:04.920 +114,32277,miku,Germany,,,"
+

Each working data pipeline is designed like a log; each broken data pipeline is broken in its own way.

+
+",2010-07-19 21:29:03.000 +60,78549,Neil McGuigan,"Vancouver, Canada",,https://database-patterns.blogspot.com/,"

I'm a Data Scientist (formerly Data Engineer) at a national Canadian bank

+

I have the following certifications:

+
    +
  • AWS Certified Solutions Architect Professional
  • +
  • AWS Certified DevOps Engineer Professional
  • +
  • AWS Certified Machine Learning Specialist
  • +
  • AWS Certified Data Analytics Specialist
  • +
+

Previously, I was a Data Scientist at Ritchie Bros. (market cap $5B)

+

I'm also one of the authors of the book "RapidMiner: Data Mining Use Cases"

+

I taught Data Mining for Management at UBC's graduate business school for a few years, as well as upper-level Economics at Royal Roads University

+

Software Expertise:

+

Statistics & Data Mining: Deep Learning, Python, SageMaker, Tableau

+

Databases: PostgreSQL, MS SQL Server, Spark, Hive. See my blog at https://database-patterns.blogspot.com/

+

I'm good at Java, Python, Bash and SQL

+",2010-07-19 19:25:03.697 +16174,2459654,Andre Silva,"Rio Branco, AC, Brazil",,,"

Interested in database, GIS, programming and statistics.

+
+

+",2013-03-25 00:28:13.523 +2873,190756,Greg Snow,"Utah, United States",,,,2011-05-07 13:44:25.593 +16205,2533377,Manas,,,,,2013-03-25 22:29:05.103 +7341,96512,Aerik,California,,http://aerik.com,"

father, software engineer, surfer

+",2012-03-12 17:31:18.703 +20179,3083475,user28363,,,,,2013-07-23 15:05:57.530 +65,71856,adamo,"Athens, Greece",,http://adamo.wordpress.com,"

I am a DevOps guy living in Greece. Once upon a time I used to run sendmail a lot.

+ +

http://gr.linkedin.com/in/yiorgos

+",2010-07-19 19:27:02.663 +21823,1797669,Arun Jose,"Bangalore, India",,http://www.analyticstraining.in,"

Early stage data scientist working at a product start up focusing on the Internet of things.

+ +

My skills would be 40% statistics, 20% programming, 40% business need estimation and implementation.

+",2013-09-18 03:45:36.353 +17123,46309,Steve,,,,,2013-04-22 06:16:28.820 +23225,368231,Barry Devereux,,,,,2013-10-31 18:40:38.143 +22971,3280768,Alex,"Strasbourg, France",,,,2013-10-23 13:49:39.697 +23388,3509938,Newbie,,,,,2013-11-05 16:42:53.193 +22797,342606,Ruslan,Mauritius,,,,2013-10-17 14:54:44.980 +21382,3258927,wisc88,,,,"

I study animal behavior.

+",2013-09-04 01:06:26.173 +19325,412477,xiaoyao,"Urbana, IL",,http://xiaoyao.im,"

College student

+",2013-06-26 17:13:29.137 +23489,3551510,user32469,,,,,2013-11-07 19:13:51.203 +23421,61964,Tjorriemorrie,South Africa,,,"

I am a self-taught developer: started out with php and switched over to python.

+",2013-11-06 07:55:40.567 +22963,1527879,gogoolplex,LYR,,,,2013-10-23 09:09:58.830 +23279,3526646,SikyS,,,,,2013-11-02 10:37:55.540 +306,509479,htrahdis,,,,,2010-07-27 02:10:19.603 +9095,1557239,D L Dahly,"Cork, Ireland",,http://dantalus.github.io,"

Epidemiologist and Statistician. Public health nutrition and food security. Supporting open science.

+ +

Blog

+ +

On Twitter

+",2012-06-11 00:16:45.320 +22628,3427844,user31409,,,,,2013-10-12 03:48:42.717 +20953,3197172,Vincent Guillemot,"Paris, France",,http://NA,,2013-08-20 13:36:55.400 +10469,453661,sds,United States,,http://sds.podval.org,"

Math, Data Science, History...

+",2012-08-23 18:25:00.690 +22689,111073,Thieme Hennis,"Amsterdam, The Netherlands",,http://hennis.nl,"

Social entrepreneur from Amsterdam, working on space farming and circular neighborhoods.

+ +

Previously: PhD student @ TU Delft (the Netherlands)

+",2013-10-14 21:31:04.133 +1809,282850,SabreWolfy,,,,,2011-01-18 11:01:51.150 +23410,3541624,user32375,,,,,2013-11-06 00:52:44.453 +23545,260404,ekangas,Virginia,,http://www.linkedin.com/in/ekangas,"

I have worked with Java, Python, Linux/Bash scripting, Hadoop, WMS, automation, query language filters/autocomplete, web services and Tomcat.

+ +

Currently learning C++ via programming problems. on http://uva.onlinejudge.org +Here's my record on there: http://uhunt.felix-halim.net/id/27788

+",2013-11-08 23:19:19.877 +23143,80153,Doug Paul,"Baltimore, MD",,,,2013-10-29 17:44:07.003 +22671,3068082,user 31466,,,,,2013-10-14 07:07:18.920 +23263,3525126,user32197,,,,,2013-11-01 21:16:59.367 +19750,1921338,Josh,,,,"

Engineering manager at a mid-sized tech company

+",2013-07-09 22:06:15.570 +22793,1463984,AdrienNK,,,,,2013-10-17 10:49:49.320 +22771,3446978,Ben Elizabeth Ward,,,,,2013-10-16 20:08:50.197 +20927,3193385,user29309,,,,,2013-08-19 16:35:15.363 +11772,365440,Emile,,,,,2012-10-22 12:12:33.970 +22604,2649623,Bob,,,,,2013-10-11 10:31:17.360 +8671,276909,user570593,,,,,2012-05-18 16:18:31.163 +23458,3547377,user32430,,,,,2013-11-07 01:54:07.043 +8719,1473300,Eekhoorn,Germany,,,"

I would like to have git for my life.

+",2012-05-21 19:16:27.983 +20667,1106092,Martin Drozdik,"Vienna, Austria",,http://www.martindrozdik.com,"

Programmer, mathematician.

+",2013-08-09 15:48:34.627 +19463,2992361,terauser,,,,,2013-07-01 16:04:28.050 +9886,294576,repied2,,,,,2012-07-25 09:18:57.253 +23159,1893836,MatriXanger,,,,,2013-10-30 02:17:30.940 +23152,1092093,Rodrigo,Brazil,,,"

Programmer and biologist, I'd love to leave the world a little better than I've found it.

+",2013-10-29 21:00:45.317 +23136,3507361,transfemer,,,,,2013-10-29 14:38:39.240 +22841,3458084,Joana,,,,,2013-10-18 19:12:28.563 +11383,511570,Davide Giraudo,"Strasbourg, France",,https://irma.math.unistra.fr/~giraudo/home.html,,2012-10-05 11:28:31.760 +15044,2393508,curious_cat,"Dordrecht, Netherlands",,,,2013-02-20 20:03:35.797 +23048,3491813,easy54123f,,,,,2013-10-25 19:19:09.190 +2958,393422,cbeleites unhappy with SX,Germany,,,"

Analytical chemist, vibrational spectroscopist and chemometrician.

+ +

I do professional chemometrics consulting and training as well as chemometric method development and coding:

+ +

Claudia Beleites
+Chemometrix GmbH
+Södeler Weg 19
+61200 Wölfersheim
+Germany

+ +

phone: +49 (15 23) 1 83 74 18
+USt-ID: DE306376280 +email: claudia (dot) beleites (at) chemometrix (dot) gmbh

+ +
+ +

I maintain the R package hyperSpec for handling spectroscopic data sets in R.

+",2011-05-13 10:02:56.203 +23068,3049222,Zhiwei,,,,,2013-10-27 07:18:57.573 +22611,3425926,Larry,,,,,2013-10-11 16:00:23.980 +23025,199535,jlim,California,,http://stackoverflow.com/users/444918/jlim,,2013-10-25 04:57:56.030 +23086,2431235,fede_luppi,,,,,2013-10-28 03:02:53.320 +22919,3470785,Gabby,,,,,2013-10-22 00:32:26.990 +22892,3466120,mickles,,,,,2013-10-21 02:45:40.427 +232,88882,Aniko,"Milwaukee, WI",,,,2010-07-26 17:02:09.363 +7155,1189721,Jessica Collins,"Seattle, WA, USA",,,"

Everyone needs a hobby.

+",2012-03-02 18:18:29.770 +23140,3508188,Louise,,,,,2013-10-29 17:21:38.730 +22772,3447535,David Anderson,,,,,2013-10-16 20:18:28.967 +19089,357438,owensmartin,"San Francisco, CA",,,"

All theory and no action, but working on it.

+",2013-06-19 07:21:38.273 +23027,2030413,Frida,,,,,2013-10-25 06:49:18.120 +22911,3469440,Durjar,,,,,2013-10-21 17:36:12.943 +7739,1369736,Julie,,,,,2012-03-31 17:15:11.023 +23460,2524281,reviewer3,,,,,2013-11-07 02:12:37.020 +23250,3523455,Samuel,,,,,2013-11-01 13:57:37.697 +6136,1153254,user1134516,,,,,2012-01-06 15:39:48.433 +22738,2695830,opt,,,,,2013-10-15 23:17:56.780 +22798,3451434,Monica Perez,,,,,2013-10-17 14:57:33.620 +21958,3340278,Erosennin,,,,"

humble fan of statistics and statistical programming searching for new knowledge

+",2013-09-23 09:59:12.650 +20603,3142953,lejlot,"London, United Kingdom",,http://wojciechczarnecki.com,"

Machine learning scientist, mostly tracking the maching-learning tag, proud first holder of .

+

For further details and contact information please visit my personal webpage.

+",2013-08-07 14:31:40.773 +22619,1333502,Ellis Whitehead,Germany,,,,2013-10-11 19:52:46.210 +23391,464115,MrPatterns,,,,,2013-11-05 17:55:22.897 +23195,3516682,younglau,,,,,2013-10-31 03:26:50.223 +23043,3339580,user2806134,,,,,2013-10-25 16:19:26.560 +22901,1559438,AbeeCrombie,"Miami, FL, United States",,,,2013-10-21 11:49:59.567 +5179,844516,Xi'an,"Paris, France",,http://xianblog.wordpress.com,"

I am a professor of Statistics at Université Paris Dauphine (PSL), Paris, France, since 2000, and at University of Warwick, Coventry, United Kingdom, since 2013, a researcher at CREST, Paris-Saclay University, since 1992, a senior member of the Institut Universitaire de France (IUF), between 2010 and 2021, and a pr[AI]rie chair since 2019. My research is focusing on Bayesian statistics, numerical probability applied to simulation methods, and statistical inference with an interest mostly in genetics and astronomy applications. I maintain a blog on academic and un-academic topics since 2008.

+

0 credit answers:

+ +

Warning: As an Amazon Associate I earn from qualifying purchases made though the links I make to books on that platform.

+",2011-11-05 07:56:15.903 +1298,235669,James Howard,"Columbia, Maryland",,http://jameshoward.us,,2010-11-10 01:57:37.960 +17326,528247,joshlk,"London, United Kingdom",,,"

Im currently a Data Scientist working with social media and network data

+",2013-04-26 17:26:15.883 +22144,2124884,SZS,,,,,2013-09-28 05:21:47.640 +22959,3477549,Bigbang,,,,,2013-10-23 08:14:12.760 +22949,3475894,Lams,,,,,2013-10-22 22:30:13.947 +22698,1429723,user1352399,,,,,2013-10-15 01:55:03.623 +23165,3510464,Gowri,,,,,2013-10-30 05:05:39.567 +18268,1512502,ivan-k,"Calgary, Canada",,,,2013-05-24 04:26:37.917 +22799,1882894,Adam,"Amsterdam, Netherlands",,,"

I’m new to everything!

+",2013-10-17 16:08:26.443 +9408,1609799,Blain Waan,The Earth,,http://stats.stackexchange.com/users/12603/blain-waan,"

Learner.

+",2012-06-28 15:59:45.413 +22940,349820,Sudh,,,,,2013-10-22 17:27:55.973 +23107,3369828,Doubting,,,,,2013-10-28 16:03:29.247 +23223,3520000,Martin R.,,,,,2013-10-31 17:58:29.983 +21886,3325010,user95691,,,,,2013-09-19 23:02:37.077 +23208,3518153,CSands,,,,,2013-10-31 10:51:28.807 +22567,75211,cschmidt,,,http://predictobot.com,,2013-10-10 15:07:14.570 +22927,2799946,Leo Zhang,,,,,2013-10-22 07:51:55.863 +23418,3542524,user32384,,,,,2013-11-06 06:33:28.320 +10448,1249453,broccoli,,,http://bayesianthink.blogspot.com/2012/12/the-best-books-to-learn-probability.html,"

Statistician,Machine Learning,Data Mining,R,Perl. +I maintain a blog listing out probability puzzles with the intention of spreading a simplistic understanding of the science of probability.

+",2012-08-22 21:55:33.107 +23203,3517688,Emanuele Raineri,,,,,2013-10-31 09:02:59.283 +23475,152619,HypersonicNinja,UK,,,"

I like cake! ;-D

+",2013-11-07 13:37:33.847 +22641,3429941,CaperKen,,,,,2013-10-12 18:47:43.087 +23125,1674890,enricoferrero,United Kingdom,,https://uk.linkedin.com/in/enricoferrero,,2013-10-29 08:58:14.480 +23259,2498805,M Silva,Lisbon,,,,2013-11-01 17:33:56.290 +14811,1686321,Rob Donnelly,,,,,2013-02-13 04:32:47.853 +15293,2427360,Sweetbabyjesus,,,,"

Mediocre for life.

+",2013-02-28 09:38:56.763 +23015,2589267,Luis Alexandre Rodrigues,,,,,2013-10-24 13:51:32.777 +22925,3471816,KF Harlock,,,,,2013-10-22 06:51:41.413 +1790,106330,Amelio Vazquez-Reina,,,https://www.linkedin.com/in/amelio/,"

I'm passionate about people, technology and research.

+

Some of my favorite quotes:

+
    +
  • "Far better an approximate answer to the right question than an exact answer to the wrong question" -- J. Tukey, 1962.
  • +
  • "Your title makes you a manager, your people make you a leader" -- Donna Dubinsky, quoted in "Trillion Dollar Coach", 2019.
  • +
+",2011-01-16 02:13:53.420 +23482,2840495,CArnold,,,,,2013-11-07 16:57:11.747 +23360,3536634,rrruss,,,,,2013-11-05 02:23:54.413 +19455,2353432,Leeor,,,http://il.linkedin.com/pub/leeor-langer/50/615/14b,,2013-07-01 08:32:10.657 +22553,504700,adeandrade,Toronto,,,,2013-10-10 08:09:00.713 +23321,1722947,gpc,,,,,2013-11-04 03:41:59.057 +5213,85655,Ruggero Turra,Italy,,http://turra.web.cern.ch,,2011-11-07 10:14:35.143 +23028,3489242,ConfEco,,,,,2013-10-25 09:28:08.600 +23417,1322642,Siddharth,Singapore,,https://sidd.io,,2013-11-06 05:17:26.550 +23309,3530909,user32257,,,,,2013-11-03 18:56:00.920 +22706,3439402,Felix Bach,,,,,2013-10-15 08:17:27.980 +22547,2651451,Andy Clifton,Germany,,http://nope,"

Happy hacker in Matlab, R, and LaTeX. Deeply interested in the challenges of renewable energy R&D, knowledge management, and technology transfer.

+",2013-10-10 05:13:41.783 +23085,3499397,Greenparker,"Kanpur, Uttar Pradesh, India",,https://dvats.github.io/,"

Generally interested most in problems on MCMC and Bayesian computation. Also interested in all other statistics.

+",2013-10-28 02:13:22.950 +19377,2933974,QuantIbex,,,,,2013-06-27 22:37:39.953 +20120,3074805,jona,,,,,2013-07-21 15:34:45.770 +13403,1461726,user1375871,,,,,2012-12-25 16:22:37.303 +12900,1159530,vzn,U.S.,,http://vzn1.wordpress.com/,"
+

The algorithm designer who does not run experiments risks becoming lost in abstraction. —Sedgewick

+
+ + +",2012-12-05 05:56:23.580 +5234,412767,Poik,,,,"

In the Spring of 2014, I graduated with a B.S. in Computer Engineering and a B.S. in Mathematics. I am trying to get into grad school for neuroscience with an application in Brain-Computer Interfacing. Job experience includes cryptanalysis, malware reverse engineering, and neural nets. I'll edit this as I think of something more interesting to put.

+ +

SOreadytohelp

+",2011-11-09 01:34:35.567 +23160,1455119,JuJoDi,"Detroit, MI, USA",,https://nl.longpressed.com,"

I'm a noob

+",2013-10-30 02:28:39.930 +23502,281416,Ivan Xiao,"Santa Clara, CA, United States",,http://ivanxiao.com,"

Humble Coder.

+",2013-11-07 23:01:25.087 +23336,3534223,user32289,,,,,2013-11-04 14:44:30.250 +19265,2939936,Felix,,,,,2013-06-24 22:18:20.480 +15663,2247995,Lucas Morin,France,,,"

Data & Model Audit @ SGCIB

+

Kaggle GM (Notebooks)

+

Mostly working in Python. Also decent in R/VBA.

+

LinkedIn, Github, Kaggle

+",2013-03-11 10:40:53.923 +22967,2281989,RonRich,,,,,2013-10-23 11:31:34.183 +22163,1979877,Cookie Monster,"Norrkoping, Sweden",,http://fredrik.bonander@facebook.se,"

Doctor in political science

+",2013-09-29 08:35:06.407 +36513,4910845,dobiwan,,,,,2014-08-19 11:47:35.347 +21624,3112279,ChangeMyName,,,,,2013-09-11 22:25:09.907 +23503,2746829,Markus Shepherd,World,,https://recommend.games/,"

It is Easier to Ask for Forgiveness than Permission

+",2013-11-07 23:29:19.207 +22985,1670427,NoDataDumpNoContribution,Germany,,,,2013-10-23 18:45:10.130 +23121,127970,Kartik,Canada,,http://kartikt.com,,2013-10-29 05:47:14.130 +10450,1774237,pat,,,,,2012-08-22 23:27:45.250 +22726,3442210,kjenrkenrj,,,,,2013-10-15 19:10:55.417 +23286,1332906,Christopher Moore,,,http://blog.lib.umn.edu/moor0554/canoemoore/,,2013-11-02 15:21:55.360 +12808,1359034,Zoran,DE,,,,2012-12-01 19:14:53.863 +23378,3539035,Ilya,,,,,2013-11-05 14:15:25.160 +23055,2234377,Ryan Walker,United States,,https://rwalk.xyz,,2013-10-26 02:32:39.970 +16588,2410610,ndoogan,,,,"

no longer a post-doc

+",2013-04-06 18:54:58.670 +6404,1152914,HCAI,,,,,2012-01-23 10:08:01.530 +22794,379318,Jack Kelly,"London, United Kingdom",,http://jack-kelly.com,"

Software developer interested in energy efficiency & green tech

+",2013-10-17 11:23:30.407 +23059,458112,Dudus,"Nairobi, Kenya",,,"

I tinker around Java,C, C++, Scripting languages(Python, Php, Javascript, Css), Computer Networks and Network Devices,

+",2013-10-26 13:00:13.083 +22944,3475168,Sue,,,,,2013-10-22 19:18:45.377 +23190,118800,Davis King,,,http://dlib.net,,2013-10-30 20:48:35.280 +22875,3464246,Pat W.,U.S.,,,"

+ +

My research interests are economics and law—especially security and corporate competition. I typically use r for empirical work.

+",2013-10-20 14:57:23.973 +22945,3353093,user2816711,"Paris, France",,,,2013-10-22 19:26:34.633 +23057,3493663,I.M.,"Lund, Sweden",,,,2013-10-26 09:27:59.537 +14110,1694363,pitfall,,,,,2013-01-22 14:29:11.993 +2069,509706,Behacad,,,,,2011-02-15 18:39:47.523 +23323,3532146,indu mann,,,,"

PhD (Economics) and Central Banker

+",2013-11-04 03:56:42.793 +22690,400389,Trein,Canada,,https://github.com/trein,"

#SOreadytohelp

+",2013-10-14 21:44:46.273 +23012,1493593,Mohamed Adel Elareef,,,,,2013-10-24 12:18:09.043 +16644,2608372,soakley,,,,"

Practitioner

+",2013-04-08 19:25:21.483 +23518,57767,Dzmitry Lahoda,,,http://lahoda.pro,"

http://asd-and-rizzo.github.io/Dzmitry_Lahoda.html

+",2013-11-08 09:27:13.313 +18040,1092887,emhart,"Cupertino, CA",,http://emhart.info,"

I'm a data scientist at a fruit vendor in Cupertino

+",2013-05-17 19:21:25.107 +22891,3465952,Agus Dwikarna,,,,,2013-10-21 01:29:52.313 +23530,440425,Jonatas Eduardo,,,,,2013-11-08 18:37:29.527 +10579,1785716,Sven Hohenstein,"Leipzig, Germany",,,"

R user since 2005.

+ +

Interested in statistics, data analysis, and visualization.

+ +

Author of the remef package for R.

+ +
+ +

+ +

+",2012-08-29 09:54:12.887 +23120,3505009,user2930721,,,,,2013-10-29 05:41:00.153 +22995,2716594,user2343837,,,,,2013-10-23 23:04:28.010 +20249,553005,bitcyber,"Nice, France",,http://www.lagrangianpoint.net,"

Primarily a C++ and Python developer, intrigued by digital signal processing, particularly for Software Defined Radio (SDR), an active embedded electronics designer, interested in statistics for clinical studies and lastly into website (drupal) development.

+",2013-07-25 10:42:47.613 +22863,3171908,David Alisha,,,,,2013-10-19 19:49:46.927 +22808,2376671,Collin,"Seattle, WA, USA",,,,2013-10-17 21:59:48.017 +22953,1962547,gator,"Niagara Falls, Canada",,,,2013-10-23 02:42:58.693 +5987,510887,Brian Borchers,"Socorro, NM",,http://euler.nmt.edu/~brian/,"

I'm a mathematics professor at New Mexico Tech.

+",2011-12-26 19:07:30.063 +23395,2906613,tony_tiger,,,,,2013-11-05 19:08:44.637 +64247,3110891,Bernie2436,,,,,2015-11-20 03:34:23.990 +16159,2538419,EngrStudent,Midwest USA,,http://mathbaby.blogspot.com/,"

Christian.

+

Dad of daughters (14yr, 10yr, 4yr, 3yr)

+

Husband. (20.7 years) Married to my best friend and an amazing person.

+

Nerd. Former president of Math club. Presented at MathFEST. +Mechanical Engineer. Scientist.

+

Grew up in the land of rednecks, cactus, and tarantulas. Now I live in the land of soybeans, corn, and car-racing.

+

Nearly no free time, so posting is likely to have intermittency.

+",2013-03-24 16:01:45.153 +190,146757,robin girard,France,,http://www.mines-paristech.fr/Services/Annuaire/&?id=8828,"

Reasearcher in applications of mathematics and statistics in the field of renewable energy.
+Also on Mathoverflow

+

Web site : https://www.energy-alternatives.eu/

+",2010-07-20 12:52:54.140 +15539,1137454,user,,,,,2013-03-07 16:38:08.097 +23353,3292099,Betterthan Kwora,"Seattle, WA, USA",,,"

“People say nothing is impossible, but I do nothing every day.” - Winnie the Pooh

+",2013-11-04 23:15:13.420 +23238,3149307,Ray Xiao,California,,,,2013-11-01 03:56:39.810 +9456,1615191,user34790,,,,,2012-06-30 14:25:34.830 +23490,3332303,jxn,San Francisco Bay Area,,,,2013-11-07 19:52:48.533 +23368,3279320,Senti Bachcha,,,,,2013-11-05 08:21:10.873 +23072,3497397,user31960,,,,,2013-10-27 13:40:53.287 +23412,3541648,Moose,,,,,2013-11-06 01:01:02.880 +23431,3310722,Tarek ,,,,,2013-11-06 13:29:17.870 +10964,1202011,user1172468,,,,,2012-09-18 01:29:02.363 +7860,1162200,Gabriel,,,,,2012-04-05 22:15:28.070 +23219,1535785,lp2437,,,,,2013-10-31 15:26:28.863 +23016,20447,miles82,Croatia,,,,2013-10-24 14:59:38.657 +22879,3312030,F.F.,,,,,2013-10-20 17:15:38.597 +23322,3532170,benjamin,,,,,2013-11-04 03:44:35.610 +11886,184332,san8055,,,,,2012-10-26 06:20:37.067 +14874,2369713,PeGre,,,,,2013-02-15 07:09:28.007 +23124,2963929,shvahabi,Earth,,,"

A Scala learner and blockchain enthusiast

+",2013-10-29 08:30:10.770 +22545,3416713,Frank,,,,,2013-10-10 04:00:27.487 +23332,3533419,user32284,"New Delhi, India",,,"

I am an enthusiastic & passionate professional of Business Analytics & Data Mining. I love to play with data & to bring useful insight out of it. I love reading all the current trends & innovation in Business Analytics & Data Mining.

+",2013-11-04 11:20:43.213 +15321,391958,Ankit,"Vancouver, Canada",,http://ankit-gupta.com,"

Interests: Distributed Systems, Visual Analytics, Information Visualization and HCI

+

https://github.com/gupta-ankit

+",2013-02-28 23:33:11.137 +2075,108788,user3269,,,,,2011-02-16 04:32:37.117 +22668,3283593,fcorowe,,,,,2013-10-14 04:37:33.117 +20367,1661631,hatch22,,,,,2013-07-30 00:22:08.517 +22620,3426841,SniperBro2000,,,,,2013-10-11 19:58:49.263 +22853,3460276,teddypicker,,,,"

a candidate of a statistician

+",2013-10-19 10:57:00.187 +23063,3494891,Edgar Ferrer,,,,"

Political science grad student.

+",2013-10-26 17:18:39.920 +128628,11521300,Ben,"Canberra, Australia",,,"

SE: Please reinstate and apologise to Monica Cellio

+
+

Research Fellow - Australian National University (ANU)

+

Honourary Lecturer - University of New South Wales (UNSW)

+

Data Scientist and Statistical Consultant

+
+

I enjoy theoretical and applied statistics, data science, machine learning and game theory. My PhD was in Bayesian statistics but I have done a lot of teaching in classical methods. I enjoy programming and analysis in R, and love making models and pretty graphs. I also have broader expertise in economics, philosophy and law.

+",2017-08-10 03:27:26.793 +23062,2654893,Jer227,,,,,2013-10-26 14:13:47.077 +21728,1328628,JackReacher,,,,"

Mathematics student from Connecticut. Thanks to everyone sharing their knowledge with me to help me learn.

+",2013-09-15 10:41:42.113 +22997,3481971,Remy,,,,,2013-10-24 01:25:36.453 +5906,1119174,Roji,,,,"

Phd candidate in Industrial engineering

+",2011-12-20 12:53:08.483 +23357,3481143,user2913161,,,,,2013-11-05 00:35:09.893 +22713,3441044,HMLAZIO,,,,,2013-10-15 14:44:13.583 +21778,3312378,Eric C,,,,,2013-09-16 22:19:43.457 +22740,1703550,B.Mr.W.,"Denver, CO",,http://datafireball.com,"

SOreadytohelp

+ +

I am a business data analyst who use R and Python.

+ +

Started recently learning Apache Spark.

+ +

I am a firm believer of open source software.

+",2013-10-15 23:40:55.433 +10060,1643640,Penguin_Knight,,,,"

Hello! When I'm not catching fishies I love to hack into the Antarctic research base's wifi, checking sexy profiles on Hatch.com and occasionally come here and answer questions.

+",2012-08-02 18:48:06.957 +22947,2506351,user2179977,,,,,2013-10-22 21:25:36.867 +16452,1986689,alco,,,,,2013-04-02 14:35:09.623 +23135,54941,thaddeusmt,"Salt Lake City, UT, United States",,http://www.chilipepperdesign.com,"

I am a web developer and outdoor enthusiast. I mostly do HTML/CSS/JavaScript and PHP/MySql/MongoDB (on Apache/Nginx), but have dabbled in Java, .NET, Ruby, et al. I've done a lot of work with social APIs (Facebook, Twitter, Instagram), Drupal, WordPress, Magento, and the Yii framework, and also do server operations and administration. The phrase ""full stack"" comes to mind.

+ +

I am currently the Chief Software Architect at http://splashlabsocial.com (built on Yii), where we build custom social media applications on top of our data platform.

+",2013-10-29 14:36:53.977 +22683,2043113,appVenture,,,,,2013-10-14 18:55:35.420 +3868,470441,wvguy8258,,,,,2011-08-04 02:24:22.913 +22670,3224218,maycca,Bavaria,,,"

I am interested how forest dynamic changes under different forest management practices, and how it affects disturbances risk.

+",2013-10-14 07:04:23.090 +22849,3369528,IMBA,,,,,2013-10-18 22:59:02.413 +22767,3446430,Narj,,,,,2013-10-16 18:46:05.790 +261,12710,John D. Cook,Houston,,https://www.johndcook.com,"

I'm an applied mathematician and software developer working as an independent consultant.

+ +

You can find all my contact info at https://www.johndcook.com/blog/contact/

+",2010-07-26 20:00:03.430 +7927,1393887,wcampbell,"Dallas, TX, United States",,http://practicalrvideos.blogspot.com/,"

I work in finance in Dallas, TX. Contact me for more details.

+ +

I have an R video blog: practicalrvideos.blogspot.com/. Check it out!

+",2012-04-10 12:42:59.353 +22562,3419035,pin_gu,,,,,2013-10-10 13:11:58.220 +22816,3453738,Karan Malhotra,,,,,2013-10-18 01:17:48.760 +22667,2257935,user1988705,,,,,2013-10-14 01:17:13.023 +23174,2036099,Marta Karas,"Baltimore, MD, United States",,http://www.linkedin.com/in/martakaras,"

I am a graduate student in Biostatistics at JH SPH. +R and Python user.

+",2013-10-30 14:53:31.883 +23191,3515858,atwell17,,,,,2013-10-30 22:12:17.233 +2915,163635,golobor,,,,,2011-05-10 18:23:20.823 +11656,1529456,user67275,"Seoul, Republic of Korea",,,"

Statistician.

+",2012-10-16 18:28:01.537 +22454,3404906,user31197,,,,,2013-10-07 20:50:53.017 +4499,346500,zx8754,"London, United Kingdom",,http://www.icr.ac.uk/,"

Analysis - Data - Analysis - Data - Volleyball

+

r

+ + + +",2011-09-23 12:00:43.570 +23180,3514459,Giancarlo,,,,,2013-10-30 16:41:10.440 +22868,3462702,user31703,,,,,2013-10-20 03:31:09.140 +12140,526714,OutputLogic,California,,http://outputlogic.com,OutputLogic.com : tools that improve productivity,2012-11-06 18:38:35.947 +22920,319353,AlbeyAmakiir,Australia,,http://www.albeyonline.com,"

Blue-haired, enbie developer. I make sure to wear odd-socks every day, and my pronouns are they/them.

+",2013-10-22 01:53:50.270 +22505,3412676,abc,,,,,2013-10-09 10:06:51.963 +21842,3321502,user30440,,,,,2013-09-18 17:29:07.947 +21638,3293801,M. Berk,,,,,2013-09-12 06:50:32.417 +22747,3444015,user2885229,,,,,2013-10-16 06:34:31.443 +23351,2930101,notwhereuareat,,,,,2013-11-04 22:21:45.583 +8888,166892,Sentry,Germany,,,,2012-05-31 13:03:00.317 +22340,3389272,Bob Hopez ,,,,,2013-10-03 23:33:51.193 +23361,3217923,Ish Yadav,,,,,2013-11-05 03:45:59.013 +22449,3404340,user31189,,,,,2013-10-07 18:18:21.910 +5374,1050593,Luigi,"Naples, Italy",,,,2011-11-18 21:45:53.290 +22546,3416854,JTH,USA,,,,2013-10-10 04:49:08.283 +23115,3303354,guillefix,,,,"

I'm currently studying physics at Oxford University, and also doing a lot of self-study!

+",2013-10-29 00:00:44.980 +22806,3452913,Yousef,,,,,2013-10-17 20:22:25.147 +728,52354,Tim,,,,"

Elitists are oppressive, anti-intellectual, ultra-conservative, and cancerous to the society, environment, and humanity. Please help make Stack Exchange a better place. Expose elite supremacy, elitist brutality, and moderation injustice to https://stackoverflow.com/contact (complicit community managers), in comments, to meta, outside Stack Exchange, and by legal actions. Push back and don't let them normalize their behaviors. Changes always happen from the bottom up. Thank you very much!

+

Just a curious self learner. Almost always upvote replies. Thanks for enlightenment! Meanwhile,

+
    +
  • Corruption and abuses have been rampantly coming from elitists.
  • +
  • Supportive comments have been removed and attacks are kept to control the direction of discourse. Outright vicious comments have been removed only to conceal atrocities.
  • +
  • Systematic discrimination has been made into policies.
  • +
  • Countless users have been harassed, persecuted, and suffocated.
  • +
+

Q&A sites are for everyone to learn and grow, not for elitists to indulge abusive oppression, and cover up for each other.

+ +",2010-08-19 15:31:09.537 +22884,2415694,Chris,Korea,,,"

A grad student from Austria - Physics

+",2013-10-20 21:10:43.787 +22955,3280104,Mudit,"Kolkata, India",,http://aankra.com,"

Just started using R. Lots of help needed from R community :)

+",2013-10-23 04:34:16.640 +1150,129510,Indolering,"Seattle, WA",,http://www.indolering.com,,2010-10-24 05:14:51.273 +22742,46841,IlliakaillI,"Vancouver, BC, Canada",,,"

Dedicated Senior Software Architect with 13 years of full development life cycle experience from architecture through test and deployment. Result-oriented team leader ready to successfully deliver software projects on time and on budget. Effective problem-solver able to provide superb customer support and connect the business and technical sides. Experienced in mentoring and motivating a distributed team to excellent performance.

+",2013-10-16 02:51:31.503 +23144,3302591,user94759,,,,,2013-10-29 17:49:00.377 +21108,1058549,NotGaeL,"Enschede, Netherlands",,http://github.com/elcodedocle,"

None of the opinions expressed on the content I post here necessarily reflect the opinions of my employers (or my current ones). The dumb ones are probably sarcastic.

+

https://abstrusegoose.com/249

+",2013-08-25 07:52:34.017 +21576,3284824,Edward Tong,"New York, United States",,http://uk.linkedin.com/in/edtong,"

https://www.linkedin.com/in/edtong

+

https://scholar.google.co.uk/citations?user=tJEOjOQAAAAJ

+",2013-09-10 10:48:24.190 +22724,3442012,john,,,,,2013-10-15 18:23:29.187 +23324,1203691,Adam R. Grey,"CT, United States",,,"

I believe that the stack exchange network is, or at least can and should be, the one glorious place in all the universe where The Rule is ""Answer. The. Question."" But you should also try and set people on the right path, too.

+",2013-11-04 04:23:11.983 +1501,66096,Uri Cohen,,,https://uricohen.github.io/,"

Working on problems in the theory of computing using neural networks. +A retired software engineer.

+

A post-doctorate scholar at the Computational and Biological Learning lab at Cambridge University, UK. Previously, did a PhD in Computational Neuroscience at the Hebrew University of Jerusalem, Israel.

+",2010-12-08 06:43:20.017 +23404,3535727,Jules,,,,,2013-11-05 22:38:20.477 +22850,3458891,Michael W.,,,,,2013-10-18 23:23:26.417 +23521,2624532,fvfaleiro,,,,,2013-11-08 13:03:40.430 +23242,2485578,jack,,,,,2013-11-01 08:20:59.267 +18085,2079278,DUWUDA,,,,,2013-05-19 13:31:29.883 +15209,2374553,Hugo,,,,,2013-02-26 07:04:37.093 +14850,1923271,sqrt,,,,,2013-02-14 10:51:20.600 +22574,2671570,user2308869,,,,,2013-10-10 17:35:19.073 +2071,7637,Dima,,,,"

All comments and opinions expressed on StackOverflow and other StackExchange sites are mine alone and do not necessarily reflect those of my employers, past or present.

+",2011-02-15 23:38:39.183 +2690,328885,Learner,,,,,2011-04-22 16:31:32.630 +23118,349350,Kotaro,Singapore,,,,2013-10-29 04:30:24.413 +22998,2223862,feik,,,,,2013-10-24 02:07:24.653 +22847,481218,defvol,Mexico,,https://sector-f.net/,"

A.K.A. [rod\w+]; working on deep emojis and sci-fi algorithms on caffeine; guitar for The Neon Synthwave Post-metal Band. Go Nyancats.

+",2013-10-18 21:15:22.643 +23455,3547017,Charlie,,,,,2013-11-06 23:43:23.503 +23308,3530660,user32256,,,,,2013-11-03 17:24:26.470 +11155,1867724,user1690846,,,,,2012-09-26 08:36:22.010 +23113,3503935,Luke,,,,,2013-10-28 22:34:32.070 +23040,47213,Rick,"Toledo, OH",,,"

Have experience with large SQL tables. Major focus with T-SQL, .Net, c#, MVC, data warehousing and SSAS.

+ +

I am an Apple fanboy and love technology.

+",2013-10-25 14:48:38.917 +23437,174465,Druska,"Toronto, Canada",,http://davedruska.com,,2013-11-06 16:30:29.063 +22815,3453665,Jdodds,,,,,2013-10-18 00:37:48.237 +633,35314,Neil G,,,https://mila.quebec/en/person/neil-girdhar/,"

Interested in machine learning and Python.

+",2010-08-10 22:28:36.470 +22426,3401068,Neodyme,,,,,2013-10-07 03:40:04.763 +22813,1469051,Max Radin,"Cambridge, MA, USA",,http://www.maxradin.com,"

Quantum software engineer at Zapata Computing with a background in first-principles and mesoscale modeling of battery materials.

+",2013-10-17 23:51:13.920 +23466,3548481,user32442,,,,,2013-11-07 08:19:01.863 +4,54503,Shane,"New York, NY",,http://www.statalgo.com,"

Quantitative researcher focusing on statistics and machine learning methods in finance. Primarily use R, C++, Python, various databases (including OneTick and KDB), and LaTeX on a daily basis.

+ + +",2010-07-19 19:03:57.227 +23157,3216128,Eric,,,,,2013-10-30 00:54:14.277 +2765,381939,NRH,"Copenhagen, Denmark",,,"

Researcher, broadly interested in probability theory, mathematical, computational and applied statistics.

+",2011-04-28 17:36:28.327 +9483,169656,Franck Dernoncourt,,,http://www.francky.me,"

+",2012-07-02 22:18:36.513 +22669,3434351,thigger,,,,,2013-10-14 05:54:45.600 +23342,155841,zebediah49,,,,,2013-11-04 18:02:11.130 +22803,2842085,ShikharDua,,,,"

I am professional in Data Science field with almost 3 years of experience in Business Analytics

+",2013-10-17 19:16:23.737 +22851,3137654,saccades,,,,"

saccades saccade

+",2013-10-19 02:42:40.443 +23496,3552120,Stefan,Canada,,,"

Biologist and researcher focusing on applied research questions in forestry, forest reclamation and ecophysiology.

+",2013-11-07 21:50:35.707 +23240,3473872,Yoam,Singapore,,,,2013-11-01 05:11:33.423 +23264,3525187,Sleepingsheep,,,,,2013-11-01 21:36:46.550 +14172,2280002,Nikita Kuznetsov,,,,,2013-01-23 20:37:50.500 +8386,1140730,Adam Bailey,"London, United Kingdom",,http://www.economicdroplets.com,"

An accountant by profession, I have a broad interest in finance and economics, and in maths and statistics relevant to those areas. I have degrees in economics, philosophy, business administration and (most recently) environmental economics. My website is a blog on environmental and natural resource economics.

+ +

As a hobby, I am also interested in number theory and geometry.

+",2012-05-03 12:01:03.713 +22757,3402323,Gred,,,,,2013-10-16 14:45:57.437 +18690,1241088,Snowflake,,,,,2013-06-06 10:40:26.117 +9716,430906,Fernando,,,,,2012-07-15 04:58:18.477 +23377,3539017,Heidegger,,,,,2013-11-05 14:10:31.143 +22733,1042302,Vikas Kawadia,"San Francisco, CA, United States",,https://www.linkedin.com/in/vkawadia,,2013-10-15 20:56:43.840 +22882,3465108,Paka,,,,,2013-10-20 19:37:55.377 +23252,485674,entropiece,,,,,2013-11-01 14:15:43.717 +23175,3513935,Benincasa,,,,,2013-10-30 15:07:26.137 +21161,1199698,Eddie Xie,"San Francisco, CA, USA",,,"

I write Java Python etc and participate open source projects. I really enjoy developing server side technologies including online machine learning, search and ads systems.

+",2013-08-27 15:22:02.050 +12884,2111553,Mimshot,,,,,2012-12-04 18:49:56.643 +22880,2318773,user2035158,,,,,2013-10-20 19:27:29.717 +11884,1978585,user1775772,,,,,2012-10-26 00:40:15.937 +22821,1218067,Arya,United States,,https://sites.google.com/site/airanmehr/,,2013-10-18 08:56:41.460 +20838,2164266,Fred S,,,,,2013-08-15 21:24:58.987 +23211,3518389,user32134,,,,,2013-10-31 11:53:30.363 +23192,1168674,user1146372,,,,"

I am PhD candidate in biomedical engineering at JHU. My research focus is bioinformatics with emphasis on NGS applications.

+",2013-10-30 22:25:32.417 +22189,3371294,Magnus Metz,,,,,2013-09-30 08:12:44.097 +23517,2132875,TheChymera,"Cambridge, MA, USA",,http://chymera.eu,,2013-11-08 09:18:08.280 +23095,1178892,dariosalvi,"Malmo, Sweden",,,"

I am an associate professor at Malmo University, Sweden. I work mainly in the field of digital health.

+

Favourite programming languages: C, Java, JavaScript, R

+",2013-10-28 11:49:47.023 +17730,2410706,mrsoltys,,,http://www.mikesoltys.com,,2013-05-09 00:36:49.643 +3446,510067,Michael Hardy,"Minneapolis, MN",,,"

I have a Ph.D. with a minor in mathematics and a major in statistics.

+ +

At meta (dot) math (dot) stackexchange (dot) com you may seek information or even make suggestions provided you stay away from any topic that might cause it to be suspected that you will suggest changes in customary ways of doing things. That is rigidly forbidden by an unstated customary rule with which I disagree.

+",2011-06-25 20:42:45.623 +22978,2475245,Hyunwoong Ji,,,,,2013-10-23 16:52:39.800 +22877,3464659,Malai,,,,,2013-10-20 17:09:25.137 +22126,3361618,masumeh,,,,,2013-09-27 11:50:54.613 +23320,2756432,Al Nejati,"Auckland, New Zealand",,,"

Ph.D. in Engineering Science by day, rogue freelance scientist by night.

+ +

In science, insanity is doing the same thing over and over and expecting the same result.

+",2013-11-04 02:56:34.667 +22714,3441105,Jos,,,,,2013-10-15 14:55:36.683 +13895,2245177,JeffJo,,,,"

Retired Engineer (Applied Math, Physics) who enjoys discovering why people think the way they do.

+",2013-01-14 22:53:44.703 +23060,3489633,Nakord,,,,,2013-10-26 13:16:59.620 +22983,3197040,Ben John,,,,,2013-10-23 18:00:35.860 +23362,132942,Andrey Shabalin,,,http://shabal.in,,2013-11-05 05:00:47.567 +21464,3072520,learner,,,,,2013-09-06 05:16:19.097 +22699,478930,theharshest,"Minneapolis, MN",,http://harshtechtalk.com,"

Machine Learning and Data Mining enthusiast.

+ +

Interested in Python, C++, Data Structures and Algorithms.

+ +

Metalhead!

+ +

SOreadytohelp

+",2013-10-15 04:14:32.587 +23270,1949203,Lowerison,"Cochrane, AB, Canada",,,,2013-11-02 05:01:03.930 +18914,2594817,fidadoma,Czech Republic,,http://www.ms.mff.cuni.cz/~dechf7am/,"

I'm PhD student of cumputer science. I study human vision and I try to create mathematical models of eye movements in MOT task.

+",2013-06-13 08:44:31.203 +6708,1220490,Marco,,,,"

A poor ignorant who humbly seeks to learn.

+",2012-02-08 12:23:57.240 +166,56451,russellpierce,"Dallas, TX, United States",,,"

Engineering Leader, Applied Scientist, Researcher, Data Scientist, Data Engineer

+",2010-07-20 05:57:22.890 +23178,3514376,Ioannis,"Stanford, CA, United States",,,,2013-10-30 16:37:43.330 +23216,3519028,user32139,,,,,2013-10-31 14:18:26.747 +22854,3460596,qwertzlcoatl,Germany,,,,2013-10-19 13:28:07.987 +22707,2427221,user2118903,,,,,2013-10-15 09:24:36.763 +22878,3464664,user31717,,,,,2013-10-20 17:10:06.830 +8208,144312,mariosangiorgio,"London, United Kingdom",,http://mariosangiorgio.github.io,"

Software Engineer

+",2012-04-24 12:24:00.010 +22776,1949079,Riaan,,,,,2013-10-16 21:26:26.100 +22741,3443078,marwa elnady,,,,,2013-10-15 23:55:26.273 +22631,2143607,Sohaib I,"Munich, Germany",,http://in.linkedin.com/in/sohaibiftikhar,,2013-10-12 10:23:19.300 +22581,200610,Cory Klein,"Pleasant Grove, UT",,http://www.coryklein.com,"

Computer science enthusiast, open source advocate, frugality expert

+",2013-10-10 20:14:40.933 +22818,303079,Don,,,,,2013-10-18 07:02:52.453 +21952,3339169,oblixram,,,,,2013-09-23 04:51:54.213 +22310,3384665,kiyoshi sasaki,,,,,2013-10-03 02:03:37.980 +22591,3421756,Big Data Lover,,,,,2013-10-10 23:22:18.470 +22843,2112551,Person,,,,,2013-10-18 20:21:59.413 +23495,1034745,Gayot Fow,"London, United Kingdom",,,"

Just a random person on the net. Would-be Samurai. Orange juice drinker. Bane of Smaug. Polecat hunter. Failed gardener. Dolt basher. General Zod's Aide de Camp. Hounder of Ming the Merciless. Skunk skinner. Scandalous Exegete. Lifter of Mummy Curses. Denizen of the Vomit Comet. Deckard's confidant. Uncoupler of cause and effect. Seer of Wormsign. Wanton slayer of Daleks, Shapeshifters, and Gozerians.

+ +
+ +

Active as an independent consultant specialising in bespoke applications for individuals in the high net worth sector. All platforms, including desktop, mobile, and tablet. Also cloud applications, privacy/encryption, browser customisation, remote storage. Private clients only. No agencies please. Money talks, nobody walks.

+ +
+ +

Hobby: I like to fly in private aircraft (passenger or supernumerary crew) and I do it several times a year sometimes by arrangement and sometimes by ""hitchhiking"" at private fields. By arrangement I do stuff like baby sitting, pet delivery, chaperoning, translating, mock landing interviews, and gift delivery. The furthest east I have been is London -> Samarkand (single engine Cherokee). The furthest west I have been is London -> Los Angeles (many different aircraft, including a G650). The most stressful are the landings at Nuuk to refuel. The easiest airport to ""hitch"" a ride is in Texas USA.

+ +
+ +

+",2013-11-07 21:49:29.467 +23450,3546686,Luis Gonzaga Sarmiento,,,,,2013-11-06 21:54:18.113 +22693,3437837,Megan,,,,,2013-10-14 22:01:18.457 +23271,3526263,user32207,,,,,2013-11-02 07:12:05.807 +22672,3424291,Inkognito,,,,,2013-10-14 09:38:37.463 +17180,2204873,WAF,Belgium,,,,2013-04-23 15:23:14.887 +651,509511,Dikran Marsupial,,,,"

(my about me is currently blank)

+",2010-08-12 07:45:05.553 +17448,2707062,Bere,,,,"

Postdoctoral researcher at University of São Paulo in Molecular Genetics of Plants.

+",2013-04-30 15:11:10.523 +7246,380114,Felix,"New York, United States",,,,2012-03-07 21:33:23.937 +22779,3448305,fimawa,,,,,2013-10-17 00:23:22.743 +22597,3422261,Aleksandr Blekh,"Atlanta, GA",,,"

Management and IT Consultant, information systems researcher, data scientist, aspiring educator and entrepreneur.

+

Newly minted Ph.D. in Information Systems - recently defended my dissertation on open source software success factors. I am interested in product management, software engineering, open source software ecosystem (obviously!), open data, open (reproducible) research, data science, statistics (especially SEM), R, artificial intelligence, startups and venture capital, knowledge sharing, management science, information architecture, education and medicine, among other topics.

+

Additionally, I have earned M.S. in Computer Information Systems and B.S. in EE (microelectronics and semiconductor devices). I have 20+ years of IT experience, mostly as a software developer and consultant. I speak Russian, English and a little Romanian.

+",2013-10-11 02:36:16.763 +22788,3450056,ryk,,,,,2013-10-17 09:53:49.753 +5211,268353,xiaohan2012,"Wuhan, China",,,,2011-11-07 08:59:29.953 +23352,2414843,Steve Weston,"New Haven, CT",,http://www.oreillynet.com/pub/au/4980,"

HPC Specialist at Yale University and co-author of "Parallel R" from O'Reilly.

+

Original author of the foreach, iterators, doMC, and doSNOW packages for R. Author and maintainer of the doMPI and itertools packages.

+

A founder of Revolution Computing, now named Revolution Analytics. Left the company when the founders were bought out in 2009.

+",2013-11-04 22:36:11.403 +22839,3457411,ΓLB,,,,,2013-10-18 16:21:15.407 +20949,1532745,Count Zero,,,,,2013-08-20 12:30:59.073 +23267,1640458,daenris,,,,,2013-11-02 02:32:05.863 +16325,2560892,user23658,,,,,2013-03-29 00:42:35.520 +22594,3422020,Aorati,,,,,2013-10-11 01:07:54.850 +22935,178585,Pandoro,"Aachen, Germany",,,"

I'm a computer science student who hates computer science :D

+",2013-10-22 15:41:36.700 +23009,2334767,BiA,,,,"

Physicist passionate about software engineering

+",2013-10-24 10:16:36.677 +22616,388148,Rain,,,,,2013-10-11 18:56:04.673 +12544,980225,Jeremy Miles,"Los Angeles, CA",,http://jeremymiles.co.uk,"

Opinions and errors stated here are mine, not my employer's.

+

Sometimes I know what I'm talking about. Often I don't.

+",2012-11-21 21:55:31.517 +8958,1484466,JEquihua,Mexico,,,,2012-06-04 17:06:25.333 +23233,3521205,Junip,,,,,2013-11-01 00:08:38.230 +23411,898810,J. Miller,,,,,2013-11-06 00:57:08.763 +16551,2319761,Manko,Israel,,,,2013-04-05 12:00:42.010 +9755,973970,Patrick McCarthy,,,,,2012-07-17 16:59:50.757 +22674,1848437,Timespace,,,,,2013-10-14 11:11:46.830 +23089,3499883,aRBOVIRUS,,,,,2013-10-28 05:34:54.517 +22902,3177162,user2684719,,,,,2013-10-21 12:16:36.657 +17179,2371003,Rup Mitra,,,,,2013-04-23 15:19:39.463 +23092,103840,rajatkhanduja,,,,,2013-10-28 08:47:50.147 +17740,2465458,Marc Claesen,Belgium,,http://www.marc-claesen.name,"

I obtained my PhD in machine learning at KU Leuven (Summa Cum Laude). My research emphasized predictive modelling with uncertainty and automation of the learning process. I am not married to a particular software platform or language, though my day-to-day weapon of choice is Python.

+ +

I'm the main author of two open-source machine learning packages: Optunity and EnsembleSVM. Most of my research articles are posted on the arXiv.

+ +

I can be contacted at $(echo oxmqeqzy?symux#oay | tr ?a-z# @o-za-n.)

+",2013-05-09 09:38:54.573 +21497,1887117,Ankur Chakravarthy,,,,"

Cancer researcher with an interest in bioinformatic analysis of large scale datasets.

+",2013-09-07 01:59:23.310 +15991,2517310,Guest,,,,,2013-03-19 21:23:49.697 +22700,1535321,Rotfuchs,,,,,2013-10-15 04:47:14.350 +22557,3418076,manstat,,,,,2013-10-10 09:48:14.343 +23081,3499096,eager_student,,,,,2013-10-27 23:36:55.623 +22682,3250756,Spy_Lord,,,,,2013-10-14 18:15:29.137 +23365,393401,Richard Blackman,Perth Australia,,,,2013-11-05 06:43:46.340 +22423,3400940,stats_newb,,,,,2013-10-07 02:44:46.310 +23278,3526569,Ross Ahmed,,,,,2013-11-02 09:56:37.087 +19752,3026677,Max S.,,,,,2013-07-10 00:13:00.183 +23098,3369877,Helen,,,,,2013-10-28 12:36:42.800 +23014,3484835,abida yousaf,,,,,2013-10-24 13:43:49.420 +20410,3117327,amoeba,St. Petersburg ⇨ Freiburg ⇨ London ⇨ Lisboa ⇨ Tübingen,,,"

Born but to die, and reas'ning but to err.

+",2013-07-31 11:19:00.203 +22775,3447740,anna,,,,,2013-10-16 21:06:10.343 +1889,341544,Henry,,,,,2011-01-28 07:59:59.930 +22190,3140100,Monika Maurya,,,,,2013-09-30 09:07:29.253 +23205,3219771,CarlBrunius,,,,"

Researcher in nutritional metabolomics. Most of my time goes into multivariate data analysis and other quality aspects of the data pipeline:

+ +
    +
  • Pre-analytical sample management
  • +
  • Within/between batch correction of LCMS data
  • +
  • Choice and design of multivariate modelling technique for: + +
      +
    • Different types of data (NMR, LCMS, OTU)
    • +
    • Complex experimental designs (time series, crossover, etc)
    • +
  • +
  • Validation of multivariate statistical modelling + +
      +
    • Nested crossvalidation designs
    • +
    • Unbiased variable selection
    • +
  • +
+",2013-10-31 09:48:00.750 +23479,3536683,Roninio,,,,,2013-11-07 15:45:26.070 +23222,26155,Dmitry Brant,"Boston, MA, USA",,https://dmitrybrant.com,"

I'm a Lead Software Engineer at the Wikimedia Foundation, responsible for the Wikipedia Android app.

+

Also check out DiskDigger, or some of my other software!

+",2013-10-31 17:29:27.253 +23186,3320936,zecaurubu,"London, UK",,http://mateusborges.com,,2013-10-30 19:32:42.140 +1506,509601,Glen,,,,,2010-12-09 01:48:37.523 +450,63945,user603,"Leuven, Belgium",,https://www.researchgate.net/profile/Kaveh_Vakili/?ev=hdr_xprf,"
+

[...] from this point of view, when you contemplate what to do about + outliers, you’re like Marlow, in “Heart of Darkness,” when he is + travelling up-river to find Kurtz. “Watching a coast as it slips by + the ship,” Conrad writes,

+ +
+

is like thinking about an enigma. There it is before you—smiling, + frowning, inviting, grand, mean, insipid, or savage, and always mute + with an air of whispering, ‘Come and find out.’

+
+
+",2010-07-29 15:38:09.663 +22895,429360,Bogdan Vancea,,,,"

Carpe Diem

+",2013-10-21 09:08:35.123 +23053,3492655,user31934,,,,,2013-10-26 00:26:12.477 +22659,1661661,hadsed,,,,"

I'm a physics student, doing research in both physics and computer science. I'm here to learn.

+",2013-10-13 20:54:09.243 +22635,1847699,askming,,,,"

I'm a phd student in biostatistics.

+",2013-10-12 14:54:25.543 +22859,2291619,user2014241,,,,,2013-10-19 16:03:58.910 +23213,3518798,michalis,"Helsinki, Finland",,,,2013-10-31 13:33:47.333 +192,218520,babelproofreader,,,https://dekalogblog.blogspot.com/,,2010-07-20 13:24:56.240 +22647,2659906,DDW,,,,"

Big Data scientist with a couple of years experience working with Hadoop. Currently working/looking for applications of Big Data technologies in Telco, Bioinformatics, Biomedics, eLearning and manufacturing.

+",2013-10-13 08:50:44.190 +12522,2060789,Orlando Mezquita,Puerto Rico,,http://www.MasterDataAnalysis.com,,2012-11-21 00:41:33.083 +15839,2499915,user22062,,,,,2013-03-15 19:26:31.567 +22566,3419538,abc,,,,,2013-10-10 14:44:17.203 +1575,261715,Chris Taylor,"London, United Kingdom",,http://www.linkedin.com/in/crntaylor,"

I'm broadly interested in very applied math. I try to apply ideas from mathematics, statistics, machine learning, formal systems and computer science to solve real-world problems. Mostly in applied finance/quantitative trading, but in other areas if the mood takes me.

+",2010-12-17 17:38:19.603 +23372,3538395,user32330,,,,,2013-11-05 11:48:03.927 +22679,3436400,user31478,,,,,2013-10-14 15:22:25.487 +23150,2219907,triomphe,,,,"

Faites-vous des amis prompts à vous censurer.

+",2013-10-29 20:10:31.960 +22655,3373885,Bo P,,,,,2013-10-13 17:05:44.267 +22560,3418349,bharti,,,,,2013-10-10 10:46:06.893 +19557,1835210,Richard D,"Dallas, TX, United States",,http://www.datapolitan.com,"

Strategy consultant with 10+ years experience bringing innovative, data-driven approaches to critical challenges in the public and non-profit sectors. A trained and experienced data scientist and project leader able to command detail while staying focused on the larger goals. A battle-tested combat leader and creative collaborator adept at challenging diverse teams of professionals to achieve common goals.

+",2013-07-03 21:45:14.293 +22596,2833983,ionick,,,,,2013-10-11 02:05:57.810 +23138,1739091,Brian,,,,,2013-10-29 16:18:25.410 +15280,1983041,Martin Van der Linden,"Nashville, TN",,http://martinvdlinden.wordpress.com/,"

+ +

Comment templates

+",2013-02-28 00:25:58.083 +22653,3432047,Paul,,,,,2013-10-13 13:49:31.680 +23407,435440,Bruno Calza,Brazil,,,,2013-11-06 00:14:37.973 +18905,2892341,dian,,,,,2013-06-13 04:46:26.887 +22922,2729027,WilBur,,,,"

I am working on becoming a developer in my spare time. At this point, I know Java well, and I have a pretty good knowledge Python. I'm doing a bunch with robotics and the web, and I'm on GitHub here.

+",2013-10-22 03:43:19.977 +23494,302409,Andrew,,,,"

I do data at a charter school network in New Orleans. I help teachers and principals use assessments to be better instructors, and try to get the right information in front of people making decisions.

+ +

I also founded and coded the first two versions of Whetstone Education, an online teacher observation tracking tool.

+",2013-11-07 21:36:00.973 +10957,510056,alittleboy,USA,,,"

Hello! My concentration is statistical genetics. I also like statistical computing (mainly) using R and other open-source software packages.

+",2012-09-17 19:42:04.117 +20457,3124851,diwgan32,,,,,2013-08-01 22:24:09.677 +23105,1178909,ptpatil,,,,,2013-10-28 15:03:10.840 +23185,946667,JohnTortugo,"Seattle, WA, USA",,https://johntortugo.atlassian.net/,"

https://johntortugo.atlassian.net/wiki/home

+",2013-10-30 19:30:11.373 +1359,276079,B_Miner,,,,"

Work in analytical CRM / database marketing.

+",2010-11-17 23:45:24.067 +23508,76405,Raghav Bali,India,,,"

My mantra for life is Code.Click.Travel

+",2013-11-08 03:38:40.580 +22914,1329862,Ziofil,,,,"

Senior quantum architecture scientist @ xanadu.ai

+",2013-10-21 19:32:19.937 +22704,3439237,George,,,,,2013-10-15 07:29:18.930 +18845,463430,andrewtinka,"Berkeley, CA",,http://www.andrewtinka.com,"

I'm a graduate student in Electrical Engineering at UC Berkeley. My focus is on multivehicle control and robotics for environmental sensing.

+ +

I'm a LaTeX and TikZ user. I expect to be asking more questions than I answer.

+",2013-06-11 17:45:15.713 +674,184175,chl,"Paris, France",,https://aliquote.org,"

Bohemian scientist.

+",2010-08-13 20:50:47.397 +22657,38195,konr,Brazil,,http://konr.mobi,"

(def love (promise))

+",2013-10-13 17:56:45.167 +22703,3434551,Vani,India,,http://www.vanithewriter.blogspot.in,"

A person with a plenty of questions and not many answers. That should describe me :)

+",2013-10-15 06:33:15.263 +22302,3384187,hax13,,,,,2013-10-02 22:31:26.477 +23543,2100235,user1868064,,,,,2013-11-08 22:42:12.317 +23093,3500963,Bal,,,,,2013-10-28 10:44:50.733 +23459,1648493,nTransform,"Halifax, NS, Canada",,http://abrahamnunes.com,"

Psychiatry resident in Eastern Canada. Research interests include the following:

+ +
    +
  • Computational psychiatry
  • +
  • Deep reinforcement learning
  • +
  • Representation learning
  • +
+",2013-11-07 02:10:00.087 +23425,201697,nowaycomputer,,,,,2013-11-06 09:51:21.543 +17994,2777839,IvLi,,,,,2013-05-16 17:22:04.950 +2806,301392,PhD,,,,,2011-05-01 16:36:16.633 +23155,1095833,Alagappan Ramu,"San Diego, CA, United States",,http://alagappanr.com,"

Software Developer!

+

Amateur photographer!

+

Runner!

+

Traveler!

+

NIT Trichy & SUNY Buffalo Alum!

+

Delhi - Chennai - New York - San Diego

+",2013-10-29 23:54:34.437 +23429,3339857,user2806363,,,,,2013-11-06 12:29:32.600 +22819,3454913,user31643,,,,,2013-10-18 07:34:19.257 +22728,2439271,Dan Schwartz,,,,,2013-10-15 19:22:28.427 +22656,3322178,Alex,,,,,2013-10-13 17:07:48.863 +22999,490740,Tom,,,,,2013-10-24 03:04:18.457 +23444,3494919,Daniel Roy,Toronto,,http://danroy.org,,2013-11-06 19:48:27.203 +17328,1369909,wolfies,,,,"

I'm the Rose part of:

+ +
Rose, C. and Smith, M.D. (2002-2013)
+Mathematical Statistics with Mathematica
+Springer-Verlag / mathStatica
+
+ +

and one of the original developers of the mathStatica software add-on for Mathematica. I'm a long-time Mma fan since v2, ... a past Visiting Scholar at Wolfram Research, ... a recent guest editor of The Mathematica Journal, ... and a current editor of the Journal of Statistical Software.

+",2013-04-26 18:24:29.347 +9554,1630661,means-to-meaning,,,,"

Each day I learn something interesting on this site. Most days I learn how little I know.

+",2012-07-05 22:00:48.013 +22979,3480166,Fabian,,,,,2013-10-23 17:14:15.497 +20991,3200875,Lea,,,,,2013-08-21 09:17:35.257 +22835,397798,Overclocked,,,,,2013-10-18 14:54:37.000 +23123,35664,Jorge Israel Peña,"Los Angeles, CA, USA",,https://jip.dev,"

I'm a practical developer who's happy to work at any level of the stack using a variety of different technologies and languages, always eager to learn the best practices of each to produce idiomatic and correct solutions.

+",2013-10-29 07:56:36.480 +22744,1760770,erraticfl,,,,,2013-10-16 04:34:13.973 +10147,1729607,user13154,,,,,2012-08-08 02:59:38.550 +22883,2414091,Tamrerk,"Osaka, 日本",,,,2013-10-20 20:54:30.903 +6608,140559,Tianyang Li,,,http://li-tianyang.com,,2012-02-03 08:33:36.793 +23255,2527870,Christopher Louden,"San Antonio, TX",,http://www.loudenanalytics.com,"

I am a data scientist who enjoys working at the interface of statistics and computer science.

+",2013-11-01 14:33:53.363 +23168,3511325,Ben Gillespie,"Leeds, United Kingdom",,,,2013-10-30 09:03:02.313 +22143,2792935,Theja Tulabandhula,"Chicago, IL, USA",,http://theja.org,"

Can estimate, optimize and build things.

+",2013-09-28 04:09:07.907 +23022,3486817,Will,,,,,2013-10-24 20:35:52.277 +22629,354648,Abhilash,,,,,2013-10-12 05:40:35.943 +23260,945577,QFDev,"London, United Kingdom",,http://www.QuickFile.co.uk,"

Founder of QuickFile.co.uk cloud accounting software.

+",2013-11-01 17:35:40.213 +22787,3449772,vskho2,,,,,2013-10-17 08:47:32.760 +23001,3482460,jxstb,,,,,2013-10-24 04:36:10.370 +22896,512405,linello,Italy,,,"

Currently I'm scientific software developer with proficiency in C/C++ with their related technologies Boost, STL, Qt, Python, computer graphics, OpenGL, Mathematica, MatLab, Bash scripting, NI Labview, LATEX, CMake, CUDA.

+",2013-10-21 09:11:46.400 +1926,111592,zubinmehta,,,,,2011-02-02 18:20:09.513 +22541,3416163,Joe,,,,,2013-10-09 23:52:28.193 +20190,3079254,luckyi,,,,,2013-07-23 18:18:53.173 +18296,2682975,user2317915,,,,,2013-05-25 01:56:35.613 +23148,3508794,user32059,,,,,2013-10-29 19:34:44.903 +23145,405642,kevinAlbs,,,http://www.kevinalbs.com,"

My name is Kevin

+",2013-10-29 18:29:42.450 +7333,1317388,FrankD,,,,,2012-03-12 11:16:41.337 +23181,3514586,JavOs,,,,,2013-10-30 17:09:56.460 +22031,3347404,Eudora,,,,,2013-09-24 18:05:31.213 +22758,1113672,GertVdE,Belgium,,http://www.sckcen.be,"

Nuclear engineering + computational science = modeling of nuclear reactors.

+ +

Active in research & teaching

+",2013-10-16 15:02:06.320 +22942,2369860,Cenderze,Sweden,,,"

Coding enthusiast trying to improve

+",2013-10-22 18:30:00.580 +22556,97325,smwikipedia,,,,"

"A good question is half the answer." --- Anonymous

+

"All problems in computer science can be solved by another level of indirection, except of course for the problem of too many levels of indirection." --- David Wheeler

+

"If I were given one hour to save the planet, I would spend 59 minutes defining the problem and one minute resolving it." --- Albert Einstein

+",2013-10-10 09:25:58.687 +23514,3516666,Kirk Hadley,"Raleigh, NC, USA",,https://www.linkedin.com/in/kirk-hadley-33416951,,2013-11-08 06:58:18.923 +22886,3443439,Chip Koziara,Ann Arbor,,http://chipkoziara.com,,2013-10-20 22:54:35.910 +23139,2723892,gmeroni,,,http://www.ethnologies.it,"

I’m 24 and I’m studying industrial engineering. I have three big passions: climbing, traveling and photography. My great desire is to go all over the word to see the smiles and look in people's eyes. For this reason my journeys are self-made and in my photograph I want to show my experiences and my feelings: one shot, one story.

+",2013-10-29 17:19:42.580 +23201,912643,Krista K,"Lithonia, GA, USA",,http://ca-cycleworks.com,"

Before you downvote and flag a post, how about instead considering an answer to help the newbie find their way towards an answer? Please help me not think that the sole purpose of a diamond moderator is to run good people off from the SE sites.

+

The SE network is such a great resource to learn from, however, once you ask "how do I do this?" and they think you're a newb?

+

Why does it seem the normal response Stack Exchange has are its diamond mods and senior members do a drive by: vote down, close, and move on without any effort to help the person seeking help.

+
+

Ex Navy, motorcycle enthusiast, got a BSEE magna cum laude (with distinction in my degree), embedded Software Engineer, turned entrepreneur. I am IT. Not I-T like information technologies (although I'm I-T, too). Added the bit about having a degree since people who habitually downvote answers seem to all have them, too. I hope that doesn't rub off onto me...

+

No, IT, like "ha ha you're it"

+

Running a business has turned me into a modern day DaVinci through necessity. CAD? Yup. Linux sysadmin? Uh-huh. Machinist, Electrician, Php, mysql, postgres, graphics design PS AI ID, non-consensual visual basic, and back to some great embedded C/C++.

+",2013-10-31 06:33:32.353 +0,-1,Community,on the server farm,,http://meta.stackexchange.com/,"

Hi, I'm not really a person.

+ +

I'm a background process that helps keep this site clean!

+ +

I do things like

+ +
    +
  • Randomly poke old unanswered questions every hour so they get some attention
  • +
  • Own community questions and answers so nobody gets unnecessary reputation from them
  • +
  • Own downvotes on spam/evil posts that get permanently deleted
  • +
  • Own suggested edits from anonymous users
  • +
  • Remove abandoned questions
  • +
+",2010-07-19 06:55:26.860 +23162,173703,Todd Schiller,"New York, NY",,https://toddschiller.com,,2013-10-30 03:14:38.610 +22829,74565,Ben Hutchison,,,,,2013-10-18 11:04:47.647 +23481,233608,John Dvorak,,,,"

Jan-Dvorak @ Miaou
+Honnza @ gmail.com

+ +
+

""Haven't you heard? Crowdsourcing to stackOverflow is the new trend in basic debugging.""

+
+ +

-- John Dvorak

+ +
+

""There are only two hard things in Computer Science: cache invalidation, naming things and off-by-one errors""

+
+ +

-- variation of Phil Karlton's quote

+",2013-11-07 16:26:32.217 +22828,1151308,Danièle,,,,,2013-10-18 11:01:08.527 +2615,167776,JorgeG,,,,,2011-04-14 15:38:22.410 +23383,2891351,Kristof Tak,,,,,2013-11-05 15:09:00.933 +2085,58892,Enrique,"Oslo, Norway",,,"

Hi

+",2011-02-17 05:07:05.447 +8926,1081114,Akavall,,,,"

I like programming, machine learning, statistics, all kinds of problem solving, and I play chess.

+ +

My github

+",2012-06-01 21:06:30.023 +22573,442470,Steffen,,,,,2013-10-10 17:27:51.820 +22570,2684787,sebastian,Germany,,,,2013-10-10 16:02:20.493 +13549,2164300,HFBrowning,,,,"

h.f.browning@gmail.com

+",2013-01-02 21:51:31.140 +9047,1509604,Roland,Germany,,,"

I am a soil ecologist and R user.

+ +
+ +

Note that comments can be edited by moderators. This is not visible and we don't get notified if they do this. Please keep this in mind when reading my comments.

+",2012-06-08 08:24:26.433 +40516,3007493,user2551700,,,,,2014-11-10 21:04:46.053 +34640,2035959,coolC,,,,,2014-07-13 03:17:43.193 +20473,3083068,Alecos Papadopoulos,,,http://alecospapadopoulos.wordpress.com/,"

PhD in Economics with a heavy dose of, and a not-so-secret love for, Econometrics and Statistics. The PhD thesis is a monograph on the Two-tier Stochastic Frontier model/framework, "The Two-tier Stochastic Frontier (2TSF) framework: Theory and Applications, Models and Tools".

+

https://alecospapadopoulos.wordpress.com/

+",2013-08-02 14:24:21.923 +23083,1292675,Sean Mackesey,"Reston, VA",,,,2013-10-28 01:39:34.797 +1930,107172,bit-question,,,,,2011-02-03 03:42:01.077 +22973,3183986,Anna Gordon,,,,,2013-10-23 15:17:27.453 +22768,3447362,user31584,,,,,2013-10-16 19:36:51.927 +23447,3546485,asdf,,,,,2013-11-06 21:05:05.393 +287,58413,Mike Lawrence,"Halifax, Canada",,,"

Cognitive scientist, statistician, programmer, tinkerer

+",2010-07-26 22:38:04.857 +23520,3554656,user32509,,,,,2013-11-08 11:23:00.013 +23408,3181070,BajaBob,,,,,2013-11-06 00:22:17.697 +23126,3505839,Hans Landsheer,,,http://www.linkedin.com/pub/hans-landsheer/33/936/197,,2013-10-29 09:17:28.283 +22756,3445983,baffled,,,,,2013-10-16 14:27:26.077 +5199,158448,Emer,,,http://linkedin.com/in/emerrf,,2011-11-06 15:07:47.937 +17056,2661467,AstrOne,,,,"

Trying to understand existence...

+",2013-04-20 02:21:30.430 +22483,2075714,Vik Keshri,,,,,2013-10-08 15:29:39.523 +5671,1058083,Has QUIT--Anony-Mousse,disappeared into oblivion,,https://stats.meta.stackexchange.com/questions/5783/reinstate-monica,"

I have decided to finally QUIT StackOverflow at the end of 2019.

+ +
    +
  1. Key issues such as duplicates are still not resolved. Measures like not allowing to post duplicates are not implemented; nor a recommender that helps users choose the most appropriate site, or ask better questions. This puts the burden on the frequent users and moderators to keep the site useful. At the same time, duplicates and low-quality questions do cause rude comments and answers in the first place. Solving this properly would likely make the tone much nicer.
  2. +
  3. Migration of questions is a complicated procedure, rather than simply making the different sites ""views"" on the same underlying data. There is a random set of migrations that can be proposed (such as SO to stats.SE), but no way to propose others (e.g., to DS.SE) They are making our lives harder than necessary.
  4. +
  5. We can't flag duplicates of questions on other SE sites, even if there is a perfect duplicate.
  6. +
  7. The company wastes tons of money on stupid things such as WinterBash, and on projects doomed to fail such as ""SO Documentation"" and ""SO for Teams"". I understand that you need to generate more revenue, but you should listen to your users (in particular to avoid costly failures such as documentation).
  8. +
  9. The company does not listen to their users. A recent example is the Monica incident. This lawyeresque statement shows what is wrong: they fight their best users. But there are many more examples. ""User surveys"" that sound like they come from the sales department, not from user support; that avoid any of the big issues such as duplicates, low quality questions, and the resulting sometimes rude handling of such questions; and many more.
  10. +
  11. I do want to emphasize that I am in general in favor of a ""code of conduct"", and I do strongly believe we need to fight for diversity. I am all in favor of using stated pronouns, but I know that I would get them wrong often. I have not seen any of the transcripts, so I have no idea what Monica did, but we also all have heard of false accusations, mistakes, and so on. I do not think the situation was handled well by SO staff, and it only got worse and worse. A code of conduct that are community standards should actually come from the community, and be handled by the community, not by some higher entity like the company.
  12. +
  13. Firing mods and forced relicensing: is Stack Exchange still interested in cooperating with the community?
  14. +
  15. Firing Community Managers: Stack Exchange is not interested in cooperating with the community, is it?
  16. +
+ +

Sorry, there is no way to further reach me.

+",2011-12-06 08:56:19.207 +23318,1286153,ntaggart,,,,,2013-11-03 22:42:43.417 +12980,2123514,Fred,United States,,,"

Not a statistician :)

+",2012-12-07 18:18:35.173 +22640,3429926,john,,,,,2013-10-12 18:36:45.027 +20870,51919,Kevin Wright,United States,,http://none,"

Statistician. Expertise in mixed models, visualization. Emacs user.

+",2013-08-17 03:11:27.560 +23019,3485779,MLS,,,,,2013-10-24 16:40:53.947 +23047,2536071,Salil Kalghatgi,,,,,2013-10-25 19:03:58.810 +23349,3535936,blarg,,,,,2013-11-04 21:59:58.647 +23196,3516757,Zakaria Al-Jammal,,,,,2013-10-31 03:58:18.593 +11250,428580,tskuzzy,United States,,,,2012-10-01 00:49:25.730 +23039,1071065,JLp,"Cambridge, MA",,,"

Every now and then, I ran into problems I cannot solve.

+ +

Comments:

+ +

1) I am not a mathematician.

+ +

2) Not a native english speaker.

+",2013-10-25 14:21:18.590 +22643,1669760,Amw 5G,,,,,2013-10-13 01:07:18.193 +23034,1426108,Daniel PP Cabral,,,,"

Senior Software Developer

+",2013-10-25 12:21:14.437 +23131,3507112,user32038,,,,,2013-10-29 13:50:47.020 +20742,117234,stackoverflowuser2010,,,,,2013-08-12 19:31:16.087 +23338,1503143,ceys,Beijing,,,"

Software Engineer in JD.com.

+",2013-11-04 16:08:33.117 +22908,71014,knguyen,,,,,2013-10-21 15:28:17.570 +9074,1554733,abaumann,,,,"

Civil servant at the City of Copenhagen.

+",2012-06-09 13:06:49.613 +20538,2606414,neverKnowsBest,,,,,2013-08-05 10:52:44.257 +2217,1475,Konrad Rudolph,"Basel, Switzerland",,http://klmr.me,"

I’m a scientist & software developer working in biology & life sciences. I hold a PhD from the University of Cambridge.

+

I’ve dabbled in everything from biology & data analysis to software development, both on the frontend and the backend, using a wide variety of programming languages and technologies. [he/him]

+

Sponsor me on GitHub if you would like to support what I’m doing here.

+",2011-03-02 16:26:54.810 +1073,61493,carlosdc,"Washington, DC",,http://www.umiacs.umd.edu/~carlos/,,2010-10-10 06:07:19.617 +23268,3525895,Billie G,,,,,2013-11-02 03:22:13.817 +23463,3528039,Tay Shin,,,,"

I am an undergraduate student from the University of California, Berkeley with a major in Statistics.

+",2013-11-07 05:23:03.553 +23440,2503,Hoffmann,Brazil,,,,2013-11-06 18:08:39.943 +22748,3444250,Nrich Rich,,,,,2013-10-16 07:42:23.120 +23008,3459649,zachwarner,,,,,2013-10-24 09:46:24.203 +22561,3418645,abc,,,,,2013-10-10 12:00:04.637 +23341,990336,Adrian,"Toronto, Canada",,,"

Software Engineer
+My unicorn: http://unicornify.appspot.com/avatar/e208066867d3d9c32a0a8ec27dbb90c7?s=128

+ +

Excited by graph problems, efficient algorithms, optimizations, parallel programming, distributed systems, large scale systems, NO SQL solutions. Learning Scala et al. (akka, lift, spray etc)

+",2013-11-04 17:11:00.743 +23114,2600028,Brian Ford,"London, UK",,,,2013-10-28 22:56:19.780 +23210,3518362,mahesh,,,,,2013-10-31 11:46:35.757 +23249,3523425,GregF,"Burlington, VT",,,"

Work for energy efficiency non-profit.

+",2013-11-01 13:49:15.813 +3662,177883,Max Gordon,Sweden,,http://gforge.se,"

I'm an orthopaedic surgeon at Danderyds sjukhus in Stockholm, Sweden, and a researcher (PhD) at the Karolinska Institute.

+ +

I'm generally interested in computer science, machine learning and statistics. I also do a lot of R-programming and I've developed a few R-packages.

+",2011-07-16 20:52:45.283 +2666,509895,Frank Harrell,"Nashville, TN",,http://hbiostat.org/fh,"

I am Professor of Biostatistics and founding Chair of the Department of Biostatistics at Vanderbilt University School of Medicine, Nashville TN USA, where I was chairman from 2003-2017. I was Expert Statistical Advisor to the Office of Biostatistics, FDA CDER 2016-2020. I am Associate Editor of Statistics in Medicine. I am a Fellow of the American Statistical Association. I am author of Regression Modeling Strategies (2nd Edition, Springer, 2015). My specialties are development and validation of predictive models, clinical trials, observational clinical research, cardiovascular research, technology evaluation, clinical epidemiology, medical diagnostic accuracy, biomarker research, pharmaceutical safety, Bayesian methods, quantifying predictive accuracy, missing data imputation, and statistical graphics and reporting. I am a long-time user of R and became a member of the R Foundation by invitation in September, 2015.

+

In August 2014 I was given the WJ Dixon Award for Excellence in Statistical Consulting by the American Statistical Association. Among many other things, Dr Dixon was the lead developer of the first general-purpose statistical software package, BMD.

+

My blog Statistical Thinking is here.

+",2011-04-20 12:59:07.093 +1412,58121,DWin,"Alameda, CA",,,"

Frustrated about the SO environment. Suggestions for change on Meta are met with hostile response. Taking a ""time out"".

+",2010-11-24 23:59:25.473 +22887,3465661,Theresa,,,,,2013-10-20 23:10:34.503 +22921,273235,dino,"Chicago, IL",,http://twitter.com/deanmalmgren,,2013-10-22 01:57:27.253 +15624,1225164,John,South Dakota,,,,2013-03-10 02:07:53.280 +22262,3283317,user2763361,,,,,2013-10-02 01:33:22.423 +6728,1159395,Paul,149.6 million km from the sun,,,"

Senior Aerospace Engineer

+ +

Specialties:
+Multiphysics Modeling and Simulation
+Heterogeneous Multiscale Methods
+Computational Orbital Mechanics
+Computational Electromagnetics
+Computational Solid Mechanics
+Computational Fluid Dynamics
+Computational Heat Transfer
+Finite Difference Methods
+Computational Robotics
+Finite Element Methods
+Finite Volume Methods
+Numerical Analysis
+Calculus Jokes
+Analogies

+ +

Education:
+PhD. Computational Science
+M.S. Computational Science
+M.S. Applied Mathematics
+B.S. Mathematics and Spanish

+ +

Past:
+Computational Scientist
+Thermal Engineer
+NASA Intern
+NSF Fellow
+Lecturer.

+",2012-02-09 02:44:07.803 +4582,942727,Giorgio Spedicato,"Milan, Metropolitan City of Milan, Italy",,,,2011-09-29 06:34:53.627 +17196,1123190,astroboy,,,,,2013-04-24 01:40:16.147 +7483,139304,Danica,"Vancouver, BC, Canada",,https://djsutherland.ml,"

Machine learning faculty at UBC Computer Science.

+",2012-03-19 19:49:40.507 +23079,557222,Benjy Kessler,Israel,,http://none,"

C++ programmer

+",2013-10-27 22:36:55.950 +22943,3472142,user31801,,,,,2013-10-22 19:15:36.897 +23428,26834,Remy,"New York, NY",,http://www.remyoukaour.com,"

My GitHub: https://github.com/roukaour/

+",2013-11-06 10:59:14.740 +23510,1081265,charmoniumQ,/home/bedroom/bed,,https://samgrayson.me,,2013-11-08 04:30:52.200 +4779,268005,JKP,"Santa Fe, New Mexico, United States",,,"

Long time SPSS designer, developer, statistician and strategy person, now retired but still active. Python fan. R user. Serious recreational cyclist

+",2011-10-11 23:55:07.313 +23257,2415655,Ana,"New York, NY, United States",,http://enemygatedown.com/,"

Former Stack Exchange Community Manager.

+

You may know me from projects like:

+ +

I've also poked at a lot of my team's back office-y systems and spent a fair amount of time hanging around the jobs tag on MSO.

+

Beyond that...

+

I'm a hobbyist programmer who loves thinking about how systems talk to each other. I've dabbled with technologies like JavaScript/Node.js, C, Python, Rails, and SQL and have developed a real love for UNIX environments. In the near future, I want to learn more about networking. I'm an avid consumer of science fiction, with a special soft spot for all things cyberpunk. Oh, and I just recently began reading comics. Better late than never, right?

+",2013-11-01 15:39:22.770 +1040,212165,drury,,,,,2010-10-02 22:26:17.753 +23167,463810,jrd1,,,,,2013-10-30 06:30:44.287 +23348,3528635,Nathan Calverley,"Madison, WI",,http://www.badgerstats.com/r-programming/,"

My name is Nathan Calverley, and I run the Wisconsin-based consulting firm BadgerStats. We provide expert training in R programming skills at competitive hourly rates. We also provide expert statistical consulting for businesses, government agencies, and college students. Visit our page to learn more!

+",2013-11-04 21:04:23.047 +628,65909,claws,,,,,2010-08-10 06:40:12.533 +22766,2410198,user2105469,,,,,2013-10-16 18:44:55.080 +22869,3463422,Begga,,,,,2013-10-20 09:43:47.800 +14436,1772922,Dzung Nguyen,,,,"

Love hacking!!

+",2013-01-31 17:42:43.177 +22844,3453740,user2892710,,,,,2013-10-18 20:28:07.700 +20939,2536272,Shawn Wang,Virginia,,https://sites.google.com/a/virginia.edu/yw5aj/,"

I am a researcher in haptics / biomechanics / computational neuroscience.

+",2013-08-20 02:03:14.837 +9804,1517600,CharlesM,,,,,2012-07-20 05:34:32.613 +23229,3281630,Housen,,,,,2013-10-31 20:59:20.817 +22827,1407730,mesut,Istanbul,,http://mesuttalebi.blogspot.com,,2013-10-18 10:26:37.993 +3993,479776,Jake Westfall,"Austin, TX",,https://jakewestfall.org,,2011-08-14 21:52:19.277 +22960,3477586,xing,,,,,2013-10-23 08:25:37.997 +9129,1454482,daniellopez46,California,,,"

I have several years experience in business and analysis, using tools such as Excel, Access, SQL, as well as BI tools. I'm learning predictive modeling and how to use R. I begin about a year ago with both. My goals are to more accurately predict who will retire or voluntarily resign from my organization. I am interested in both accuracy at the aggrregate level (total number of retirements or resignations by various demographic attrributes such as department and job type) and at the individual level (probability that a person will retire or resign).

+",2012-06-12 15:24:37.730 +23101,2392963,Gianluca,"New York, USA",,https://iamgianluca.github.io/blog,,2013-10-28 14:07:31.090 +22811,956350,p.s.,,,,,2013-10-17 23:26:39.263 +22982,3480338,A.Pentz,,,,,2013-10-23 17:52:00.727 +23315,2251287,Mike J,,,,,2013-11-03 21:25:56.293 +35937,4857521,Learner,,,,"

In God we trust. All others must bring data ! - Edwards Deming

+",2014-08-07 11:11:14.270 +23031,3489411,Stijn Vermeeren,,,,,2013-10-25 10:05:55.513 +6805,1243097,user1205901 - Слава Україні,,,,,2012-02-13 02:09:08.377 +20981,1244148,david25272,,,,,2013-08-21 05:48:47.167 +14253,1958030,LF12,,,,,2013-01-25 20:29:03.740 +3999,465049,Fomite,United States,,http://www.confounding.net,"

Infectious disease epidemiologist specializing in the intersection between mathematical models of disease transmission and observational methods. Crafter of artisanal simulation models for the discerning scientist. Oddly fond of enteric pathogens.

+ +

A fair hand at SAS, Python and R

+ +

@GermsAndNumbers

+",2011-08-15 09:24:34.677 +23328,3136905,user89073,,,,,2013-11-04 08:26:40.967 +13202,207107,user,,,,,2012-12-15 12:10:55.210 +22752,2012620,rbm,,,,,2013-10-16 10:35:32.913 +23129,2390545,Nick Stauner,"Cleveland, OH",,http://www.linkedin.com/in/nickstauner,"

Personality & social psychologist

+ +

Research interests:

+ +
    +
  • Existential psychology (meaning/purpose in life, uncertainty, existential problems and attitudes)
  • +
  • Positive psychology (psychological well-being, life satisfaction, affect)
  • +
  • Motivation (goals, values, motives, intrinsic & extrinsic motivation, attainment, support & conflict)
  • +
  • Spirituality (afterlife belief, religion, universalism)
  • +
  • Traits (the ""Big Five"", spirituality, intelligence, needs for cognition or closure, conservatism)
  • +
  • Survey methods (construction, validation, bias identification & reduction)
  • +
  • Statistics

    + +
      +
    1. Latent factor structure
    2. +
    3. Structural equation modeling
    4. +
    5. Effect size estimation
    6. +
    7. Longitudinal change
    8. +
    9. Causality
    10. +
    11. Nonparametric & robust analysis
    12. +
    13. Resampling methods
    14. +
  • +
  • Programming (R, function writing, simulation testing)
  • +
+ +

Publications:

+ + + +

Posters and presentations (PowerPoints available on my SlideShare page)

+ +
    +
  • Me & Ozer, D. J. (2012). Matching goals to values: Correlations follow semantic similarities. 92nd WPA, San Francisco.
  • +
  • Me. (2012). Existential and psychological health as products of intrinsic goal attainment. Presentation, April 19, UCR.
  • +
  • Me, Selvam, T., Cheong, R., & Ozer. (2011). Religious differences in the value systems of meaningful (and meaningless) lives. 2nd ARP, Riverside.
  • +
  • Me & Ozer. (2011). Joint factors of spirituality and religiousness. 91st WPA, Los Angeles.
  • +
  • Me & Ozer. (2011). Spiritual predictors of the search for meaning in life. 12th SPSP, San Antonio.
  • +
  • Me. (2010). Current research in existential psychology. Presentation, November 4, UCR.
  • +
  • Me, Boudreaux, M. J., & Ozer. (2010). Factor structure of the Values Q-Set. 118th APA, San Diego.
  • +
  • Me & Ozer. (2010). The motive content of meaningful (and meaningless) lives. 15th EAPP, Brno, Czech Republic.
  • +
  • Me, Stimson, T. S., & Boudreaux. (2010). The curve of the quest for a more meaningful life. 11th SPSP, Las Vegas.
  • +
  • Me. (2010). The Values Q-Set. Presentation, January 21, UCR.
  • +
  • Me, Stimson, Boudreaux, & Ozer. (2009). When do personality traits predict personal goals? 1st ARP, Evanston.
  • +
  • Me. (2009). The factor structure of personal goals. Presentation, June 4, UCR.
  • +
  • Me, Stimson, & Ozer. (2009). The factor structure of personal goals in an undergraduate population. 10th SPSP, Tampa.
  • +
+",2013-10-29 12:42:40.810 +1805,139823,Zach,"Boston, MA, United States",,https://www.datacamp.com/courses/advanced-deep-learning-with-keras-in-python,"

Interested in Data Science?? I currently teach 2 online classes through DataCamp.

+ +

Check them out to learn more:
+Advanced Deep Learning with Keras in Python
+The Machine Learning Toolbox - R

+",2011-01-18 02:04:11.150 +22680,1520767,xealits,,,,,2013-10-14 16:39:06.857 +23141,3508261,michec,,,,,2013-10-29 17:33:11.913 +169,225647,Chris Beeley,"Nottingham, United Kingdom",,http://chrisbeeley.net,"

Twitter @ChrisBeeley

+",2010-07-20 06:31:18.270 +23374,2439912,ltux,China,,,,2013-11-05 13:00:17.977 +23235,3521370,user32165,,,,,2013-11-01 01:27:22.320 +22372,3394232,carlosedubarreto,Brazil,,,,2013-10-04 22:38:56.543 +23396,2592699,Nick Allen,,,,,2013-11-05 19:19:39.893 +22837,3222172,jclouse,"Portland, OR, USA",,http://NA,"

Data science leader and practitioner working in healthcare, retail, and non-profit sectors.

+",2013-10-18 15:56:57.403 +23030,3419292,Bart,,,,,2013-10-25 09:46:56.070 +22888,919719,Pavel Miron,"Barcelona, Spain",,,,2013-10-20 23:48:17.110 +22732,3442508,Cromulus,,,,,2013-10-15 20:32:27.470 +22972,3479255,Martin,,,,,2013-10-23 14:10:58.907 +23472,1072112,alexwlchan,"Cambridge, United Kingdom",,https://alexwlchan.net,"

Dormant user.

+ +

I used to write a lot of answers on SFF.SE in the harry-potter tag, but I don’t write much new content these days.

+",2013-11-07 12:33:22.890 +23227,2379815,Mike John,,,,,2013-10-31 19:14:29.873 +21896,2658040,Vincent,,,,"

PhD in Representation theory of Lie groups (2009), current job in medical statistics.

+",2013-09-20 11:14:54.407 +15870,2504039,Jingjings,,,,,2013-03-17 00:57:19.023 +23064,2371425,user62423,,,,,2013-10-26 20:05:52.107 +19125,1462786,Jens Kouros,Germany,,,"

I got my first degree as a musician (jazz guitar) and the returned to school as a psychology major, where I developed a strong interest in statistics and programming. Currently I am a Ph.D. student in computational sociology. In my thesis I am exploring probabilistic programming to develop simulation models in social science.

+",2013-06-20 07:34:47.003 +22720,171584,MARK,"Seattle, WA",,,"

Computer Software Engineer

+",2013-10-15 17:10:46.237 +11072,326883,erogol,"Berlin, Germany",,http://www.erogol.com,"

Mozilla TTS - https://github.com/mozilla/TTS

+",2012-09-22 18:20:30.147 +23297,3311833,puretppc,Canada,,https://www.facebook.com/puretppc,"

I'm learning how to program C# right now. I have done programming for 3 years before and this is currently my 4th year and language.

+ +

Languages I've tried but didn't do well before:

+ +
    +
  • Turing
  • +
  • Python
  • +
  • Java
  • +
+ +

My Roles on Stack Overflow:

+ +
    +
  • Helping new users with site features
  • +
  • Editing posts
  • +
  • Asking questions regarding my code problems
  • +
  • Answering questions (rarely)
  • +
  • Reviewing (*new)
  • +
+ +

Cool Achievements

+ + +",2013-11-02 22:16:54.107 +22910,3469299,BonScott,,,,,2013-10-21 17:02:30.750 +10756,1355012,abhinavkulkarni,San Francisco,,,,2012-09-07 18:51:51.533 +9245,147680,alex,,,,,2012-06-19 23:04:53.973 +22610,2208326,Brian Levey,"Miami Beach, FL, USA",,,,2013-10-11 14:56:11.510 +23441,548388,Michael Greinecker,"Paris, France",,https://www.michaelgreinecker.com/,,2013-11-06 18:47:36.233 +15782,2492453,user21988,,,,,2013-03-14 11:50:51.803 +23311,1865149,Spartan,,,,,2013-11-03 19:28:04.490 +2802,509924,user4422,Earth,,,"

Economist, statistician, technology enthusiast.

+",2011-05-01 12:59:37.310 +227,5734,Justin Bozonier,United States,,http://justinbozonier.posterous.com,"I've been a software engineer for a few years. I love what I do and I'm always trying to learn more. Advice and critical discussions are welcome! +
+Currently studying discrete math, multivariable calculus, algorithm design and analysis, and data structures (mainly graphs). I am also researching what I think my future field will be. I'm leaning towards soft computing/machine learning currently. Right now it's academia, but most things start out that way. I think it will be huge when enough people completely grok it.",2010-07-23 15:29:27.743 +22696,3438366,stophammertime,,,,,2013-10-15 01:45:34.960 +23289,2415162,Bran,,,,,2013-11-02 17:56:18.913 +23078,3498586,Kelly,,,,,2013-10-27 20:16:54.170 +22563,1689766,user1552372,,,,,2013-10-10 13:28:02.073 +22507,3413008,user31264,,,,,2013-10-09 11:32:54.857 +23261,3524536,user32194,,,,,2013-11-01 18:34:22.570 +22929,2923612,analyzethat,Netherlands,,http://www.analyzethat.nl,,2013-10-22 09:04:57.570 +19298,2206449,sk8asd123,Chicago,,,,2013-06-25 20:36:59.393 +17635,2730325,Carlo Lazzaro,Italy,,,"

I am a health economist interested in statistics related to my research field (economic evaluation of health care programmes).

+",2013-05-06 10:20:37.820 +6162,1109341,Stéphane Laurent,,,https://laustep.github.io/stlahblog/,,2012-01-08 10:04:43.493 +14806,2234756,Shakesbeery,,,,"

I am a graduate student in linguistics focusing on phonetics and possibly computational linguistics in the future.

+",2013-02-12 22:05:11.960 +22688,3437671,cla4study,"São Paulo - State of São Paulo, Brazil",,,"

Ph.D. student in Economics at Insper Instituto de Ensino e Pesquisa (Teaching and Research Institute): love to study, love to research, this is all I do now!! :-)

+",2013-10-14 21:10:00.537 +18358,1305683,Vincent Labatut,"Avignon, France",,,,2013-05-27 19:16:57.693 +1085,223480,RockScience,,,,,2010-10-13 06:37:12.123 +23082,3301407,teaLeef,,,,,2013-10-28 01:21:38.540 +22425,2035078,Daniel Wonglee,,,,,2013-10-07 03:36:32.333 +21905,3331748,user163,,,,,2013-09-20 19:43:04.530 +2161,301955,trev,,,,,2011-02-24 15:00:49.200 +23029,1318252,Luca Bertinetto,Torino,,,"

Computer Vision researcher | Curious about machines, humans and everything in between.

+",2013-10-25 09:39:06.843 +22650,3409387,wesley,,,,,2013-10-13 11:45:50.007 +46110,511595,MichaelChirico,"San Francisco, CA, USA",,http://michaelchirico.github.io/,"

R, Python, Spark, Presto, Julia, QGIS, SAGA, STATA, MATLAB neophyte.

+

http://michaelchirico.github.io/

+",2015-02-19 22:47:35.533 +5448,1058987,jbowman,"Berkeley, CA, USA",,http://N/A,"

I am currently a Director of Data Science at Walmart Labs, working on supply chain problems such as demand forecasting and inventory replenishment. Prior to this, I worked for the now-defunct solar power company SolFocus, which built large photovoltaic arrays in various remote places for electricity generation. My main responsibility was forecasting how much energy we'd get out of a particular site. And reliability work too, of course. Before that I worked for a number of years in Hewlett-Packard's Strategic Planning and Modeling (SPaM) group, an industry award-winning internal, and occasionally external, consulting organization, and as an independent consultant.

+",2011-11-23 02:34:06.680 +9030,1521312,Jane Wayne,,,,,2012-06-07 18:10:47.537 +23151,3508950,user32062,,,,,2013-10-29 20:17:18.353 +22857,52667,kenn,,,https://dumper.io/,,2013-10-19 15:44:48.373 +19395,2982238,asdir,"Paris, France",,,"

Environment/development/energy/trade economist turned applied econometrician.

+",2013-06-28 13:08:03.663 +1322,144321,Jeff,,,,,2010-11-14 02:49:02.200 +23066,3470312,mrphippen,,,,,2013-10-27 04:17:29.270 +23423,450777,Kash,India,,,"

Programmer by profession.

+",2013-11-06 09:08:44.397 +22712,158604,kobejohn,"Kobe, Japan",,,,2013-10-15 13:49:10.597 +11775,1586762,wh0,,,,,2012-10-22 13:17:16.410 +1985,112999,user3125,,,,,2011-02-09 20:18:53.797 +21833,3319915,Pravesh Parekh,"Bangalore, India",,http://requiem-for-a-lost-soul.blogspot.com,"

Working in the field of computational neuroscience, a biotechnologist by training, I am interested in literature, into fiction writing, a theatre enthusiast, particularly interested in direction and script writing, an avid gamer, love "playing around" on my laptop, and a photographer.

+",2013-09-18 12:05:41.463 +13427,2182734,Stef van Buuren,Netherlands,,http://www.stefvanbuuren.nl,"

Over 25 years of experience in data analysis, development of statistical methods on missing data and child growth.

+",2012-12-27 10:33:34.907 +11489,1754821,January,Europe,,http://logfc.wordpress.com,"

My first computer

+ +

My first Linux

+",2012-10-09 21:17:46.280 +22858,2421118,user2114036,,,,,2013-10-19 15:45:33.163 +22765,3447161,A. Reza Khosravi,Texas,,,,2013-10-16 18:44:27.323 +7229,1304337,nadya,,,,"

Nadiia Basos. MSc in Geomatics. BSc in Environmental sciences. Interested in web map development, interpolations and modelling (from wildlife species distribution to hydrodynamics).

+",2012-03-07 01:40:48.383 +23467,431693,rnjai,Mexico,,https://rnjai.com,,2013-11-07 10:31:14.077 +13918,2248598,learnOR,,,,,2013-01-15 18:58:28.887 +23258,3524039,Jason,"Houston, TX",,http://dataprediction.blogspot.com/,"

I'm a data analyst with a PhD in Economics currently working for the consulting firm AlixPartners.

+",2013-11-01 16:33:15.700 +11117,1873485,beuhbbb,,,,"

Happy to learn, happy to help

+",2012-09-24 17:50:37.940 +23108,3502362,lovekesh,"New Delhi, India",,,,2013-10-28 16:08:53.440 +22626,22022,Blaisorblade,"Berlin, Germany",,http://www.informatik.uni-marburg.de/~pgiarrusso,"

Researcher in functional (and object-oriented) programming languages and their theory, especially Scala. Currently working in industry on machine-checked proofs of functional correctness for concurrent imperative programs.

+

Ex Linux kernel programmer, Scala/Java/Haskell/C/C++/Java developer.

+",2013-10-12 01:31:43.297 +22762,2885599,Rom,,,,,2013-10-16 15:49:20.717 +16039,1604330,GeorgeWilson,Singapore,,,,2013-03-20 21:14:06.027 +1124,115118,Alex Holcombe,"Sydney, Australia",,http://alexholcombe.wordpress.com/,"

@ceptional

linkedin

+",2010-10-20 09:50:40.767 +23389,3539809,Bart,,,,,2013-11-05 16:51:39.300 +22899,3467730,Kirsty,,,,,2013-10-21 11:16:10.237 +22848,2811229,Gilles San Martin,Belgium,,,,2013-10-18 22:25:43.353 +23532,3413362,sdevlin,"Montreal, Canada",,,"

Undergraduate student in Math and Computer Science at McGill University in Montreal, Quebec.

+",2013-11-08 19:23:37.687 +1717,209234,r_31415,,,,,2011-01-06 19:24:44.620 +23534,3556682,user32528,,,,,2013-11-08 19:34:53.347 +22651,3431950,Lawrence,,,,,2013-10-13 13:09:09.757 +21918,2305450,avocado,,,,,2013-09-21 10:01:34.170 +22731,113178,Valentin H,,,,,2013-10-15 20:21:08.253 +23099,63218,xkrz,Canada,,,"

A Stackflow-er :)

+",2013-10-28 13:56:36.277 +22723,3442007,thomeroqir,,,,,2013-10-15 18:20:33.817 +18416,2712076,user75402,,,,,2013-05-28 22:49:47.037 +23177,1187953,Maxim Khesin,,,,,2013-10-30 16:22:13.387 +8361,1453919,BGreene,"Dublin, Ireland",,,"

Industrial researcher with expertise in biomedical signal processing and pattern recognition.

+",2012-05-02 13:06:11.257 +23052,1869930,Dmytro Savochkin,"Lviv, Lviv Oblast, Ukraine",,,,2013-10-26 00:01:06.250 +23046,3491484,texastoast7,,,,,2013-10-25 17:58:52.327 +786,178774,Vass,USA,,,"

This is the third phase of my StackOverflow/Stackexchange experience.

+ +

Maths/Stats and the IT means to do it! (does AI fit into it or does it revolve around it? or... do they revolve around I?)

+",2010-08-26 17:19:15.257 +17670,303118,tmakino,,,,,2013-05-07 14:20:48.663 +22475,3407924,sara,,,,,2013-10-08 12:31:47.380 +22549,3417257,user99980,,,,,2013-10-10 06:33:41.573 +22939,3474345,toolpool,,,,,2013-10-22 16:13:48.050 +1945,275327,Josh,,,,,2011-02-04 21:33:28.620 +7275,1311272,snape,,,,"

I am not a statistician but I am interested in learning Statistics.

+",2012-03-09 11:55:54.627 +23420,2411131,Shaowei Ling,"Nanjing, China",,,"

I am a postgraduate of Southeast University in Nanjing, China.

+",2013-11-06 07:52:31.957 +21119,2631368,sachinruk,"Sydney NSW, Australia",,https://sachinruk.github.io,"
    +
  • PhD in Bayesian Machine Learning.
  • +
  • Obsessed with DL.
  • +
  • Currently dipping toes in Reinforcement Learning.
  • +
+",2013-08-26 01:41:57.983 +20828,3151620,Mayou,,,,"

Quant at an investment management firm.

+",2013-08-15 13:23:28.443 +23497,3552125,Vivian,,,,,2013-11-07 21:51:39.410 +18998,1680541,Xander,,,,,2013-06-16 14:27:38.900 +17580,2722398,acbart,,,,,2013-05-03 21:05:29.093 +9063,942536,Cam.Davidson.Pilon,"Waterloo, ON, Canada",,http://www.dataorigami.net,"

Blog at DataOrigami and ControlledMold. CEO at Pioreactor.

+",2012-06-08 18:05:01.960 +19822,3035457,Bijoy,,,,,2013-07-11 20:57:03.617 +22618,29283,taserian,South Carolina,,http://im-strange.blogspot.com/,"Mathematics, logic and outright outlandishness.",2013-10-11 19:21:41.103 +37646,3412739,chri5,,,,,2014-09-14 08:15:02.660 +21182,3207579,Avraham,New York Metropolitan Area,,http://www.avrahamadler.com,"

US-based non-life actuary and R programmer. I maintain some packages on CRAN including Delaporte, lamW, Pade, revss, and minimaxApprox.

+

$argon2id$v=19$m=64,t=512,p=2$UHLafHyFcVlL31U8ykB66A$fbQ/Uj9TR5qHtliLnvsBlA

+",2013-08-28 06:01:41.090 +12495,1979338,Rafael,"Hobart, Australia",,,"

I'm an eternal stat-R apprentice.

+",2012-11-20 04:55:17.293 +23122,1253573,user1213492,,,,,2013-10-29 06:44:10.490 +22633,2195866,Gilles Pilon,"Naperville, IL, USA",,,"

Professional engineer. Data scientist. Python developer. Lean Six Sigma Master Black Belt.

+",2013-10-12 12:31:22.350 +1542,509609,Jen,,,,,2010-12-13 23:39:26.947 +23363,3537094,Ben,,,,,2013-11-05 05:37:44.340 +23476,3550060,user32454,,,,,2013-11-07 14:15:45.333 +23176,3372585,Slyron,,,,,2013-10-30 15:37:51.767 +23075,3497904,user2925487,,,,,2013-10-27 17:00:57.470 +21885,2998457,Julian Karch,"Berlin, Germany",,,,2013-09-19 22:12:29.957 +15183,2412204,fredrikhs,Southern Norway,,,"

MSc. in Economics. Interested in programming and data analysis, particulary time series analysis.

+",2013-02-25 11:17:06.007 +22801,3451915,user31621,,,,,2013-10-17 16:35:03.233 +23006,1731307,oveoyas,,,,,2013-10-24 08:00:21.000 +22791,2321257,statusfailed,,,,,2013-10-17 10:12:44.497 +11440,1705277,sashkello,"Sydney, Australia",,,"

Laß die Zeit an dir ablaufen wie Wasser

+",2012-10-08 07:23:31.760 +22774,78417,Garret Smith,,,,,2013-10-16 20:31:45.560 +23434,2771810,Glob Cropper,,,,,2013-11-06 15:10:33.623 +23371,3418978,Dr. Stevil,,,,,2013-11-05 11:42:42.050 +5086,948131,Tyler Rinker,"Buffalo, NY",,http://trinkerrstuff.wordpress.com/,"

I’m a husband and father of 2 girls. I lead data science @ Kangarootime, #Python & #rstats enthusiast, #dataviz geek, and #nlp buff.

+",2011-10-30 13:20:23.900 +22916,3470047,Dr.Sheldon Cooper,"Pasadena, California",,,"
    +
  • Theoretical physicist
  • +
+",2013-10-21 20:15:05.410 +23473,3549654,Yuri,"Kyiv, місто Київ, Україна",,https://www.yburger.com,"

Head of AI Department at ZoralLabs. Key interests are: artificial intelligence, machine learning, data mining, distributed computing, big data... My personal website: https://www.yburger.com

+",2013-11-07 12:41:24.940 +22824,3455388,student,,,,,2013-10-18 09:21:02.257 +1636,103559,leonbloy,"Buenos Aires, Argentina",,,"

Hernan J. Gonzalez

+

Buenos Aires, Argentina

+

PNGJ: https://github.com/leonbloy/pngj/

+",2010-12-28 14:55:22.660 +22867,3462564,mixtureModel,,,,,2013-10-20 02:11:50.450 +23513,142194,Quester,,,,,2013-11-08 06:50:14.660 +23484,3551139,Clementina,,,,,2013-11-07 17:44:32.547 +23386,3539376,Paul,Vienna,,,"

aquila non capit muscam!

+",2013-11-05 15:24:41.793 +22783,3449012,phantason,,,,,2013-10-17 05:20:18.950 +23422,1765444,vvasch,,,,,2013-11-06 08:41:23.063 +22678,1033471,mojovski,,,,,2013-10-14 14:34:18.813 +19547,2931115,SKY,,,,,2013-07-03 17:49:46.973 +19559,2576067,Sol,,,,"

Psycholinguistics researcher

+",2013-07-04 01:55:07.513 +8074,1235662,Marc in the box,,,http://menugget.blogspot.com/,,2012-04-17 14:34:53.120 +22946,2473468,AlbertusW,,,,,2013-10-22 21:06:21.953 +20795,3171977,forecaster,,,,,2013-08-14 01:31:10.303 +23446,2288031,working4coins,,,,,2013-11-06 20:06:00.680 +12282,2036221,Pat,,,,,2012-11-12 15:30:48.933 +4854,974824,DartPrivateer,,,,,2011-10-15 02:49:01.160 +11353,60844,Assad Ebrahim,"London, England United Kingdom",,http://www.mathscitech.org/articles,,2012-10-04 10:41:03.333 +23397,3540471,user32359,,,,,2013-11-05 19:20:36.173 +23132,3507188,user32039,,,,,2013-10-29 14:06:15.503 +3894,510162,hr0nix,,,,,2011-08-05 09:31:14.967 +22874,3464199,user31712,,,,,2013-10-20 14:37:54.647 +14799,2358692,Ray Koopman,,,,,2013-02-12 19:05:13.730 +23244,257657,BigChief,,,,,2013-11-01 11:22:32.527 +23026,1555884,Yu Yang,,,,,2013-10-25 05:11:24.337 +22617,3263034,Tom Palmer,,,,,2013-10-11 19:12:00.953 +22907,3468722,hans,,,,,2013-10-21 14:58:19.417 +20097,3071766,user2602256,,,,,2013-07-20 13:57:31.517 +16504,485269,xbsd,,,,,2013-04-04 01:48:09.553 +23130,473684,Vincent,"Laboratoire d'Annecy de Physique des Particules, France",,https://www.linkedin.com/in/vincent-reverdy,"

Researcher, astrophysicist, computer scientist, programming language expert, software architect and C++ standardization committee member.

+

LinkedIn: https://www.linkedin.com/in/vincent-reverdy

+",2013-10-29 12:59:01.220 +15025,511627,Lepidopterist,,,,,2013-02-20 03:58:39.913 +24121,3625299,Fabian,,,,,2013-11-24 23:03:15.827 +22769,3447384,Rob F,,,,,2013-10-16 19:42:42.730 +22695,3438289,user31497,,,,,2013-10-15 01:09:29.977 +2081,465047,ttnphns,"Moscow, Russia",,https://www.spsstools.net/en/KO-spssmacros,"

A SPSS user (and developer, not affiliated with), data analyst.

+",2011-02-16 19:33:34.657 +20773,3123897,Ellis Valentiner,,,,,2013-08-13 16:12:25.523 +23127,3506042,sign,,,,,2013-10-29 10:03:02.447 +22836,3160162,EPSILONsdfsdfdsf,,,,"

Please delete my account.

+",2013-10-18 15:05:22.563 +23188,3443278,user2884661,,,,,2013-10-30 20:39:55.270 +5643,161488,Robert Kubrick,,,,,2011-12-04 20:39:43.713 +23076,3085199,Ping Jin,,,,,2013-10-27 18:49:47.733 +22981,3001934,user2547279,,,,,2013-10-23 17:31:11.980 +22872,3463784,CuriousCat,,,,"

I always struggled with Statistics in school.

+ +

I'm here to see if I can do better now - fifteen years later.

+ +

Some of my questions might seem lame.

+",2013-10-20 12:05:51.743 +23442,3546054,oren,,,,,2013-11-06 19:25:42.607 +27332,1143891,whenov,,,,,2014-02-12 05:37:18.303 +23540,160509,Jakub Roztocil,,,https://httpie.io/,"

Software engineer. Creator of HTTPie & co.

+",2013-11-08 21:07:47.077 +22675,934262,vvavepacket,"Miami, FL",,,"

so curious with number theory

+",2013-10-14 11:17:25.780 +22862,3461349,user31696,,,,,2013-10-19 17:34:26.353 +23524,3555271,Sandra,,,,,2013-11-08 14:00:25.287 +23163,2511051,datddd,,,,,2013-10-30 04:25:38.927 +5821,435185,AdamO,Nakoja Abad,,,"

"The trappings of luxury cannot save you from the nail-biting boredom of repetitive brain injury."

+",2011-12-14 21:46:36.197 +22817,3453959,user31641,,,,,2013-10-18 02:39:46.243 +23239,390042,nvcnvn,"Ho Chi Minh City, Vietnam",,http://nguyen.open-vn.org,"

I call myself an old-style software engineer, always trying to solve the problem simple as possible. Building strong backend for performance and scalability, adapting computer science in everyday tasks is what I looking for in career path.

+",2013-11-01 04:03:15.543 +15120,155034,SKM,"Singapore, Singapore",,,,2013-02-22 17:12:53.983 +22608,1912945,Girishkumar,"Mumbai, India",,http://www.isical.ac.in/~mtc1113/,,2013-10-11 14:28:01.963 +7007,1273790,Zen,"São Paulo, Brazil",,,"

Paulo C. Marques F.

+

BA in Physics. PhD in Mathematical Statistics.

+

Interested in Machine Learning, Causal Inference, Bayesian Statistics, Statistical Theory, and stochastic composition of string quartets.

+

Teaches Statistics, Machine Learning, Predictive Modeling, Python, R and Data Science at Insper.

+
+

"Artificial Intelligence is no match for Natural Stupidity." (Anonymous)

+",2012-02-23 21:59:40.227 +20312,2278287,Barb90,,,,,2013-07-27 12:53:36.160 +23021,3486685,user31894,,,,,2013-10-24 20:05:57.813 +23004,211135,Alberto Fernández,"Barcelona, Spain",,http://albertofem.com,"

Programmer.

+",2013-10-24 07:24:30.847 +4831,255542,jseabold,,,,"

Statsmodels developer
+Github
+Twitter

+",2011-10-13 16:49:15.740 +22784,3403506,Shaxi Liver,,,,,2013-10-17 08:02:34.503 +23220,3519482,user32145,,,,,2013-10-31 15:49:57.877 +22913,2402243,Tom,,,,,2013-10-21 17:59:10.943 +4656,900727,Dilip Sarwate,"Urbana, IL",,,"

I am a retired professor of electrical and computer engineering with a lifetime of experience teaching probability and statistics to reluctant engineering undergraduates.

+",2011-10-03 16:59:55.347 +23542,1687146,LostPhysx,Germany,,https://areinhardt.eu/,,2013-11-08 22:17:21.377 +9749,1663503,Robert Jones,,,,"

merge keep - done on 21Jan

+",2012-07-17 09:53:51.480 +14919,2376283,Fuca26,,,,"

Doctorate in economics, first-gen. Love traveling, sports, chess, readings, and languages. +My research focuses on labor economics; it also embraces different aspects of applied microeconomics: housing, sport, behavioral, and education economics.

+

When I do not do research I am a middle/long-distance runner. +I recently converted to doggies worshipping.

+

I was a waiter, then a statistician wannabe, and finally a prof in economics.

+",2013-02-17 00:35:13.563 +399,509491,Thylacoleo,,,,,2010-07-28 00:56:47.640 +22716,3441318,thaq,,,,,2013-10-15 15:40:03.783 +22909,1718644,kotoll,,,,,2013-10-21 15:42:32.237 +22548,3417191,user31315,,,,,2013-10-10 06:20:04.270 +22823,1401141,bognick,"Athens, Greece",,,,2013-10-18 09:16:24.410 +23202,242979,campeterson,"Utah, United States",,http://campeterson.com,,2013-10-31 06:53:45.650 +22691,3435348,ali nouri,,,,,2013-10-14 21:49:07.817 +11283,973679,zima,,,,,2012-10-02 10:45:07.113 +9081,215207,kjetil b halvorsen,Tellus,,,"

Mathematical statistician (the image is my grandfather in 1927, just returned to Tromsø after wintering at Jan Mayen)

+

For contact, my email:

+
+
<first name><My public seed> at gmail dot com
+
+
+

Below a list of some questions I have responded to, where I consider my response interesting:

+

how-does-saddlepoint-approximation-work

+

what is the intuition behind svd

+

goodness of fit and which model to choose linear regression or poisson

+

Estimating parameters for Binomial with both $n$ and $p$ unknown

+

what is the difference between finite and infinite variance

+

general sum of gamma distributions

+

why should we use t errors instead of normal errors

+

maximum likelihood estimation mle in layman terms

+

difference of two iid lognormal random variables

+

non transitivity of correlation correlations between gender and brain size

+

Intuition on the Kullback-Leibler divergence

+

Taleb and the black swan

+

Correlations between continuous and categorical nominal variables

+

Why does logistic regression become unstable when classes are well separated

+",2012-06-09 22:52:37.473 +22555,3382596,AsymLabs,Great Britain,,http://asymlabs.org,"

AsymLabs (TM) is a brand name of Applied Numerics Ltd of Great Britain that is also under license to Resource CUA of Amsterdam. AsymLabs provides numerical analysis tools for civil engineering, construction and related fields that can be used to optimize and accelerate the infrastructure construction process in the emerging economies of the developing world - particularly hot and tropical environments. This leads to lowered construction cost, accelerated construction, and better service performance.

+ +

In the course of our work, we sometimes develop more general software tools or libraries that we donate to the open source community under MIT or GNU GPLv3 licence terms. Most, if not all of these, are for Unix based operating systems, but from time to time they are made compatible with Microsoft Windows. They can be found at our repositories at GitHub.com and BitBucket.org.

+ +

These repositories are being expanded monthly and we invite you to visit them. The primary languages of our work are Bash, C++ and Lisp.

+",2013-10-10 09:03:56.503 +22931,2359715,smrati katiyar,,,,,2013-10-22 12:36:52.613 +22912,3469477,Sabina,,,,,2013-10-21 17:45:28.917 +346,109249,Henrik,"University Of Warwick, Gibbet Hill Road, Coventry, Vereinigtes Königreich",,http://singmann.org,"

Assistent professor of psychology at the University of Warwick, UK.

+ +

My primary programming language is R.
+I am maintainer of R packages afex, MPTinR, and acss.

+ +

Besides R, I use Python (using PsychoPy) and JavaScript for running experiments and occasionally other languages.

+ +

A list of my publications can be found on my homepage (usually with possibility to download the papers, data, and analysis scripts).

+",2010-07-27 10:01:37.337 +23088,3144687,Jie Wei,Huazhong University of Science and Technology,,,"

Assistant Professor, School of Economics, Huazhong University of Science and Technology

+",2013-10-28 04:30:44.510 +16474,2583847,Maarten Buis,"Konstanz, Germany",,http://www.maartenbuis.nl,"

Sociologist and Stata user. I have mainly worked on variations on logistic regression, interactions and mediation. Substantively, I worked mostly on differences in educational attainment between children with different parental backgrounds.

+",2013-04-03 10:24:26.900 +23464,3183849,shad0w_wa1k3r,Dubai - United Arab Emirates,,https://www.ashishnitinpatil.com,"

Husband | Pythonista | Engineer | Co-founder @ MySyara | Certified BlockChain Dev | Dota2 fanatic
+Love problem solving & programming for the same.

+

CryptoThanks - ETH : 0x464512ACAfaf0BEbA4BBDE4B6305E5B9E57d87Fc

+",2013-11-07 05:45:01.453 +22719,1133928,SuppaiKamo,Denmark,,,,2013-10-15 17:02:02.373 +23256,2627386,Jens Jensen,,,,,2013-11-01 15:04:28.380 +23212,1494685,EzzatA,"Graz, Austria",,https://plus.google.com/116823604753279239388,"

TU Graz Computer Science MSc student. Python, Django, and Javascript ninja

+",2013-10-31 13:18:42.803 +22965,140373,Daniel Winterstein,"Edinburgh, United Kingdom",,https://good-loop.com,"

I am co-founder and CTO for Good-Loop.

+

I also have a +personal homepage.

+",2013-10-23 10:59:13.510 +23230,3520791,Matthew C,,,,,2013-10-31 21:33:02.790 +23491,139200,G-wizard,"Montreal, Canada",,,,2013-11-07 20:29:40.943 +23776,3588691,Thijs van der Vaart,,,,,2013-11-15 06:58:00.907 +22974,3479712,John,,,,,2013-10-23 15:38:02.280 +22894,3325382,user2795569,,,,,2013-10-21 04:34:02.463 +23339,3534783,thatnewguy,,,,,2013-11-04 16:48:47.160 +7615,1353035,purple51,,,,,2012-03-26 01:46:00.170 +23306,119676,Agent1891,"Aachen, Germany",,http://behery.github.io,"

I'm just here trying to get things done.

+ +

SOreadytohelp

+ +

Twitter -- @Agent1891

+",2013-11-03 14:58:41.637 +23486,2854695,Manoj Kumar,India,,,,2013-11-07 18:02:57.597 +22725,3442096,Gumby,,,,,2013-10-15 18:43:28.630 +23546,1241901,rajohnson90,Wisconsin,,,"

I'm a graduate student of computer science with an undergraduate in Japanese language.

+",2013-11-08 23:41:24.353 +23226,1073022,user1071847,,,,,2013-10-31 18:55:04.490 +23054,144469,pankaj28843,"Copenhagen, Denmark",,http://psjinx.com,,2013-10-26 01:10:14.443 +23067,2178793,Pattisahusiwa,Indonesia,,,,2013-10-27 06:05:18.207 +23189,3221594,Greg,,,,,2013-10-30 20:46:20.517 +19359,2945859,Bill Bradley,,,,"

I'm a mathematician by training, but currently co-run Mirabolic Consulting, which focuses on machine learning and data science problems.

+",2013-06-27 15:23:33.807 +23461,3547467,JConway,,,,,2013-11-07 02:27:04.507 +15827,2038743,Nick Cox,"Durham, UK",,,"

I work mostly with environmental data, secondarily with social science data. My interests include statistical graphics, exploratory data analysis, generalised linear models, distributions, transformations and directional data analysis. I am an active Stata user, have both contributed to Stata itself and published many additional Stata programs, and have written widely on the use of Stata, especially in the Stata Journal. I have a strong side-interest in the history of statistics.

+",2013-03-15 13:57:54.467 +23013,218648,Tacio Medeiros,,,,,2013-10-24 13:10:35.237 +22542,3416394,Thomas,,,,,2013-10-10 01:51:45.710 +23024,3240947,Robson,"Recife, Brazil",,https://sites.google.com/site/robsondtigre/,,2013-10-25 04:06:52.627 +10684,512335,Flounderer,,,,,2012-09-03 23:44:58.677 +11506,1924428,Samo Jerom,,,,,2012-10-10 14:24:43.837 +23319,3531574,Nathan M,"Alma, Wisconsin, USA",,,,2013-11-03 23:17:55.293 +23295,1481103,Oleg Melnikov,"Bellevue, WA",,http://www.linkedin.com/in/olegmelnikov,"

http://oleg.rice.edu +http://www.linkedin.com/in/olegmelnikov

+",2013-11-02 20:44:48.917 +22575,2620197,Kim,,,,"

Code User

+",2013-10-10 18:05:08.563 +21029,2259709,Drew75,,,,,2013-08-22 14:20:37.097 +23303,3529923,Amy A,,,,,2013-11-03 13:02:01.453 +10594,1796034,tiantianchen,,,,,2012-08-30 08:19:16.923 +23090,3499927,Man Ray,,,,,2013-10-28 05:47:22.693 +22926,3471893,user31777,,,,,2013-10-22 07:08:23.553 +22989,1559121,Iron Savior,"Seattle, WA",,https://github.com/IronSavior,"

DEMACIA!

+",2013-10-23 19:29:55.120 +11210,1886970,Quartz,,,,"

Monte Carlo, risk, QMC, statistical efficiency, high dimensional approximation...

+",2012-09-28 10:29:04.207 +82,64496,jebyrnes,,,,,2010-07-19 19:48:20.350 +19681,2167500,zkurtz,Pittsburgh,,,"

Statistician and consultant.

+",2013-07-08 17:35:17.070 +10372,1369402,HelloL,,,,,2012-08-19 12:00:00.683 +20222,2233497,Steve,,,,,2013-07-24 14:49:52.873 +22795,3450768,cristis,,,,,2013-10-17 12:51:21.873 +22049,2734621,TangoStar,Germany,,,,2013-09-25 11:25:10.927 +22861,26725,Ankur,"Melbourne, Australia",,,"

A junior BA have some experience in the financial services industry. I do programming for my own personal projects hence the questions might sound trivial.

+",2013-10-19 16:59:35.120 +22458,323365,Achint,,,,,2013-10-08 01:07:56.313 +23018,3485578,user31889,,,,,2013-10-24 16:00:26.397 +23218,3519146,Matthew,Singapore,,,,2013-10-31 14:39:35.600 +23413,3541779,user32378,,,,,2013-11-06 02:00:54.243 +23406,2424498,geofflittle,Seattle,,,,2013-11-05 23:05:14.573 +23077,337671,KeithSmith,United States,,http://fernleaf07.blogspot.com/,"

Embedded Software Engineer.

+ +

Eclipse, C, C++, WinCE, eCos, FreeRTOS, Inkjet Printers, ICP-EOS and IR spectrometers

+ +

Learning about Python and JSON

+ +

Familiar with NXP Cortex MCUs

+ +

Amateur astronomer and pteridologist

+",2013-10-27 18:56:55.847 +23462,926622,jrc1000,,,,,2013-11-07 04:03:34.003 +5556,1016273,Innuo,,,http://mlstat.wordpress.com,,2011-11-29 19:33:58.173 +22649,3431547,user31436,,,,,2013-10-13 10:04:32.353 +22968,282375,James,"Sheffield, United Kingdom",,,,2013-10-23 12:31:26.027 +19492,2995702,Copuleros,,,,,2013-07-02 10:58:26.740 +17076,2664142,Patrick,,,,,2013-04-20 21:00:03.237 +1895,329305,cardinal,Not from around here,,http://none,,2011-01-29 21:42:27.863 +15583,1021083,0x90,,,,"

echo \[q\]sa\[ln0=aln256%Pln256/snlbx\]sb3135071790101768542287578439snlbxq|dc

+",2013-03-08 20:27:24.483 +21523,3127210,Raz_Lobo,"El Ejido, Spain",,http://www.mucharuina.com,,2013-09-08 13:22:10.560 +22950,3172304,bravetang8,,,,,2013-10-23 01:03:32.633 +23023,3487363,user103065,,,,,2013-10-24 23:19:38.397 +23333,3533294,Fldm,,,,,2013-11-04 11:28:40.793 +23281,3508040,Daniel Gotthardt,,,,,2013-11-02 12:24:29.447 +20456,792968,Andrea,,,,,2013-08-01 19:10:55.053 +22480,1796529,Gerome Bochmann,,,,"

Junior dev, mostly javascript and angular in particular. I have two years experience in R and a Master in Cognitive Sciene; {LaTeX} enthusiast, mindful meditationist and computer nerd of the third generation.

+",2013-10-08 14:43:02.427 +12152,967618,power,,,,,2012-11-07 07:36:08.563 +14965,2381666,Cagdas Ozgenc,,,https://www.linkedin.com/in/cagdas-ozgenc-b709929/,"

Investor, statistics hobbyist

+",2013-02-18 14:12:19.150 +22822,3455354,Jos,,,,,2013-10-18 09:14:44.847 +22807,3123733,ievgenii,,,,,2013-10-17 21:51:11.407 +22937,1962493,damienfrancois,Belgium,,http://www.damienfrancois.be,,2013-10-22 15:57:46.907 +22763,275509,mikepk,"Boston, MA",,http://mikepk.com,,2013-10-16 16:30:02.500 +23142,3508281,user32053,,,,,2013-10-29 17:36:43.217 +13051,94831,Gabra,,,,,2012-12-10 14:14:41.870 +24527,2761061,Francis Smart,"Maryland, USA",,http://www.EconometricsBySimulation.com,,2013-12-06 04:28:04.883 +23266,2779111,junkaholik,"Vancouver, BC, Canada",,https://spzfolio.wordpress.com/,"

Data scientist, data visualization enthusiasts, avid user of Python (and Jupyter). +I dream of doing more D3. Currently using data science to assess and improve science education.

+ +

Interests: ultimate frisbee, climbing mountains, martial arts, photography, and underwater sign language.

+",2013-11-01 22:31:56.647 +23173,2214814,Sulli,,,,,2013-10-30 14:05:10.450 +14360,2301670,mrz,"Oxford, United Kingdom",,,,2013-01-29 13:34:31.547 +23454,3527998,user2948524,,,,,2013-11-06 23:40:25.140 +22753,3352915,Paul,,,,,2013-10-16 12:41:11.517 +22479,3408572,user31230,,,,,2013-10-08 14:42:31.953 +22410,435249,bernie2436,,,,,2013-10-06 15:37:55.093 +22569,1445214,Varun Jain,,,,,2013-10-10 15:23:06.633 +3731,184039,Nathan VanHoudnos,"Pittsburgh, PA",,http://edustatistics.org/nathanvan/,"

I'm a PhD student in the joint Statistics and Public Policy program at Carnegie Mellon University.

+",2011-07-22 13:32:10.580 +23340,3534834,Fred,,,,,2013-11-04 17:03:13.133 +22711,3039268,CMP,Argentina,,,,2013-10-15 13:17:39.517 +20470,964416,Zhubarb,"London, United Kingdom",,,"

When you stare long into the thesys, the thesys stares back into you.

+",2013-08-02 12:27:30.800 +23116,2125820,George Fisher,"Boston, MA, United States",,http://www.georgefisher.com,"

George Fisher

+

Retired Morgan Stanley Managing Director with MIT Masters degree actively seeking career in cutting-edge technology. Perfect fit for a Finance Vertical: interface between a technology firm and Finance-Industry C-Suite executives & technologists.

+

Excellent programmer, data scientist; Kaggle Expert, GitHub, StackExchange and scikit-learn contributor; Stanford University training. Cloud: Amazon Web Services Architect and Linux certified. Successful large-group manager, collaborative individual contributor, extensive finance-industry experience. MIT Master’s degree.

+
• Kaggle Silver for stacked regression analysis of financial data
+• Kaggle Bronze for convolutional neural network / OpenCV analysis of aerial photographs  
+• Did all of the start-up and data-analysis work for Health-Care Twitter Analysis  which tracked progression of diseases via Twitter posts.
+• Numerous GitHub stars for MNIST algorithm analysis 
+
+• Programming
+    ◦ PHP, SQL, JavaScript
+
+• Data Science, Analysis
+    ◦ R, Python, Spark
+    ◦ Stanford University Data Mining and Applications 2016
+    ◦ Kaggle Expert
+
+• Linux (certified sysadmin), Docker, Git
+
+• Cloud: AWS (certified Architect)
+• Finance
+    ◦ MIT Master of Finance 2012
+    ◦ 30-year Wall Street Career
+        ▪ Real-Estate Finance 2007 - current
+        ▪ Investment Manager 2005 - 2011
+        ▪ Prudential CAO, Board Member 2002 - 2004
+        ▪ Fidelity CIO, EVP 1998 - 2002
+        ▪ Morgan Stanley Managing Director 1981 - 1998
+    ◦ BA in Economics 1976
+
+

For more details please see https://georgefisher.com/resume/resume.pdf

+",2013-10-29 00:13:57.977 +21398,3260554,Jan Modus,,,,,2013-09-04 09:26:47.983 +22885,3465421,Anton,,,,,2013-10-20 21:37:32.620 +3183,229913,David J. Harris,"Florida, United States",,http://davharris.github.io,,2011-06-03 04:30:18.257 +37033,4963444,greenall,,,,,2014-08-30 22:54:35.713 +22792,511732,Ufuk Can Bicici,,,,,2013-10-17 10:35:07.370 +23451,2980224,mlo,,,,,2013-11-06 22:14:45.610 +22928,2805215,frank,,,,,2013-10-22 08:19:37.080 +22681,55154,George Powell,,,,"

Software Engineer. Interested in web technologies, web security, web scale. Enjoys a challenge and learning new stuff.

+",2013-10-14 17:12:17.350 +22627,2993324,Michael,San Jose,,,,2013-10-12 02:02:09.157 +22583,3421213,geo,,,,,2013-10-10 20:48:01.520 +22826,3455544,bob hope,,,,,2013-10-18 09:50:21.567 +23312,3531019,ukituki,,,,,2013-11-03 19:37:15.580 +22889,3465843,user31733,,,,,2013-10-21 00:38:08.623 +7016,1274641,Al-Ahmadgaid Asaad,"Iligan City, Philippines",,http://www.alstatr.blogspot.com,"

Student

+",2012-02-24 06:48:05.733 +18403,2809410,Michael,,,,,2013-05-28 16:09:37.057 +13537,2198217,Davide,,,,"

M.S Student in Biostatistics @ University of Bologna

+",2013-01-02 16:04:05.927 +23416,1736244,Joe Li,,,,,2013-11-06 05:14:45.850 +21985,1811490,Michael,,,,,2013-09-23 21:38:39.343 +15723,2474148,David M W Powers,"Flinders University, South Australia",,http://david.wardpowers.info,"

I'm a professor/researcher in artificial intelligence and cognitive science with a special interest in language learning - computational cognitive psycholinguistics. That is I am particularly interested in building computationally and cognitively plausible models of how we learn and communicate about the world, and how to get computers/robots to learn and communicate about the world in a similar way, as well as how better to teach people languages and about language.

+ +

From the perspective of learning, my focus is on unsupervised learning - children learn to speak without a teacher, without being taught what nouns and verbs are. From the perspective of language, I like Tagmemics and Cognitive Linguistics - consistent with my focus on unsupervised learning, language and learning boil down to what we think is similar.

+ +

Formally this gives rise to similarity or distance measures used in learning, and we also need to account chance levels or prevalences. It also gives rise to metaphor, metonymy, grammar and related linguistic constructs, which define Cognitive Linguistics, as well as the basic ideas of Phonemic and Tagmemic analysis, Contrast in Analogous Environments and Complementary Distribution. It similarly explains the richness of polysemy and why there is really no such thing as synonymy, as when anything is different we look for other things to be different to learn from, using the part that is similar as context or paradigm, a substrate for the learning. We are continuously developing and modifying the range of meaning of language with our personal, social and cultural development, as we press old words or morphemes into new roles.

+ +

I also edit the new Springer journal, Computational Cognitive Science that aims to bridge the gap between Computational Intelligence and Cognitive Science.

+",2013-03-13 05:19:39.237 +11808,1718514,PascalVKooten,Netherlands,,http://www.linkedin.com/profile/view?id=190745232,"

Software enthusiast. Blockchain, Cognitive apps, machine learning & AI; to name a few.

+ +

Here's my LinkedIn and here is my GitHub.

+",2012-10-23 12:50:50.990 +22736,233335,davidjhp,,,,,2013-10-15 22:47:07.093 +21204,3230813,elsonidoq,"Buenos Aires, Argentina",,http://pablozivic.com.ar,"

I’m curious and enthusiastic.

+

I specialize in building computational models to explain data - data science - with solid knowledge in statistics, machine learning and programming. I’m always thirsty for new data sets to taste new state of the art techniques.

+

Academically I apply my knowledge of massive data analysis to music corpora - computer musicology. My thesis explores the problem of how to build computer models that compose music based on the music psychology theories of the last three decades. Right now, at my PhD I’m analyzing how the very notion of style necessarily impacts on certain statistics of the music belonging to it.

+

Industrially developed algorithms for online audience targeting and optimization of online campaigns, market reaserch on twitter, information extraction systems, content classification, keyword extraction, etc.

+

As a hobby I like to develop innovative interfaces for music creation, use the standard ones (guitars, keyboards, drums, etc.) and also juggling.

+

I like to be in creative spaces and do interdisciplinary work. But above all, complex problems that allows me to keep learning.

+",2013-08-28 15:56:16.150 +23355,2302841,Sean C. Rife,"Stow, OH",,http://www.seanrife.com,"

Ph.D. in Social Psychology. New Assistant Professor. My research deals with relationships and technology.

+",2013-11-04 23:18:22.737 +22915,3394778,girolamous,,,,,2013-10-21 19:42:15.257 +23483,3428827,Roberto,"Palermo, Italy",,,"

PhD in Statistics, my field of interest is mainly survey sampling and inference on finite populations. +I also enjoy programming in R.

+",2013-11-07 17:04:36.007 +23103,3501996,DonutPhil,,,,,2013-10-28 14:55:58.223 +16737,1132433,ramgorur,,,,"

Interested in all sorts of stuffs.

+",2013-04-11 02:30:54.253 +22881,1704481,Eric Chen,,,,,2013-10-20 19:28:45.027 +22359,3148491,visoft,,,,,2013-10-04 13:25:01.307 +8629,51051,andreas-h,,,,,2012-05-16 08:42:47.713 +22780,503639,dpollitt,Minneapolis,,http://www.properspective.com,"

Photographer, tech head, and extreme sport enthusiast.

+ +

I'm an amateur who likes to shoot everything from portraits to landscapes. My favorite venues are half way up a 14k mountain, Ontario in Fall, and the Mediterranean at sunset.

+ +

I love talking technology, new equipment, and Canon lenses.

+ +

+

+",2013-10-17 02:37:35.930 +23310,3530949,asharma,"Toronto, ON, Canada",,,,2013-11-03 19:14:05.977 +22785,1772269,Stephen_B,,,,,2013-10-17 08:33:16.957 +11013,1849501,Chris,,,,,2012-09-20 01:23:27.523 +221,102298,steffen,Frankfurt Area (Germany),,,"

Data Scientist

+",2010-07-22 05:56:36.677 +23449,3546638,adamToplz,,,,,2013-11-06 21:40:38.437 +22702,209030,Yaniv Aknin,"London, United Kingdom",,http://tech.blog.aknin.name,"

I like bits and humans. SRE at Google. Opinions are my own.

+",2013-10-15 05:42:22.333 +22554,3417872,andrea,,,,,2013-10-10 08:58:21.843 +19374,2234051,marcellt,,,,,2013-06-27 21:18:41.817 +4871,287086,d_a_c321,"New York, NY",,,,2011-10-16 23:19:28.467 +23299,96938,rold2007,"Sapporo, Hokkaido Prefecture, Japan",,,"

Freelance software developer. Working in image processing (computer vision). Doing custom machine learning as a hobby.

+",2013-11-03 08:02:04.637 +10986,132790,Josef,"Montreal, Quebec",,,"

statsmodels maintainer and developer, semi-retired from scipy.stats maintainance

+",2012-09-19 00:02:29.913 +23369,1920948,Pavel 'Strajk' Dolecek,"Brno, Czech Republic",,https://strajk.me/resume/,"

I work mostly with React & ecosystem, Vue, Node, TypeScript & Flow, GraphQL, and Cypress.
In the past, I had worked extensively with Angular.js, Meteor, Ruby on Rails, PHP, and various databases.

+

I have experience with full-stack development, leading teams, project management, and organizing workshops.

+

I'm passionate about organizing information, health & fitness, design, and education.

+

I live in Brno, Czechia and I like it here, but I don't mind relocating for awesome opportunities.

+

I enjoy running, cycling, lifting, photo & video, traveling, reading, playing video games (Doom & AoE are my favorites), and spending too much time on Reddit & HackerNews ¯\_(ツ)_/¯

+",2013-11-05 08:45:32.303 +594,131001,Glen_b,I'm right here,,,"

I was elected Moderator 31 March 2015 and stepped down early October 2019.

+

While I am now quite sure nothing will come of it (hence the change in avatar after several +years), I still support the unconditional reinstatement of Monica.
+https://rentry.co/44bxc (also see here)

+

Remember that all models are wrong; the practical question is how wrong do they have to be to not be useful 
--   George Box & Norman R. Draper, Empirical Model-Building and Response Surfaces

+Far better an approximate answer to the right question, which is often vague, than an exact answer to the wrong question, which can always be made precise.
  --   John Tukey, The future of data analysis. Ann. Math. Stat. 33 (1), (1962)

+It is no more appropriate to tailor a course to the desires of the student than it is to modify the driving test according to the wishes of learner drivers.
  --   Stephen Senn, On thinking and learning. RSS News
, December 2007

+

https://mobile.twitter.com/stephensenn/status/538017638111531009

+



+

+

+

+

Hard to find data queries: bountiful

+

incl Ansbody, excl tags title & Qbody - http://data.stackexchange.com/stats/query/669089

+

toask - av mean/median qu

+

-0.359364, 0.1085635

+",2010-08-07 08:40:07.287 +23156,3509713,Marquis Randell,,,,,2013-10-30 00:36:07.867 +22870,1477007,Diego Jimeno,,,https://github.com/diegojimeno,"

An analyst in process, statistics, machine learning and BigData and a bit of game theory :-)

+ +

Just started using GitHub https://github.com/diegojimeno

+",2013-10-20 10:21:46.343 +5479,82247,Dimitar Vouldjeff,"Sofia, Bulgaria",,http://vouldjeff.freewildart.com,,2011-11-24 21:16:43.767 +23471,3549374,007,,,,,2013-11-07 11:35:59.940 +22585,3421416,Jeff Shane,,,,,2013-10-10 21:34:01.167 +22834,3065897,mauna,,,,,2013-10-18 13:33:29.413 +23492,3551870,bison2178,,,,,2013-11-07 20:46:18.193 +23329,19930,Ankur Sethi,"Bangalore, Karnataka, India",,http://ankursethi.in,"

Freelance JavaScript engineer from Bangalore.

+ +

Email: contact@ankursethi.in

+ +

Website: https://ankursethi.in

+ +

Twitter: https://twitter.com/ankurs3thi

+",2013-11-04 08:29:52.920 +22673,879549,Heather Turner,United Kingdom,,http://www.heatherturner.net,"

I am a full-time freelance consultant providing support in statistics and R programming. I am also an Associate Fellow of the Statistics Department at the University of Warwick.

+ +

I spend most of my time working in R, but also use markup languages lots quite a bit.

+",2013-10-14 10:30:29.340 +23522,3555173,Sofea,,,,,2013-11-08 13:47:01.210 +23485,1617686,asdf,,,,,2013-11-07 17:57:25.767 +21884,2831124,Kian,"London, United Kingdom",,,"

Interested in stats, computing and quant finance

+",2013-09-19 21:52:07.043 +23133,3417716,r_alb,,,,,2013-10-29 14:14:57.847 +23044,1489032,Lamia,,,,"

Research Specialist

+",2013-10-25 16:57:06.070 +22599,1179283,Jeff Iacono,San Francisco,,,"

Product @trycaviar / @square. pretendgineer. data analyst. vermonster.

+",2013-10-11 05:52:34.533 +6813,1245417,user9171,,,,,2012-02-13 14:24:15.577 +22917,3148547,Dustin Stevens,,,,,2013-10-21 22:54:47.960 +23457,3547250,charles,,,,,2013-11-07 01:05:08.400 +23214,3518887,cmiller01,,,,,2013-10-31 13:52:08.423 +568,57318,Etienne Racine,"Montreal, QC, Canada",,,,2010-08-05 17:28:05.270 +13526,1831307,Stefano Lombardi,,,,,2013-01-02 10:38:44.693 +371,45700,Richie Cotton,"Union City, NJ, USA",,,"

I'm a Data Evangelist at DataCamp. I wrote a Learning R and Testing R Code.

+

My github and Bitbucket repos, LinkedIn and twitter accounts.

+",2010-07-27 15:21:03.487 +23317,1131390,Kapoios,Greece,,,"

!

+",2013-11-03 22:17:28.047 +22717,1170427,khikho,,,,,2013-10-15 15:53:38.830 +23531,1986060,sonicboom,"Oslo, Norway",,,"

Computer Science PhD graduate student in UiO, Oslo with a strong interest in math.

+",2013-11-08 19:08:05.300 +23200,3517073,user32123,,,,,2013-10-31 06:02:07.147 +23296,3528248,Karnage2015,,,,,2013-11-02 21:21:48.783 +20622,3149735,Taal,,,,,2013-08-08 06:54:26.143 +15806,2495808,Jimj,,,,,2013-03-15 02:06:10.923 +22644,2372872,Roberto Ferrer,,,,,2013-10-13 02:05:50.643 +23262,3524678,Adrien.M,,,,,2013-11-01 19:11:44.703 +23527,917727,Ken Russell,,,,,2013-11-08 16:22:34.470 +5637,1084076,vinux,India,,http://about.me/vinuct,"

Statistician,

+",2011-12-04 16:11:53.740 +22820,1534566,FraNut,,,http://www.irea.cnr.it/index.php?option=com_comprofiler&task=userprofile&user=144&Itemid=100,,2013-10-18 08:42:55.297 +21362,3212276,zhifff,,,,,2013-09-03 13:46:27.047 +23347,1346433,Reed Sandberg,,,http://reed-sandberg.blogspot.com/,"

Machine whisperer

+",2013-11-04 20:57:47.220 +11490,1920491,Matteo Fasiolo,"Bristol, United Kingdom",,,"

Postdoc in Statistics at the University of Bristol. Previously working on statistical inference for dynamical processes. Lately I am working on generalized additive models.

+",2012-10-09 21:22:33.060 +23334,41225,lol,Aruba,,http://lol.com,what can i say; lol!,2013-11-04 12:19:20.483 +10409,7049886,Bradford,,,,,2012-08-21 14:43:10.063 +23071,2464957,Alexey Krikunov,,,,,2013-10-27 11:17:25.160 +18514,2842712,gregory_britten,"Woods Hole, MA, USA",,http://pemlab.whoi.edu,"

I am a PI at the Woods Hole Oceanographic Institution.

+",2013-05-31 21:20:04.397 +21476,3271296,Giancarlo,,,,,2013-09-06 13:51:40.493 +23430,3380629,satyarth sharma,,,,,2013-11-06 13:18:11.033 +23045,3491239,Kevin,,,,,2013-10-25 16:59:53.883 +18841,2115671,C.Colden,,,,,2013-06-11 16:11:21.680 +5911,973400,shn,,,,,2011-12-20 17:47:35.700 +23128,976979,daemonk,,,,,2013-10-29 10:56:01.507 +16144,1722110,Vincent,"32° 20'N, 64° 45'W",,,"

I am a unix fanatic who uses OSX and Ubuntu at home and at the office.

+",2013-03-24 01:04:33.750 +211,93886,Tal Galili,Israel,,http://www.r-statistics.com,"Statistics, blogging, and the hope for a happy long life.",2010-07-21 07:53:50.990 +22786,3449771,Bella Fadida,,,,,2013-10-17 08:46:33.937 +22903,429289,Skippy le Grand Gourou,"Clermont-Ferrand, France",,https://lachrysomele.fr,"

The world is full of ignorant people, smug people and arrogant people. SE is no exception — why would it be ?

+ +

Like art ? Check La Chrysomèle, Art Gallery.

+",2013-10-21 12:58:09.987 +20604,3023629,phg,,,,,2013-08-07 14:56:49.377 +22705,1421348,Learnerbeaver,Mars,,,"

Python explorer for text mining, modelling etc..

+",2013-10-15 07:43:36.180 +449,125690,John,"Halifax, Canada",,,"

I teach honours statistics in a psychology department primarily using R and simulation.

+",2010-07-29 14:29:12.917 +21168,1557825,NG_21,"Bangalore, India",,http://www.bosch.com/worldsite_startpage/en/default.aspx,"

A statistician by qualification, Machine Learning practitioner, aspiring Data Scientist

+",2013-08-27 18:30:54.943 +13370,2172823,Jessica,,,,,2012-12-23 00:21:39.787 +23507,3553094,user32491,,,,,2013-11-08 03:28:34.330 +12787,2099187,generic_user,,,,,2012-11-30 19:02:58.453 +18848,2885929,GK89,,,,"

Data Scientist working in the oil industry

+",2013-06-11 18:55:25.513 +23112,3503839,Pradyumna,,,,,2013-10-28 22:02:30.007 +20130,535336,Anton Tarasenko,,,https://antontarasenko.com,,2013-07-22 05:46:44.290 diff --git a/examples/csv_examples/votes.csv b/examples/csv_examples/votes.csv new file mode 100644 index 00000000..4d72f4cc --- /dev/null +++ b/examples/csv_examples/votes.csv @@ -0,0 +1,2001 @@ +id,user_id,post_id,vote_type_id,creation_date +177,,143,2,2010-07-19 +246,,28,2,2010-07-19 +362,,28,2,2010-07-19 +138,,28,2,2010-07-19 +258,,28,2,2010-07-19 +202,,28,2,2010-07-19 +284,,28,2,2010-07-19 +254,,28,2,2010-07-19 +397,,143,3,2010-07-20 +566,,143,2,2010-07-20 +673,,143,2,2010-07-20 +1103,,143,2,2010-07-21 +852,,414,2,2010-07-21 +857,,414,2,2010-07-21 +868,,414,2,2010-07-21 +871,,414,2,2010-07-21 +1057,,28,2,2010-07-21 +1055,,414,2,2010-07-21 +886,,414,2,2010-07-21 +899,,414,2,2010-07-21 +988,,356,2,2010-07-21 +1087,,356,2,2010-07-21 +1054,,414,3,2010-07-21 +892,,28,2,2010-07-21 +1234,,414,2,2010-07-22 +1191,,414,2,2010-07-22 +1113,,28,2,2010-07-22 +1131,,356,2,2010-07-22 +1127,,28,2,2010-07-22 +1126,,412,2,2010-07-22 +1431,,543,2,2010-07-23 +1428,,543,2,2010-07-23 +1324,,414,3,2010-07-23 +1426,,541,2,2010-07-23 +1310,,541,2,2010-07-23 +1425,,414,2,2010-07-23 +1362,,356,2,2010-07-23 +1408,,414,2,2010-07-23 +1337,,414,2,2010-07-23 +1356,,412,2,2010-07-23 +1530,,412,2,2010-07-24 +1538,,541,2,2010-07-24 +1527,,541,2,2010-07-24 +1526,,543,2,2010-07-24 +1498,,543,2,2010-07-24 +1534,,543,2,2010-07-24 +1580,,541,2,2010-07-25 +1572,,28,2,2010-07-25 +1581,,543,2,2010-07-25 +1611,,28,2,2010-07-25 +1547,,28,2,2010-07-25 +1699,,28,2,2010-07-26 +1809,,414,2,2010-07-26 +1779,,412,2,2010-07-26 +2041,,414,2,2010-07-27 +2406,,543,2,2010-07-28 +2549,,356,2,2010-07-29 +2611,,412,2,2010-07-29 +2689,,412,2,2010-07-30 +2638,,356,2,2010-07-30 +2918,,143,2,2010-08-02 +2906,,28,2,2010-08-02 +2905,,543,2,2010-08-02 +3079,,143,2,2010-08-03 +3288,,541,2,2010-08-05 +3402,,1248,2,2010-08-06 +3486,,1248,2,2010-08-06 +3467,,1248,2,2010-08-06 +3441,,28,2,2010-08-06 +4943,,28,2,2010-08-19 +4971,,1760,2,2010-08-19 +4923,,1760,2,2010-08-19 +5013,,1760,2,2010-08-19 +4976,,1760,2,2010-08-19 +5188,,1760,2,2010-08-21 +5816,,543,2,2010-08-27 +5778,,541,2,2010-08-27 +5971,,414,2,2010-08-30 +5944,,414,2,2010-08-30 +6061,,414,2,2010-08-31 +6099,,414,2,2010-08-31 +6368,,2156,2,2010-09-02 +6342,,2156,2,2010-09-02 +6536,,2169,2,2010-09-03 +6916,,414,2,2010-09-07 +7206,,541,2,2010-09-09 +7205,,543,2,2010-09-09 +7327,,412,2,2010-09-10 +7933,,2509,2,2010-09-15 +7927,,2509,2,2010-09-15 +7926,,2509,2,2010-09-15 +7928,,2509,2,2010-09-15 +7854,,2509,2,2010-09-15 +7851,,2509,2,2010-09-15 +8015,,2509,2,2010-09-16 +8050,,2509,2,2010-09-16 +7981,,2509,2,2010-09-16 +7982,,2509,2,2010-09-16 +8040,,2509,2,2010-09-16 +7975,,2509,2,2010-09-16 +8042,,2509,2,2010-09-16 +8019,,2509,2,2010-09-16 +8047,,2509,2,2010-09-16 +8049,,2509,2,2010-09-16 +8067,,2509,2,2010-09-16 +8264,,2509,2,2010-09-17 +8565,,2509,2,2010-09-20 +8935,,1787,2,2010-09-23 +8966,,1760,2,2010-09-23 +8962,,1787,2,2010-09-23 +8889,,1760,2,2010-09-23 +8957,,1760,2,2010-09-23 +8951,,1787,2,2010-09-23 +9038,,1787,2,2010-09-24 +9057,,1787,2,2010-09-24 +9058,,1760,2,2010-09-24 +9188,,414,2,2010-09-25 +9290,,1760,2,2010-09-26 +9549,,2169,2,2010-09-29 +9668,,2509,2,2010-09-30 +10028,,414,2,2010-10-02 +10268,,2156,2,2010-10-04 +10251,,2156,2,2010-10-04 +10238,,2169,2,2010-10-04 +10210,,28,2,2010-10-04 +10187,,28,2,2010-10-04 +10158,,2169,2,2010-10-04 +10411,,2156,2,2010-10-06 +10692,,143,2,2010-10-08 +10682,,356,2,2010-10-08 +10741,,3188,2,2010-10-09 +10845,,3188,2,2010-10-10 +10834,,412,2,2010-10-10 +11177,,3188,2,2010-10-12 +11455,,541,2,2010-10-13 +11456,,543,2,2010-10-13 +11340,,541,2,2010-10-13 +11426,,543,2,2010-10-13 +11424,,541,2,2010-10-13 +11338,,414,2,2010-10-13 +11683,,541,2,2010-10-14 +11526,,1248,2,2010-10-14 +11626,,1248,2,2010-10-14 +11941,,3188,2,2010-10-17 +12202,,414,2,2010-10-19 +13006,,2509,2,2010-10-22 +13128,,3646,2,2010-10-24 +13127,,3646,2,2010-10-24 +13159,,3649,2,2010-10-24 +13187,,3649,2,2010-10-24 +13354,,3649,2,2010-10-25 +13284,,3646,2,2010-10-25 +13355,,3646,2,2010-10-25 +13774,,28,2,2010-10-28 +13909,,543,2,2010-10-29 +13908,,541,2,2010-10-29 +13969,,3649,2,2010-10-30 +14040,,2509,2,2010-10-30 +14065,,541,2,2010-10-30 +14143,,3649,1,2010-10-31 +14418,,3649,2,2010-11-02 +14417,,3646,2,2010-11-02 +14611,,414,2,2010-11-04 +14850,,414,2,2010-11-05 +15272,,3188,2,2010-11-08 +15271,,143,2,2010-11-08 +15576,,28,2,2010-11-10 +16249,,4187,2,2010-11-15 +16250,,4187,2,2010-11-15 +16247,,4187,2,2010-11-15 +16243,,4187,2,2010-11-15 +16383,,4187,2,2010-11-16 +16426,,4187,2,2010-11-16 +16343,,4187,2,2010-11-16 +16391,,4187,2,2010-11-16 +16397,,4187,2,2010-11-16 +16325,,4187,2,2010-11-16 +16273,,4187,2,2010-11-16 +16351,,4187,2,2010-11-16 +16381,,4187,2,2010-11-16 +16476,,4187,2,2010-11-17 +16483,,4187,2,2010-11-17 +16513,,4187,2,2010-11-17 +16575,,4187,2,2010-11-18 +16699,,4187,2,2010-11-19 +16852,,28,2,2010-11-19 +17097,,2509,2,2010-11-23 +17509,,4187,2,2010-11-26 +17589,,414,2,2010-11-27 +17872,,4187,2,2010-11-30 +18015,,543,2,2010-12-01 +18014,,541,2,2010-12-01 +18108,,4187,2,2010-12-02 +18416,,2509,2,2010-12-04 +18309,,4705,2,2010-12-04 +18327,,4705,2,2010-12-04 +18402,,4714,2,2010-12-04 +18311,,4705,2,2010-12-04 +18379,,4714,2,2010-12-04 +18405,,4714,2,2010-12-04 +18334,,4705,2,2010-12-04 +18424,,4705,2,2010-12-04 +18273,,4705,2,2010-12-04 +18378,,4714,2,2010-12-04 +18313,,4714,2,2010-12-04 +18434,,4705,2,2010-12-05 +18472,,4714,2,2010-12-05 +18498,,4714,2,2010-12-06 +18509,,4714,2,2010-12-06 +18581,,4714,2,2010-12-06 +18519,,4714,2,2010-12-06 +18662,,4714,2,2010-12-07 +18666,,4714,2,2010-12-07 +19487,,4714,2,2010-12-13 +19492,,4714,2,2010-12-13 +19398,,4714,2,2010-12-13 +19417,,4705,2,2010-12-13 +19615,,4714,2,2010-12-14 +19631,,5015,2,2010-12-14 +19702,,5020,2,2010-12-14 +19679,,5020,2,2010-12-14 +19704,,5020,2,2010-12-14 +19743,,5020,2,2010-12-14 +19703,,5015,2,2010-12-14 +19732,,5020,2,2010-12-14 +19560,,5015,2,2010-12-14 +19569,,5015,2,2010-12-14 +19603,,5020,2,2010-12-14 +19612,,5020,2,2010-12-14 +19549,,5020,2,2010-12-14 +19666,,5020,1,2010-12-14 +19632,,5020,2,2010-12-14 +19730,,5015,2,2010-12-14 +19566,,5020,2,2010-12-14 +19902,,4705,2,2010-12-15 +19922,,4187,2,2010-12-16 +20547,,3649,2,2010-12-22 +20582,,1760,2,2010-12-23 +20639,,4714,1,2010-12-24 +20660,,1760,2,2010-12-25 +20825,,414,2,2010-12-28 +21127,,2509,2,2011-01-02 +21120,,4187,2,2011-01-02 +21768,,2509,2,2011-01-06 +21950,,4187,2,2011-01-09 +22148,,1760,2,2011-01-11 +22422,,5015,2,2011-01-13 +22642,,2509,2,2011-01-16 +22774,,2509,2,2011-01-17 +22759,,4187,2,2011-01-17 +22755,,2509,2,2011-01-17 +22727,,5020,2,2011-01-17 +22719,,5015,2,2011-01-17 +23961,,5015,2,2011-01-27 +24899,,4187,2,2011-02-04 +25097,,4187,2,2011-02-06 +25076,,541,2,2011-02-06 +25267,,4187,2,2011-02-07 +25527,,412,2,2011-02-09 +25583,,1787,2,2011-02-10 +26157,,4187,2,2011-02-14 +26305,,414,2,2011-02-15 +26558,,1787,1,2011-02-16 +26514,,414,2,2011-02-16 +27070,,6788,2,2011-02-20 +27054,,6788,2,2011-02-20 +27557,,2509,2,2011-02-24 +27634,,414,2,2011-02-24 +27764,,2509,2,2011-02-25 +27968,,412,2,2011-02-27 +29095,,4714,2,2011-03-07 +29092,,4705,2,2011-03-07 +29812,,541,2,2011-03-10 +29858,,28,2,2011-03-10 +29865,,414,2,2011-03-10 +29862,,4187,2,2011-03-10 +30423,,543,2,2011-03-14 +30768,,2509,2,2011-03-16 +30765,,28,2,2011-03-16 +31820,,7965,2,2011-03-24 +31979,,7965,2,2011-03-25 +32228,,414,2,2011-03-28 +32582,,28,2,2011-03-30 +32850,,412,2,2011-04-01 +32963,,4714,2,2011-04-01 +33537,,414,2,2011-04-06 +33643,,414,2,2011-04-06 +33963,,8529,2,2011-04-08 +33961,,4187,2,2011-04-08 +33867,,4705,2,2011-04-08 +33982,,8529,2,2011-04-08 +33970,,8529,2,2011-04-08 +33965,,8529,2,2011-04-08 +34051,,8529,2,2011-04-09 +34061,,8529,2,2011-04-09 +34012,,8529,2,2011-04-09 +34083,,8529,2,2011-04-09 +34160,,8529,2,2011-04-10 +34122,,8529,2,2011-04-10 +34322,,8529,2,2011-04-11 +34651,,8681,2,2011-04-14 +34647,,8699,2,2011-04-14 +34690,,8681,2,2011-04-14 +34725,,8699,2,2011-04-14 +34686,,8681,2,2011-04-14 +34998,,8699,1,2011-04-17 +35081,,28,2,2011-04-18 +35257,,143,2,2011-04-19 +36875,,412,2,2011-05-02 +37112,,412,2,2011-05-04 +37293,,414,2,2011-05-05 +37398,,28,2,2011-05-05 +37397,,4187,2,2011-05-05 +37396,,2509,2,2011-05-05 +37395,,414,2,2011-05-05 +37389,,412,2,2011-05-05 +37298,,414,2,2011-05-05 +37295,,414,2,2011-05-05 +37537,,414,2,2011-05-06 +37482,,414,2,2011-05-06 +37459,,414,2,2011-05-06 +37714,,9524,2,2011-05-07 +37710,,9524,2,2011-05-07 +37688,,9524,2,2011-05-07 +37762,,9524,2,2011-05-07 +37814,,9524,2,2011-05-07 +37813,,9524,2,2011-05-07 +37812,,9529,2,2011-05-07 +37810,,9524,2,2011-05-07 +37805,,9529,2,2011-05-07 +37809,,9529,2,2011-05-07 +37802,,9529,2,2011-05-07 +37801,,9524,2,2011-05-07 +37769,,9529,2,2011-05-07 +37775,,9529,2,2011-05-07 +37763,,9524,2,2011-05-07 +37767,,9524,2,2011-05-07 +37883,,1760,2,2011-05-08 +37987,,9524,2,2011-05-08 +38006,,9524,2,2011-05-08 +38521,,28,2,2011-05-11 +38926,,9524,15,2011-05-13 +38933,,9529,2,2011-05-13 +39169,,412,2,2011-05-15 +39168,,1760,2,2011-05-15 +39382,,9524,2,2011-05-17 +39665,,2509,2,2011-05-18 +39949,,10008,2,2011-05-20 +39941,,10008,2,2011-05-20 +39919,,10008,2,2011-05-20 +40016,,10008,2,2011-05-20 +40027,,10008,2,2011-05-20 +39979,,10008,2,2011-05-20 +40071,,10069,2,2011-05-21 +40122,,10069,2,2011-05-21 +40064,,10008,2,2011-05-21 +40134,,10069,2,2011-05-21 +40333,,10008,2,2011-05-22 +40351,,4705,2,2011-05-23 +40349,,4714,2,2011-05-23 +40922,,28,2,2011-05-26 +41089,,4187,2,2011-05-27 +41419,,10008,2,2011-05-31 +41704,,412,2,2011-06-01 +43108,,4187,2,2011-06-12 +43144,,2509,2,2011-06-12 +43557,,10911,2,2011-06-16 +43750,,10911,2,2011-06-17 +43756,,10069,1,2011-06-18 +43915,,412,2,2011-06-19 +44608,,1248,2,2011-06-23 +44620,,1248,2,2011-06-23 +44698,,1248,2,2011-06-24 +44886,,2509,2,2011-06-27 +44967,,1248,2,2011-06-27 +45667,,28,2,2011-07-05 +45738,,28,2,2011-07-05 +45657,,28,2,2011-07-05 +45821,,1248,2,2011-07-06 +46037,,1248,2,2011-07-07 +46162,,1248,2,2011-07-08 +46158,,1248,2,2011-07-08 +46893,,412,2,2011-07-14 +47779,,2509,2,2011-07-20 +48489,,2509,2,2011-07-26 +48717,,2509,2,2011-07-27 +48801,,2509,2,2011-07-28 +48985,,2509,2,2011-07-29 +49243,,4187,2,2011-08-01 +49185,,4187,2,2011-08-01 +49500,,4187,2,2011-08-03 +49793,,4705,2,2011-08-05 +49870,,414,2,2011-08-05 +49879,,414,2,2011-08-05 +49750,,414,2,2011-08-05 +49791,,4714,2,2011-08-05 +49936,,414,2,2011-08-06 +50279,,4714,2,2011-08-09 +50418,,4714,2,2011-08-09 +50222,,4714,2,2011-08-09 +50427,,4705,2,2011-08-09 +50204,,4714,2,2011-08-09 +50452,,4187,2,2011-08-10 +50760,,2509,2,2011-08-11 +50723,,9524,2,2011-08-11 +51049,,4187,2,2011-08-14 +51360,,414,2,2011-08-16 +51911,,13060,2,2011-08-18 +51938,,13058,2,2011-08-18 +51745,,13058,2,2011-08-18 +51979,,13060,2,2011-08-18 +51942,,13060,2,2011-08-18 +51748,,13060,2,2011-08-18 +51880,,13058,2,2011-08-18 +51849,,13060,2,2011-08-18 +51986,,13060,2,2011-08-18 +51946,,13058,2,2011-08-18 +51913,,13058,2,2011-08-18 +51884,,13060,2,2011-08-18 +51881,,13060,2,2011-08-18 +51741,,13058,2,2011-08-18 +51842,,13060,1,2011-08-18 +51837,,13060,2,2011-08-18 +51821,,13060,2,2011-08-18 +51829,,13058,2,2011-08-18 +51883,,13058,2,2011-08-18 +51768,,13060,2,2011-08-18 +52054,,13058,2,2011-08-19 +52119,,13058,2,2011-08-19 +52136,,28,2,2011-08-19 +52109,,13058,2,2011-08-19 +52142,,414,2,2011-08-20 +52283,,414,2,2011-08-21 +52957,,28,2,2011-08-25 +53076,,10911,2,2011-08-26 +53018,,13058,2,2011-08-26 +52987,,414,2,2011-08-26 +53420,,2169,1,2011-08-30 +53419,,3188,1,2011-08-30 +53469,,356,2,2011-08-30 +53826,,13631,2,2011-09-01 +53972,,13631,2,2011-09-02 +54022,,13631,2,2011-09-02 +53945,,13631,2,2011-09-02 +53919,,13631,2,2011-09-02 +54219,,2509,2,2011-09-05 +54657,,2509,2,2011-09-09 +55397,,414,2,2011-09-15 +56081,,1248,2,2011-09-21 +56223,,4714,2,2011-09-22 +56261,,414,2,2011-09-22 +56351,,414,2,2011-09-22 +56538,,4187,2,2011-09-23 +56468,,4187,2,2011-09-23 +56520,,4714,2,2011-09-23 +56532,,4714,2,2011-09-23 +56660,,1248,2,2011-09-24 +56975,,4187,2,2011-09-27 +57078,,10008,2,2011-09-28 +57642,,28,2,2011-10-02 +57748,,14729,2,2011-10-03 +57802,,14729,2,2011-10-03 +57823,,14790,2,2011-10-03 +58340,,28,2,2011-10-06 +58369,,28,2,2011-10-06 +58441,,28,2,2011-10-06 +58920,,28,2,2011-10-11 +59052,,14790,1,2011-10-11 +59078,,14790,2,2011-10-11 +58926,,4714,2,2011-10-11 +59169,,13058,2,2011-10-12 +59206,,13060,2,2011-10-12 +59418,,13060,2,2011-10-13 +59470,,414,3,2011-10-13 +59445,,15281,2,2011-10-13 +59927,,412,2,2011-10-17 +60208,,15542,2,2011-10-18 +60161,,15542,3,2011-10-18 +60575,,4187,2,2011-10-19 +60578,,414,2,2011-10-19 +60553,,15542,2,2011-10-19 +60411,,2509,2,2011-10-19 +60626,,15542,2,2011-10-19 +60579,,28,2,2011-10-19 +60435,,9529,2,2011-10-19 +60576,,2509,2,2011-10-19 +60755,,414,2,2011-10-20 +60784,,10911,2,2011-10-20 +61296,,356,2,2011-10-23 +61581,,13060,2,2011-10-25 +61802,,1248,2,2011-10-26 +62064,,28,2,2011-10-28 +62228,,541,2,2011-10-29 +62229,,543,2,2011-10-29 +62269,,412,2,2011-10-29 +62528,,9529,2,2011-11-01 +62527,,9524,2,2011-11-01 +62755,,1248,2,2011-11-02 +62945,,412,2,2011-11-03 +62901,,1248,2,2011-11-03 +62978,,4705,2,2011-11-03 +63230,,4705,2,2011-11-05 +63282,,16212,1,2011-11-06 +63283,,16212,2,2011-11-06 +63626,,4705,2,2011-11-08 +63573,,16212,2,2011-11-08 +63727,,16313,2,2011-11-09 +63674,,16337,2,2011-11-09 +63665,,16313,2,2011-11-09 +63657,,16313,2,2011-11-09 +63728,,16337,2,2011-11-09 +63656,,16337,2,2011-11-09 +63912,,16366,2,2011-11-10 +64350,,5015,2,2011-11-13 +64401,,16537,2,2011-11-14 +64400,,15542,2,2011-11-14 +64910,,28,2,2011-11-16 +64914,,15542,2,2011-11-16 +65278,,28,2,2011-11-18 +65187,,2509,2,2011-11-18 +65402,,16537,1,2011-11-19 +65380,,16537,2,2011-11-19 +65617,,2509,2,2011-11-21 +65603,,28,2,2011-11-21 +65820,,1248,2,2011-11-22 +65890,,414,2,2011-11-22 +66013,,16537,2,2011-11-23 +66095,,17000,2,2011-11-24 +66088,,16998,2,2011-11-24 +66218,,414,2,2011-11-24 +66190,,4187,2,2011-11-24 +66362,,414,2,2011-11-25 +66387,,16998,2,2011-11-25 +67271,,4714,2,2011-12-01 +67270,,4705,2,2011-12-01 +68106,,28,2,2011-12-07 +68870,,4705,2,2011-12-13 +68917,,4714,2,2011-12-13 +68967,,4705,2,2011-12-13 +69022,,13058,2,2011-12-13 +69021,,13060,2,2011-12-13 +68976,,4714,2,2011-12-13 +69138,,2509,2,2011-12-14 +69249,,28,2,2011-12-15 +69405,,10069,2,2011-12-16 +69392,,10008,2,2011-12-16 +69409,,10069,2,2011-12-16 +69484,,10069,2,2011-12-16 +69472,,10069,2,2011-12-16 +69419,,10069,2,2011-12-16 +69502,,10008,2,2011-12-16 +70305,,3646,2,2011-12-22 +70317,,16313,2,2011-12-22 +70306,,3649,2,2011-12-22 +70346,,16313,2,2011-12-22 +70345,,16337,2,2011-12-22 +70332,,16337,2,2011-12-22 +70484,,28,2,2011-12-25 +70635,,2509,2,2011-12-26 +70668,,1248,2,2011-12-27 +70758,,16998,2,2011-12-28 +71106,,18345,2,2012-01-01 +71107,,18335,2,2012-01-01 +71118,,18345,2,2012-01-01 +71195,,18345,2,2012-01-02 +71173,,18345,2,2012-01-02 +71200,,18345,2,2012-01-02 +71196,,18335,2,2012-01-02 +71489,,18345,1,2012-01-03 +71434,,541,2,2012-01-03 +71421,,1760,2,2012-01-03 +72042,,4714,3,2012-01-07 +72300,,10008,2,2012-01-09 +72757,,28,2,2012-01-12 +72869,,28,2,2012-01-13 +72999,,16537,2,2012-01-14 +73105,,1248,2,2012-01-15 +73818,,543,2,2012-01-19 +74204,,4187,2,2012-01-21 +74310,,1248,3,2012-01-22 +74311,,414,3,2012-01-22 +74334,,2509,2,2012-01-22 +74719,,414,2,2012-01-24 +74962,,2509,2,2012-01-26 +74933,,1248,2,2012-01-26 +75253,,1248,2,2012-01-28 +75572,,2509,2,2012-01-30 +75841,,2509,2,2012-01-31 +76344,,28,2,2012-02-02 +77041,,2509,2,2012-02-07 +77038,,28,2,2012-02-07 +77771,,20240,2,2012-02-10 +77769,,20234,2,2012-02-10 +77777,,20234,2,2012-02-10 +77772,,20240,1,2012-02-10 +78156,,20240,2,2012-02-11 +78173,,20240,2,2012-02-11 +78153,,20234,2,2012-02-11 +78152,,20234,2,2012-02-11 +78174,,20240,2,2012-02-11 +78137,,20234,2,2012-02-11 +78024,,20240,2,2012-02-11 +78165,,20234,2,2012-02-11 +78265,,20240,2,2012-02-12 +78403,,20240,2,2012-02-13 +78561,,20234,2,2012-02-13 +78822,,1248,2,2012-02-14 +78674,,20240,2,2012-02-14 +78929,,412,2,2012-02-14 +78900,,412,2,2012-02-14 +78852,,2509,2,2012-02-14 +79530,,20234,2,2012-02-16 +79834,,28,2,2012-02-17 +80054,,20240,2,2012-02-19 +80302,,28,2,2012-02-20 +80236,,20561,2,2012-02-20 +80627,,20667,2,2012-02-21 +80816,,20667,2,2012-02-22 +81096,,20240,2,2012-02-23 +81104,,20240,2,2012-02-23 +81094,,20234,2,2012-02-23 +81047,,20234,2,2012-02-23 +81097,,20234,2,2012-02-23 +81046,,20240,2,2012-02-23 +81340,,20234,2,2012-02-24 +81373,,28,2,2012-02-24 +81434,,4187,2,2012-02-25 +81905,,2509,2,2012-02-28 +82214,,16337,2,2012-02-29 +82215,,16337,1,2012-02-29 +82640,,28,2,2012-03-02 +83264,,28,2,2012-03-06 +83240,,412,2,2012-03-06 +83325,,28,2,2012-03-07 +83290,,1248,2,2012-03-07 +83652,,4187,2,2012-03-09 +83674,,4187,2,2012-03-09 +83800,,414,2,2012-03-09 +83713,,4187,2,2012-03-09 +83879,,1248,3,2012-03-10 +83878,,412,2,2012-03-10 +84016,,9524,2,2012-03-11 +84241,,13058,2,2012-03-12 +84475,,10008,2,2012-03-13 +84557,,13058,2,2012-03-13 +85131,,414,2,2012-03-16 +85260,,28,2,2012-03-17 +85701,,28,2,2012-03-20 +85711,,1248,2,2012-03-20 +85719,,15542,2,2012-03-20 +85718,,20240,2,2012-03-20 +85767,,541,2,2012-03-20 +85766,,543,2,2012-03-20 +85720,,16537,2,2012-03-20 +86031,,20561,2,2012-03-21 +86224,,4714,2,2012-03-21 +86223,,4705,2,2012-03-21 +86174,,20561,2,2012-03-21 +86621,,28,2,2012-03-23 +87341,,414,2,2012-03-28 +88055,,4187,2,2012-04-01 +88566,,28,2,2012-04-04 +88860,,2509,2,2012-04-05 +88858,,4187,2,2012-04-05 +89110,,23019,2,2012-04-06 +89107,,23019,2,2012-04-06 +89047,,23019,2,2012-04-06 +89206,,23087,2,2012-04-07 +89288,,23087,3,2012-04-07 +89394,,4187,2,2012-04-08 +89336,,414,2,2012-04-08 +89747,,4187,2,2012-04-10 +89701,,22797,2,2012-04-10 +90510,,4187,2,2012-04-13 +90805,,414,2,2012-04-15 +91397,,4187,2,2012-04-18 +91548,,17000,2,2012-04-19 +91667,,9524,2,2012-04-20 +91955,,4187,2,2012-04-22 +92745,,4187,2,2012-04-26 +92892,,28,2,2012-04-27 +94046,,2509,2,2012-05-02 +94424,,2509,2,2012-05-03 +94556,,4187,2,2012-05-04 +94662,,4187,2,2012-05-04 +94603,,414,2,2012-05-04 +94669,,28,2,2012-05-04 +94592,,10008,2,2012-05-04 +94683,,2509,2,2012-05-04 +94668,,4187,2,2012-05-04 +95013,,10069,2,2012-05-05 +94929,,28,2,2012-05-05 +95132,,24602,2,2012-05-06 +95355,,24602,2,2012-05-07 +95314,,414,2,2012-05-07 +95449,,414,2,2012-05-08 +95444,,24602,1,2012-05-08 +95859,,2509,2,2012-05-09 +95817,,414,2,2012-05-09 +96027,,414,2,2012-05-10 +96365,,414,2,2012-05-11 +96535,,414,15,2012-05-12 +96437,,4714,2,2012-05-12 +96601,,4187,2,2012-05-13 +96677,,25072,2,2012-05-14 +96676,,25087,2,2012-05-14 +96682,,25087,2,2012-05-14 +97574,,10008,2,2012-05-18 +97538,,10069,2,2012-05-18 +97604,,10008,2,2012-05-18 +97613,,10069,2,2012-05-18 +97577,,10069,2,2012-05-18 +97799,,10069,2,2012-05-19 +97837,,2509,2,2012-05-20 +98036,,4187,2,2012-05-21 +98001,,28,2,2012-05-21 +98619,,2156,2,2012-05-23 +98803,,20667,2,2012-05-24 +99937,,20240,2,2012-05-29 +100257,,4714,2,2012-05-31 +100197,,28,2,2012-05-31 +100320,,4187,2,2012-05-31 +100259,,4705,2,2012-05-31 +100465,,26070,2,2012-06-01 +100469,,26070,2,2012-06-01 +100579,,26070,2,2012-06-01 +100879,,26070,2,2012-06-02 +101365,,28,2,2012-06-05 +102827,,26070,2,2012-06-11 +102989,,26070,2,2012-06-12 +103004,,28,2,2012-06-12 +103051,,26657,2,2012-06-12 +102944,,26070,2,2012-06-12 +103057,,10008,2,2012-06-12 +103048,,26070,2,2012-06-12 +103038,,26070,2,2012-06-12 +103018,,26657,2,2012-06-12 +103690,,412,2,2012-06-15 +103640,,412,2,2012-06-15 +103612,,412,15,2012-06-15 +103720,,412,2,2012-06-15 +103710,,26657,2,2012-06-15 +103707,,26070,2,2012-06-15 +103672,,412,15,2012-06-15 +103955,,2509,2,2012-06-16 +104405,,28,2,2012-06-18 +104867,,27132,2,2012-06-20 +104794,,27120,2,2012-06-20 +104871,,27132,2,2012-06-20 +104860,,27120,2,2012-06-20 +104859,,27132,2,2012-06-20 +104870,,27132,1,2012-06-20 +105110,,27120,2,2012-06-21 +105289,,27132,2,2012-06-21 +105112,,27132,2,2012-06-21 +105306,,27132,2,2012-06-21 +105305,,27120,2,2012-06-21 +105653,,27194,2,2012-06-22 +105577,,27194,2,2012-06-22 +105569,,27194,2,2012-06-22 +105529,,27194,2,2012-06-22 +105494,,10008,2,2012-06-22 +107306,,10069,2,2012-06-30 +108055,,10008,2,2012-07-04 +108331,,414,2,2012-07-05 +109456,,412,2,2012-07-10 +109599,,412,2,2012-07-11 +111204,,26070,2,2012-07-17 +111487,,23087,2,2012-07-18 +111811,,4187,2,2012-07-20 +112512,,1248,2,2012-07-23 +112597,,4187,2,2012-07-24 +113036,,414,2,2012-07-25 +113035,,28,2,2012-07-25 +113275,,356,2,2012-07-26 +113657,,2509,2,2012-07-27 +113936,,412,2,2012-07-28 +114453,,1248,2,2012-07-31 +114655,,9529,2,2012-08-01 +114848,,4187,2,2012-08-02 +115237,,412,2,2012-08-03 +115375,,2509,2,2012-08-04 +115311,,2509,2,2012-08-04 +115295,,414,2,2012-08-04 +115414,,2509,2,2012-08-05 +115439,,356,2,2012-08-05 +117744,,4187,2,2012-08-14 +118166,,10008,2,2012-08-15 +118617,,15542,2,2012-08-16 +119072,,543,2,2012-08-18 +118947,,541,2,2012-08-18 +118945,,541,2,2012-08-18 +119071,,541,2,2012-08-18 +119034,,543,2,2012-08-18 +119009,,541,2,2012-08-18 +118978,,541,2,2012-08-18 +118975,,541,2,2012-08-18 +119200,,414,2,2012-08-19 +119187,,541,2,2012-08-19 +119508,,541,2,2012-08-20 +119339,,30434,2,2012-08-20 +119374,,10069,2,2012-08-20 +119804,,10008,2,2012-08-21 +119810,,10069,2,2012-08-21 +120020,,30434,2,2012-08-22 +120041,,30434,1,2012-08-22 +120288,,414,2,2012-08-23 +121184,,30862,2,2012-08-28 +121314,,414,2,2012-08-28 +121434,,30862,2,2012-08-28 +121399,,30862,2,2012-08-28 +121395,,30864,2,2012-08-28 +121353,,30864,2,2012-08-28 +121382,,30864,2,2012-08-28 +121398,,30864,1,2012-08-28 +121476,,30862,2,2012-08-28 +121400,,30862,2,2012-08-28 +121494,,30862,2,2012-08-28 +121501,,30864,2,2012-08-28 +121396,,30864,2,2012-08-28 +121480,,30862,2,2012-08-28 +121192,,30864,2,2012-08-28 +121257,,30862,2,2012-08-28 +121247,,30864,2,2012-08-28 +121575,,30960,2,2012-08-29 +121537,,30957,2,2012-08-29 +121598,,30960,2,2012-08-29 +121596,,30957,2,2012-08-29 +121654,,412,2,2012-08-29 +121576,,30960,2,2012-08-29 +121992,,30960,1,2012-08-30 +121824,,4705,2,2012-08-30 +121764,,30862,2,2012-08-30 +121757,,30864,2,2012-08-30 +121947,,30960,2,2012-08-30 +121895,,2509,2,2012-08-30 +123028,,30862,2,2012-09-04 +123012,,30862,2,2012-09-04 +123029,,30864,2,2012-09-04 +123011,,30864,2,2012-09-04 +123242,,30862,2,2012-09-05 +123356,,10008,2,2012-09-06 +123537,,9529,2,2012-09-06 +124357,,414,2,2012-09-10 +124533,,31587,2,2012-09-11 +124769,,2509,2,2012-09-11 +124612,,31587,1,2012-09-11 +124603,,31575,2,2012-09-11 +124566,,2509,2,2012-09-11 +125053,,412,2,2012-09-12 +124980,,10911,2,2012-09-12 +125080,,31587,2,2012-09-13 +125599,,31587,16,2012-09-15 +126271,,4187,2,2012-09-19 +126446,,32053,1,2012-09-20 +126447,,32053,2,2012-09-20 +126651,,32038,2,2012-09-20 +126636,,32038,2,2012-09-20 +126614,,32038,2,2012-09-20 +126583,,32038,2,2012-09-20 +126486,,32038,2,2012-09-20 +126729,,2509,2,2012-09-21 +126850,,32053,2,2012-09-21 +126858,,4187,2,2012-09-21 +126719,,30960,2,2012-09-21 +126695,,2509,2,2012-09-21 +127590,,32317,2,2012-09-25 +127649,,412,2,2012-09-25 +127909,,32317,2,2012-09-26 +128379,,28,2,2012-09-28 +128301,,28,2,2012-09-28 +128527,,414,2,2012-09-29 +128890,,414,2,2012-09-30 +129022,,32038,2,2012-10-01 +129256,,26070,2,2012-10-02 +129412,,414,2,2012-10-02 +129825,,28,2,2012-10-03 +129724,,4187,2,2012-10-03 +129706,,28,2,2012-10-03 +130889,,15542,2,2012-10-07 +131100,,30864,2,2012-10-08 +131276,,5015,2,2012-10-08 +131132,,4187,2,2012-10-08 +131154,,414,2,2012-10-08 +131581,,28,2,2012-10-09 +131574,,412,2,2012-10-09 +131695,,4705,2,2012-10-09 +131730,,28,2,2012-10-10 +132537,,14729,2,2012-10-13 +132903,,28,2,2012-10-15 +133193,,33598,2,2012-10-16 +133180,,33598,2,2012-10-16 +133426,,3646,2,2012-10-18 +133427,,3649,2,2012-10-18 +134089,,4187,2,2012-10-21 +134353,,1248,2,2012-10-23 +134680,,1248,2,2012-10-24 +134724,,1248,2,2012-10-24 +134874,,34166,2,2012-10-25 +134859,,34166,2,2012-10-25 +134858,,34166,2,2012-10-25 +134851,,34166,2,2012-10-25 +134840,,34166,2,2012-10-25 +134835,,34166,2,2012-10-25 +134861,,34166,2,2012-10-25 +135041,,34166,2,2012-10-26 +135085,,34166,2,2012-10-26 +134988,,34166,2,2012-10-26 +135104,,34166,2,2012-10-26 +135023,,34166,2,2012-10-26 +135149,,34166,2,2012-10-26 +135004,,34166,2,2012-10-26 +135131,,34166,2,2012-10-26 +135094,,34166,2,2012-10-26 +135349,,34166,2,2012-10-27 +135323,,34166,2,2012-10-27 +135809,,34166,2,2012-10-30 +135755,,34166,2,2012-10-30 +135962,,34166,2,2012-10-31 +137142,,28,2,2012-11-05 +137084,,16366,2,2012-11-05 +137082,,4187,2,2012-11-05 +137850,,28,2,2012-11-08 +137914,,414,2,2012-11-09 +137887,,414,2,2012-11-09 +138250,,35097,2,2012-11-11 +138294,,35097,2,2012-11-11 +138295,,35097,2,2012-11-11 +138275,,35097,2,2012-11-11 +138266,,35097,2,2012-11-11 +138274,,35097,2,2012-11-11 +138287,,35097,2,2012-11-11 +138277,,35097,2,2012-11-11 +138302,,35097,2,2012-11-11 +138344,,35097,2,2012-11-11 +138297,,35097,2,2012-11-11 +138304,,412,2,2012-11-11 +138358,,35097,2,2012-11-11 +138270,,35097,2,2012-11-11 +138249,,35097,2,2012-11-11 +138242,,35097,2,2012-11-11 +138262,,35097,2,2012-11-11 +138483,,35160,2,2012-11-12 +138486,,35097,2,2012-11-12 +138586,,35097,2,2012-11-12 +138400,,35160,2,2012-11-12 +138394,,35097,2,2012-11-12 +138393,,35097,2,2012-11-12 +138533,,35097,2,2012-11-12 +138524,,35097,2,2012-11-12 +138505,,35097,2,2012-11-12 +138481,,35097,2,2012-11-12 +138465,,35160,2,2012-11-12 +138428,,35097,2,2012-11-12 +138412,,35097,2,2012-11-12 +138429,,35097,2,2012-11-12 +138466,,35160,2,2012-11-12 +138534,,35097,2,2012-11-12 +138472,,35097,2,2012-11-12 +138411,,35097,2,2012-11-12 +138787,,35160,2,2012-11-13 +138709,,35160,2,2012-11-13 +138852,,35097,2,2012-11-13 +138811,,35160,2,2012-11-13 +138790,,35097,2,2012-11-13 +138720,,35097,2,2012-11-13 +138781,,35160,2,2012-11-13 +138759,,34166,2,2012-11-13 +138761,,35097,2,2012-11-13 +138694,,35249,2,2012-11-13 +138680,,35160,3,2012-11-13 +138628,,35097,2,2012-11-13 +138729,,35097,2,2012-11-13 +138815,,35160,2,2012-11-13 +138721,,35160,2,2012-11-13 +139015,,35097,2,2012-11-14 +139046,,35097,2,2012-11-14 +139043,,414,2,2012-11-14 +139016,,35160,2,2012-11-14 +139009,,35097,2,2012-11-14 +139021,,35160,2,2012-11-14 +139008,,35160,2,2012-11-14 +139264,,414,2,2012-11-15 +139192,,35160,2,2012-11-15 +139236,,35097,2,2012-11-15 +139161,,35160,2,2012-11-15 +139314,,35097,2,2012-11-15 +139410,,35160,2,2012-11-16 +139405,,35160,2,2012-11-16 +139456,,35097,2,2012-11-16 +139457,,35160,2,2012-11-16 +139413,,35097,2,2012-11-16 +139785,,10069,2,2012-11-18 +139784,,10008,2,2012-11-18 +139786,,10008,2,2012-11-18 +139797,,1248,2,2012-11-18 +140248,,541,2,2012-11-21 +140937,,13058,2,2012-11-24 +141238,,412,2,2012-11-26 +141872,,414,2,2012-11-28 +142036,,414,2,2012-11-29 +142682,,414,2,2012-12-01 +142662,,13058,2,2012-12-01 +142719,,25087,1,2012-12-01 +142777,,28,2,2012-12-02 +143062,,414,2,2012-12-03 +142947,,10069,2,2012-12-03 +142929,,10008,2,2012-12-03 +143421,,414,2,2012-12-04 +143389,,28,2,2012-12-04 +143438,,35097,2,2012-12-04 +143444,,35160,2,2012-12-04 +143626,,2509,2,2012-12-05 +143689,,541,2,2012-12-05 +143926,,20240,2,2012-12-06 +143912,,20234,2,2012-12-06 +143820,,4187,2,2012-12-06 +143817,,2509,2,2012-12-06 +143816,,28,2,2012-12-06 +144014,,4187,2,2012-12-07 +144430,,2509,2,2012-12-09 +144365,,28,2,2012-12-09 +144353,,35160,2,2012-12-09 +144714,,2509,2,2012-12-10 +145102,,34166,2,2012-12-12 +145618,,37182,2,2012-12-14 +145734,,34166,2,2012-12-14 +145727,,414,2,2012-12-14 +145907,,414,2,2012-12-15 +146184,,28,2,2012-12-16 +146108,,37182,2,2012-12-16 +146074,,25072,2,2012-12-16 +146228,,13058,2,2012-12-17 +146427,,143,2,2012-12-18 +147545,,20667,2,2012-12-22 +147571,,20667,3,2012-12-22 +147803,,30434,2,2012-12-23 +147810,,16366,2,2012-12-23 +147911,,414,2,2012-12-24 +147983,,414,2,2012-12-25 +148321,,37748,15,2012-12-27 +148524,,33598,2,2012-12-30 +148577,,33598,2,2012-12-30 +148593,,13058,2,2012-12-30 +148592,,13060,2,2012-12-30 +148895,,34166,2,2013-01-02 +148969,,37981,2,2013-01-03 +148983,,37981,2,2013-01-03 +148996,,37981,2,2013-01-03 +149248,,1248,2,2013-01-03 +149191,,1248,2,2013-01-03 +149157,,37981,2,2013-01-03 +149051,,37981,2,2013-01-03 +149399,,28,2,2013-01-04 +149388,,3188,2,2013-01-04 +149339,,37182,2,2013-01-04 +149680,,34166,2,2013-01-05 +150106,,1248,2,2013-01-07 +150463,,28,2,2013-01-09 +151021,,4187,2,2013-01-11 +151375,,30960,2,2013-01-13 +151366,,30957,2,2013-01-13 +151516,,34166,2,2013-01-14 +151874,,10008,2,2013-01-15 +152020,,34166,2,2013-01-16 +151973,,2509,2,2013-01-16 +152043,,2509,2,2013-01-16 +152395,,35160,3,2013-01-17 +152503,,414,2,2013-01-17 +152556,,414,2,2013-01-17 +152398,,13058,2,2013-01-17 +152397,,13060,2,2013-01-17 +152552,,414,2,2013-01-17 +152622,,34166,2,2013-01-18 +152923,,16537,2,2013-01-19 +152970,,28,2,2013-01-20 +153180,,4187,2,2013-01-21 +153295,,4187,2,2013-01-22 +153594,,414,2,2013-01-23 +153434,,412,2,2013-01-23 +153725,,414,2,2013-01-24 +153657,,37981,2,2013-01-24 +153863,,26070,2,2013-01-25 +154648,,33598,2,2013-01-29 +155187,,13060,2,2013-02-01 +155442,,13058,2,2013-02-02 +155687,,13060,2,2013-02-03 +155686,,13058,2,2013-02-03 +155771,,414,2,2013-02-04 +155973,,28,2,2013-02-05 +156688,,28,2,2013-02-08 +156795,,412,2,2013-02-08 +156942,,40121,2,2013-02-10 +157000,,40104,2,2013-02-10 +157130,,40104,2,2013-02-11 +159038,,34166,2,2013-02-19 +158945,,2509,2,2013-02-19 +159303,,28,2,2013-02-20 +159502,,541,2,2013-02-21 +159991,,40859,2,2013-02-22 +159990,,40859,2,2013-02-22 +160014,,40859,2,2013-02-22 +159840,,412,2,2013-02-22 +160261,,2509,2,2013-02-24 +160340,,4187,2,2013-02-24 +160759,,10911,2,2013-02-26 +161344,,41244,2,2013-03-01 +161346,,414,2,2013-03-01 +161436,,41244,3,2013-03-01 +161826,,34166,2,2013-03-04 +162394,,28,2,2013-03-06 +162639,,2509,2,2013-03-07 +162829,,1760,2,2013-03-08 +163165,,37981,2,2013-03-10 +163401,,2509,2,2013-03-11 +163569,,2509,2,2013-03-11 +165071,,42513,2,2013-03-19 +165070,,42517,2,2013-03-19 +165043,,28,2,2013-03-19 +165086,,42517,2,2013-03-19 +165081,,42517,2,2013-03-19 +165080,,42513,2,2013-03-19 +165078,,42517,2,2013-03-19 +165344,,42513,2,2013-03-20 +165346,,42517,2,2013-03-20 +165334,,2509,2,2013-03-20 +165489,,42513,2,2013-03-20 +165355,,42517,1,2013-03-20 +165287,,42517,2,2013-03-20 +165345,,42517,2,2013-03-20 +165478,,42517,2,2013-03-20 +165474,,42517,2,2013-03-20 +165463,,42517,15,2013-03-20 +165457,,42517,2,2013-03-20 +165448,,42517,2,2013-03-20 +165447,,42513,2,2013-03-20 +165667,,2509,2,2013-03-21 +165635,,42513,2,2013-03-21 +165685,,42517,2,2013-03-21 +165744,,13631,2,2013-03-22 +165819,,42517,2,2013-03-22 +165793,,2509,2,2013-03-22 +166166,,26070,2,2013-03-24 +166305,,42517,2,2013-03-25 +166409,,42517,2,2013-03-26 +166894,,2509,2,2013-03-28 +167006,,3649,2,2013-03-28 +167005,,3646,2,2013-03-28 +167002,,31575,2,2013-03-28 +167266,,28,2,2013-03-29 +167189,,34166,2,2013-03-29 +167448,,2509,2,2013-03-30 +167821,,9524,2,2013-04-01 +167686,,9529,2,2013-04-01 +167869,,43458,2,2013-04-02 +168035,,42517,2,2013-04-02 +168056,,543,2,2013-04-02 +168531,,2509,2,2013-04-04 +168433,,28,2,2013-04-04 +168877,,30434,2,2013-04-06 +168996,,414,2,2013-04-07 +169220,,34166,2,2013-04-08 +169965,,42517,2,2013-04-10 +169966,,42513,2,2013-04-10 +170288,,2509,2,2013-04-11 +170137,,34166,2,2013-04-11 +170404,,1248,2,2013-04-12 +170567,,28,2,2013-04-12 +170597,,2509,2,2013-04-12 +170592,,4187,2,2013-04-12 +171117,,414,2,2013-04-14 +171592,,1248,2,2013-04-16 +172113,,412,2,2013-04-18 +172310,,42517,2,2013-04-19 +172493,,44772,16,2013-04-20 +172732,,16537,16,2013-04-21 +173012,,44772,2,2013-04-22 +173325,,1248,2,2013-04-24 +173341,,414,2,2013-04-24 +173356,,34166,2,2013-04-24 +173573,,10069,2,2013-04-24 +174185,,414,2,2013-04-26 +174045,,45279,2,2013-04-26 +174285,,37748,2,2013-04-27 +174456,,10008,2,2013-04-28 +174687,,10069,2,2013-04-29 +174638,,45457,16,2013-04-29 +174898,,45536,1,2013-04-30 +174951,,45543,2,2013-04-30 +174939,,45543,2,2013-04-30 +175149,,28,2,2013-05-01 +175327,,45279,2,2013-05-02 +175437,,28,2,2013-05-02 +175349,,45279,2,2013-05-02 +175830,,10069,2,2013-05-04 +175934,,37981,2,2013-05-04 +176312,,4187,2,2013-05-06 +176240,,4187,2,2013-05-06 +176751,,46070,2,2013-05-08 +176897,,46070,2,2013-05-08 +176792,,20240,2,2013-05-08 +176775,,46070,2,2013-05-08 +176953,,414,2,2013-05-08 +176929,,28,2,2013-05-08 +177003,,28,2,2013-05-09 +177328,,9524,2,2013-05-10 +177407,,13058,2,2013-05-11 +177346,,4705,2,2013-05-11 +177658,,2509,2,2013-05-12 +177588,,4187,2,2013-05-12 +177697,,28,2,2013-05-13 +178671,,44370,2,2013-05-16 +178766,,16537,2,2013-05-16 +178984,,28,2,2013-05-17 +178982,,414,2,2013-05-17 +179268,,1248,2,2013-05-19 +179278,,414,2,2013-05-19 +179566,,2509,2,2013-05-20 +179774,,4187,2,2013-05-21 +179842,,28,2,2013-05-21 +180289,,3649,2,2013-05-22 +180187,,9529,1,2013-05-22 +180290,,3646,2,2013-05-22 +180478,,414,2,2013-05-23 +180676,,2509,2,2013-05-24 +180977,,28,2,2013-05-25 +181610,,4187,2,2013-05-28 +181824,,47497,16,2013-05-28 +181900,,45279,2,2013-05-29 +182259,,37981,2,2013-05-30 +182245,,2509,2,2013-05-30 +182809,,47846,2,2013-06-02 +182807,,47846,2,2013-06-02 +182925,,47846,2,2013-06-02 +183233,,10008,2,2013-06-03 +183235,,5015,2,2013-06-03 +183162,,2509,2,2013-06-03 +183240,,10069,2,2013-06-03 +183086,,45543,2,2013-06-03 +183234,,5020,2,2013-06-03 +183314,,2509,2,2013-06-04 +183508,,47981,2,2013-06-04 +183398,,10069,2,2013-06-04 +183685,,48103,2,2013-06-05 +183823,,4187,2,2013-06-05 +183974,,16537,2,2013-06-06 +184084,,48133,2,2013-06-06 +183973,,15542,2,2013-06-06 +184154,,48133,2,2013-06-06 +184098,,48125,2,2013-06-06 +183952,,48133,2,2013-06-06 +184133,,48125,2,2013-06-06 +184130,,48103,2,2013-06-06 +184113,,48125,2,2013-06-06 +183950,,48103,2,2013-06-06 +184012,,48133,2,2013-06-06 +184185,,48133,2,2013-06-06 +184397,,28,2,2013-06-07 +184382,,414,2,2013-06-07 +184967,,26070,2,2013-06-10 +185006,,1248,2,2013-06-10 +185549,,4187,2,2013-06-12 +185945,,4187,2,2013-06-13 +186254,,37981,2,2013-06-14 +186449,,4187,2,2013-06-15 +186779,,4187,2,2013-06-16 +186735,,2509,2,2013-06-16 +187240,,28,2,2013-06-18 +187504,,2509,2,2013-06-19 +187501,,2509,2,2013-06-19 +187390,,34166,2,2013-06-19 +187989,,1248,2,2013-06-20 +187913,,414,2,2013-06-20 +187905,,2509,2,2013-06-20 +187892,,2509,2,2013-06-20 +188166,,2509,2,2013-06-21 +188109,,28,2,2013-06-21 +188186,,414,2,2013-06-21 +188183,,2509,2,2013-06-21 +188565,,4714,2,2013-06-23 +189119,,45280,6,2013-06-25 +189088,,45280,6,2013-06-25 +189248,,1248,2,2013-06-25 +189477,,2509,2,2013-06-26 +189865,,1248,2,2013-06-27 +189725,,1248,2,2013-06-27 +189621,,1248,2,2013-06-27 +190050,,3649,2,2013-06-28 +190201,,1248,2,2013-06-28 +190051,,3646,2,2013-06-28 +190273,,5015,2,2013-06-29 +190329,,48658,2,2013-06-29 +190424,,1248,2,2013-06-30 +190679,,10069,2,2013-07-01 +191301,,18335,2,2013-07-03 +191312,,3649,2,2013-07-03 +191299,,18345,2,2013-07-03 +191519,,49879,2,2013-07-03 +191325,,49906,2,2013-07-03 +191320,,49906,2,2013-07-03 +191257,,49906,2,2013-07-03 +191407,,2509,2,2013-07-03 +191400,,49906,2,2013-07-03 +191311,,3646,2,2013-07-03 +191791,,49906,2,2013-07-04 +191650,,28,2,2013-07-04 +192561,,49879,2,2013-07-07 +193011,,48658,2,2013-07-09 +193792,,26070,2,2013-07-11 +194293,,414,2,2013-07-12 +194491,,2509,2,2013-07-14 +194508,,50739,2,2013-07-14 +194568,,414,2,2013-07-14 +194537,,2509,2,2013-07-14 +195177,,2509,2,2013-07-16 +195415,,2509,2,2013-07-17 +195357,,2509,2,2013-07-17 +195498,,2509,2,2013-07-17 +195354,,4187,2,2013-07-17 +195726,,35160,2,2013-07-18 +195733,,35097,2,2013-07-18 +195945,,51047,2,2013-07-18 +195865,,49906,2,2013-07-18 +196000,,16209,2,2013-07-18 +195999,,16212,2,2013-07-18 +196440,,3646,2,2013-07-20 +196392,,28,2,2013-07-20 +196619,,3646,16,2013-07-20 +196516,1790.0,50982,8,2013-07-20 +196511,,50982,2,2013-07-20 +196843,,35097,2,2013-07-21 +196880,,50982,2,2013-07-21 +197125,,23019,2,2013-07-22 +197122,,50982,2,2013-07-22 +197052,,34166,2,2013-07-22 +196955,,34166,2,2013-07-22 +197540,,1248,2,2013-07-23 +197526,,414,2,2013-07-23 +197656,,51644,16,2013-07-24 +197746,,1248,2,2013-07-24 +197817,,51644,2,2013-07-24 +198718,,50982,2,2013-07-26 +198854,,4187,2,2013-07-27 +199066,,50982,9,2013-07-28 +199407,,52099,16,2013-07-29 +199383,,28,2,2013-07-29 +199524,,2509,2,2013-07-29 +199780,,52126,16,2013-07-30 +199792,,52126,2,2013-07-30 +200128,,2509,2,2013-07-31 +200395,,2509,2,2013-08-01 +200462,,52449,2,2013-08-02 +200459,,52449,2,2013-08-02 +201184,,52567,2,2013-08-05 +201138,,52567,3,2013-08-05 +201523,,35097,2,2013-08-07 +201521,,1248,2,2013-08-07 +201592,,414,2,2013-08-07 +201591,,28,2,2013-08-07 +201593,,2509,2,2013-08-07 +201589,,4187,2,2013-08-07 +201531,,412,2,2013-08-07 +201812,,10069,2,2013-08-08 +201988,,42517,2,2013-08-08 +201811,,10008,2,2013-08-08 +201989,,42513,2,2013-08-08 +201964,,42517,2,2013-08-08 +202165,,52910,2,2013-08-09 +202160,,2509,2,2013-08-09 +202448,,2509,2,2013-08-10 +203432,,52449,2,2013-08-14 +203419,,53261,2,2013-08-14 +203464,,53264,2,2013-08-14 +203557,,53261,2,2013-08-14 +203556,,53261,2,2013-08-14 +203558,,53264,2,2013-08-14 +203527,,53261,2,2013-08-14 +203559,,53261,2,2013-08-14 +203465,,53264,1,2013-08-14 +203739,,53384,2,2013-08-15 +203828,,2509,2,2013-08-15 +204099,,53384,2,2013-08-16 +204223,,53391,2,2013-08-16 +204195,,53404,2,2013-08-16 +204181,,2509,2,2013-08-16 +204133,,53384,2,2013-08-16 +204358,,49879,2,2013-08-17 +204372,,2509,2,2013-08-17 +204743,,53384,2,2013-08-19 +204773,,53384,2,2013-08-19 +205012,,2509,2,2013-08-20 +205319,,414,2,2013-08-21 +205674,,44772,2,2013-08-22 +206107,,31575,2,2013-08-23 +206046,,31587,2,2013-08-23 +206303,,28,2,2013-08-24 +206338,,4187,2,2013-08-24 +206299,,414,2,2013-08-24 +206306,,4187,2,2013-08-24 +206420,,42517,2,2013-08-25 +206916,,2509,2,2013-08-27 +206915,,414,2,2013-08-27 +207310,,412,2,2013-08-28 +207653,,50739,16,2013-08-29 +207957,,48103,2,2013-08-30 +208451,,54506,2,2013-09-02 +208536,,2156,2,2013-09-03 +208615,,54624,1,2013-09-03 +208786,,54574,16,2013-09-03 +208781,,49906,16,2013-09-03 +208620,,54624,2,2013-09-03 +208825,,49906,2,2013-09-03 +208809,,54574,2,2013-09-03 +209042,,54637,16,2013-09-04 +208921,,54724,16,2013-09-04 +208886,,17000,1,2013-09-04 +209567,,4187,2,2013-09-06 +209563,,414,2,2013-09-06 +209590,,2509,2,2013-09-06 +209867,,47846,2,2013-09-08 +209832,,54915,3,2013-09-08 +209871,,54915,2,2013-09-08 +209794,,54915,2,2013-09-08 +210191,,55043,2,2013-09-10 +210469,,28,2,2013-09-10 +210397,,16313,2,2013-09-10 +210350,,16337,2,2013-09-10 +210471,,16337,2,2013-09-10 +210611,,3649,2,2013-09-11 +210610,,3646,2,2013-09-11 +210724,,55182,2,2013-09-12 +210733,,55182,2,2013-09-12 +210743,,55182,2,2013-09-12 +210866,,55150,2,2013-09-12 +210865,,55150,2,2013-09-12 +210766,,55182,2,2013-09-12 +210754,,55150,16,2013-09-12 +210978,,8529,2,2013-09-13 +211693,,55361,2,2013-09-16 +211506,,28,2,2013-09-16 +211912,,1248,2,2013-09-17 +211911,,55436,2,2013-09-17 +212117,,55617,2,2013-09-18 +212067,,55617,2,2013-09-18 +212205,,55576,15,2013-09-18 +212251,,48658,2,2013-09-18 +212394,,55722,2,2013-09-19 +212374,,55722,2,2013-09-19 +212673,,55576,2,2013-09-20 +212775,,55722,3,2013-09-20 +213654,,27132,2,2013-09-24 +213655,,27120,2,2013-09-24 +214075,,541,2,2013-09-25 +213991,,4705,2,2013-09-25 +213990,,4714,2,2013-09-25 +214885,,56273,2,2013-09-27 +214944,,56273,16,2013-09-27 +215327,,543,16,2013-09-30 +215398,,541,2,2013-09-30 +215293,,543,2,2013-09-30 +215518,,56445,2,2013-09-30 +215399,,543,2,2013-09-30 +215336,,541,2,2013-09-30 +215633,,543,2,2013-10-01 +216063,,56580,16,2013-10-02 +216270,,56768,2,2013-10-03 +216368,,28,2,2013-10-03 +216360,,2509,2,2013-10-03 +216635,,56780,2,2013-10-04 +216626,,56783,2,2013-10-04 +216499,,56783,1,2013-10-04 +216755,,56875,3,2013-10-05 +216737,,56684,2,2013-10-05 +216828,,56859,2,2013-10-05 +216891,,56860,2,2013-10-05 +216827,,56860,2,2013-10-05 +216912,,56860,2,2013-10-05 +216908,,56860,2,2013-10-05 +216893,,56859,2,2013-10-05 +216826,,56859,2,2013-10-05 +216880,,56859,2,2013-10-05 +216809,,56875,2,2013-10-05 +216841,,56860,2,2013-10-05 +217009,,56928,2,2013-10-06 +216952,,56911,2,2013-10-06 +216938,11490.0,56768,8,2013-10-06 +217090,,414,2,2013-10-06 +217082,,56911,2,2013-10-06 +217123,,16998,2,2013-10-06 +217060,,56859,2,2013-10-06 +217059,,56860,2,2013-10-06 +217328,,56955,2,2013-10-07 +217324,,56955,2,2013-10-07 +217345,,56860,2,2013-10-07 +217282,,2509,2,2013-10-07 +217186,,56684,2,2013-10-07 +217151,,56860,2,2013-10-07 +217450,,57065,2,2013-10-08 +217458,,57065,2,2013-10-08 +217465,,57065,2,2013-10-08 +217464,,57065,2,2013-10-08 +217452,,57065,2,2013-10-08 +217671,,57055,1,2013-10-08 +217673,,57055,2,2013-10-08 +217630,,57055,2,2013-10-08 +217647,,57055,2,2013-10-08 +217643,,57065,2,2013-10-08 +217636,,57065,2,2013-10-08 +217497,,57065,2,2013-10-08 +217592,,57065,2,2013-10-08 +217585,,57026,2,2013-10-08 +217576,,4714,2,2013-10-08 +217539,,57065,2,2013-10-08 +217537,,57086,2,2013-10-08 +217502,,57012,2,2013-10-08 +217489,,57065,2,2013-10-08 +217483,,57065,2,2013-10-08 +217674,,57053,2,2013-10-08 +217955,,57128,2,2013-10-09 +217717,,57086,2,2013-10-09 +217938,,57110,2,2013-10-09 +217917,,57126,2,2013-10-09 +217910,,57086,2,2013-10-09 +217906,,57126,2,2013-10-09 +217898,,56860,1,2013-10-09 +217895,,3646,2,2013-10-09 +217832,,57110,16,2013-10-09 +217827,,57110,2,2013-10-09 +217815,,57164,2,2013-10-09 +217790,,57110,3,2013-10-09 +217788,,57160,2,2013-10-09 +217777,,28,2,2013-10-09 +217763,,57177,2,2013-10-09 +217952,,57128,2,2013-10-09 +217758,,57167,2,2013-10-09 +217757,,57164,2,2013-10-09 +217969,,57241,2,2013-10-10 +217968,,57230,2,2013-10-10 +217971,,57238,2,2013-10-10 +218219,,412,2,2013-10-10 +218217,,57212,2,2013-10-10 +218214,,57212,2,2013-10-10 +218223,,57194,2,2013-10-10 +218222,,57190,2,2013-10-10 +218221,,57196,2,2013-10-10 +218220,,57126,2,2013-10-10 +218211,,57206,2,2013-10-10 +218208,,57185,2,2013-10-10 +218207,,57191,2,2013-10-10 +218205,,57202,2,2013-10-10 +218202,,57196,2,2013-10-10 +218201,,57196,2,2013-10-10 +218194,,57202,2,2013-10-10 +218193,,57198,2,2013-10-10 +218189,,57196,2,2013-10-10 +218187,,57223,2,2013-10-10 +218185,,57177,1,2013-10-10 +218182,,57203,2,2013-10-10 +218180,,57224,1,2013-10-10 +218178,,57211,2,2013-10-10 +218177,,57210,2,2013-10-10 +218176,,57188,2,2013-10-10 +218174,,57184,2,2013-10-10 +218173,,57220,2,2013-10-10 +218172,,57195,2,2013-10-10 +218167,,57207,2,2013-10-10 +218164,,57207,2,2013-10-10 +218163,,57211,2,2013-10-10 +218162,,57215,2,2013-10-10 +218161,,57205,2,2013-10-10 +218160,,57177,2,2013-10-10 +218158,,57192,2,2013-10-10 +218150,,57186,2,2013-10-10 +218142,,57185,2,2013-10-10 +218140,,57086,2,2013-10-10 +218135,,57177,2,2013-10-10 +218131,,57183,2,2013-10-10 +218125,,5015,2,2013-10-10 +218121,,57015,2,2013-10-10 +218118,,57177,2,2013-10-10 +218115,,57190,2,2013-10-10 +218114,,57196,2,2013-10-10 +218113,,57195,2,2013-10-10 +218107,,57198,2,2013-10-10 +218106,,57196,2,2013-10-10 +218104,,57195,2,2013-10-10 +218103,,57196,2,2013-10-10 +218101,,57195,2,2013-10-10 +218099,,57192,2,2013-10-10 +218098,,57192,2,2013-10-10 +218094,,57128,2,2013-10-10 +218090,,57256,2,2013-10-10 +218089,,57249,2,2013-10-10 +218088,,57249,2,2013-10-10 +218085,,57215,1,2013-10-10 +218082,,57230,3,2013-10-10 +218081,,57251,2,2013-10-10 +218079,,57250,2,2013-10-10 +218072,,414,2,2013-10-10 +218071,,57252,2,2013-10-10 +218070,,57231,2,2013-10-10 +218069,,57245,2,2013-10-10 +218068,,57012,2,2013-10-10 +218063,,57245,2,2013-10-10 +218061,,57193,1,2013-10-10 +218093,,57222,2,2013-10-10 +218060,,57249,2,2013-10-10 +218059,,57256,2,2013-10-10 +218058,,57086,2,2013-10-10 +218100,,57195,2,2013-10-10 +218054,,57265,2,2013-10-10 +218053,,57262,2,2013-10-10 +218052,,57262,2,2013-10-10 +218105,,57192,2,2013-10-10 +218051,,57264,2,2013-10-10 +218050,,57262,2,2013-10-10 +218049,,57249,2,2013-10-10 +218048,,57264,2,2013-10-10 +218047,,57261,2,2013-10-10 +218046,,57251,2,2013-10-10 +218044,,57256,2,2013-10-10 +218123,,5020,2,2013-10-10 +218043,,57249,2,2013-10-10 +218042,,57253,2,2013-10-10 +218041,,56955,2,2013-10-10 +218139,,57183,2,2013-10-10 +218040,,57065,2,2013-10-10 +218036,,57259,2,2013-10-10 +218035,,57258,2,2013-10-10 +218153,,57183,2,2013-10-10 +218034,,57065,2,2013-10-10 +218033,,57237,2,2013-10-10 +218032,,57238,2,2013-10-10 +218031,,57251,2,2013-10-10 +218030,,57256,2,2013-10-10 +218028,,57250,2,2013-10-10 +218027,,57233,2,2013-10-10 +218171,,57217,2,2013-10-10 +218026,,57236,2,2013-10-10 +218025,,57238,16,2013-10-10 +218021,,57238,2,2013-10-10 +218175,,57184,1,2013-10-10 +218018,,57237,2,2013-10-10 +218016,,57237,2,2013-10-10 +218015,,57236,2,2013-10-10 +218179,,57223,2,2013-10-10 +218014,,52871,16,2013-10-10 +218013,,57128,2,2013-10-10 +218012,,57220,2,2013-10-10 +218011,,57233,2,2013-10-10 +218010,,57222,2,2013-10-10 +218008,,57164,2,2013-10-10 +218006,,57229,2,2013-10-10 +218196,,57203,2,2013-10-10 +218005,,57231,16,2013-10-10 +218003,,57194,2,2013-10-10 +218001,,57177,2,2013-10-10 +218206,,57164,2,2013-10-10 +218000,,57203,2,2013-10-10 +217999,,57223,2,2013-10-10 +217996,,57211,2,2013-10-10 +218212,,57209,2,2013-10-10 +217995,,57238,2,2013-10-10 +217992,,57177,2,2013-10-10 +217987,,57243,2,2013-10-10 +217983,,57225,1,2013-10-10 +217980,,57229,2,2013-10-10 +217979,,57229,2,2013-10-10 +217973,,57236,2,2013-10-10 +217972,,57239,2,2013-10-10 +218225,,57306,2,2013-10-11 +218435,,57280,2,2013-10-11 +218226,,57304,2,2013-10-11 +218444,,57278,2,2013-10-11 +218455,,57288,2,2013-10-11 +218442,,57286,2,2013-10-11 +218450,,57294,1,2013-10-11 +218449,,57294,2,2013-10-11 +218441,,48597,2,2013-10-11 +218447,,57293,2,2013-10-11 +218446,,57290,16,2013-10-11 +218445,,57283,2,2013-10-11 +218440,,57288,2,2013-10-11 +218448,,57294,11,2013-10-11 +218431,,57287,2,2013-10-11 +218430,,57065,2,2013-10-11 +218427,,57299,2,2013-10-11 +218424,,57287,2,2013-10-11 +218423,,57288,2,2013-10-11 +218421,,57288,2,2013-10-11 +218418,,57297,2,2013-10-11 +218417,,57301,2,2013-10-11 +218415,,57285,1,2013-10-11 +218414,,57285,2,2013-10-11 +218413,,57242,2,2013-10-11 +218411,,37182,2,2013-10-11 +218409,,57271,2,2013-10-11 +218407,,57252,2,2013-10-11 +218406,,57268,2,2013-10-11 +218405,,57276,2,2013-10-11 +218403,,57279,2,2013-10-11 +218402,,57297,2,2013-10-11 +218401,,56784,2,2013-10-11 +218396,,57273,2,2013-10-11 +218393,,57273,2,2013-10-11 +218392,,57271,2,2013-10-11 +218390,,57276,2,2013-10-11 +218389,,56784,2,2013-10-11 +218388,594.0,56784,8,2013-10-11 +218387,,57275,2,2013-10-11 +218385,,57276,2,2013-10-11 +218384,,57239,2,2013-10-11 +218383,,57233,1,2013-10-11 +218381,,57275,2,2013-10-11 +218378,,57272,2,2013-10-11 +218376,,57270,2,2013-10-11 +218372,,57267,2,2013-10-11 +218371,,57156,2,2013-10-11 +218368,,57277,2,2013-10-11 +218367,,56784,2,2013-10-11 +218365,,57258,2,2013-10-11 +218364,,56784,2,2013-10-11 +218363,,57271,2,2013-10-11 +218361,,57243,2,2013-10-11 +218360,,57220,16,2013-10-11 +218359,,57282,16,2013-10-11 +218357,,57271,2,2013-10-11 +218355,,57283,1,2013-10-11 +218354,,57266,2,2013-10-11 +218352,,57256,2,2013-10-11 +218350,,57271,2,2013-10-11 +218347,,57256,2,2013-10-11 +218346,,57278,2,2013-10-11 +218345,,57280,2,2013-10-11 +218344,,57265,2,2013-10-11 +218335,,57316,2,2013-10-11 +218334,,57315,2,2013-10-11 +218358,,56372,2,2013-10-11 +218332,,57325,2,2013-10-11 +218326,,57324,2,2013-10-11 +218325,,57323,2,2013-10-11 +218362,,56768,2,2013-10-11 +218324,,57321,2,2013-10-11 +218323,,57324,2,2013-10-11 +218321,,57324,2,2013-10-11 +218317,,37182,2,2013-10-11 +218316,,57320,2,2013-10-11 +218314,,57317,2,2013-10-11 +218311,,57314,2,2013-10-11 +218373,,57270,16,2013-10-11 +218309,,57317,2,2013-10-11 +218308,,57323,2,2013-10-11 +218306,,57334,2,2013-10-11 +218382,,57272,2,2013-10-11 +218302,,57337,2,2013-10-11 +218301,,57336,2,2013-10-11 +218300,,57337,1,2013-10-11 +218386,,57274,2,2013-10-11 +218299,,57336,2,2013-10-11 +218297,,47981,2,2013-10-11 +218294,,57255,2,2013-10-11 +218293,,57328,2,2013-10-11 +218291,,57255,2,2013-10-11 +218285,,57309,2,2013-10-11 +218284,,57317,2,2013-10-11 +218398,,57257,2,2013-10-11 +218283,,57325,2,2013-10-11 +218279,,57266,2,2013-10-11 +218278,,57307,2,2013-10-11 +218404,,35249,2,2013-10-11 +218276,,57254,2,2013-10-11 +218275,,57307,2,2013-10-11 +218272,,57293,2,2013-10-11 +218408,,56784,2,2013-10-11 +218270,,57307,3,2013-10-11 +218269,,57306,1,2013-10-11 +218268,,57307,3,2013-10-11 +218267,,57306,2,2013-10-11 +218266,,57281,2,2013-10-11 +218265,,57305,2,2013-10-11 +218263,,57279,2,2013-10-11 +218420,,57293,2,2013-10-11 +218258,,57306,2,2013-10-11 +218257,,57306,2,2013-10-11 +218255,,57293,2,2013-10-11 +218426,,57256,1,2013-10-11 +218254,,57281,1,2013-10-11 +218252,,57310,2,2013-10-11 +218250,,57309,2,2013-10-11 +218434,,57287,2,2013-10-11 +218249,,57302,2,2013-10-11 +218247,,57310,2,2013-10-11 +218244,,57311,2,2013-10-11 +218240,,57315,2,2013-10-11 +218238,,57314,2,2013-10-11 +218237,,57314,2,2013-10-11 +218235,,57312,2,2013-10-11 +218234,,57313,2,2013-10-11 +218232,,57313,2,2013-10-11 +218231,,57313,2,2013-10-11 +218230,,57297,2,2013-10-11 +218451,,57294,2,2013-10-11 +218229,,57278,2,2013-10-11 +218228,,57309,2,2013-10-11 +218572,,57323,2,2013-10-12 +218458,,56784,2,2013-10-12 +218461,,57352,2,2013-10-12 +218617,,57348,2,2013-10-12 +218460,,10911,2,2013-10-12 +218649,,57345,2,2013-10-12 +218648,,57271,2,2013-10-12 +218459,,57219,2,2013-10-12 +218636,,47981,2,2013-10-12 +218634,,57341,2,2013-10-12 +218631,,57308,2,2013-10-12 +218629,,57345,2,2013-10-12 +218628,,47981,2,2013-10-12 +218626,,57320,1,2013-10-12 +218623,,47981,2,2013-10-12 +218622,,57353,2,2013-10-12 +218621,,57352,2,2013-10-12 +218620,,57351,2,2013-10-12 +218619,,57352,1,2013-10-12 +218618,,57320,2,2013-10-12 +218614,,57324,2,2013-10-12 +218613,,57350,2,2013-10-12 +218611,,57341,2,2013-10-12 +218608,,57349,2,2013-10-12 +218607,,57256,2,2013-10-12 +218603,,9524,2,2013-10-12 +218601,,56784,2,2013-10-12 +218599,,57337,2,2013-10-12 +218598,,57241,2,2013-10-12 +218597,,57245,2,2013-10-12 +218596,,57258,2,2013-10-12 +218595,,57271,2,2013-10-12 +218594,,57206,2,2013-10-12 +218592,,57195,2,2013-10-12 +218591,,57329,2,2013-10-12 +218589,,57342,2,2013-10-12 +218586,,57347,2,2013-10-12 +218585,,10911,2,2013-10-12 +218584,,57273,2,2013-10-12 +218582,,57323,2,2013-10-12 +218581,,57340,2,2013-10-12 +218580,,57301,1,2013-10-12 +218577,,57323,2,2013-10-12 +218576,,57302,2,2013-10-12 +218575,,57326,2,2013-10-12 +218571,,57340,2,2013-10-12 +218570,,57249,2,2013-10-12 +218569,,57256,2,2013-10-12 +218563,,57337,2,2013-10-12 +218559,,57233,2,2013-10-12 +218556,,57212,2,2013-10-12 +218555,,57286,2,2013-10-12 +218552,,57246,2,2013-10-12 +218551,,57353,1,2013-10-12 +218545,,57334,2,2013-10-12 +218544,,47981,2,2013-10-12 +218542,,57347,2,2013-10-12 +218583,,57026,2,2013-10-12 +218538,,47981,2,2013-10-12 +218537,,57346,2,2013-10-12 +218527,,57334,2,2013-10-12 +218587,,57343,2,2013-10-12 +218526,,47981,2,2013-10-12 +218525,,57365,2,2013-10-12 +218524,,57342,2,2013-10-12 +218593,,57194,2,2013-10-12 +218522,,57367,2,2013-10-12 +218517,,56911,2,2013-10-12 +218516,,57357,2,2013-10-12 +218514,,15542,2,2013-10-12 +218513,,57372,3,2013-10-12 +218512,,47981,2,2013-10-12 +218511,,57334,2,2013-10-12 +218602,,57334,2,2013-10-12 +218508,,57372,2,2013-10-12 +218507,,57346,2,2013-10-12 +218506,,57372,3,2013-10-12 +218610,,57343,2,2013-10-12 +218505,,57372,3,2013-10-12 +218502,,57334,2,2013-10-12 +218501,,57370,2,2013-10-12 +218616,,57318,2,2013-10-12 +218500,,57334,2,2013-10-12 +218498,,57356,2,2013-10-12 +218493,,57342,2,2013-10-12 +218491,,57346,2,2013-10-12 +218487,,57353,2,2013-10-12 +218486,,57351,2,2013-10-12 +218481,,57359,2,2013-10-12 +218625,,57185,3,2013-10-12 +218477,,57351,2,2013-10-12 +218475,,57351,2,2013-10-12 +218473,,57350,2,2013-10-12 +218630,,57332,2,2013-10-12 +218470,,57359,2,2013-10-12 +218469,,57310,2,2013-10-12 +218468,,57356,2,2013-10-12 +218646,,56768,2,2013-10-12 +218466,,57310,1,2013-10-12 +218463,,56768,2,2013-10-12 +218462,,57317,2,2013-10-12 +218653,,57350,2,2013-10-13 +218651,,57195,2,2013-10-13 +218650,,57195,2,2013-10-13 +218807,,57376,2,2013-10-13 +218804,,57379,16,2013-10-13 +218814,,57378,2,2013-10-13 +218802,,57357,2,2013-10-13 +218669,,57380,2,2013-10-13 +218797,,57351,2,2013-10-13 +218796,,57350,2,2013-10-13 +218793,,57386,2,2013-10-13 +218787,,57368,2,2013-10-13 +218785,,57359,2,2013-10-13 +218783,,57380,2,2013-10-13 +218781,,57388,2,2013-10-13 +218780,,57389,2,2013-10-13 +218779,,57388,2,2013-10-13 +218778,,57387,2,2013-10-13 +218777,,57384,3,2013-10-13 +218776,,57386,2,2013-10-13 +218770,,57321,2,2013-10-13 +218768,,57353,2,2013-10-13 +218765,,57369,2,2013-10-13 +218764,,57363,2,2013-10-13 +218759,,57357,2,2013-10-13 +218757,,57364,2,2013-10-13 +218756,,57359,2,2013-10-13 +218754,,57370,2,2013-10-13 +218752,,57293,2,2013-10-13 +218751,,57351,2,2013-10-13 +218750,,57350,2,2013-10-13 +218749,,57356,2,2013-10-13 +218737,,57367,2,2013-10-13 +218721,,57271,1,2013-10-13 +218719,,49906,2,2013-10-13 +218717,,57394,2,2013-10-13 +218716,,57324,2,2013-10-13 +218715,,57350,2,2013-10-13 +218710,,57405,2,2013-10-13 +218709,,57392,2,2013-10-13 +218708,,57373,2,2013-10-13 +218706,,57392,2,2013-10-13 +218705,,57381,2,2013-10-13 +218703,,20234,2,2013-10-13 +218720,,57399,2,2013-10-13 +218702,,57402,2,2013-10-13 +218701,,57408,2,2013-10-13 +218700,,57405,2,2013-10-13 +218699,,57407,2,2013-10-13 +218698,,57401,2,2013-10-13 +218697,,57407,2,2013-10-13 +218694,,57407,2,2013-10-13 +218755,,57334,1,2013-10-13 +218693,,57405,2,2013-10-13 +218692,,57380,2,2013-10-13 +218689,,57228,2,2013-10-13 +218762,,57354,2,2013-10-13 +218687,,57395,2,2013-10-13 +218686,,57393,2,2013-10-13 +218685,,57397,2,2013-10-13 +218769,,57344,2,2013-10-13 +218683,,57184,2,2013-10-13 +218682,,57396,2,2013-10-13 +218681,,57396,2,2013-10-13 +218680,,57395,2,2013-10-13 +218677,,57350,2,2013-10-13 +218672,,57384,3,2013-10-13 +218671,,57400,2,2013-10-13 +218782,,57389,2,2013-10-13 +218667,,57351,2,2013-10-13 +218666,,57391,2,2013-10-13 +218663,,57195,2,2013-10-13 +218790,,57373,2,2013-10-13 +218662,,57396,2,2013-10-13 +218661,,57401,2,2013-10-13 +218659,,49906,2,2013-10-13 +218800,,56911,2,2013-10-13 +218658,,57398,2,2013-10-13 +218657,,57398,2,2013-10-13 +218656,,57185,2,2013-10-13 +218655,,57388,2,2013-10-13 +218654,,57351,2,2013-10-13 +218842,,57452,2,2013-10-14 +218839,,57441,2,2013-10-14 +218838,,57351,2,2013-10-14 +218834,,57446,2,2013-10-14 +218831,,57350,2,2013-10-14 +218830,,57443,2,2013-10-14 +218829,,57271,9,2013-10-14 +218828,,57454,1,2013-10-14 +218827,,57448,2,2013-10-14 +218823,,57433,2,2013-10-14 +218840,,57442,2,2013-10-14 +218819,,57452,2,2013-10-14 +218817,,57451,2,2013-10-14 diff --git a/examples/house_prices.py b/examples/house_prices.py index e721eaaf..5c99095c 100644 --- a/examples/house_prices.py +++ b/examples/house_prices.py @@ -16,6 +16,7 @@ # NOTE: you must download the dataset from Kaggle for this example to work from datetime import datetime +import os import pandas as pd import plexe @@ -28,19 +29,37 @@ # Note: for conciseness we leave the input schema empty and let plexe infer it model = ModelBuilder( provider=ProviderConfig( - default_provider="openai/gpt-4o", - orchestrator_provider="anthropic/claude-sonnet-4-20250514", - research_provider="openai/gpt-4o", - engineer_provider="anthropic/claude-3-7-sonnet-20250219", - ops_provider="anthropic/claude-3-7-sonnet-20250219", - tool_provider="openai/gpt-4o", + # default_provider="openai/gpt-4o", + # orchestrator_provider="anthropic/claude-sonnet-4-20250514", + # research_provider="openai/gpt-4o", + # engineer_provider="anthropic/claude-3-7-sonnet-20250219", + # ops_provider="anthropic/claude-3-7-sonnet-20250219", + # tool_provider="openai/gpt-4o", + default_provider="gemini/gemini-2.5-flash", + orchestrator_provider="gemini/gemini-2.5-flash", + research_provider="gemini/gemini-2.5-flash", + engineer_provider="gemini/gemini-2.5-flash", + ops_provider="gemini/gemini-2.5-flash", + tool_provider="gemini/gemini-2.5-flash", ), verbose=False, ) + # Step 2: Build the model using the training dataset # 2B: Build the model with the dataset # NOTE: In order to run this example, you will need to download the dataset from Kaggle +# Make MLflow callback optional. Set ENABLE_MLFLOW=1 in the environment to enable MLflow integration. +callbacks_list = [] +if os.environ.get("ENABLE_MLFLOW", "0") == "1": + callbacks_list = [ + MLFlowCallback( + tracking_uri="http://127.0.0.1:8080", + experiment_name=f"house-prices-{datetime.now().strftime('%Y%m%d-%H%M%S')}", + ) + ] + + m = model.build( datasets=[pd.read_csv("examples/datasets/house-prices-train.csv")], intent=( @@ -54,24 +73,44 @@ max_iterations=2, timeout=1800, # 30 minute timeout run_timeout=180, - callbacks=[ - MLFlowCallback( - tracking_uri="http://127.0.0.1:8080", - experiment_name=f"house-prices-{datetime.now().strftime('%Y%m%d-%H%M%S') }", - ) - ], + callbacks=callbacks_list, ) -# Step 3: Save the model +# Step 3: Save the model and handle test data transformation plexe.save_model(m, "house-prices.tar.gz") -# Step 4: Run a prediction on the built model +# Step 4: Load and prepare test data +print("\nPreparing test data for predictions...") test_df = pd.read_csv("examples/datasets/house-prices-test.csv").sample(10) -predictions = pd.DataFrame.from_records([m.predict(x) for x in test_df.to_dict(orient="records")]) -# Step 5: print a sample of predictions +# Get the predictor's transform method if available +if hasattr(m.predictor, "transform"): + print("Found transform method in predictor, applying transformations...") + try: + # Use the predictor's built-in transform method + transformed_test = m.predictor.transform(test_df) + predictions = pd.DataFrame.from_records([m.predict(x) for x in transformed_test.to_dict(orient="records")]) + except Exception as e: + print(f"Error during transformation: {str(e)}") + print("Falling back to raw predictions (warning: may be incorrect)") + predictions = pd.DataFrame.from_records([m.predict(x) for x in test_df.to_dict(orient="records")]) +else: + print("No transformation method found, using raw test data (warning: may cause incorrect predictions)") + predictions = pd.DataFrame.from_records([m.predict(x) for x in test_df.to_dict(orient="records")]) + +# Step 5: Print a sample of predictions +print("\nPredictions:") print(predictions) # Step 6: Print model description +print("\nModel description:") description = m.describe() print(description.as_text()) + +# Optional: Print available model artifacts and generated code +print("\nGenerated artifacts and code:") +print(f"Model artifacts: {m.artifacts}") +print(f"Training code available: {bool(m.trainer_source)}") +print(f"Feature transformer code available: {bool(m.feature_transformer_source)}") +print(f"Testing code available: {bool(m.testing_source)}") +print(f"Dataset splitter code available: {bool(m.dataset_splitter_source)}") diff --git a/examples/rel-f1-driver-dnf.txt b/examples/rel-f1-driver-dnf.txt new file mode 100644 index 00000000..0f345d50 --- /dev/null +++ b/examples/rel-f1-driver-dnf.txt @@ -0,0 +1,6 @@ +I am working with the data is loaded into the `f1` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/f1`). +I want to define a predictive task on this relational database. +- Database Description: The F1 database tracks all-time Formula 1 racing data and statistics. It provides detailed information for various stakeholders including drivers, constructors, engine manufacturers, and tyre manufacturers. Highlights include data on all circuits (e.g. geographical details), and full historical data from every season. This includes overall standings, race results, and more specific data like practice sessions, qualifying positions, sprints, and pit stops. +- Task definition: For each driver predict the if they will DNF (did not finish) a race in the next 1 month. +- Evaluation metric: Area Under the Receiver Operating Characteristic curve (AUROC). +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/rel-f1-driver-position.txt b/examples/rel-f1-driver-position.txt new file mode 100644 index 00000000..458f5def --- /dev/null +++ b/examples/rel-f1-driver-position.txt @@ -0,0 +1,6 @@ +I am working with the data is loaded into the `f1` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/f1`). +I want to define a predictive task on this relational database. +- Database Description: The F1 database tracks all-time Formula 1 racing data and statistics. It provides detailed information for various stakeholders including drivers, constructors, engine manufacturers, and tyre manufacturers. Highlights include data on all circuits (e.g. geographical details), and full historical data from every season. This includes overall standings, race results, and more specific data like practice sessions, qualifying positions, sprints, and pit stops. +- Task definition:Predict the average finishing position of each driver all races in the next 1 months. +- Evaluation metric: Mean Absolute Error (MAE) +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/rel-f1-driver-top3.txt b/examples/rel-f1-driver-top3.txt new file mode 100644 index 00000000..22cdaa0f --- /dev/null +++ b/examples/rel-f1-driver-top3.txt @@ -0,0 +1,6 @@ +I am working with the data is loaded into the `f1` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/f1`). +I want to define a predictive task on this relational database. +- Database Description: The F1 database tracks all-time Formula 1 racing data and statistics. It provides detailed information for various stakeholders including drivers, constructors, engine manufacturers, and tyre manufacturers. Highlights include data on all circuits (e.g. geographical details), and full historical data from every season. This includes overall standings, race results, and more specific data like practice sessions, qualifying positions, sprints, and pit stops. +- Task definition:For each driver predict if they will qualify in the top-3 for a race in the next 15 days. +- Evaluation metric: Area Under the Receiver Operating Characteristic curve (AUROC). +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/rel-f1-result-position.txt b/examples/rel-f1-result-position.txt new file mode 100644 index 00000000..f5a3be48 --- /dev/null +++ b/examples/rel-f1-result-position.txt @@ -0,0 +1,6 @@ +I am working with the data is loaded into the `f1` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/f1`). +I want to define a predictive task on this relational database. +- Database Description: The F1 database tracks all-time Formula 1 racing data and statistics. It provides detailed information for various stakeholders including drivers, constructors, engine manufacturers, and tyre manufacturers. Highlights include data on all circuits (e.g. geographical details), and full historical data from every season. This includes overall standings, race results, and more specific data like practice sessions, qualifying positions, sprints, and pit stops. +- Task definition: For each race result, predict the finishing position +- Evaluation metric: Mean Absolute Error (MAE) +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/rel-hm-item-sales.txt b/examples/rel-hm-item-sales.txt new file mode 100644 index 00000000..07a89d27 --- /dev/null +++ b/examples/rel-hm-item-sales.txt @@ -0,0 +1,6 @@ +I am working with the data is loaded into the `hm` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/hm`). +I want to define a predictive task on this relational database. +- Database Description: The H&M relational database hosts extensive customer and product data for online shopping experiences across its extensive network of brands and stores. This database includes detailed customer purchase histories and a rich set of metadata, encompassing everything from basic demographic information to extensive details about each product available. +- Task definition:Predict the total sales for an article (the sum of prices of the associated transactions) in the next week. +- Evaluation metric: Mean Absolute Error (MAE). +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/rel-hm-user-churn.txt b/examples/rel-hm-user-churn.txt new file mode 100644 index 00000000..73012ddf --- /dev/null +++ b/examples/rel-hm-user-churn.txt @@ -0,0 +1,6 @@ +I am working with the data is loaded into the `hm` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/hm`). +I want to define a predictive task on this relational database. +- Database Description: The H&M relational database hosts extensive customer and product data for online shopping experiences across its extensive network of brands and stores. This database includes detailed customer purchase histories and a rich set of metadata, encompassing everything from basic demographic information to extensive details about each product available. +- Task definition:Predict the churn for a customer (no transactions) in the next week. +- Evaluation metric: Area Under the Receiver Operating Characteristic curve (AUROC). +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/rel-stack-engage.txt b/examples/rel-stack-engage.txt new file mode 100644 index 00000000..64842bc3 --- /dev/null +++ b/examples/rel-stack-engage.txt @@ -0,0 +1,12 @@ +I am working with the data is loaded into the `stack-2000` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/stack-2000`). +I want to define a predictive task on this relational database. +Task definition: Predict if the user will make any contribution, defined as a vote, comment, or post, to the site in the next 2 days. +Entity filtering: We filter on active users defined as users that have made at least one comment/post/vote before the timestamp. +Task significance: By accurately forecasting the levels of user contribution, website administrators can effectively gauge +and oversee user activity. This insight allows for well-informed choices across various business aspects. For instance, +it aids in preempting and mitigating user attrition, as well as in enhancing strategies to foster increased user interaction +and involvement. This predictive task serves as a crucial tool in optimizing user experience and sustaining a dynamic and +engaged user base. +Machine learning task: Binary classification. The label is 1 when a user contributes to the site and 0 otherwise. +Evaluation metric: Average Precision (AP). +I don't have any requirements, if you need them you can set by default automatically. \ No newline at end of file diff --git a/examples/relational_ques.txt b/examples/relational_ques.txt new file mode 100644 index 00000000..d95d91f0 --- /dev/null +++ b/examples/relational_ques.txt @@ -0,0 +1,32 @@ +I am working with the data is loaded into the `stack_200` database (connection string: `postgresql+psycopg2://mlflow:mlflow@postgres:5432/stack_200`). + +I want to define a predictive task on this relational database using the Relational Deep Learning agents. + +**Task Definition:** +- **Target Entity:** `Posts` (specifically questions). +- **Prediction Goal:** Predict if a newly posted question will receive any Upvotes (`VoteTypeId = 2` in `Votes` table) within the next 1 months (horizon = 1 months). +- **Task Type:** Binary Classification (Label 1 if upvotes > 0, else 0). + +**Input & Graph Construction:** +- Leverage the full relational schema (Users, Posts, Votes, Comments, Badges, etc.) as a Heterogeneous Graph. +- **Node Features:** + - `Posts`: Use `Title` and `Body` (Text features), `CreationDate` (Timestamp). + - `Users`: Use `Reputation` (Numerical), `DisplayName` (Text). +- **Edges:** Automatically infer from Foreign Keys. + +**Temporal Splitting:** +- Use `CreationDate` in the `Posts` table to perform time-based splitting (Train/Val/Test) to prevent data leakage. + + +------------------------------------------ +Here are my specifications to proceed with the Relational Deep Learning pipeline: +Output Schema: Represent the prediction label as an Integer (0 or 1). +Context: This corresponds to the target column $y$ in the Training Table defined by the Temporal Task Supervisor. + +Input Schema & Graph Construction: +Do not manually define the schema. Instead, please activate the Relational Graph Architect Agent. +Ensure Time is treated as a first-class citizen by extracting timestamps from the Posts table (CreationDate) to assign $t_v$ to nodes. +Model Solutions: +I request 1 optimized Relational GNN solution (e.g., Heterogeneous Graph Transformer or R-GCN). +Please assign the Relational GNN Specialist to handle the training loop using Time-Consistent Neighbor Sampling to prevent temporal leakage. +Please proceed. \ No newline at end of file diff --git a/examples/test_saved_model.py b/examples/test_saved_model.py new file mode 100644 index 00000000..084a6236 --- /dev/null +++ b/examples/test_saved_model.py @@ -0,0 +1,115 @@ +""" +Script to test a saved Plexe model by loading it and running predictions. +""" + +import pandas as pd +import numpy as np +from sklearn.preprocessing import LabelEncoder +import plexe + +# Load the saved model +print("Loading saved model from house-prices.tar.gz...") +model = plexe.load_model("house-prices.tar.gz") +print("\nModel loaded successfully!") + +# Print and save transformer code if available +if model.feature_transformer_source: + print("\nFeature transformer code found:") + print("-" * 80) + print(model.feature_transformer_source) + print("-" * 80) + + # Save transformer code for inspection + with open("feature_transformer_code.py", "w") as f: + f.write(model.feature_transformer_source) + print("\nSaved transformer code to 'feature_transformer_code.py'") + +# Load and prepare test data +print("\nLoading test data...") +test_df = pd.read_csv("examples/datasets/house-prices-test.csv").sample(10) +original_test = test_df.copy() # Keep original for comparison +print(f"Test data shape: {test_df.shape}") + +# Basic preprocessing for categorical columns +print("\nPreprocessing categorical columns...") +categorical_columns = test_df.select_dtypes(include=["object"]).columns +for col in categorical_columns: + # First try ordinal encoding + le = LabelEncoder() + try: + test_df[col] = le.fit_transform(test_df[col].astype(str)) + except Exception: + # If fails, try one-hot encoding + print(f"Using one-hot encoding for {col}") + dummies = pd.get_dummies(test_df[col], prefix=col) + test_df = pd.concat([test_df, dummies], axis=1) + test_df.drop(col, axis=1, inplace=True) + +# Fill missing values +print("Handling missing values...") +numeric_columns = test_df.select_dtypes(include=[np.number]).columns +test_df[numeric_columns] = test_df[numeric_columns].fillna(test_df[numeric_columns].mean()) + +# Try to transform test data if possible +print("\nPreparing test data for predictions...") +try: + if hasattr(model.predictor, "transform"): + print("Found transform method, applying model's transformations...") + transformed_test = model.predictor.transform(original_test) # Try with original data first + print("Data transformed successfully by model's transformer!") + predictions = pd.DataFrame.from_records([model.predict(x) for x in transformed_test.to_dict(orient="records")]) + else: + print("Using manually preprocessed data...") + predictions = pd.DataFrame.from_records([model.predict(x) for x in test_df.to_dict(orient="records")]) +except Exception as e: + print(f"\nError with model's transformer: {str(e)}") + print("Attempting prediction with manually preprocessed data...") + try: + predictions = pd.DataFrame.from_records([model.predict(x) for x in test_df.to_dict(orient="records")]) + except Exception as e: + print(f"Error during prediction: {str(e)}") + raise + +# Print results +print("\nPredictions:") +print(predictions) + +print("\nModel description:") +description = model.describe() +print(description.as_text()) + +# Print feature importance if available in model metadata +if hasattr(model, "metadata") and "feature_importance" in model.metadata: + print("\nFeature Importance:") + print(model.metadata["feature_importance"]) + +# Print available components +print("\nModel components available:") +print(f"Training code available: {bool(model.trainer_source)}") +print(f"Feature transformer code available: {bool(model.feature_transformer_source)}") +print(f"Testing code available: {bool(model.testing_source)}") +print(f"Dataset splitter code available: {bool(model.dataset_splitter_source)}") + +# If we have the original test data with SalePrice, show comparison +if "SalePrice" in original_test.columns: + print("\nSample comparison (actual vs predicted):") + comparison = pd.DataFrame( + { + "Actual": original_test["SalePrice"], + "Predicted": predictions["SalePrice"] if "SalePrice" in predictions.columns else predictions.iloc[:, 0], + } + ) + print(comparison) + + # Calculate error metrics + errors = comparison["Actual"] - comparison["Predicted"] + metrics = { + "Mean Absolute Error": abs(errors).mean(), + "Mean Squared Error": (errors**2).mean(), + "Root Mean Squared Error": (errors**2).mean() ** 0.5, + "Mean Absolute Percentage Error": (abs(errors) / comparison["Actual"]).mean() * 100, + } + + print("\nError Metrics:") + for metric_name, value in metrics.items(): + print(f"{metric_name}: {value:.2f}") diff --git a/litellm-settings.yaml b/litellm-settings.yaml new file mode 100644 index 00000000..e1e8c149 --- /dev/null +++ b/litellm-settings.yaml @@ -0,0 +1,15 @@ +# litellm settings +# https://docs.litellm.ai/docs/simple_proxy#litellm-settings-config-file + +# litellm-settings.yaml +litellm_settings: + set_verbose: True + cache: False + drop_params: True + num_retries: 2 +model_list: + - model_name: gpt-oss-20b + litellm_params: + model: openai/gpt-oss-20b + api_base: https://qllm.iselab.info/v1 + api_key: none # does os.getenv("AZURE_API_KEY_EU") \ No newline at end of file diff --git a/mcp_config.json b/mcp_config.json new file mode 100644 index 00000000..99de27ee --- /dev/null +++ b/mcp_config.json @@ -0,0 +1,41 @@ +{ + "mcpServers": { + "hpo-search": { + "command": "python", + "args": [ + "plexe/langgraph/mcp_servers/hpo_server.py" + ] + }, + "google-scholar": { + "command": "python", + "args": [ + "plexe/langgraph/mcp_servers/scholar_server.py" + ] + }, + "kaggle": { + "command": "python", + "args": [ + "plexe/langgraph/mcp_servers/kaggle_server.py" + ], + "env": { + "KAGGLE_USERNAME": "${KAGGLE_USERNAME}", + "KAGGLE_KEY": "${KAGGLE_KEY}" + } + }, + "semantic-scholar": { + "command": "python", + "args": [ + "plexe/langgraph/mcp_servers/semantic_scholar_server.py" + ], + "env": { + "SEMANTIC_SCHOLAR_API_KEY": "${SEMANTIC_SCHOLAR_API_KEY}" + } + }, + "arxiv": { + "command": "python", + "args": [ + "plexe/langgraph/mcp_servers/arxiv_server.py" + ] + } + } +} \ No newline at end of file diff --git a/mlflow.db b/mlflow.db new file mode 100644 index 00000000..a179de85 Binary files /dev/null and b/mlflow.db differ diff --git a/paper/AutoML_Agent.pdf b/paper/AutoML_Agent.pdf new file mode 100644 index 00000000..cab7769c Binary files /dev/null and b/paper/AutoML_Agent.pdf differ diff --git a/paper/Graph.pdf b/paper/Graph.pdf new file mode 100644 index 00000000..51ac2192 Binary files /dev/null and b/paper/Graph.pdf differ diff --git a/paper/Relbench.pdf b/paper/Relbench.pdf new file mode 100644 index 00000000..38b9f6f6 Binary files /dev/null and b/paper/Relbench.pdf differ diff --git a/plexe/__init__.py b/plexe/__init__.py index 8817bfdb..3451de05 100644 --- a/plexe/__init__.py +++ b/plexe/__init__.py @@ -1,14 +1,11 @@ -from .models import Model as Model -from .model_builder import ModelBuilder as ModelBuilder -from .datasets import DatasetGenerator as DatasetGenerator -from .fileio import ( - load_model as load_model, - save_model as save_model, - save_checkpoint as save_checkpoint, - load_checkpoint as load_checkpoint, - list_checkpoints as list_checkpoints, - clear_checkpoints as clear_checkpoints, +from .langgraph import ( + PlexeOrchestrator, + AgentConfig, + PipelineState, + ConversationalAgent, + EDAAgent, + DatasetBuilderAgent, + TaskBuilderAgent, + RelationalGNNSpecialistAgent, + OperationAgent, ) -from .callbacks import Callback as Callback -from .callbacks import MLFlowCallback as MLFlowCallback -from .callbacks import ModelCheckpointCallback as ModelCheckpointCallback diff --git a/plexe/agents/__init__.py b/plexe/agents/__init__.py deleted file mode 100644 index d3079610..00000000 --- a/plexe/agents/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Agents for the Plexe ML platform. - -This package contains agent implementations for various tasks in the Plexe platform. -""" diff --git a/plexe/agents/agents.py b/plexe/agents/agents.py deleted file mode 100644 index d414fc12..00000000 --- a/plexe/agents/agents.py +++ /dev/null @@ -1,307 +0,0 @@ -""" -This module defines a multi-agent ML engineering system for building machine learning models. -""" - -import json -import logging -import types -from dataclasses import dataclass, field -from typing import List, Dict, Optional, Callable - -from smolagents import CodeAgent, LiteLLMModel, AgentText - -from plexe.agents.dataset_analyser import EdaAgent -from plexe.agents.dataset_splitter import DatasetSplitterAgent -from plexe.agents.feature_engineer import FeatureEngineeringAgent -from plexe.agents.model_packager import ModelPackagerAgent -from plexe.agents.model_planner import ModelPlannerAgent -from plexe.agents.model_tester import ModelTesterAgent -from plexe.agents.model_trainer import ModelTrainerAgent -from plexe.agents.schema_resolver import SchemaResolverAgent -from plexe.config import config -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.models.entities.artifact import Artifact -from plexe.internal.models.entities.code import Code -from plexe.internal.models.entities.metric import Metric -from plexe.internal.models.entities.metric import MetricComparator, ComparisonMethod -from plexe.core.entities.solution import Solution -from plexe.core.interfaces.predictor import Predictor -from plexe.tools.datasets import get_latest_datasets -from plexe.tools.evaluation import get_review_finalised_model, get_solution_performances -from plexe.tools.metrics import get_select_target_metric -from plexe.tools.response_formatting import ( - format_final_orchestrator_agent_response, -) -from plexe.tools.training import register_best_solution - -logger = logging.getLogger(__name__) - - -@dataclass -class ModelGenerationResult: - training_source_code: str - inference_source_code: str - feature_transformer_source_code: str - dataset_split_code: str - predictor: Predictor - model_artifacts: List[Artifact] - performance: Metric # Validation performance - test_performance: Metric = None # Test set performance - testing_source_code: str = None # Testing code from model tester agent - evaluation_report: Dict = None # Evaluation report from model tester agent - metadata: Dict[str, str] = field(default_factory=dict) # Model metadata - - -class PlexeAgent: - """ - Multi-agent ML engineering system for building machine learning models. - - This class creates and manages a system of specialized agents that work together - to analyze data, plan solutions, train models, and generate inference code. - """ - - def __init__( - self, - orchestrator_model_id: str = "anthropic/claude-3-7-sonnet-20250219", - ml_researcher_model_id: str = "openai/gpt-4o", - ml_engineer_model_id: str = "anthropic/claude-3-7-sonnet-20250219", - ml_ops_engineer_model_id: str = "anthropic/claude-3-7-sonnet-20250219", - tool_model_id: str = "openai/gpt-4o", - verbose: bool = False, - max_steps: int = 50, - distributed: bool = False, - chain_of_thought_callable: Optional[Callable] = None, - max_solutions: int = 1, - ): - """ - Initialize the multi-agent ML engineering system. - - Args: - orchestrator_model_id: Model ID for the orchestrator agent - ml_researcher_model_id: Model ID for the ML researcher agent - ml_engineer_model_id: Model ID for the ML engineer agent - ml_ops_engineer_model_id: Model ID for the ML ops engineer agent - tool_model_id: Model ID for the model used inside tool calls - verbose: Whether to display detailed agent logs - max_steps: Maximum number of steps for the orchestrator agent - distributed: Whether to run the agents in a distributed environment - chain_of_thought_callable: Optional callable for chain of thought logging - """ - self.orchestrator_model_id = orchestrator_model_id - self.ml_researcher_model_id = ml_researcher_model_id - self.ml_engineer_model_id = ml_engineer_model_id - self.ml_ops_engineer_model_id = ml_ops_engineer_model_id - self.tool_model_id = tool_model_id - self.verbose = verbose - self.max_steps = max_steps - self.distributed = distributed - self.chain_of_thought_callable = chain_of_thought_callable - - # Set verbosity levels - self.orchestrator_verbosity = 1 if verbose else 0 - self.specialist_verbosity = 1 if verbose else 0 - - # Create solution planner agent - plans ML approaches - self.ml_research_agent = ModelPlannerAgent( - model_id=self.ml_researcher_model_id, - verbose=verbose, - chain_of_thought_callable=chain_of_thought_callable, - max_solutions=max_solutions, - ).agent - - # Create and run the schema resolver agent - self.schema_resolver_agent = SchemaResolverAgent( - model_id=self.orchestrator_model_id, - verbose=verbose, - chain_of_thought_callable=chain_of_thought_callable, - ).agent - - # Create the EDA agent to analyze the dataset - self.eda_agent = EdaAgent( - model_id=self.orchestrator_model_id, - verbose=verbose, - chain_of_thought_callable=chain_of_thought_callable, - ).agent - - # Create feature engineering agent - transforms raw datasets for better model performance - self.feature_engineering_agent = FeatureEngineeringAgent( - model_id=self.ml_engineer_model_id, - verbose=verbose, - chain_of_thought_callable=self.chain_of_thought_callable, - ).agent - - # Create dataset splitter agent - intelligently splits datasets - self.dataset_splitter_agent = DatasetSplitterAgent( - model_id=self.orchestrator_model_id, - verbose=verbose, - chain_of_thought_callable=self.chain_of_thought_callable, - ).agent - - # Create model trainer agent - implements training code - self.mle_agent = ModelTrainerAgent( - ml_engineer_model_id=self.ml_engineer_model_id, - tool_model_id=self.tool_model_id, - distributed=self.distributed, - verbose=verbose, - chain_of_thought_callable=self.chain_of_thought_callable, - schema_resolver_agent=self.schema_resolver_agent, - ).agent - - # Create predictor builder agent - creates inference code - self.mlops_engineer = ModelPackagerAgent( - model_id=self.ml_ops_engineer_model_id, - tool_model_id=self.tool_model_id, - verbose=verbose, - chain_of_thought_callable=self.chain_of_thought_callable, - schema_resolver_agent=self.schema_resolver_agent, - ).agent - - # Create model tester agent - tests and evaluates the finalized model - self.model_tester_agent = ModelTesterAgent( - model_id=self.ml_engineer_model_id, - verbose=verbose, - chain_of_thought_callable=self.chain_of_thought_callable, - ).agent - - # Create orchestrator agent - coordinates the workflow - self.manager_agent = CodeAgent( - name="Orchestrator", - model=LiteLLMModel(model_id=self.orchestrator_model_id), - tools=[ - get_select_target_metric(self.tool_model_id), - get_review_finalised_model(self.tool_model_id), - get_latest_datasets, - get_solution_performances, - register_best_solution, - format_final_orchestrator_agent_response, - ], - managed_agents=[ - self.eda_agent, - self.schema_resolver_agent, - self.feature_engineering_agent, - self.ml_research_agent, - self.dataset_splitter_agent, - self.mle_agent, - self.mlops_engineer, - self.model_tester_agent, - ], - add_base_tools=False, - verbosity_level=self.orchestrator_verbosity, - additional_authorized_imports=config.code_generation.authorized_agent_imports, - max_steps=self.max_steps, - planning_interval=7, - step_callbacks=[self.chain_of_thought_callable], - ) - - def run(self, task, additional_args: dict) -> ModelGenerationResult: - """ - Run the orchestrator agent to generate a machine learning model. - - Returns: - ModelGenerationResult: The result of the model generation process. - """ - object_registry = ObjectRegistry() - result = self.manager_agent.run(task=task, additional_args=additional_args) - - print(f"Registry contents:\n\n" f"{json.dumps(sorted(object_registry.list()), indent=4)}" f"\n\n") - - try: - # Only log the full result when in verbose mode - if logger.isEnabledFor(logging.DEBUG): - logger.debug("Agent result: %s", result) - - if isinstance(result, AgentText): - result = json.loads(str(result)) - - # Extract data from the agent result - best_solution = object_registry.get(Solution, "best_performing_solution") - training_code = best_solution.training_code - inference_code = best_solution.inference_code - - # Extract performance metrics - if "performance" in result and isinstance(result["performance"], dict): - metrics = result["performance"] - else: - metrics = {} - - metric_name = metrics.get("name", "unknown") - metric_value = metrics.get("value", 0.0) - comparison_str = metrics.get("comparison_method", "") - comparison_method_map = { - "HIGHER_IS_BETTER": ComparisonMethod.HIGHER_IS_BETTER, - "LOWER_IS_BETTER": ComparisonMethod.LOWER_IS_BETTER, - "TARGET_IS_BETTER": ComparisonMethod.TARGET_IS_BETTER, - } - comparison_method = ComparisonMethod.HIGHER_IS_BETTER # Default to higher is better - for key, method in comparison_method_map.items(): - if key in comparison_str: - comparison_method = method - - comparator = MetricComparator(comparison_method) - performance = Metric( - name=metric_name, - value=metric_value, - comparator=comparator, - ) - - # Model metadata - metadata = result.get("metadata", {"model_type": "unknown", "framework": "unknown"}) - - # Compile the inference code into a module - inference_module: types.ModuleType = types.ModuleType("predictor") - exec(inference_code, inference_module.__dict__) - # Instantiate the predictor class from the loaded module - predictor_class = getattr(inference_module, "PredictorImplementation") - predictor = predictor_class(best_solution.model_artifacts) - - # Get feature transformer code if available - feature_transformer_code = None - try: - feature_code = object_registry.get(Code, "feature_transformations") - if feature_code: - feature_transformer_code = feature_code.code - except KeyError: - # No feature transformations code found, that's ok - pass - - # Get dataset split code if available - dataset_split_code = None - try: - dataset_split_code = object_registry.get(Code, "dataset_splitting_code") - if dataset_split_code: - dataset_split_code = dataset_split_code.code - except KeyError: - # No dataset split code found, that's ok - pass - - # Get testing code if available - testing_code = None - try: - testing_code = best_solution.testing_code - except Exception: - # No testing code found, that's ok - pass - - # Get evaluation report if available - evaluation_report = None - try: - evaluation_report = best_solution.model_evaluation_report - except Exception: - # No evaluation report found, that's ok - pass - - return ModelGenerationResult( - training_source_code=training_code, - inference_source_code=inference_code, - feature_transformer_source_code=feature_transformer_code, - dataset_split_code=dataset_split_code, - predictor=predictor, - model_artifacts=best_solution.model_artifacts, - performance=performance, - test_performance=performance, # Using the same performance for now - testing_source_code=testing_code, - evaluation_report=evaluation_report, - metadata=metadata, - ) - except Exception as e: - raise RuntimeError(f"❌ Failed to process agent result: {str(e)}") from e diff --git a/plexe/agents/conversational.py b/plexe/agents/conversational.py deleted file mode 100644 index 683cf432..00000000 --- a/plexe/agents/conversational.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -Conversational Agent for guiding users through ML model definition and initiation. - -This module defines a ConversationalAgent that helps users define their ML requirements -through natural conversation, validates their inputs, and initiates model building -when all necessary information has been gathered. -""" - -import logging - -from smolagents import ToolCallingAgent, LiteLLMModel - -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.datasets import get_dataset_preview -from plexe.tools.conversation import validate_dataset_files, initiate_model_build - -logger = logging.getLogger(__name__) - - -class ConversationalAgent: - """ - Agent for conversational model definition and build initiation. - - This agent guides users through defining their ML requirements via natural - conversation, helps clarify the problem, validates dataset availability, - and initiates the model building process when all requirements are met. - """ - - def __init__( - self, - model_id: str = "anthropic/claude-sonnet-4-20250514", - verbose: bool = False, - ): - """ - Initialize the conversational agent. - - Args: - model_id: Model ID for the LLM to use for conversation - verbose: Whether to display detailed agent logs - """ - self.model_id = model_id - self.verbose = verbose - - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create the conversational agent with necessary tools - self.agent = ToolCallingAgent( - name="ModelDefinitionAssistant", - description=( - "Expert ML consultant that helps users define their machine learning requirements " - "through conversational guidance. Specializes in clarifying problem definitions, " - "understanding data requirements, and initiating model builds when ready. " - "Maintains a friendly, helpful conversation while ensuring all technical " - "requirements are properly defined before proceeding with model creation." - ), - model=LiteLLMModel(model_id=self.model_id), - tools=[ - get_dataset_preview, - validate_dataset_files, - initiate_model_build, - ], - add_base_tools=False, - verbosity_level=self.verbosity, - prompt_templates=get_prompt_templates( - base_template_name="toolcalling_agent.yaml", - override_template_name="conversational_prompt_templates.yaml", - ), - ) diff --git a/plexe/agents/dataset_analyser.py b/plexe/agents/dataset_analyser.py deleted file mode 100644 index c6291ba1..00000000 --- a/plexe/agents/dataset_analyser.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -Exploratory Data Analysis (EDA) Agent for data analysis and insights in ML models. - -This module defines an EdaAgent that analyzes datasets to generate comprehensive -exploratory data analysis reports before model building begins. -""" - -import logging -from typing import List, Callable - -from smolagents import LiteLLMModel, CodeAgent - -from plexe.config import config, prompt_templates -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.datasets import register_eda_report, drop_null_columns, get_latest_datasets -from plexe.tools.schemas import get_dataset_schema - -logger = logging.getLogger(__name__) - - -class EdaAgent: - """ - Agent for performing exploratory data analysis on datasets. - - This agent analyzes the available datasets to produce a comprehensive EDA report - containing data overview, feature analysis, relationships, data quality issues, - key insights, and recommendations for modeling. - """ - - def __init__( - self, - model_id: str = "openai/gpt-4o", - verbose: bool = False, - chain_of_thought_callable: Callable = None, - ): - """ - Initialize the EDA agent. - - Args: - model_id: Model ID for the LLM to use for data analysis - verbose: Whether to display detailed agent logs - chain_of_thought_callable: Optional callable for chain of thought logging - """ - self.model_id = model_id - self.verbose = verbose - - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create the EDA agent with the necessary tools - self.agent = CodeAgent( - name="DatasetAnalyser", - description=( - "Expert data analyst that performs exploratory data analysis on datasets to generate insights " - "and recommendations for ML modeling. Will analyse existing datasets, not create new ones.\n" - "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:\n" - "- the ML task definition (i.e. 'intent')\n" - "- the name of the dataset to be analysed" - ), - model=LiteLLMModel(model_id=self.model_id), - tools=[drop_null_columns, register_eda_report, get_dataset_schema, get_latest_datasets], - add_base_tools=False, - verbosity_level=self.verbosity, - # planning_interval=3, - max_steps=30, - step_callbacks=[chain_of_thought_callable], - additional_authorized_imports=[ - "pandas", - "pandas.*", - "numpy", - "numpy.*", - "plexe", - "plexe.*", - "scipy", - "scipy.*", - "sklearn", - "sklearn.*", - "statsmodels", - "statsmodels.*", - ] - + config.code_generation.authorized_agent_imports, - prompt_templates=get_prompt_templates("code_agent.yaml", "eda_prompt_templates.yaml"), - ) - - def run( - self, - intent: str, - dataset_names: List[str], - ) -> bool: - """ - Run the EDA agent to analyze datasets and create EDA reports. - - Args: - intent: Natural language description of the model's purpose - dataset_names: List of dataset registry names available for analysis - - Returns: - Dictionary containing: - - eda_report_names: List of registered EDA report names in the Object Registry - - dataset_names: List of datasets that were analyzed - - summary: Brief summary of key findings - """ - # Use the template system to create the prompt - datasets_str = ", ".join(dataset_names) - - # Generate the prompt using the template system - task_description = prompt_templates.eda_agent_prompt( - intent=intent, - datasets=datasets_str, - ) - - # Run the agent to get analysis - self.agent.run(task_description) - - return True diff --git a/plexe/agents/dataset_splitter.py b/plexe/agents/dataset_splitter.py deleted file mode 100644 index d469b772..00000000 --- a/plexe/agents/dataset_splitter.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Dataset Splitter Agent for partitioning datasets into training, validation, and test sets. - -This module defines a DatasetSplitterAgent that handles the critical task of properly -splitting datasets for machine learning model development, supporting both random and -time-series-aware splitting strategies. -""" - -import logging -from typing import Optional, Callable - -from smolagents import CodeAgent, LiteLLMModel - -from plexe.config import config -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.datasets import get_dataset_preview, register_split_datasets, get_latest_datasets, get_dataset_reports - -logger = logging.getLogger(__name__) - - -class DatasetSplitterAgent: - """ - Agent for intelligently splitting datasets into train, validation, and test sets. - - This agent analyzes datasets and performs appropriate splits based on data characteristics - and the specific ML task at hand, handling both standard random splits and specialized - cases like time-series, imbalanced classification, and small datasets. - """ - - def __init__( - self, - model_id: str, - verbose: bool = False, - chain_of_thought_callable: Optional[Callable] = None, - ): - """ - Initialize the dataset splitter agent. - - Args: - model_id: Model ID for the LLM to use for split decision-making - verbose: Whether to display detailed agent logs - chain_of_thought_callable: Optional callable for chain of thought logging - """ - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create the dataset splitter agent with the necessary tools - self.agent = CodeAgent( - name="DatasetSplitter", - description=( - "Expert data engineer that intelligently splits datasets for machine learning tasks. " - "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:" - "- the ML task definition (i.e. 'intent')" - "- the registered NAME of the dataset to split" - "- the split ratios (train_ratio, val_ratio, test_ratio)" - "- any helpful information or specific requirements for the split" - ), - model=LiteLLMModel(model_id=model_id), - tools=[ - get_dataset_preview, - register_split_datasets, - get_latest_datasets, - get_dataset_reports, - ], - planning_interval=5, - add_base_tools=False, - verbosity_level=self.verbosity, - additional_authorized_imports=[ - "pandas", - "pandas.*", - "numpy", - "numpy.*", - "plexe", - "plexe.*", - "sklearn", - "sklearn.*", - "sklearn.model_selection", - "sklearn.model_selection.*", - "scipy", - "scipy.*", - "datetime", - "datetime.*", - ] - + config.code_generation.authorized_agent_imports, - prompt_templates=get_prompt_templates( - base_template_name="code_agent.yaml", override_template_name="dataset_splitter_templates.yaml" - ), - step_callbacks=[chain_of_thought_callable], - ) diff --git a/plexe/agents/feature_engineer.py b/plexe/agents/feature_engineer.py deleted file mode 100644 index 09d5db08..00000000 --- a/plexe/agents/feature_engineer.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Feature Engineering Agent for transforming raw datasets into optimized features for ML models. - -This agent analyzes datasets based on EDA reports, generates feature transformation code, -validates it, and executes it to create enhanced datasets for model training. -""" - -import logging -from typing import Optional, Callable - -from smolagents import CodeAgent, LiteLLMModel - -from plexe.config import config -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.datasets import ( - get_dataset_preview, - get_dataset_reports, - get_latest_datasets, - register_feature_engineering_report, -) -from plexe.tools.execution import apply_feature_transformer -from plexe.tools.validation import validate_feature_transformations -from plexe.tools.schemas import get_global_schemas - -logger = logging.getLogger(__name__) - - -class FeatureEngineeringAgent: - """ - Agent for creating optimized features from raw datasets for ML models. - - This agent analyzes datasets, generates transformation code based on EDA insights, - and applies these transformations to create enhanced datasets for model training. - The agent ensures data integrity through validation and provides transformed - datasets through the object registry. - """ - - def __init__( - self, - model_id: str, - verbose: bool = False, - chain_of_thought_callable: Optional[Callable] = None, - ): - """ - Initialize the feature engineering agent. - - Args: - model_id: Model ID for the LLM to use for feature engineering - verbose: Whether to display detailed agent logs - chain_of_thought_callable: Optional callback for chain-of-thought logging - """ - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create feature engineering agent - self.agent = CodeAgent( - name="FeatureEngineer", - description=( - "Expert data scientist that transforms raw datasets into optimized features for ML models. " - "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:" - "- the ML task definition (i.e. 'intent')" - "- the name of the dataset to transform" - ), - model=LiteLLMModel(model_id=model_id), - tools=[ - get_dataset_preview, - validate_feature_transformations, - apply_feature_transformer, - get_latest_datasets, - get_dataset_reports, - get_global_schemas, - register_feature_engineering_report, - ], - add_base_tools=False, - additional_authorized_imports=config.code_generation.authorized_agent_imports - + [ - "plexe", - "plexe.*", - "pandas", - "pandas.*", - "numpy", - "numpy.*", - "scikit-learn", - "scikit-learn.*", - "scikit_learn", - "scikit_learn.*", - "sklearn", - "sklearn.*", - "scipy", - "scipy.*", - "statsmodels", - "statsmodels.*", - ], - verbosity_level=self.verbosity, - prompt_templates=get_prompt_templates( - base_template_name="code_agent.yaml", override_template_name="feature_engineer_prompt_templates.yaml" - ), - planning_interval=5, - step_callbacks=[chain_of_thought_callable], - ) diff --git a/plexe/agents/model_packager.py b/plexe/agents/model_packager.py deleted file mode 100644 index 5921b6e5..00000000 --- a/plexe/agents/model_packager.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Model Packager Agent for creating production-ready inference code for ML models. - -This agent analyzes training code and generates high-quality inference code that can be -used in production environments, ensuring proper encapsulation of model functionality. -""" - -import logging -from typing import Optional, Callable - -from smolagents import CodeAgent, LiteLLMModel - -from plexe.config import config -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.context import get_inference_context_tool -from plexe.tools.validation import validate_inference_code -from plexe.tools.solutions import list_solutions - -logger = logging.getLogger(__name__) - - -class ModelPackagerAgent: - """ - Agent for creating production-ready inference code for ML models. - - This agent analyzes the training code produced by the ModelTrainerAgent and creates - high-quality inference code that properly encapsulates the trained model for deployment - in production environments. - """ - - def __init__( - self, - model_id: str, - tool_model_id: str, - verbose: bool = False, - chain_of_thought_callable: Optional[Callable] = None, - schema_resolver_agent=None, - ): - """ - Initialize the model packager agent. - - Args: - model_id: Model ID for the LLM to use for inference code generation - tool_model_id: Model ID for the LLM to use for tool operations - verbose: Whether to display detailed agent logs - chain_of_thought_callable: Optional callback for chain-of-thought logging - """ - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create predictor builder agent - creates inference code - self.agent = CodeAgent( - name="MLOperationsEngineer", - description=( - "Expert ML operations engineer that analyzes training code and creates high-quality production-ready " - "inference code for ML models. This agent STRICTLY requires the training code of the best solution to have " - "been registered in the object registry." - ), - model=LiteLLMModel(model_id=model_id), - tools=[ - get_inference_context_tool(tool_model_id), - validate_inference_code, - list_solutions, - ], - managed_agents=[schema_resolver_agent] if schema_resolver_agent else [], - add_base_tools=False, - verbosity_level=self.verbosity, - additional_authorized_imports=config.code_generation.authorized_agent_imports + ["plexe", "plexe.*"], - prompt_templates=get_prompt_templates( - base_template_name="code_agent.yaml", override_template_name="mlops_prompt_templates.yaml" - ), - planning_interval=8, - step_callbacks=[chain_of_thought_callable], - ) diff --git a/plexe/agents/model_planner.py b/plexe/agents/model_planner.py deleted file mode 100644 index dce8b155..00000000 --- a/plexe/agents/model_planner.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging - -from smolagents import ToolCallingAgent, LiteLLMModel - -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.datasets import get_dataset_preview, get_latest_datasets, get_dataset_reports -from plexe.tools.schemas import get_global_schemas -from plexe.tools.solutions import get_solution_creation_tool - -logger = logging.getLogger(__name__) - - -class ModelPlannerAgent: - """ - Agent responsible for planning ML model solutions based on provided requirements. - - This agent acts as an ML research scientist that develops detailed solution ideas - and plans for ML use cases. It analyzes the dataset and requirements to propose - appropriate modeling approaches. - - Attributes: - verbosity (int): The verbosity level for agent output (0 for quiet, 1 for verbose) - agent (ToolCallingAgent): The underlying tool-calling agent implementation - """ - - def __init__( - self, - model_id: str, - verbose: bool = False, - chain_of_thought_callable: callable = None, - max_solutions: int = 1, - ): - """ - Initialize the ModelPlannerAgent. - - Args: - model_id (str): The identifier for the language model to use - verbose (bool): Whether to enable verbose output from the agent - chain_of_thought_callable (callable, optional): Callback function for - intercepting and processing chain-of-thought outputs - """ - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create solution planner agent - plans ML approaches - self.agent = ToolCallingAgent( - name="MLResearchScientist", - description=( - "Expert ML researcher that develops detailed solution ideas and plans for ML use cases. " - "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:" - "- the ML task definition (i.e. 'intent')" - "- input schema for the model" - "- output schema for the model" - "- the name and comparison method of the metric to optimise" - "- the name of the dataset to use for training" - ), - model=LiteLLMModel(model_id=model_id), - tools=[ - get_dataset_preview, - get_latest_datasets, - get_dataset_reports, - get_global_schemas, - get_solution_creation_tool(max_solutions), - ], - add_base_tools=False, - verbosity_level=self.verbosity, - prompt_templates=get_prompt_templates("toolcalling_agent.yaml", "mls_prompt_templates.yaml"), - step_callbacks=[chain_of_thought_callable], - ) diff --git a/plexe/agents/model_tester.py b/plexe/agents/model_tester.py deleted file mode 100644 index 3abefd61..00000000 --- a/plexe/agents/model_tester.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Model Tester Agent for comprehensive testing and evaluation of finalized ML models. - -This module defines a ModelTesterAgent that evaluates model performance on test data, -performs quality analysis, and generates comprehensive evaluation reports. -""" - -import logging -from typing import Optional, Callable - -from smolagents import CodeAgent, LiteLLMModel - -from plexe.config import config -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.testing import register_testing_code, register_evaluation_report -from plexe.tools.datasets import get_test_dataset -from plexe.tools.schemas import get_solution_schemas -from plexe.tools.code_analysis import get_feature_transformer_code -from plexe.tools.solutions import list_solutions - -logger = logging.getLogger(__name__) - - -class ModelTesterAgent: - """ - Agent for comprehensive testing and evaluation of finalized ML models. - - This agent retrieves the predictor and test datasets directly from the object - registry, performs thorough evaluation through direct code execution, and - produces detailed performance reports. - """ - - def __init__( - self, - model_id: str, - verbose: bool = False, - chain_of_thought_callable: Optional[Callable] = None, - ): - """ - Initialize the model tester agent. - - Args: - model_id: Model ID for the LLM to use for model testing - verbose: Whether to display detailed agent logs - chain_of_thought_callable: Optional callable for chain of thought logging - """ - self.model_id = model_id - self.verbose = verbose - - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create the model tester agent with the necessary tools - self.agent = CodeAgent( - name="ModelTester", - description=( - "Expert ML model testing specialist that evaluates finalized models for performance, " - "quality, and production readiness. To work effectively, as part of the 'task' prompt " - "the agent STRICTLY requires:\n" - "- the test dataset name for evaluation\n" - "- task definition and target metric information\n" - "- the expected input schema and output schema of the model\n" - "The predictor must already have been created by the ml operations engineer.\n" - ), - model=LiteLLMModel(model_id=self.model_id), - tools=[ - register_testing_code, - register_evaluation_report, - get_test_dataset, - get_solution_schemas, - get_feature_transformer_code, - list_solutions, - ], - add_base_tools=False, - verbosity_level=self.verbosity, - additional_authorized_imports=config.code_generation.authorized_agent_imports - + [ - "plexe", - "plexe.*", - "pandas", - "pandas.*", - "numpy", - "numpy.*", - "sklearn", - "sklearn.*", - "scipy", - "scipy.*", - ], - prompt_templates=get_prompt_templates( - base_template_name="code_agent.yaml", override_template_name="model_tester_prompt_templates.yaml" - ), - step_callbacks=[chain_of_thought_callable], - ) diff --git a/plexe/agents/model_trainer.py b/plexe/agents/model_trainer.py deleted file mode 100644 index fffcce96..00000000 --- a/plexe/agents/model_trainer.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Model Trainer Agent for training ML models based on provided plans. - -This agent implements the training code, validates it, and executes the training code. -""" - -import logging - -from smolagents import CodeAgent, LiteLLMModel - -from plexe.config import config -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.execution import get_executor_tool -from plexe.tools.response_formatting import format_final_mle_agent_response -from plexe.tools.schemas import get_dataset_schema, get_solution_schemas -from plexe.tools.training import get_training_code_generation_tool, get_training_code_fixing_tool -from plexe.tools.validation import validate_training_code -from plexe.tools.datasets import get_training_datasets -from plexe.tools.code_analysis import get_feature_transformer_code -from plexe.tools.solutions import list_solutions, get_solution_plan_by_id - -logger = logging.getLogger(__name__) - - -class ModelTrainerAgent: - """ - Agent for training ML models based on provided plans. - - This agent implements the training code, validates it, and executes the training code. - """ - - def __init__( - self, - ml_engineer_model_id: str, - tool_model_id: str, - distributed: bool = False, - verbose: bool = False, - chain_of_thought_callable: callable = None, - schema_resolver_agent=None, - ): - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create model trainer agent - implements training code - self.agent = CodeAgent( - name="MLEngineer", - description=( - "Expert ML engineer that implements, trains and validates ML models based on provided plans. " - "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:" - "- the ML task definition (i.e. 'intent')" - "- the name and comparison method of the metric to optimise" - "- the Solution ID to implement (from ML Research Scientist)" - "- the split train/validation dataset names" - "- the working directory to use for model execution" - ), - model=LiteLLMModel(model_id=ml_engineer_model_id), - max_steps=15, - tools=[ - get_training_code_generation_tool(tool_model_id), - validate_training_code, - get_dataset_schema, - get_training_code_fixing_tool(tool_model_id), - get_executor_tool(distributed), - format_final_mle_agent_response, - get_training_datasets, - get_solution_schemas, - get_feature_transformer_code, - get_solution_plan_by_id, - list_solutions, - ], - managed_agents=[schema_resolver_agent] if schema_resolver_agent else [], - add_base_tools=False, - additional_authorized_imports=[ - "plexe", - "plexe.*", - ] - + config.code_generation.authorized_agent_imports, - verbosity_level=self.verbosity, - prompt_templates=get_prompt_templates( - base_template_name="code_agent.yaml", override_template_name="mle_prompt_templates.yaml" - ), - step_callbacks=[chain_of_thought_callable], - ) diff --git a/plexe/agents/schema_resolver.py b/plexe/agents/schema_resolver.py deleted file mode 100644 index 063d2821..00000000 --- a/plexe/agents/schema_resolver.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Schema Resolver Agent for inferring input and output schemas for ML models. - -This module defines a SchemaResolverAgent that determines the appropriate -input and output schemas for a machine learning model based on its intent -and available datasets. -""" - -import logging -from typing import Callable - -from smolagents import LiteLLMModel, CodeAgent - -from plexe.internal.common.utils.agents import get_prompt_templates -from plexe.tools.datasets import get_dataset_preview, get_dataset_reports, get_latest_datasets -from plexe.tools.schemas import ( - register_global_schemas, - get_global_schemas, - register_solution_schemas, - get_solution_schemas, -) -from plexe.tools.solutions import list_solutions - -logger = logging.getLogger(__name__) - - -class SchemaResolverAgent: - """ - Agent for resolving input and output schemas for ML models. - - This agent analyzes the model intent and available datasets to determine - the appropriate input and output schemas, handling both schema inference - and validation scenarios. - """ - - def __init__( - self, - model_id: str, - verbose: bool = False, - chain_of_thought_callable: Callable = None, - ): - """ - Initialize the schema resolver agent. - - Args: - model_id: Model ID for the LLM to use for schema resolution - verbose: Whether to display detailed agent logs - """ - self.model_id = model_id - self.verbose = verbose - - # Set verbosity level - self.verbosity = 1 if verbose else 0 - - # Create the schema resolver agent with the necessary tools - self.agent = CodeAgent( - name="SchemaResolver", - description=( - "Expert schema resolver that determines appropriate input and output schemas for ML models. " - "To work effectively, as part of the 'task' prompt the agent STRICTLY requires:\n" - "- the ML task definition (i.e. 'intent')\n" - "- the name of the feature-engineered dataset that will be used for training" - "Important: the agent requires the feature-engineered dataset to have been created" - ), - model=LiteLLMModel(model_id=self.model_id), - tools=[ - get_dataset_preview, - get_global_schemas, - register_global_schemas, - register_solution_schemas, - get_solution_schemas, - get_latest_datasets, - get_dataset_reports, - list_solutions, - ], - add_base_tools=False, - verbosity_level=self.verbosity, - step_callbacks=[chain_of_thought_callable], - prompt_templates=get_prompt_templates( - base_template_name="code_agent.yaml", override_template_name="schema_resolver_prompt_templates.yaml" - ), - ) diff --git a/plexe/api/__init__.py b/plexe/api/__init__.py new file mode 100644 index 00000000..10a6f1c4 --- /dev/null +++ b/plexe/api/__init__.py @@ -0,0 +1,7 @@ +""" +API module exports +""" + +from .datasets import router as datasets_router + +__all__ = ["datasets_router"] diff --git a/plexe/api/datasets.py b/plexe/api/datasets.py new file mode 100644 index 00000000..235eb195 --- /dev/null +++ b/plexe/api/datasets.py @@ -0,0 +1,323 @@ +import logging +from pathlib import Path +from typing import List + +from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi.responses import FileResponse +from pydantic import BaseModel +import pandas as pd +import psycopg2 + +# from plexe.agents.feature_generator import FeatureGeneratorAgent + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +router = APIRouter(prefix="/api", tags=["datasets"]) +UPLOADS_DIR = Path("./data/uploads") +UPLOADS_DIR.mkdir(parents=True, exist_ok=True) +DATABASE_DIR = Path("./data/database") +DATABASE_DIR.mkdir(parents=True, exist_ok=True) + + +class PostgresConnection(BaseModel): + """PostgreSQL connection configuration""" + + host: str + port: int + database: str + username: str + password: str + + +class CombineDatasetsRequest(BaseModel): + tables: List[str] + relationships: List[dict] + connection: PostgresConnection + + +@router.post("/upload") +async def upload_files(files: List[UploadFile] = File(...)): + """ + Upload one or more dataset files + + Supported formats: CSV, XLSX, JSON, Parquet + """ + if not files: + raise HTTPException(status_code=400, detail="No files provided") + + uploaded_files = [] + errors = [] + + # Allowed file extensions + allowed_extensions = {".csv", ".xlsx", ".xls", ".json", ".parquet", ".pq"} + + for file in files: + try: + # Validate file extension + file_ext = Path(file.filename).suffix.lower() + if file_ext not in allowed_extensions: + errors.append(f"{file.filename}: Unsupported file format") + continue + + # Save file + file_path = UPLOADS_DIR / file.filename + + # Read and save file content + contents = await file.read() + with open(file_path, "wb") as f: + f.write(contents) + + uploaded_files.append({"filename": file.filename, "size": len(contents), "path": str(file_path)}) + + except Exception as e: + errors.append(f"{file.filename}: {str(e)}") + + if not uploaded_files and errors: + raise HTTPException(status_code=400, detail="; ".join(errors)) + + return {"uploaded": uploaded_files, "errors": errors, "total": len(uploaded_files), "failed": len(errors)} + + +@router.post("/postgres/test") +async def test_postgres_connection(config: PostgresConnection): + """ + Test PostgreSQL connection without saving + """ + try: + import psycopg2 + + # Build connection string + conn_str = ( + f"dbname={config.database} " + f"user={config.username} " + f"password={config.password} " + f"host={config.host} " + f"port={config.port}" + ) + + # Test connection + conn = psycopg2.connect(conn_str) + conn.close() + + return { + "success": True, + "message": "Connection successful", + "host": config.host, + "port": config.port, + "database": config.database, + } + + except ImportError: + raise HTTPException(status_code=500, detail="psycopg2 not installed. Install with: pip install psycopg2-binary") + except Exception as e: + raise HTTPException(status_code=400, detail=f"Connection failed: {str(e)}") + + +@router.post("/postgres/execute") +async def execute_postgres_query(config: PostgresConnection): + """ + Execute a query on a PostgreSQL database + TODO: Store in database or secure config file + """ + try: + import psycopg2 + + # Build connection string + conn_str = ( + f"dbname={config.database} " + f"user={config.username} " + f"password={config.password} " + f"host={config.host} " + f"port={config.port}" + ) + + # Test connection first + conn = psycopg2.connect(conn_str) + + # Fetch tables from the public schema + tables = [] + with conn.cursor() as cursor: + cursor.execute( + """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema = 'public' + """ + ) + for row in cursor.fetchall(): + tables.append(row[0]) + + # Fetch relationships from the public schema + relationships = [] + with conn.cursor() as cursor: + cursor.execute( + """ + SELECT + tc.table_name, + kcu.column_name, + ccu.table_name AS foreign_table_name, + ccu.column_name AS foreign_column_name + FROM + information_schema.table_constraints AS tc + JOIN information_schema.key_column_usage AS kcu + ON tc.constraint_name = kcu.constraint_name + JOIN information_schema.constraint_column_usage AS ccu + ON ccu.constraint_name = tc.constraint_name + WHERE constraint_type = 'FOREIGN KEY' AND tc.table_schema = 'public'; + """ + ) + for row in cursor.fetchall(): + relationships.append( + { + "table_name": row[0], + "column_name": row[1], + "foreign_table_name": row[2], + "foreign_column_name": row[3], + } + ) + + conn.close() + + # TODO: Save to secure config file or database + # For now, just return success and the list of tables + return { + "success": True, + "message": "Query executed successfully", + "host": config.host, + "port": config.port, + "database": config.database, + "tables": tables, + "relationships": relationships, + } + + except ImportError: + raise HTTPException(status_code=500, detail="psycopg2 not installed. Install with: pip install psycopg2-binary") + except Exception as e: + raise HTTPException(status_code=400, detail=f"Failed to execute query: {str(e)}") + + +@router.get("/datasets") +async def get_datasets(): + """ + Get list of uploaded datasets + """ + try: + datasets = [] + + if UPLOADS_DIR.exists(): + for file_path in UPLOADS_DIR.iterdir(): + if file_path.is_file(): + datasets.append( + { + "id": file_path.stem, + "filename": file_path.name, + "size": file_path.stat().st_size, + "created_at": file_path.stat().st_ctime, + "path": str(file_path), + } + ) + + return {"datasets": datasets, "total": len(datasets)} + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to get datasets: {str(e)}") + + +@router.delete("/datasets/{dataset_id}") +async def delete_dataset(dataset_id: str): + """ + Delete a dataset by ID (filename without extension) + """ + try: + # Find and delete the file + for file_path in UPLOADS_DIR.iterdir(): + if file_path.stem == dataset_id and file_path.is_file(): + file_path.unlink() + return {"success": True, "message": f"Dataset '{dataset_id}' deleted successfully"} + + raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to delete dataset: {str(e)}") + + +@router.get("/datasets/{dataset_id}/download") +async def download_dataset(dataset_id: str): + """ + Download a dataset by ID (filename without extension) + """ + try: + for file_path in UPLOADS_DIR.iterdir(): + if file_path.stem == dataset_id and file_path.is_file(): + return FileResponse(path=file_path, filename=file_path.name, media_type="application/octet-stream") + raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to download dataset: {str(e)}") + + +@router.post("/datasets/combine") +async def combine_datasets_endpoint(data: CombineDatasetsRequest): + """ + Combine datasets using featuretools + """ + logger.info(f"Received request to combine datasets with data: {data}") + try: + # connect to postgres and export data to csv + conn_str = ( + f"dbname={data.connection.database} " + f"user={data.connection.username} " + f"password={data.connection.password} " + f"host={data.connection.host} " + f"port={data.connection.port}" + ) + conn = psycopg2.connect(conn_str) + + db_dir = DATABASE_DIR / data.connection.database + db_dir.mkdir(parents=True, exist_ok=True) + + table_paths = {} + for table_name in data.tables: + df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn) + file_path = db_dir / f"{table_name}.csv" + df.to_csv(file_path, index=False) + table_paths[table_name] = str(file_path.resolve()) + + conn.close() + + # session_id = str(uuid.uuid4()) + # agent = FeatureGeneratorAgent( + # session_id=session_id, + # tables=data.tables, + # relationships=data.relationships, + # ) + # task = f"""Generate features from the provided tables and relationships. + # The data for the tables is available as CSV files. + # Here is the mapping from table name to file path: + # {table_paths} + # + # You must use the `read_file` tool to read the data for each table before processing it. + # """ + # result = agent.run(task) + + # if isinstance(result, pd.DataFrame): + # destination_dir = db_dir + # destination_path = destination_dir / "final_dataset.csv" + # result.to_csv(destination_path, index=False) + # message = "Dataset combination completed successfully." + # logger.info(f"Saved final dataset to {destination_path}") + # else: + # message = "Dataset combination finished, but the agent did not return a dataframe." + # logger.error(f"Agent returned type {type(result)} instead of pandas.DataFrame.") + + message = "Feature generation is being migrated to a new multi-agent system." + + logger.info(message) + return {"success": True, "message": message} + + except Exception as e: + logger.error(f"Failed to combine datasets: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Failed to combine datasets: {str(e)}") diff --git a/plexe/callbacks.py b/plexe/callbacks.py deleted file mode 100644 index ba848589..00000000 --- a/plexe/callbacks.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -Callbacks for model building process in Plexe. - -This module defines callback interfaces that let users hook into various stages -of the model building process, allowing for custom logging, tracking, visualization, -or other operations to be performed at key points. -""" - -import logging -from abc import ABC -from dataclasses import dataclass -from typing import Optional, Type, Dict, Any - -from pydantic import BaseModel - -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.core.entities.solution import Solution - -logger = logging.getLogger(__name__) - - -@dataclass -class BuildStateInfo: - """ - Consolidated information about model build state at any point in the process. - - This class combines all information available during different stages of the model building - process (start, end, iteration start, iteration end) into a single structure. - """ - - # Common identification fields - intent: str - """The natural language description of the model's intent.""" - - provider: str - """The provider (LLM) used for generating the model.""" - - # Schema fields - input_schema: Optional[Type[BaseModel]] = None - """The input schema for the model.""" - - output_schema: Optional[Type[BaseModel]] = None - """The output schema for the model.""" - - run_timeout: Optional[int] = None - """Maximum time in seconds for each individual training run.""" - - max_iterations: Optional[int] = None - """Maximum number of iterations for the model building process.""" - - timeout: Optional[int] = None - """Maximum total time in seconds for the entire model building process.""" - - # Iteration fields - iteration: int = 0 - """Current iteration number (0-indexed).""" - - # Dataset fields - datasets: Optional[Dict[str, TabularConvertible]] = None - - # Current node being evaluated - node: Optional[Solution] = None - """The solution node being evaluated in the current iteration.""" - - # Model information fields (replacing direct model reference) - model_identifier: Optional[str] = None - """Model unique identifier.""" - - model_state: Optional[str] = None - """Current model state (BUILDING/READY/ERROR).""" - - # Final model artifacts (only available at build end) - final_metric: Optional[Any] = None - """Final performance metric.""" - - final_artifacts: Optional[list] = None - """Model artifacts list.""" - - trainer_source: Optional[str] = None - """Training source code.""" - - predictor_source: Optional[str] = None - """Predictor source code.""" - - -class Callback(ABC): - """ - Abstract base class for callbacks during model building. - - Callbacks allow running custom code at various stages of the model building process. - Subclass this and implement the methods you need for your specific use case. - """ - - def on_build_start(self, info: BuildStateInfo) -> None: - """ - Called when the model building process starts. - """ - pass - - def on_build_end(self, info: BuildStateInfo) -> None: - """ - Called when the model building process ends. - """ - pass - - def on_iteration_start(self, info: BuildStateInfo) -> None: - """ - Called at the start of each model building iteration. - """ - pass - - def on_iteration_end(self, info: BuildStateInfo) -> None: - """ - Called at the end of each model building iteration. - """ - pass - - -# Import at the end to avoid circular dependencies -from plexe.internal.models.callbacks.mlflow import MLFlowCallback -from plexe.internal.models.callbacks.chain_of_thought import ChainOfThoughtModelCallback -from plexe.internal.models.callbacks.checkpoint import ModelCheckpointCallback - -__all__ = [ - "Callback", - "BuildStateInfo", - "MLFlowCallback", - "ChainOfThoughtModelCallback", - "ModelCheckpointCallback", -] diff --git a/plexe/config.py b/plexe/config.py deleted file mode 100644 index 948a1396..00000000 --- a/plexe/config.py +++ /dev/null @@ -1,358 +0,0 @@ -""" -Configuration for the plexe library. -""" - -import importlib -import logging -import warnings -from dataclasses import dataclass, field -from importlib.resources import files -from typing import List -from functools import cached_property -from jinja2 import Environment, FileSystemLoader -import sys -from pathlib import Path - -from plexe import templates as template_module - - -TEMPLATE_DIR = files("plexe").joinpath("templates/prompts") - - -# configure warnings -warnings.filterwarnings("ignore", category=FutureWarning) -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -def is_package_available(package_name: str) -> bool: - """Check if a Python package is available/installed.""" - try: - importlib.import_module(package_name) - return True - except ImportError: - return False - - -@dataclass(frozen=True) -class _Config: - @dataclass(frozen=True) - class _FileStorageConfig: - cache_dir: str = field(default=".plexecache/") - model_dir: str = field(default="model_files/") - checkpoint_dir: str = field(default="checkpoints/") - delete_checkpoints_on_success: bool = field(default=False) - keep_checkpoints: int = field(default=3) - - @dataclass(frozen=True) - class _LoggingConfig: - level: str = field(default="INFO") - format: str = field(default="[%(asctime)s - %(name)s - %(levelname)s - (%(threadName)-10s)]: - %(message)s") - - @dataclass(frozen=True) - class _ModelSearchConfig: - initial_nodes: int = field(default=3) - max_nodes: int = field(default=15) - max_fixing_attempts_train: int = field(default=3) - max_fixing_attempts_predict: int = field(default=10) - max_time_elapsed: int = field(default=600) - - @dataclass(frozen=True) - class _ExecutionConfig: - runfile_name: str = field(default="execution_script.py") - - @dataclass(frozen=True) - class _CodeGenerationConfig: - # Base ML packages that are always available - _base_packages: List[str] = field( - default_factory=lambda: [ - "pandas", - "numpy", - "scikit-learn", - "sklearn", - "joblib", - "mlxtend", - "xgboost", - "pyarrow", - "statsmodels", - ] - ) - - # Deep learning packages that are optional - _deep_learning_packages: List[str] = field( - default_factory=lambda: [ - "tensorflow-cpu", - "torch", - "transformers", - "tokenizers", - "accelerate", - "safetensors", - ] - ) - - # Additional standard library modules for agent execution - _standard_lib_modules: List[str] = field( - default_factory=lambda: [ - "pathlib", - "typing", - "dataclasses", - "json", - "io", - "time", - "datetime", - "os", - "sys", - "math", - "random", - "itertools", - "collections", - "functools", - "operator", - "re", - "copy", - "warnings", - "logging", - "importlib", - "types", - "plexe", - ] - ) - - @property - def allowed_packages(self) -> List[str]: - """Dynamically determine which packages are available and can be used.""" - available_packages = self._base_packages.copy() - - # Check if deep learning packages are installed and add them if they are - for package in self._deep_learning_packages: - if is_package_available(package): - available_packages.append(package) - - return available_packages - - @property - def authorized_agent_imports(self) -> List[str]: - """Return the combined list of allowed packages and standard library modules for agent execution.""" - # Start with allowed packages - imports = self.allowed_packages.copy() - - # Add standard library modules - imports.extend(self._standard_lib_modules) - - return imports - - @property - def deep_learning_available(self) -> bool: - """Check if deep learning packages are available.""" - return any(is_package_available(pkg) for pkg in self._deep_learning_packages) - - k_fold_validation: int = field(default=5) - - @dataclass(frozen=True) - class _DataGenerationConfig: - pass # todo: implement - - @dataclass(frozen=True) - class _RayConfig: - address: str = field(default=None) # None for local, Ray address for remote - num_cpus: int = field(default=None) # None for auto-detect - num_gpus: int = field(default=None) # None for auto-detect - - # configuration objects - file_storage: _FileStorageConfig = field(default_factory=_FileStorageConfig) - logging: _LoggingConfig = field(default_factory=_LoggingConfig) - model_search: _ModelSearchConfig = field(default_factory=_ModelSearchConfig) - code_generation: _CodeGenerationConfig = field(default_factory=_CodeGenerationConfig) - execution: _ExecutionConfig = field(default_factory=_ExecutionConfig) - data_generation: _DataGenerationConfig = field(default_factory=_DataGenerationConfig) - ray: _RayConfig = field(default_factory=_RayConfig) - - -@dataclass(frozen=True) -class _CodeTemplates: - predictor_interface: str = field( - default=Path(importlib.import_module("plexe.core.interfaces.predictor").__file__).read_text() - ) - predictor_template: str = field( - default=files(template_module).joinpath("models").joinpath("predictor.tmpl.py").read_text() - ) - feature_transformer_interface: str = field( - default=Path(importlib.import_module("plexe.core.interfaces.feature_transformer").__file__).read_text() - ) - feature_transformer_template: str = field( - default=files(template_module).joinpath("models").joinpath("feature_transformer.tmpl.py").read_text() - ) - - -@dataclass(frozen=True) -class _PromptTemplates: - template_dir: str = field(default=TEMPLATE_DIR) - - @cached_property - def env(self) -> Environment: - return Environment(loader=FileSystemLoader(str(self.template_dir))) - - def _render(self, template_name: str, **kwargs) -> str: - template = self.env.get_template(template_name) - return template.render(**kwargs) - - def planning_system(self) -> str: - return self._render("planning/system_prompt.jinja") - - def planning_select_metric(self, problem_statement) -> str: - return self._render("planning/select_metric.jinja", problem_statement=problem_statement) - - def schema_base(self) -> str: - return self._render("schemas/base.jinja") - - def schema_identify_target(self, columns, intent) -> str: - return self._render("schemas/identify_target.jinja", columns=columns, intent=intent) - - def schema_generate_from_intent(self, intent, input_schema="input_schema", output_schema="output_schema") -> str: - return self._render( - "schemas/generate_from_intent.jinja", intent=intent, input_schema=input_schema, output_schema=output_schema - ) - - def schema_resolver_prompt( - self, intent, datasets, input_schema=None, output_schema=None, has_input_schema=False, has_output_schema=False - ) -> str: - return self._render( - "agent/schema_resolver_prompt.jinja", - intent=intent, - datasets=datasets, - input_schema=input_schema, - output_schema=output_schema, - has_input_schema=has_input_schema, - has_output_schema=has_output_schema, - ) - - def eda_agent_prompt(self, intent, datasets) -> str: - return self._render( - "agent/agent_data_analyser_prompt.jinja", - intent=intent, - datasets=datasets, - ) - - def training_system(self) -> str: - return self._render("training/system_prompt.jinja") - - def training_generate( - self, problem_statement, plan, history, allowed_packages, training_data_files, validation_data_files - ) -> str: - return self._render( - "training/generate.jinja", - problem_statement=problem_statement, - plan=plan, - history=history, - allowed_packages=allowed_packages, - training_data_files=training_data_files, - validation_data_files=validation_data_files, - use_validation_files=len(validation_data_files) > 0, - ) - - def training_fix( - self, training_code, plan, review, problems, allowed_packages, training_data_files, validation_data_files - ) -> str: - return self._render( - "training/fix.jinja", - training_code=training_code, - plan=plan, - review=review, - problems=problems, - allowed_packages=allowed_packages, - training_data_files=training_data_files, - validation_data_files=validation_data_files, - use_validation_files=len(validation_data_files) > 0, - ) - - def training_review(self, problem_statement, plan, training_code, problems, allowed_packages) -> str: - return self._render( - "training/review.jinja", - problem_statement=problem_statement, - plan=plan, - training_code=training_code, - problems=problems, - allowed_packages=allowed_packages, - ) - - def review_system(self) -> str: - return self._render("review/system_prompt.jinja") - - def review_model( - self, - intent: str, - input_schema: str, - output_schema: str, - solution_plan: str, - training_code: str, - inference_code: str, - ) -> str: - return self._render( - "review/model.jinja", - intent=intent, - input_schema=input_schema, - output_schema=output_schema, - solution_plan=solution_plan, - training_code=training_code, - inference_code=inference_code, - ) - - def cot_system(self) -> str: - return self._render("utils/system_prompt.jinja") - - def cot_summarize(self, context: str) -> str: - return self._render("utils/cot_summarize.jinja", context=context) - - def agent_builder_prompt( - self, - intent: str, - input_schema: str, - output_schema: str, - datasets: List[str], - working_dir: str, - max_iterations: int = None, - resume: bool = False, - ) -> str: - return self._render( - "agent/agent_manager_prompt.jinja", - intent=intent, - input_schema=input_schema, - output_schema=output_schema, - datasets=datasets, - working_dir=working_dir, - max_iterations=max_iterations, - resume=resume, - ) - - -# Instantiate configuration and templates -config: _Config = _Config() -code_templates: _CodeTemplates = _CodeTemplates() -prompt_templates: _PromptTemplates = _PromptTemplates() - - -# Default logging configuration -def configure_logging(level: str | int = logging.INFO, file: str = None) -> None: - # Configure the library's root logger - sm_root_logger = logging.getLogger("plexe") - sm_root_logger.setLevel(level) - - # Clear existing handlers to avoid duplicate logs - sm_root_logger.handlers = [] - - # Define a common formatter - formatter = logging.Formatter(config.logging.format) - - stream_handler = logging.StreamHandler() - # Only apply reconfigure if the stream supports it - if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8") - stream_handler.setFormatter(formatter) - sm_root_logger.addHandler(stream_handler) - - if file: - file_handler = logging.FileHandler(file, encoding="utf-8") - file_handler.setFormatter(formatter) - sm_root_logger.addHandler(file_handler) - - -configure_logging(level=config.logging.level) diff --git a/plexe/core/__init__.py b/plexe/core/__init__.py deleted file mode 100644 index 5070f9de..00000000 --- a/plexe/core/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -""" -Core types and functions used across Plexe. - -This package contains fundamental types and functions used throughout the Plexe codebase. -It's designed to avoid circular dependencies by providing core functionality that can -be imported by multiple modules. -""" diff --git a/plexe/core/entities/solution.py b/plexe/core/entities/solution.py deleted file mode 100644 index 00f55f87..00000000 --- a/plexe/core/entities/solution.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -This module defines the `Solution` class used to represent complete ML pipelines. - -A Solution encapsulates all artifacts produced throughout the ML workflow for a single -approach, including the initial plan, feature transformations, model training code, -inference code, performance metrics, and deployment artifacts. Each Solution represents -one experimental path from data processing through model deployment. -""" - -import time -import uuid -from dataclasses import dataclass, field -from typing import List, Dict - -from plexe.internal.models.entities.artifact import Artifact -from plexe.internal.models.entities.metric import Metric - - -@dataclass(eq=False) -class Solution: - """ - Represents a complete ML solution from planning through deployment. - - A Solution is a container for all artifacts related to a single ML approach, - allowing different experiments to maintain their own schemas, transformations, - and implementations. Solutions persist throughout the workflow and contain - the final deployable model components. - - Attributes: - id (str): A unique identifier for the solution. - created_time (float): The UNIX timestamp when the solution was created. - plan (str): The ML approach description and strategy for this solution. - training_code (str): The model training implementation code. - inference_code (str): The production inference/prediction code. - input_schema (Dict[str, str]): Schema for the input data expected by the model. - output_schema (Dict[str, str]): Schema for the output data produced by the model. - performance (Metric): Validation set performance metrics. - test_performance (Metric): Test set performance metrics. - execution_time (float): Time taken to train the model. - execution_stdout (list[str]): Training execution logs and output. - exception_was_raised (bool): Whether training failed with an exception. - exception (Exception): The exception raised during training, if any. - model_artifacts (List[Path]): Paths to serialized model files and other artifacts. - analysis (str): Summary analysis of the solution's effectiveness. - """ - - # General attributes - id: str = field(default_factory=lambda: uuid.uuid4().hex, kw_only=True) - created_time: float = field(default_factory=lambda: time.time(), kw_only=True) - - # Core solution attributes - plan: str = field(default=None, hash=True, kw_only=True) - training_code: str = field(default=None, hash=True, kw_only=True) - inference_code: str = field(default=None, hash=True, kw_only=True) - testing_code: str = field(default=None, hash=True, kw_only=True) - input_schema: Dict[str, str] = field(default=None, kw_only=True) - output_schema: Dict[str, str] = field(default=None, kw_only=True) - schema_reasoning: str = field(default=None, kw_only=True) # Explanation of schema design - - # Post-execution results: model performance, execution time, exceptions, etc. - performance: Metric = field(default=None, kw_only=True) # Validation performance - test_performance: Metric = field(default=None, kw_only=True) # Test set performance - execution_time: float = field(default=None, kw_only=True) - execution_stdout: list[str] = field(default_factory=list, kw_only=True) - exception_was_raised: bool = field(default=False, kw_only=True) - exception: Exception = field(default=None, kw_only=True) - model_artifacts: List[Artifact] = field(default_factory=list, kw_only=True) - - # Evaluations and analyses - analysis: str = field(default=None, kw_only=True) - review: Dict[str, str] = field(default=None, kw_only=True) - model_evaluation_report: Dict[str, any] = field(default_factory=dict, kw_only=True) diff --git a/plexe/core/interfaces/__init__.py b/plexe/core/interfaces/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/core/interfaces/feature_transformer.py b/plexe/core/interfaces/feature_transformer.py deleted file mode 100644 index efeb48bb..00000000 --- a/plexe/core/interfaces/feature_transformer.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -This module defines the FeatureTransformer interface, which all generated feature transformers must implement. -""" - -from abc import ABC, abstractmethod - -import pandas as pd - - -class FeatureTransformer(ABC): - """ - Abstract base class for all dynamically generated feature transformers. - - Every implementation of feature transformer must provide a mechanism for transforming the input data - into the format required by the model. - """ - - @abstractmethod - def transform(self, inputs: pd.DataFrame) -> pd.DataFrame: - pass diff --git a/plexe/core/interfaces/predictor.py b/plexe/core/interfaces/predictor.py deleted file mode 100644 index 0f60c77d..00000000 --- a/plexe/core/interfaces/predictor.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -This module defines the Predictor interface, which all dynamically generated inference codes must implement. -""" - -from abc import ABC, abstractmethod -from typing import List -from plexe.internal.models.entities.artifact import Artifact - - -class Predictor(ABC): - """ - Abstract base class for all dynamically generated inference code. - - Every implementation of predictor must provide a mechanism for instantiating the underlying model(s) - by reading the binary or text data from the `ModelArtifact` class. - """ - - @abstractmethod - def __init__(self, artifacts: List[Artifact]): - pass - - @abstractmethod - def predict(self, inputs: dict) -> dict: - pass diff --git a/plexe/core/object_registry.py b/plexe/core/object_registry.py deleted file mode 100644 index 821babf9..00000000 --- a/plexe/core/object_registry.py +++ /dev/null @@ -1,197 +0,0 @@ -""" -This module provides a generic Registry pattern implementation for storing and retrieving objects by name or prefix. -""" - -import logging -import dataclasses -import copy -from typing import Dict, List, Type, TypeVar, Any - - -T = TypeVar("T") -logger = logging.getLogger(__name__) - - -@dataclasses.dataclass -class Item: - item: T - immutable: bool = False - - -class ObjectRegistry: - """ - Registry for storing and retrieving objects by name. - - This class implements the Singleton pattern so that registry instances are shared - across the application. It provides methods for registering, retrieving, and - managing objects in a type-safe manner. - """ - - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super(ObjectRegistry, cls).__new__(cls) - cls._instance._items = {} - return cls._instance - - @staticmethod - def _get_uri(t: Type[T], name: str) -> str: - return f"{str(t)}://{name}" - - def register(self, t: Type[T], name: str, item: T, overwrite: bool = False, immutable: bool = False) -> None: - """ - Register an item with a given name. - - :param t: type prefix for the item - :param name: identifier for the item - must be unique within the prefix - :param item: the item to register - :param overwrite: whether to overwrite an existing item with the same name - :param immutable: whether the item should be treated as immutable (not modifiable) - """ - uri = self._get_uri(t, name) - was_overwrite = overwrite and uri in self._items - - if not overwrite and uri in self._items: - raise ValueError(f"Item '{uri}' already registered, use a different name") - - self._items[uri] = Item(item, immutable=immutable) - - # Enhanced logging with context - action = "overwrote" if was_overwrite else "registered" - logger.info(f"Registry: {action} {uri} (immutable={immutable}, total: {len(self._items)} items)") - - def register_multiple( - self, t: Type[T], items: Dict[str, T], overwrite: bool = False, immutable: bool = False - ) -> None: - """ - Register multiple items with a given prefix. - - :param t: type prefix for the items - :param overwrite: whether to overwrite existing items with the same names - :param immutable: whether the items should be treated as immutable (not modifiable) - :param items: dictionary of item names and their corresponding objects - """ - for name, item in items.items(): - self.register(t, name, item, overwrite=overwrite, immutable=immutable) - - def get(self, t: Type[T], name: str) -> T: - """ - Retrieve an item by name. - - :param t: type prefix for the item - :param name: the name of the item to retrieve - :return: The registered item - :raises KeyError: If the item is not found in the registry - """ - uri = self._get_uri(t, name) - if uri not in self._items: - logger.warning(f"⚠️ Item '{uri}' not found in registry") - raise KeyError(f"Item '{uri}' not found in registry") - logger.info(f"Registry: Retrieved {uri} (immutable={self._items[uri].immutable})") - return self._items[uri].item if not self._items[uri].immutable else copy.deepcopy(self._items[uri].item) - - def get_multiple(self, t: Type[T], names: List[str]) -> Dict[str, T]: - """ - Retrieve multiple items by name. - - :param t: type prefix for the items - :param names: List of item names to retrieve - :return: Dictionary mapping item names to items - :raises KeyError: If any item is not found in the registry - """ - return {name: self.get(t, name) for name in names} - - def get_all(self, t: Type[T]) -> Dict[str, T]: - """ - Retrieve all items for a given prefix. - - :param t: type prefix for the items - :return: Dictionary mapping item names to items - """ - return {name: item.item for name, item in self._items.items() if name.startswith(str(t))} - - def delete(self, t: Type[T], name: str) -> None: - """ - Delete an item by name. - - :param t: type prefix for the item - :param name: the name of the item to delete - """ - uri = self._get_uri(t, name) - if uri in self._items: - del self._items[uri] - else: - raise KeyError(f"Item '{uri}' not found in registry") - - def clear(self) -> None: - """ - Clear all registered items. - """ - self._items.clear() - - def list(self) -> List[str]: - """ - List all registered item names. - - :return: List of item names in the registry - """ - return list(self._items.keys()) - - def list_by_type(self, t: Type[T]) -> List[str]: - """ - List all registered names for a specific type. - - :param t: type prefix for the items - :return: List of item names (without the type prefix) for the given type - """ - prefix = str(t) - return [uri.split("://")[1] for uri in self._items.keys() if uri.startswith(prefix)] - - # TODO: unclear if this is needed, consider deleting - def get_all_solutions(self) -> List[Dict[str, Any]]: - """ - Get all solutions tracked during model building. - - This method extracts solution information from the registry, focusing on - code, performance metrics, and other solution-specific data for checkpointing. - - :return: List of solution data dictionaries - """ - solutions = [] - - # Extract training code and their results - from plexe.internal.models.entities.code import Code - from plexe.core.entities.solution import Solution - - # Get all code objects - code_items = self.get_all(Code) - node_items = self.get_all(Solution) - - # Build solution data from code and node objects - for uri, code_obj in code_items.items(): - if isinstance(code_obj, Code): - # Extract code ID and try to find associated node - code_id = uri.split("://")[1] - solution_data = { - "code_id": code_id, - "code": code_obj.code, - "iteration": getattr(code_obj, "iteration", 0), - } - - # Look for associated node to get performance metrics - for node_uri, node in node_items.items(): - if isinstance(node, Solution) and node.training_code == code_obj.code: - if node.performance: - solution_data["performance"] = { - "name": node.performance.name, - "value": node.performance.value, - "comparison_method": getattr( - node.performance.comparator, "comparison_method", "HIGHER_IS_BETTER" - ), - } - break - - solutions.append(solution_data) - - return solutions diff --git a/plexe/core/state.py b/plexe/core/state.py deleted file mode 100644 index 1cc8a713..00000000 --- a/plexe/core/state.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -Model state definitions for Plexe. - -This module defines the possible states a model can be in during its lifecycle. -""" - -from enum import Enum - - -class ModelState(Enum): - """States a model can be in during its lifecycle.""" - - DRAFT = "draft" - """Model is in draft state, not yet built.""" - - BUILDING = "building" - """Model is currently being built.""" - - READY = "ready" - """Model is built and ready to use.""" - - ERROR = "error" - """Model encountered an error during building.""" diff --git a/plexe/core/storage.py b/plexe/core/storage.py deleted file mode 100644 index 2674026c..00000000 --- a/plexe/core/storage.py +++ /dev/null @@ -1,588 +0,0 @@ -""" -Core storage functions for model and checkpoint persistence. - -This module contains implementation logic for saving and loading models and checkpoints, -without direct dependencies on the Model class. It provides a foundation for the -fileio module, breaking the circular dependency between models.py and fileio.py. -""" - -import io -import json -import yaml -import logging -import tarfile -import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional, TypeVar, Type -import numpy as np - -from plexe.core.state import ModelState -from plexe.config import config -from plexe.internal.common.utils.pydantic_utils import map_to_basemodel - -logger = logging.getLogger(__name__) - -# Type variable for generic model type -M = TypeVar("M") - - -class FallbackNoneLoader(yaml.SafeLoader): - pass - - -def fallback_to_none(loader, tag_suffix, node): - return None - - -FallbackNoneLoader.add_multi_constructor("", fallback_to_none) - - -def _convert_to_native_types(obj): - """Recursively convert numpy types and other non-native types to Python native types. - - Falls back to string representation for any type that can't be converted. - """ - try: - # Handle NumPy types specifically - if isinstance(obj, np.generic): - return _convert_to_native_types(obj.item()) - elif isinstance(obj, np.ndarray): - return _convert_to_native_types(obj.tolist()) - # Handle common collections - elif isinstance(obj, dict): - return {k: _convert_to_native_types(v) for k, v in obj.items()} - elif isinstance(obj, (list, tuple)): - return type(obj)(_convert_to_native_types(item) for item in obj) - # Native types are already safe - elif isinstance(obj, (str, int, float, bool, type(None))): - return obj - # For other types, raise error to trigger fallback - else: - raise TypeError(f"Unsupported type: {type(obj).__name__}") - except Exception as e: - # Fallback: if anything goes wrong, use JSON serialization with sanitization - logger.warning( - f"Failed to convert object of type {type(obj).__name__} for serialization: {e}. Using string representation." - ) - return json.loads(json.dumps(obj, skipkeys=True, default=str)) - - -def _load_yaml_or_json_from_tar(tar, yaml_path: str, json_path: str): - """Load from YAML if available, fallback to JSON for backward compatibility.""" - members = [m.name for m in tar.getmembers()] - if yaml_path in members: - content = tar.extractfile(yaml_path).read().decode("utf-8") - return yaml.load(content, Loader=FallbackNoneLoader) - elif json_path in members: - content = tar.extractfile(json_path).read().decode("utf-8") - return json.loads(content) - else: - raise FileNotFoundError(f"Neither {yaml_path} nor {json_path} found in archive") - - -def _save_model_to_tar(model: Any, path: str | Path) -> str: - """ - Core implementation of saving a model to a tar archive. - - Args: - model: The model to save (any object with required attributes) - path: Path where to save the model - - Returns: - Path where the model was saved - """ - # Ensure .tar.gz extension - if not str(path).endswith(".tar.gz"): - raise ValueError("Path must end with .tar.gz") - - # Ensure parent directory exists - Path(path).parent.mkdir(parents=True, exist_ok=True) - - try: - with tarfile.open(path, "w:gz") as tar: - # Get metrics data if available - metrics_data = {} - if hasattr(model, "metric") and model.metric: - metrics_data = { - "name": model.metric.name, - "value": _convert_to_native_types(model.metric.value), - "comparison_method": model.metric.comparator.comparison_method.value, - "target": _convert_to_native_types(model.metric.comparator.target), - } - - # Gather metadata - metadata = { - "intent": model.intent, - "state": model.state.value, - "metrics": metrics_data, - "metadata": model.metadata, - "identifier": model.identifier, - } - - # Save each metadata item separately - for key, value in metadata.items(): - if key in ["metrics", "metadata"]: - info = tarfile.TarInfo(f"metadata/{key}.yaml") - # Ensure all values are serializable - safe_value = _convert_to_native_types(value) - content = yaml.safe_dump(safe_value, default_flow_style=False).encode("utf-8") - else: - info = tarfile.TarInfo(f"metadata/{key}.txt") - content = str(value).encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save schemas - for name, schema in [("input_schema", model.input_schema), ("output_schema", model.output_schema)]: - schema_dict = {name: field.annotation.__name__ for name, field in schema.model_fields.items()} - info = tarfile.TarInfo(f"schemas/{name}.yaml") - content = yaml.safe_dump(schema_dict, default_flow_style=False).encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save trainer source if available - if hasattr(model, "trainer_source") and model.trainer_source: - info = tarfile.TarInfo("code/trainer.py") - content = model.trainer_source.encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save predictor source if available - if hasattr(model, "predictor_source") and model.predictor_source: - info = tarfile.TarInfo("code/predictor.py") - content = model.predictor_source.encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save feature transformer source if available - if hasattr(model, "feature_transformer_source") and model.feature_transformer_source: - info = tarfile.TarInfo("code/feature_transformer.py") - content = model.feature_transformer_source.encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save dataset splitter source if available - if hasattr(model, "dataset_splitter_source") and model.dataset_splitter_source: - info = tarfile.TarInfo("code/dataset_splitter.py") - content = model.dataset_splitter_source.encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save testing source if available - if hasattr(model, "testing_source") and model.testing_source: - info = tarfile.TarInfo("code/testing.py") - content = model.testing_source.encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save evaluation report if available - if hasattr(model, "evaluation_report") and model.evaluation_report: - info = tarfile.TarInfo("metadata/evaluation_report.yaml") - # Convert numpy types to native Python types before serialization - evaluation_report_native = _convert_to_native_types(model.evaluation_report) - content = yaml.safe_dump(evaluation_report_native, default_flow_style=False).encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save artifacts - if hasattr(model, "artifacts"): - for artifact in model.artifacts: - arc_name = f"artifacts/{Path(artifact.name).as_posix()}" - info = tarfile.TarInfo(arc_name) - - if artifact.is_path(): - with open(artifact.path, "rb") as f: - content = f.read() - elif artifact.is_handle(): - content = artifact.handle.read() - else: - content = artifact.data - - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save EDA markdown reports if available - if ( - hasattr(model, "metadata") - and "eda_markdown_reports" in model.metadata - and model.metadata["eda_markdown_reports"] - ): - for dataset_name, report_markdown in model.metadata["eda_markdown_reports"].items(): - info = tarfile.TarInfo(f"metadata/eda_report_{dataset_name}.md") - content = report_markdown.encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - except Exception as e: - logger.error(f"Error saving model: {e}") - if Path(path).exists(): - Path(path).unlink() - raise - - logger.info(f"Model saved to {path}") - return str(path) - - -def _load_model_data_from_tar(path: str | Path) -> Dict[str, Any]: - """ - Core implementation of loading model data from a tar archive. - - Args: - path: Path to the model archive - - Returns: - Dictionary with model data to reconstruct a Model instance - """ - if not Path(path).exists(): - raise ValueError(f"Model not found: {path}") - - try: - with tarfile.open(path, "r:gz") as tar: - # Extract metadata - intent = tar.extractfile("metadata/intent.txt").read().decode("utf-8") - state = ModelState(tar.extractfile("metadata/state.txt").read().decode("utf-8")) - metrics_data = _load_yaml_or_json_from_tar(tar, "metadata/metrics.yaml", "metadata/metrics.json") - metadata = _load_yaml_or_json_from_tar(tar, "metadata/metadata.yaml", "metadata/metadata.json") - identifier = tar.extractfile("metadata/identifier.txt").read().decode("utf-8") - - # Extract schema information - input_schema_dict = _load_yaml_or_json_from_tar( - tar, "schemas/input_schema.yaml", "schemas/input_schema.json" - ) - output_schema_dict = _load_yaml_or_json_from_tar( - tar, "schemas/output_schema.yaml", "schemas/output_schema.json" - ) - - # Process schemas into Pydantic models - input_schema = _process_schema_dict(input_schema_dict) - output_schema = _process_schema_dict(output_schema_dict) - - # Extract code if available - trainer_source = None - if "code/trainer.py" in [m.name for m in tar.getmembers()]: - trainer_source = tar.extractfile("code/trainer.py").read().decode("utf-8") - - predictor_source = None - if "code/predictor.py" in [m.name for m in tar.getmembers()]: - predictor_source = tar.extractfile("code/predictor.py").read().decode("utf-8") - # FIXME: this is a hack required to ensure backwards compatibility with old models - predictor_source = predictor_source.replace("plexe.internal.models.interfaces", "plexe.core.interfaces") - - feature_transformer_source = None - if "code/feature_transformer.py" in [m.name for m in tar.getmembers()]: - feature_transformer_source = tar.extractfile("code/feature_transformer.py").read().decode("utf-8") - - dataset_splitter_source = None - if "code/dataset_splitter.py" in [m.name for m in tar.getmembers()]: - dataset_splitter_source = tar.extractfile("code/dataset_splitter.py").read().decode("utf-8") - - testing_source = None - if "code/testing.py" in [m.name for m in tar.getmembers()]: - testing_source = tar.extractfile("code/testing.py").read().decode("utf-8") - - evaluation_report = None - try: - evaluation_report = _load_yaml_or_json_from_tar( - tar, "metadata/evaluation_report.yaml", "metadata/evaluation_report.json" - ) - except FileNotFoundError: - pass - - # Load EDA markdown reports if available - eda_markdown_reports = {} - for member in tar.getmembers(): - if member.name.startswith("metadata/eda_report_") and member.name.endswith(".md"): - dataset_name = member.name.replace("metadata/eda_report_", "").replace(".md", "") - report_content = tar.extractfile(member).read().decode("utf-8") - eda_markdown_reports[dataset_name] = report_content - - # Collect artifact data - artifact_data = [] - for member in tar.getmembers(): - if member.name.startswith("artifacts/") and not member.isdir(): - file_data = tar.extractfile(member) - if file_data: - artifact_data.append({"name": Path(member.name).name, "data": file_data.read()}) - - # Prepare result dictionary with both raw schema dicts and processed schemas - model_data = { - "intent": intent, - "state": state, - "metadata": metadata, - "identifier": identifier, - "trainer_source": trainer_source, - "predictor_source": predictor_source, - "feature_transformer_source": feature_transformer_source, - "dataset_splitter_source": dataset_splitter_source, - "testing_source": testing_source, - "evaluation_report": evaluation_report, - "artifact_data": artifact_data, - "input_schema_dict": input_schema_dict, - "output_schema_dict": output_schema_dict, - "input_schema": input_schema, - "output_schema": output_schema, - "metrics_data": metrics_data, - } - - # Add EDA reports if found - if eda_markdown_reports: - model_data["eda_markdown_reports"] = eda_markdown_reports - - logger.debug(f"Model data successfully loaded from {path}") - return model_data - - except Exception as e: - logger.error(f"Error loading model: {e}") - raise - - -def _save_checkpoint_to_tar(model: Any, iteration: int, path: Optional[str | Path] = None) -> str: - """ - Core implementation of saving a checkpoint to a tar archive. - - Args: - model: The model to checkpoint (any object with required attributes) - iteration: Current iteration number - path: Optional custom path - - Returns: - Path where the checkpoint was saved - """ - # Generate default path if not provided - if path is None: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - checkpoint_dir = Path(config.file_storage.cache_dir) / config.file_storage.checkpoint_dir - checkpoint_dir.mkdir(parents=True, exist_ok=True) - path = checkpoint_dir / f"{model.identifier}_{timestamp}.checkpoint.tar.gz" - - # Ensure .tar.gz extension - if not str(path).endswith(".tar.gz"): - raise ValueError("Path must end with .tar.gz") - - # Ensure parent directory exists - Path(path).parent.mkdir(parents=True, exist_ok=True) - - try: - with tarfile.open(path, "w:gz") as tar: - # Add checkpoint marker - info = tarfile.TarInfo("checkpoint.marker") - content = b"1" - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Add current iteration - info = tarfile.TarInfo("metadata/iteration.txt") - content = str(iteration).encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Add model metadata - metadata = { - "intent": model.intent, - "state": model.state.value, - "metadata": model.metadata, - "identifier": model.identifier, - } - - # Save each metadata item separately - for key, value in metadata.items(): - if key in ["metadata"]: - info = tarfile.TarInfo(f"metadata/{key}.yaml") - safe_value = _convert_to_native_types(value) - content = yaml.safe_dump(safe_value, default_flow_style=False).encode("utf-8") - else: - info = tarfile.TarInfo(f"metadata/{key}.txt") - content = str(value).encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save schemas - for name, schema in [("input_schema", model.input_schema), ("output_schema", model.output_schema)]: - schema_dict = {name: field.annotation.__name__ for name, field in schema.model_fields.items()} - info = tarfile.TarInfo(f"schemas/{name}.yaml") - content = yaml.safe_dump(schema_dict, default_flow_style=False).encode("utf-8") - info.size = len(content) - tar.addfile(info, io.BytesIO(content)) - - # Save previously tried solutions from the ObjectRegistry if available - solutions = [] - if hasattr(model, "object_registry") and hasattr(model.object_registry, "get_all_solutions"): - solutions = model.object_registry.get_all_solutions() - - if solutions: - solutions_data = json.dumps(solutions, default=str).encode("utf-8") - info = tarfile.TarInfo("solutions/solutions.json") - info.size = len(solutions_data) - tar.addfile(info, io.BytesIO(solutions_data)) - - except Exception as e: - logger.error(f"Error saving checkpoint: {e}") - if Path(path).exists(): - Path(path).unlink() - raise - - logger.info(f"Checkpoint saved to {path}") - return str(path) - - -def _load_checkpoint_data_from_tar(path: str | Path) -> Dict[str, Any]: - """ - Core implementation of loading checkpoint data from a tar archive. - - Args: - path: Path to the checkpoint archive - - Returns: - Dictionary with checkpoint data - """ - if not Path(path).exists(): - raise ValueError(f"Checkpoint not found: {path}") - - try: - with tarfile.open(path, "r:gz") as tar: - # Verify this is a checkpoint archive - if "checkpoint.marker" not in [m.name for m in tar.getmembers()]: - raise ValueError(f"Archive at {path} is not a valid checkpoint") - - # Extract metadata - intent = tar.extractfile("metadata/intent.txt").read().decode("utf-8") - state = ModelState(tar.extractfile("metadata/state.txt").read().decode("utf-8")) - metadata = _load_yaml_or_json_from_tar(tar, "metadata/metadata.yaml", "metadata/metadata.json") - identifier = tar.extractfile("metadata/identifier.txt").read().decode("utf-8") - iteration = int(tar.extractfile("metadata/iteration.txt").read().decode("utf-8")) - - # Extract schema information - input_schema_dict = _load_yaml_or_json_from_tar( - tar, "schemas/input_schema.yaml", "schemas/input_schema.json" - ) - output_schema_dict = _load_yaml_or_json_from_tar( - tar, "schemas/output_schema.yaml", "schemas/output_schema.json" - ) - - # Process schemas into Pydantic models - input_schema = _process_schema_dict(input_schema_dict) - output_schema = _process_schema_dict(output_schema_dict) - - # Extract previous solutions if available - solutions = [] - if "solutions/solutions.json" in [m.name for m in tar.getmembers()]: - solutions_json = tar.extractfile("solutions/solutions.json").read().decode("utf-8") - solutions = json.loads(solutions_json) - - # Prepare result dictionary with both raw schema dicts and processed schemas - checkpoint_data = { - "intent": intent, - "identifier": identifier, - "input_schema_dict": input_schema_dict, - "output_schema_dict": output_schema_dict, - "input_schema": input_schema, - "output_schema": output_schema, - "state": state, - "metadata": metadata, - "iteration": iteration, - "solutions": solutions, - } - - logger.debug(f"Checkpoint successfully loaded from {path}") - return checkpoint_data - - except Exception as e: - logger.error(f"Error loading checkpoint: {e}") - raise - - -def _list_checkpoint_files(model_id: Optional[str] = None) -> List[str]: - """ - Core implementation of listing checkpoint files. - - Args: - model_id: Optional model identifier to filter checkpoints - - Returns: - List of checkpoint paths - """ - checkpoint_dir = Path(config.file_storage.cache_dir) / config.file_storage.checkpoint_dir - if not checkpoint_dir.exists(): - return [] - - checkpoints = list(checkpoint_dir.glob("*.checkpoint.tar.gz")) - - if model_id: - checkpoints = [cp for cp in checkpoints if model_id in cp.stem] - - return [str(cp) for cp in checkpoints] - - -def _delete_checkpoint_file(path: str | Path) -> bool: - """ - Core implementation of deleting a checkpoint file. - - Args: - path: Path to the checkpoint to delete - - Returns: - True if deletion was successful, False otherwise - """ - try: - Path(path).unlink(missing_ok=True) - return True - except Exception as e: - logger.error(f"Error deleting checkpoint: {e}") - return False - - -def _clear_checkpoint_files(model_id: Optional[str] = None, older_than_days: Optional[int] = None) -> int: - """ - Core implementation of clearing checkpoints based on criteria. - - Args: - model_id: Optional model identifier to filter checkpoints - older_than_days: Optional age in days to filter checkpoints - - Returns: - Number of checkpoints deleted - """ - checkpoints = _list_checkpoint_files(model_id) - deleted_count = 0 - - # Apply age filter if provided - if older_than_days is not None: - cutoff_time = datetime.datetime.now() - datetime.timedelta(days=older_than_days) - checkpoints = [cp for cp in checkpoints if Path(cp).stat().st_mtime < cutoff_time.timestamp()] - - # Delete matching checkpoints - for cp in checkpoints: - if _delete_checkpoint_file(cp): - deleted_count += 1 - - return deleted_count - - -def _process_schema_dict(schema_dict: Dict[str, str]) -> Type: - """ - Process a schema dictionary to create a Pydantic model. - - Args: - schema_dict: Dictionary mapping field names to type names - - Returns: - A Pydantic model class - """ - - def type_from_name(type_name: str) -> type: - # Map string type names to actual Python types - type_map = { - "str": str, - "int": int, - "float": float, - "bool": bool, - "List[int]": List[int], - "List[float]": List[float], - "List[str]": List[str], - "List[bool]": List[bool], - } - return type_map[type_name] - - # Create a Pydantic model from the schema dictionary - schema_name = "Schema" # This will be overridden by map_to_basemodel - return map_to_basemodel(schema_name, {name: type_from_name(type_name) for name, type_name in schema_dict.items()}) diff --git a/plexe/datasets.py b/plexe/datasets.py deleted file mode 100644 index 714fc42f..00000000 --- a/plexe/datasets.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -This module provides the Dataset class, which represents a collection of data that can be real, -synthetic, or a combination of both. - -The Dataset class offers functionalities for: -- Wrapping existing datasets (e.g. pandas DataFrames). -- Generating synthetic data based on a schema. -- Augmenting real datasets with additional synthetic samples. -- Iterating and accessing data samples conveniently. - -Users can either pass raw datasets directly to models or leverage this class for dataset management and augmentation. -""" - -from typing import Iterator, Type, Dict, Optional -import logging -import pandas as pd -from pydantic import BaseModel - -logger = logging.getLogger(__name__) - -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.provider import Provider -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.internal.common.utils.pydantic_utils import merge_models, map_to_basemodel -from plexe.internal.schemas.resolver import SchemaResolver -from plexe.internal.datasets.generator import DatasetGenerator as DataGenerator - - -class DatasetGenerator: - """ - Represents a dataset, which can contain real data, synthetic data, or both. - - This class provides a structured way to manage data, allowing users to: - - Wrap real datasets (pandas etc.). - - Generate synthetic data from scratch. - - Augment existing datasets with synthetic samples. - - Add new columns to existing datasets using an extended schema. - - Example: - >>> synthetic_dataset = DatasetGenerator( - >>> description="Synthetic reviews", - >>> provider="openai/gpt-4o", - >>> schema=MovieReviewSchema, - >>> ) - >>> synthetic_dataset.generate(100) # Generate 100 samples - >>> model.build(datasets={"train": synthetic_dataset}) - """ - - def __init__( - self, - description: str, - provider: str, - schema: Type[BaseModel] | Dict[str, type] = None, - data: pd.DataFrame = None, - ) -> None: - """ - Initialize a new DatasetGenerator. - - :param description: A human-readable description of the dataset - :param provider: LLM provider used for synthetic data generation - :param schema: The schema the data should match, if any - :param data: A dataset of real data on which to base the generation, if available - """ - # Core attributes required for dataset generation - self.description = description - self.provider = Provider(provider) - - # Internal attributes for data management - self._data: Optional[pd.DataFrame] = None - self._index = 0 - self.schema = None - - # Process schema and data inputs - if schema is not None: - # Convert schema to Pydantic BaseModel if it's a dictionary - self.schema = map_to_basemodel("data", schema) - - if data is not None: - # Convert and validate input data - data_wrapper = DatasetAdapter.coerce(data) - if isinstance(data_wrapper, TabularConvertible): - self._data = data_wrapper.to_pandas() - else: - raise ValueError("Dataset must be convertible to pandas DataFrame.") - - # If schema is provided, validate data against schema - # but only validate existing columns, not new ones being added - if schema is not None: - self._validate_schema(self._data, allow_new_columns=True) - # If no schema provided, infer it from data - else: - schemas = SchemaResolver(self.provider, self.description).resolve({"data": self._data}) - self.schema = merge_models("data", list(schemas)) - - # Initialize data generator - self.data_generator = DataGenerator(self.provider, self.description, self.schema) - - def generate(self, num_samples: int): - """ - Generate synthetic data samples or augment existing data. - - If num_samples is 0 and existing data is provided with a new schema, - this will transform the existing data to match the new schema (adding columns). - - :param num_samples: Number of new samples to generate - """ - generated_data = self.data_generator.generate(num_samples, self._data) - - if self._data is None: - self._data = generated_data - elif num_samples == 0: - # When num_samples is 0, we're just adding columns to existing data - # SimpleLLMDataGenerator.generate already handles this correctly by returning - # the existing data with new columns added, so we just replace _data directly - self._data = generated_data - else: - # When adding new rows, concatenate them with existing data - self._data = pd.concat([self._data, generated_data], ignore_index=True) - - def _validate_schema(self, data: pd.DataFrame, allow_new_columns: bool = False): - """ - Ensure data matches the schema by checking column presence. - - :param data: DataFrame to validate against the schema - :param allow_new_columns: If True, allow schema to have columns that don't exist in data yet - :raises ValueError: If required columns from schema are missing and not allowed - """ - for key in self.schema.model_fields.keys(): - if key not in data.columns: - if not allow_new_columns: - raise ValueError(f"Dataset does not match schema, missing column in dataset: {key}") - else: - # When augmenting with new columns, we'll skip validation for those columns - logger.debug(f"Allowing new column that will be added through augmentation: {key}") - - @property - def data(self) -> pd.DataFrame: - """ - Get the dataset as a pandas DataFrame. - - :return: The dataset as a DataFrame - :raises ValueError: If no data has been set or generated - """ - if self._data is None: - raise ValueError("No data has been set or generated. Call generate() first.") - return self._data - - def __len__(self) -> int: - """ - Get the number of samples in the dataset. - - :return: Number of rows in the dataset, or 0 if no data - """ - if self._data is not None: - return len(self._data) - return 0 - - def __iter__(self) -> Iterator: - """ - Get an iterator over the dataset rows. - - :return: Self as iterator - """ - self._index = 0 - return self - - def __next__(self): - """ - Get the next item when iterating over the dataset. - - :return: Dictionary representing the next row - :raises StopIteration: When all rows have been processed - """ - if self._data is None or self._index >= len(self): - raise StopIteration - - row = self._data.iloc[self._index].to_dict() - self._index += 1 - return row - - def __getitem__(self, index: int): - """ - Get a dataset item by index. - - :param index: Row index to retrieve - :return: Dictionary representing the row at the given index - :raises IndexError: If dataset is empty - """ - if self._data is None: - raise IndexError("Dataset is empty.") - return self._data.iloc[index].to_dict() diff --git a/plexe/fileio.py b/plexe/fileio.py deleted file mode 100644 index bee8787c..00000000 --- a/plexe/fileio.py +++ /dev/null @@ -1,277 +0,0 @@ -""" -This module provides file I/O utilities for saving and loading models to and from archive files. - -It serves as a user-facing API that delegates implementation details to core.storage. -""" - -import logging -from pathlib import Path -from typing import Any, List, Optional - -# Import core implementations -from plexe.core.storage import ( - _save_model_to_tar, - _load_model_data_from_tar, - _save_checkpoint_to_tar, - _load_checkpoint_data_from_tar, - _list_checkpoint_files, - _delete_checkpoint_file, - _clear_checkpoint_files, -) - -# Import other required types -from plexe.internal.models.entities.metric import Metric, MetricComparator, ComparisonMethod -from plexe.internal.models.entities.artifact import Artifact - -logger = logging.getLogger(__name__) - - -def save_model(model: Any, path: str | Path) -> str: - """ - Save a model to a tar archive. - - Args: - model: The model to save - path: Path where to save the model - - Returns: - Path where the model was saved - - Raises: - ValueError: If the model cannot be saved or the path is invalid - """ - try: - return _save_model_to_tar(model, path) - except Exception as e: - logger.error(f"Error saving model to {path}: {e}") - raise ValueError(f"Failed to save model: {str(e)}") from e - - -def load_model(path: str | Path): - """ - Load a model from a tar archive. - - Args: - path: Path to the model archive - - Returns: - A Model instance - - Raises: - ValueError: If the model file doesn't exist or is invalid - """ - # Import here to avoid circular imports - from plexe.models import Model - - try: - # Load model data using the core implementation - model_data = _load_model_data_from_tar(path) - - # Create model instance with schemas already processed by core - model = Model( - intent=model_data["intent"], - input_schema=model_data["input_schema"], - output_schema=model_data["output_schema"], - ) - - # Set model state and properties - model.state = model_data["state"] - model.metadata = model_data["metadata"] - model.identifier = model_data["identifier"] - model.trainer_source = model_data["trainer_source"] - model.predictor_source = model_data["predictor_source"] - # Set additional properties if available; these are optional for backward compatibility - model.feature_transformer_source = model_data.get("feature_transformer_source", None) - model.dataset_splitter_source = model_data.get("dataset_splitter_source", None) - model.testing_source = model_data.get("testing_source", None) - model.evaluation_report = model_data.get("evaluation_report", None) - - # Process metrics data if available - metrics_data = model_data["metrics_data"] - if metrics_data: - comparator = MetricComparator( - comparison_method=ComparisonMethod(metrics_data["comparison_method"]), target=metrics_data["target"] - ) - model.metric = Metric(name=metrics_data["name"], value=metrics_data["value"], comparator=comparator) - - # Add EDA reports to metadata if found - if "eda_markdown_reports" in model_data: - model.metadata["eda_markdown_reports"] = model_data["eda_markdown_reports"] - - # Process artifacts - artifact_handles = [] - for artifact_item in model_data["artifact_data"]: - artifact_handles.append(Artifact.from_data(artifact_item["name"], artifact_item["data"])) - - model.artifacts = artifact_handles - - # Load predictor if source code is available - if model.predictor_source: - import types - - predictor_module = types.ModuleType("predictor") - exec(model.predictor_source, predictor_module.__dict__) - model.predictor = predictor_module.PredictorImplementation(artifact_handles) - - logger.debug(f"Model successfully loaded from {path}") - return model - - except Exception as e: - logger.error(f"Error loading model from {path}: {e}") - raise ValueError(f"Failed to load model: {str(e)}") from e - - -def save_checkpoint(model: Any, iteration: int, path: Optional[str | Path] = None) -> str: - """ - Save a model checkpoint to a tar archive. - - Args: - model: The model to checkpoint - iteration: Current iteration number - path: Optional custom path - - Returns: - Path where the checkpoint was saved - - Raises: - ValueError: If the checkpoint cannot be saved or the path is invalid - """ - try: - return _save_checkpoint_to_tar(model, iteration, path) - except Exception as e: - logger.error(f"Error saving checkpoint (iteration {iteration}): {e}") - raise ValueError(f"Failed to save checkpoint: {str(e)}") from e - - -def load_checkpoint( - checkpoint_path: Optional[str | Path] = None, model_id: Optional[str] = None, latest: bool = False -) -> Any: - """ - Load a model from a checkpoint. - - This function loads a model from a checkpoint file, which can then be used - to resume a previously interrupted build process. - - Args: - checkpoint_path: Direct path to a checkpoint file - model_id: Model identifier to find checkpoints for - latest: If True and model_id is provided, loads the latest checkpoint for that model - - Returns: - Model instance initialized from the checkpoint - - Raises: - ValueError: If no checkpoint could be found or if the parameters are invalid - """ - # Import here to avoid circular imports - from plexe.models import Model - - try: - # Parameter validation - if checkpoint_path is None and model_id is None: - raise ValueError("Either checkpoint_path or model_id must be provided") - - if checkpoint_path is not None and model_id is not None: - raise ValueError("Only one of checkpoint_path or model_id should be provided") - - # If model_id is provided, find relevant checkpoints - if model_id is not None: - checkpoints = list_checkpoints(model_id) - if not checkpoints: - raise ValueError(f"No checkpoints found for model_id '{model_id}'") - - if latest: - # Sort by timestamp (descending) and take the first one - checkpoint_path = sorted(checkpoints, reverse=True)[0] - else: - # If not latest, we need an exact path - raise ValueError("When using model_id without latest=True, please specify a specific checkpoint_path") - - # Load the checkpoint data from tar archive (schemas are already processed) - checkpoint_data = _load_checkpoint_data_from_tar(checkpoint_path) - - # Create a new model with the basic information from the checkpoint - model = Model( - intent=checkpoint_data["intent"], - input_schema=checkpoint_data["input_schema"], - output_schema=checkpoint_data["output_schema"], - ) - - # Update model with checkpoint data - model.identifier = checkpoint_data["identifier"] - model.state = checkpoint_data["state"] - model.metadata.update(checkpoint_data["metadata"]) - - # Store the checkpoint data for resumption - model._checkpoint_data = checkpoint_data - - logger.info(f"Model loaded from checkpoint {checkpoint_path}") - return model - - except Exception as e: - if isinstance(e, ValueError): - # Preserve specific ValueError messages - raise - logger.error(f"Error loading checkpoint from {checkpoint_path}: {e}") - raise ValueError(f"Failed to load checkpoint: {str(e)}") from e - - -def list_checkpoints(model_id: Optional[str] = None) -> List[str]: - """ - List available checkpoints. - - Args: - model_id: Optional model identifier to filter checkpoints - - Returns: - List of checkpoint paths - - Raises: - ValueError: If there's an issue accessing the checkpoint directory - """ - try: - return _list_checkpoint_files(model_id) - except Exception as e: - logger.error(f"Error listing checkpoints: {e}") - raise ValueError(f"Failed to list checkpoints: {str(e)}") from e - - -def delete_checkpoint(path: str | Path) -> bool: - """ - Delete a specific checkpoint. - - Args: - path: Path to the checkpoint to delete - - Returns: - True if deletion was successful, False otherwise - - Raises: - ValueError: If there's an issue accessing the file - """ - try: - return _delete_checkpoint_file(path) - except Exception as e: - logger.error(f"Error deleting checkpoint {path}: {e}") - raise ValueError(f"Failed to delete checkpoint: {str(e)}") from e - - -def clear_checkpoints(model_id: Optional[str] = None, older_than_days: Optional[int] = None) -> int: - """ - Clear checkpoints based on filter criteria. - - Args: - model_id: Optional model identifier to filter checkpoints - older_than_days: Optional age in days to filter checkpoints - - Returns: - Number of checkpoints deleted - - Raises: - ValueError: If there's an issue accessing or deleting the files - """ - try: - return _clear_checkpoint_files(model_id, older_than_days) - except Exception as e: - logger.error(f"Error clearing checkpoints: {e}") - raise ValueError(f"Failed to clear checkpoints: {str(e)}") from e diff --git a/plexe/internal/__init__.py b/plexe/internal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/common/__init__.py b/plexe/internal/common/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/common/datasets/__init__.py b/plexe/internal/common/datasets/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/common/datasets/adapter.py b/plexe/internal/common/datasets/adapter.py deleted file mode 100644 index 97799e08..00000000 --- a/plexe/internal/common/datasets/adapter.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -This module provides the DatasetAdapter class, which converts various dataset formats into standardized Dataset -objects. This enables the library to accept multiple dataset types as inputs, while ensuring consistency and -interoperability. -""" - -import logging -from typing import Any, Dict, List, Optional - -import pandas as pd - -from plexe.internal.common.datasets.interface import Dataset -from plexe.internal.common.datasets.tabular import TabularDataset - - -logger = logging.getLogger(__name__) - - -# This dictionary defines the mapping of dataset types to their respective wrapper classes. -# Note: this could be replaced with a separate DatasetRegistry class in the future if we need dynamic registration -# TODO: Add more dataset types and their corresponding classes -DATASET_REGISTRY_MAP = { - "tabular": TabularDataset, -} - - -class DatasetAdapter: - """ - A utility class for converting different dataset formats into standardized Dataset objects. - - This class provides methods for handling structured datasets, ensuring compatibility with downstream - processing steps in the plexe library. - """ - - @staticmethod - def coerce(dataset: Any) -> Dataset: - """ - Converts a dataset to a standardized format. - - This method attempts to convert the input dataset to a Dataset implementation if a suitable one is - available. If dataset_type is None, it tries to auto-detect the appropriate type. For backward compatibility, - it falls back to returning a pandas DataFrame if no appropriate Dataset is available. - - :param dataset: The dataset to convert - :returns: A Dataset implementation or pandas DataFrame - :raises ValueError: If the dataset type is unsupported - """ - # If dataset is already a Dataset, return it directly - if isinstance(dataset, Dataset): - return dataset - - # Determine the dataset type - dataset_type = DatasetAdapter.auto_detect(dataset) - - # If we have a suitable Dataset implementation, use it - if dataset_type is not None: - try: - return DATASET_REGISTRY_MAP[dataset_type](dataset) - except (ValueError, ImportError) as e: - # Log the error but continue with the fallback - logger.error(f"Failed to create {dataset_type} dataset: {str(e)}") - raise ValueError(f"Failed to convert dataset of type {type(dataset)} to {dataset_type}: {str(e)}") - else: - raise ValueError(f"Unsupported dataset type: {type(dataset)}") - - @classmethod - def auto_detect(cls, data: Any) -> Optional[str]: - """ - Auto-detect the appropriate dataset type for the given data. - - :param data: The data to detect the appropriate dataset type for - :returns: Name of the detected dataset implementation, or None if no appropriate type was found - """ - if isinstance(data, pd.DataFrame): - return "tabular" - - # TODO: Add more auto-detection logic for other data types - return None - - @staticmethod - def features(datasets: Dict[str, Dataset]) -> List[str]: - """ - Extracts a flat list of feature names from the given datasets. - - This method is useful for gathering meaningful names for all features available across multiple datasets, - which can be passed to a downstream LLM call or other processing steps. - - :param datasets: A dictionary of dataset names and their corresponding datasets - :returns: A list of feature names - :raises ValueError: If the dataset type is unsupported - """ - features = [] - for name, dataset in datasets.items(): - features.extend(f"{name}.{feature}" for feature in dataset.structure.features) - return features diff --git a/plexe/internal/common/datasets/interface.py b/plexe/internal/common/datasets/interface.py deleted file mode 100644 index 382d9cda..00000000 --- a/plexe/internal/common/datasets/interface.py +++ /dev/null @@ -1,201 +0,0 @@ -""" -This module defines the core interfaces for dataset handling in plexe. - -The interfaces provide a consistent API for working with different types of datasets, regardless of their -underlying implementation. They define operations like splitting, sampling, and serialization that are common -across all dataset types. -""" - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from typing import Tuple, Type, TypeVar, Any, Literal, Dict, List - -import numpy as np -import pandas as pd - -# Type variable for the dataset interface -T = TypeVar("T", bound="Dataset") - - -@dataclass -class DatasetStructure: - """ - Descriptor for the dataset structure. - - This descriptor provides metadata about the dataset's structure. The 'modality' field indicates the broad - type of underlying data structure (e.g., tabular, tensor, etc.) in a framework-agnostic way (that is, it - does not distinguish between PyTorch tensors and TensorFlow tensors, for example). The 'details' field - contains the description of the dataset's structure, which can vary depending on the type of dataset. - """ - - # todo: expand this when more dataset types are added - modality: Literal["table", "tensor", "other"] = field() - features: List[str] = field() - details: Dict[str, Any] = field() - - -class Dataset(ABC): - """ - Base interface for all dataset implementations with universal operations. - - This interface defines operations that all dataset implementations must support, regardless of the underlying - data format. These include splitting datasets into train/validation/test sets, sampling data, and serialization. - """ - - @abstractmethod - def split( - self, - train_ratio: float = 0.7, - val_ratio: float = 0.15, - test_ratio: float = 0.15, - stratify_column: str = None, - random_state: int = None, - ) -> Tuple[T, T, T]: - """ - Split dataset into train, validation and test sets. - - :param train_ratio: Proportion of data to use for training - :param val_ratio: Proportion of data to use for validation - :param test_ratio: Proportion of data to use for testing - :param stratify_column: Column to use for stratified splitting - :param random_state: Random seed for reproducibility - :returns: A tuple of (train_dataset, val_dataset, test_dataset) - """ - pass - - @abstractmethod - def sample(self, n: int = None, frac: float = None, replace: bool = False, random_state: int = None) -> T: - """ - Sample records from dataset. - - :param n: Number of samples to take - :param frac: Fraction of dataset to sample - :param replace: Whether to sample with replacement - :param random_state: Random seed for reproducibility - :returns: A new dataset containing the sampled data - """ - pass - - @abstractmethod - def to_bytes(self) -> bytes: - """ - Serialize dataset to bytes. - - :returns: Serialized dataset as bytes - """ - pass - - @classmethod - @abstractmethod - def from_bytes(cls: Type[T], data: bytes) -> T: - """ - Deserialize dataset from bytes. - - :param data: Serialized dataset as bytes - :returns: Deserialized dataset - """ - pass - - @property - @abstractmethod - def structure(self) -> DatasetStructure: - """ - Return a descriptor of the dataset's structure. - - The structure descriptor has different details depending on the type of underlying dataset. This - method is used to provide human-readable and LLM-readable information about the dataset's structure. - - :returns: Schema definition for the dataset - """ - pass - - @abstractmethod - def __len__(self) -> int: - """ - Return the number of items in the dataset. - - :returns: Number of items in the dataset - """ - pass - - @abstractmethod - def __getitem__(self, index: int) -> Any: - """ - Get an item from the dataset by index. - - :param index: Index of the item to retrieve - :returns: The item at the specified index - """ - pass - - -class TabularConvertible(ABC): - """ - Interface for datasets that can be converted to tabular formats. - - This interface defines methods for converting a dataset to common tabular - data formats like pandas DataFrames and numpy arrays. - """ - - @abstractmethod - def to_pandas(self) -> pd.DataFrame: - """ - Convert to pandas DataFrame. - - :returns: Dataset as pandas DataFrame - """ - pass - - @abstractmethod - def to_numpy(self) -> np.ndarray: - """ - Convert to numpy array. - - :returns: Dataset as numpy array - """ - pass - - -class TorchConvertible(ABC): - """ - Interface for datasets that can be converted to PyTorch formats. - - This interface defines methods for converting a dataset to PyTorch-specific - data structures like Datasets and Tensors. - """ - - @abstractmethod - def to_torch_dataset(self): - """ - Convert to PyTorch Dataset. - - :returns: Dataset as PyTorch Dataset - """ - pass - - @abstractmethod - def to_torch_tensor(self): - """ - Convert to PyTorch Tensor. - - :returns: Dataset as PyTorch Tensor - """ - pass - - -class TensorflowConvertible(ABC): - """ - Interface for datasets that can be converted to TensorFlow formats. - - This interface defines methods for converting a dataset to TensorFlow-specific - data structures like tf.data.Dataset. - """ - - @abstractmethod - def to_tf_dataset(self): - """ - Convert to TensorFlow Dataset. - - :returns: Dataset as TensorFlow Dataset - """ - pass diff --git a/plexe/internal/common/datasets/tabular.py b/plexe/internal/common/datasets/tabular.py deleted file mode 100644 index ebedc9fc..00000000 --- a/plexe/internal/common/datasets/tabular.py +++ /dev/null @@ -1,219 +0,0 @@ -""" -This module provides the TabularDataset implementation, which handles tabular data like pandas DataFrames -and numpy arrays. It implements the DatasetInterface and provides methods for splitting, sampling, and -serialization of tabular data. -""" - -import io -import pandas as pd -from typing import Tuple, Optional, Any - -import numpy as np -from plexe.internal.common.datasets.interface import Dataset, TabularConvertible, DatasetStructure - - -class TabularDataset(Dataset, TabularConvertible): - """ - Dataset implementation for tabular data. - - TabularDataset wraps pandas DataFrames and provides methods for common dataset operations like splitting, - sampling, and serialization. It also implements the TabularConvertible interface to allow conversion to - pandas and numpy formats. - """ - - def __init__(self, data: pd.DataFrame): - self._data = self._validate(data) - - @staticmethod - def _validate(data: Any) -> pd.DataFrame: - """Ensure that the input is a pandas DataFrame.""" - if isinstance(data, pd.DataFrame): - return data.copy() - raise ValueError(f"TabularDataset only supports pandas DataFrame input, got {type(data)}") - - def split( - self, - train_ratio: float = 0.7, - val_ratio: float = 0.15, - test_ratio: float = 0.15, - stratify_column: Optional[str] = None, - random_state: Optional[int] = None, - is_time_series: bool = False, - time_index_column: Optional[str] = None, - ) -> Tuple["TabularDataset", "TabularDataset", "TabularDataset"]: - """ - Split dataset into train, validation and test sets. - - :param train_ratio: Proportion of data to use for training - :param val_ratio: Proportion of data to use for validation - :param test_ratio: Proportion of data to use for testing - :param stratify_column: Column to use for stratified splitting (not used for time series) - :param random_state: Random seed for reproducibility (not used for time series) - :param is_time_series: Whether the data is chronological time series data - :param time_index_column: Column name that represents the time index, required if is_time_series=True - :returns: A tuple of (train_dataset, val_dataset, test_dataset) - :raises ValueError: If ratios don't sum to approximately 1.0 or if time_index_column is missing for time series - """ - if abs(train_ratio + val_ratio + test_ratio - 1.0) > 1e-10: - raise ValueError("Split ratios must sum to 1.0") - - # Handle time series data - if is_time_series: - if not time_index_column: - raise ValueError("time_index_column must be provided when is_time_series=True") - - if time_index_column not in self._data.columns: - raise ValueError(f"time_index_column '{time_index_column}' not found in dataset columns") - - # Sort by time index - sorted_data = self._data.sort_values(by=time_index_column).reset_index(drop=True) - - # Calculate split indices - n_samples = len(sorted_data) - train_end = int(n_samples * train_ratio) - val_end = train_end + int(n_samples * val_ratio) - - # Split the data sequentially - train_data = sorted_data.iloc[:train_end] - val_data = sorted_data.iloc[train_end:val_end] - test_data = sorted_data.iloc[val_end:] - - # Handle edge cases for empty splits - empty_df = pd.DataFrame(columns=self._data.columns) - if val_ratio < 1e-10: - val_data = empty_df - if test_ratio < 1e-10: - test_data = empty_df - - return TabularDataset(train_data), TabularDataset(val_data), TabularDataset(test_data) - - # Regular random splitting for non-time series data - from sklearn.model_selection import train_test_split - - # Handle all-data-to-train edge case - if val_ratio < 1e-10 and test_ratio < 1e-10: - return ( - TabularDataset(self._data), - TabularDataset(pd.DataFrame(columns=self._data.columns)), - TabularDataset(pd.DataFrame(columns=self._data.columns)), - ) - elif val_ratio < 1e-10: - train_data, test_data = train_test_split( - self._data, - test_size=test_ratio / (train_ratio + test_ratio), - stratify=self._data[stratify_column] if stratify_column else None, - random_state=random_state, - ) - return ( - TabularDataset(train_data), - TabularDataset(pd.DataFrame(columns=self._data.columns)), - TabularDataset(test_data), - ) - elif test_ratio < 1e-10: - train_data, val_data = train_test_split( - self._data, - test_size=val_ratio / (train_ratio + val_ratio), - stratify=self._data[stratify_column] if stratify_column else None, - random_state=random_state, - ) - return ( - TabularDataset(train_data), - TabularDataset(val_data), - TabularDataset(pd.DataFrame(columns=self._data.columns)), - ) - - # Standard 3-way split - train_data, temp_data = train_test_split( - self._data, - test_size=(val_ratio + test_ratio), - stratify=self._data[stratify_column] if stratify_column else None, - random_state=random_state, - ) - val_ratio_adjusted = val_ratio / (val_ratio + test_ratio) - val_data, test_data = train_test_split( - temp_data, - test_size=(1 - val_ratio_adjusted), - stratify=temp_data[stratify_column] if stratify_column else None, - random_state=random_state, - ) - return TabularDataset(train_data), TabularDataset(val_data), TabularDataset(test_data) - - def sample( - self, n: int = None, frac: float = None, replace: bool = False, random_state: int = None - ) -> "TabularDataset": - """ - Sample records from dataset. - - :param n: number of samples to take - :param frac: fraction of dataset to sample - :param replace: whether to sample with replacement - :param random_state: random seed for reproducibility - :return: a new dataset containing the sampled data - """ - return TabularDataset(self._data.sample(n=n, frac=frac, replace=replace, random_state=random_state)) - - def to_bytes(self) -> bytes: - """ - Serialize the dataset to bytes using Parquet format. - - :return: byte representation of the dataset - """ - try: - import pyarrow as pa - import pyarrow.parquet as pq - - buffer = io.BytesIO() - table = pa.Table.from_pandas(self._data) - pq.write_table(table, buffer) - return buffer.getvalue() - except Exception as e: - raise RuntimeError("Failed to serialize DataFrame to bytes") from e - - @classmethod - def from_bytes(cls, data: bytes) -> "TabularDataset": - """ - Deserialize bytes back into a TabularDataset. - - :param data: byte representation of dataset - :return: TabularDataset instance - """ - try: - import pyarrow.parquet as pq - - buffer = io.BytesIO(data) - table = pq.read_table(buffer) - return cls(table.to_pandas()) - except Exception as e: - raise RuntimeError("Failed to deserialize bytes to DataFrame") from e - - @property - def structure(self) -> DatasetStructure: - """ - Return structural metadata for the dataset. - - :return: DatasetStructure object describing features and shape - """ - return DatasetStructure( - modality="table", - features=list(self._data.columns), - details={ - "num_rows": len(self._data), - "num_columns": self._data.shape[1], - "column_names": list(self._data.columns), - "column_types": self._data.dtypes.astype(str).to_dict(), - }, - ) - - def to_pandas(self) -> pd.DataFrame: - """Return a copy of the dataset as a pandas DataFrame.""" - return self._data - - def to_numpy(self) -> np.ndarray: - """Convert the dataset to a NumPy array.""" - return self._data.to_numpy() - - def __len__(self) -> int: - return len(self._data) - - def __getitem__(self, item: Any) -> Any: - return self._data.iloc[item] diff --git a/plexe/internal/common/provider.py b/plexe/internal/common/provider.py deleted file mode 100644 index 5c1030db..00000000 --- a/plexe/internal/common/provider.py +++ /dev/null @@ -1,175 +0,0 @@ -""" -This module defines the base class for LLM providers and includes -logging and retry mechanisms for querying the providers. -""" - -import logging -import textwrap -from typing import Type, Optional - -import litellm -from litellm import completion, supports_response_schema -from litellm.exceptions import RateLimitError, ServiceUnavailableError -from pydantic import BaseModel -from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type - -logger = logging.getLogger(__name__) - - -class ProviderConfig: - """ - Configuration class for specifying different LLM providers for various agent roles. - - This allows for granular control of which providers/models are used for different - parts of the multi-agent system. - - Attributes: - default_provider: The default provider to use when specific ones aren't set - orchestrator_provider: Provider for the orchestrator/manager agent - research_provider: Provider for the ML Research Scientist agent - engineer_provider: Provider for the ML Engineer agent - ops_provider: Provider for the ML Ops Engineer agent - tool_provider: Provider for tool operations - """ - - def __init__( - self, - default_provider: str = "openai/gpt-4o-mini", - orchestrator_provider: Optional[str] = None, - research_provider: Optional[str] = None, - engineer_provider: Optional[str] = None, - ops_provider: Optional[str] = None, - tool_provider: Optional[str] = None, - ): - # Default provider is used when specific ones aren't set - self.default_provider = default_provider - - # Agent-specific providers - self.orchestrator_provider = orchestrator_provider or default_provider - self.research_provider = research_provider or default_provider - self.engineer_provider = engineer_provider or default_provider - self.ops_provider = ops_provider or default_provider - - # Provider for tool operations - self.tool_provider = tool_provider or default_provider - - def __repr__(self) -> str: - return ( - f"ProviderConfig(default={self.default_provider}, " - f"orchestrator={self.orchestrator_provider}, " - f"research={self.research_provider}, " - f"engineer={self.engineer_provider}, " - f"ops={self.ops_provider}, " - f"tool={self.tool_provider})" - ) - - -class Provider: - """ - Base class for LiteLLM provider. - """ - - def __init__(self, model: str = None): - default_model = "openai/gpt-4o-mini" - self.model = model or default_model - if "/" not in self.model: - self.model = default_model - logger.warning(f"Model name should be in the format 'provider/model', using default model: {default_model}") - # Check if the model supports json mode - if "response_format" not in litellm.get_supported_openai_params(model=self.model): - raise ValueError(f"Model {self.model} does not support passing response_format") - if not supports_response_schema(model=self.model): - raise ValueError(f"Model {self.model} does not support response schema") - - def _make_completion_call(self, messages, response_format): - """Helper method to make the actual API call with built-in retries for rate limits""" - response = completion(model=self.model, messages=messages, response_format=response_format) - - if not response.choices[0].message.content: - raise ValueError("Empty response from provider") - - return response.choices[0].message.content - - def query( - self, - system_message: str, - user_message: str, - response_format: Type[BaseModel] = None, - retries: int = 3, - backoff: bool = True, - ) -> str: - """ - Method to query the provider using litellm.completion. - - :param [str] system_message: The system message to send to the provider. - :param [str] user_message: The user message to send to the provider. - :param [Type[BaseModel]] response_format: A pydantic BaseModel class representing the response format. - :param [int] retries: The number of times to retry the request. Defaults to 3. - :param [bool] backoff: Whether to use exponential backoff when retrying. Defaults to True. - :return [str]: The response from the provider. - """ - self._log_request(system_message, user_message, self.__class__.__name__) - - messages = [{"role": "system", "content": system_message}, {"role": "user", "content": user_message}] - - try: - # Handle general errors with standard retries - if backoff: - - @retry(stop=stop_after_attempt(retries), wait=wait_exponential(multiplier=2)) - def call_with_backoff_retry_all_errors(): - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=2, min=4), - retry=retry_if_exception_type((RateLimitError, ServiceUnavailableError)), - ) - def call_with_backoff_retry_service_errors(): - return self._make_completion_call(messages, response_format) - - return call_with_backoff_retry_service_errors() - - r = call_with_backoff_retry_all_errors() - else: - r = self._make_completion_call(messages, response_format) - - self._log_response(r, self.__class__.__name__) - return r - except Exception as e: - self._log_error(e) - raise e - - @staticmethod - def _log_request(system_message: str, user_message: str, model): - """ - Logs the request to the provider. - - :param [str] system_message: The system message to send to the provider. - :param [str] user_message: The user message to send to the provider. - """ - logger.debug( - ( - # String interpolation because Python <3.12 does not support backslashes inside f-strings curly braces - f"Requesting chat completion from {model} with messages: " - + textwrap.shorten(system_message.replace("\n", " "), 30) - + ", " - + textwrap.shorten(user_message.replace("\n", " "), 30) - ) - ) - - @staticmethod - def _log_response(response, model): - """ - Logs the response from the provider. - - :param [str] response: The response from the provider. - """ - logger.debug(f"Received completion from {model}: {textwrap.shorten(response, 30)}") - - @staticmethod - def _log_error(error): - """ - Logs the error from the provider. - - :param [str] error: The error from the provider. - """ - logger.error(f"Error querying provider: {error}") diff --git a/plexe/internal/common/utils/__init__.py b/plexe/internal/common/utils/__init__.py deleted file mode 100644 index e1b25cfb..00000000 --- a/plexe/internal/common/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Common utility functions for the plexe library. -""" diff --git a/plexe/internal/common/utils/agents.py b/plexe/internal/common/utils/agents.py deleted file mode 100644 index 26f1b9a4..00000000 --- a/plexe/internal/common/utils/agents.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -This module provides utilities for working with agents defined using the smolagents library. -""" - -import yaml -import importlib -from plexe.config import config - - -def get_prompt_templates(base_template_name: str, override_template_name: str) -> dict: - """ - Given the name of a smolagents prompt template (the 'base template') and a plexe prompt template - (the 'overriding template'), this function loads both templates and returns a merged template in which - all keys from the overriding template overwrite the matching keys in the base template. - """ - base_template: dict = yaml.safe_load( - importlib.resources.files("smolagents.prompts").joinpath(base_template_name).read_text() - ) - override_template: dict = yaml.safe_load( - str( - importlib.resources.files("plexe") - .joinpath("templates/prompts/agent") - .joinpath(override_template_name) - .read_text() - ).replace("{{allowed_packages}}", str(config.code_generation.allowed_packages)) - ) - - # Recursively merge two dictionaries to ensure deep merging - def merge_dicts(base: dict, override: dict) -> dict: - for key, value in override.items(): - if isinstance(value, dict) and key in base and isinstance(base[key], dict): - base[key] = merge_dicts(base[key], value) - else: - base[key] = value - return base - - return merge_dicts(base_template, override_template) diff --git a/plexe/internal/common/utils/chain_of_thought/__init__.py b/plexe/internal/common/utils/chain_of_thought/__init__.py deleted file mode 100644 index c8e4c5fa..00000000 --- a/plexe/internal/common/utils/chain_of_thought/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Chain of thought capturing and logging for agent systems. - -This package provides a framework-agnostic way to capture, format, and display -the chain of thought reasoning from different agent frameworks. -""" - -from plexe.internal.common.utils.chain_of_thought.protocol import StepSummary, ToolCall -from plexe.internal.common.utils.chain_of_thought.adapters import extract_step_summary_from_smolagents -from plexe.internal.common.utils.chain_of_thought.callable import ChainOfThoughtCallable -from plexe.internal.common.utils.chain_of_thought.emitters import ( - ChainOfThoughtEmitter, - ConsoleEmitter, - LoggingEmitter, - MultiEmitter, -) - -__all__ = [ - "StepSummary", - "ToolCall", - "extract_step_summary_from_smolagents", - "ChainOfThoughtCallable", - "ChainOfThoughtEmitter", - "ConsoleEmitter", - "LoggingEmitter", - "MultiEmitter", -] diff --git a/plexe/internal/common/utils/chain_of_thought/adapters.py b/plexe/internal/common/utils/chain_of_thought/adapters.py deleted file mode 100644 index b3e23e08..00000000 --- a/plexe/internal/common/utils/chain_of_thought/adapters.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -This module provides adapters for extracting step information from different agent frameworks. - -The functions in this module are designed to take as input the step outputs provided by a particular -agent framework, and convert them into a framework-agnostic representation that is used throughout Plexe. -This module should be extended as new agent frameworks are added to Plexe, or modified if the agent -frameworks change their output formats. -""" - -from typing import Any - -from .protocol import StepSummary, ToolCall - - -def extract_step_summary_from_smolagents(step: Any, agent: Any) -> StepSummary: - """ - Extract step summary from a SmoLAgents step object. - - Args: - step: A SmoLAgents step object - agent: The agent that performed the step - - Returns: - A framework-agnostic StepSummary object - """ - # Get agent name - agent_name = getattr(agent, "name", agent.__class__.__name__) - - # Extract common properties - step_number = getattr(step, "step_number", None) - step_type = step.__class__.__name__ - error = str(getattr(step, "error", "")) or None - - # Extract model output from various step types - model_output = None - if hasattr(step, "model_output_message") and step.model_output_message: - model_output = getattr(step.model_output_message, "content", None) - - # Extract tool calls - tool_calls = [] - if hasattr(step, "tool_calls") and step.tool_calls: - tool_calls = [ToolCall(name=call.name, args=call.arguments) for call in step.tool_calls] - - # Extract observations and results based on step type - observations = getattr(step, "observations", None) - result = getattr(step, "action_output", None) - - # Handle specific step types - if hasattr(step, "code_block") and getattr(step, "code_block", None): - # Extract thought and code from CodeActionStep - code_block = getattr(step, "code_block", None) - if code_block: - # Don't show full code in chain of thought, just indicate it's there - observations = f"[Code block executed: {len(code_block)} characters]" - - # Create and return the step summary - return StepSummary( - step_number=step_number, - step_type=step_type, - agent_name=agent_name, - model_output=model_output, - tool_calls=tool_calls, - observations=observations, - result=result, - error=error, - ) diff --git a/plexe/internal/common/utils/chain_of_thought/callable.py b/plexe/internal/common/utils/chain_of_thought/callable.py deleted file mode 100644 index 6b1442ed..00000000 --- a/plexe/internal/common/utils/chain_of_thought/callable.py +++ /dev/null @@ -1,209 +0,0 @@ -""" -This module defines Callables for capturing and formatting agent chain of thought. - -The classes in this module are designed to be used as "plug-ins" to agent frameworks, enabling the production -of a user-friendly output of the agent's reasoning process. The output can be used for debugging, logging, or -user feedback during agent execution. -""" - -import json -import logging -from typing import Any, List, Optional, Tuple - -from pydantic import BaseModel, Field - -from ...provider import Provider -from plexe.config import prompt_templates -from .protocol import StepExtractor, StepSummary -from .adapters import extract_step_summary_from_smolagents -from .emitters import ChainOfThoughtEmitter, ConsoleEmitter - -logger = logging.getLogger(__name__) - - -class ChainOfThoughtCallable: - """ - Callable that captures and formats agent chain of thought. - - This callable can be attached to agent frameworks to capture - each step of the agent's reasoning process and format it for - user-friendly output. - """ - - def __init__( - self, - emitter: Optional[ChainOfThoughtEmitter] = None, - extractor: StepExtractor = extract_step_summary_from_smolagents, - ): - """ - Initialize the chain of thought callable. - - Args: - emitter: The emitter to use for outputting chain of thought - extractor: Function that extracts step information from the agent framework - """ - self.emitter = emitter or ConsoleEmitter() - self.extractor = extractor - self.steps: List[StepSummary] = [] - - def __call__(self, step: Any, agent: Any = None) -> None: - """ - Process a step from an agent. - - Args: - step: The step object from the agent framework - agent: The agent that performed the step - """ - try: - # Extract step summary - summary = self.extractor(step, agent) - - # Generate friendly title and summary if not already present - if summary.friendly_title is None or summary.friendly_summary is None: - friendly_title, friendly_summary = _generate_friendly_summary(summary) - summary.friendly_title = friendly_title - summary.friendly_summary = friendly_summary - - # Store the step for later retrieval - self.steps.append(summary) - - # Emit the step information - self._emit_step(summary) - - except Exception as e: - # Log full stack trace at debug level - import traceback - - logger.debug(f"Error processing agent step: {str(e)}\n{traceback.format_exc()}") - - # Log a shorter message at warning level - logger.warning(f"Error processing agent step: {str(e)[:50]}") - - def _emit_step(self, summary: StepSummary) -> None: - """ - Emit a step to the configured emitter. - - Args: - summary: The step summary to emit - """ - # If we have friendly title and summary, emit those in a consolidated message - if summary.friendly_title and summary.friendly_summary: - # Emit friendly step header with title and summary together - # This ensures they appear as a single node in the tree - self.emitter.emit_thought(summary.agent_name, f"💡 {summary.friendly_title}\n💭 {summary.friendly_summary}") - return - - # Fall back to verbose output if friendly version not available - - # Emit step header - step_header = f"🧠 {summary.step_type}" - if summary.step_number is not None: - step_header += f" #{summary.step_number}" - - self.emitter.emit_thought(summary.agent_name, step_header) - - # Emit model output separately if available - if summary.model_output: - thought_text = summary.model_output[:500] - if len(summary.model_output) > 500: - thought_text += "..." - self.emitter.emit_thought(summary.agent_name, f"💭 Thought: {thought_text}") - - # Emit tool calls one by one for better visualization - for call in summary.tool_calls: - self.emitter.emit_thought(summary.agent_name, f"🔧 Tool: {call.name}({call.args})") - - # Emit observations - if summary.observations: - observation_text = summary.observations[:500] - if len(summary.observations) > 500: - observation_text += "..." - self.emitter.emit_thought(summary.agent_name, f"📡 Observed: {observation_text}") - - # Emit result - if summary.result: - result_text = str(summary.result)[:500] - if len(str(summary.result)) > 500: - result_text += "..." - self.emitter.emit_thought(summary.agent_name, f"📦 Result: {result_text}") - - # Emit error if any - if summary.error: - self.emitter.emit_thought(summary.agent_name, f"❌ Error: {summary.error}") - - def get_full_chain_of_thought(self) -> List[StepSummary]: - """ - Get the full chain of thought captured so far. - - Returns: - The list of step summaries - """ - return self.steps - - def clear(self) -> None: - """Clear all captured steps.""" - self.steps = [] - - -def _generate_friendly_summary(summary: StepSummary) -> Tuple[str, str]: - """ - Generate a user-friendly title and summary for a step using LLM. - - Args: - summary: The step summary to generate a friendly summary for - - Returns: - A tuple of (friendly_title, friendly_summary) - """ - - class FriendlySummaryResponse(BaseModel): - """Response format for generating friendly step summaries.""" - - title: str = Field(description="A short, friendly title (3-7 words) that captures the essence of what happened") - summary: str = Field(description="A concise summary (1-2 sentences) that explains the step in plain language") - - # Create a context string that summarizes the step for the LLM - context_parts = [f"Step Type: {summary.step_type}"] - - if summary.model_output: - context_parts.append(f"Thought: {summary.model_output}") - - if summary.tool_calls: - for call in summary.tool_calls: - context_parts.append(f"Tool: {call.name}({json.dumps(call.args)})") - - if summary.observations: - context_parts.append(f"Observation: {summary.observations}") - - if summary.result: - context_parts.append(f"Result: {str(summary.result)}") - - if summary.error: - context_parts.append(f"Error: {summary.error}") - - context = "\n".join(context_parts) - - # Get the prompt template - system_message = prompt_templates.cot_system() - user_message = prompt_templates.cot_summarize(context) - - try: - # Use the Provider to get a structured response - provider = Provider() - response = provider.query( - system_message=system_message, user_message=user_message, response_format=FriendlySummaryResponse - ) - - # Parse the response to get JSON - response_data = json.loads(response) - return response_data["title"], response_data["summary"] - except Exception as e: - # Log full stack trace at debug level - import traceback - - logger.debug(f"Error generating friendly summary: {str(e)}\n{traceback.format_exc()}") - - # Log shorter message at warning level - logger.warning(f"Error generating friendly summary: {str(e)[:50]}") - - return f"{summary.step_type}", f"Step {summary.step_number or 0} of type {summary.step_type}" diff --git a/plexe/internal/common/utils/chain_of_thought/emitters.py b/plexe/internal/common/utils/chain_of_thought/emitters.py deleted file mode 100644 index 3cb882b6..00000000 --- a/plexe/internal/common/utils/chain_of_thought/emitters.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -This module defines Emitters for outputting chain of thought information. - -The emitters are responsible for formatting and outputting the chain of thought of the agents to output -locations such as the console or a logging system. The emitters can be used in various contexts, such as -logging agent actions, debugging, or providing user feedback during agent execution. -""" - -import logging -import sys -from abc import ABC, abstractmethod -from datetime import datetime -from typing import List, TextIO - -logger = logging.getLogger(__name__) - - -class ChainOfThoughtEmitter(ABC): - """ - Abstract base class for chain of thought emitters. - - Emitters are responsible for outputting chain of thought - information in a user-friendly format. - """ - - @abstractmethod - def emit_thought(self, agent_name: str, message: str) -> None: - """ - Emit a thought from an agent. - - Args: - agent_name: The name of the agent emitting the thought - message: The thought message - """ - pass - - -class ConsoleEmitter(ChainOfThoughtEmitter): - """ - Emitter that outputs chain of thought to the console with rich formatting. - """ - - def __init__(self, output: TextIO = sys.stdout): - """ - Initialize the console emitter with Rich support. - - Args: - output: The text IO to write to - """ - self.output = output - - # Initialize Rich components - try: - from rich.console import Console - - # Force colorization even in environments like WSL where auto-detection might fail - self.console = Console(file=output, force_terminal=True, color_system="auto", highlight=False) - self.step_count = 0 - self.has_rich = True - except ImportError: - # Fall back to basic output if Rich isn't available - self.has_rich = False - - def emit_thought(self, agent_name: str, message: str) -> None: - """ - Emit a thought to the console using Rich tree visualization. - - Args: - agent_name: The name of the agent emitting the thought - message: The thought message - """ - if not self.has_rich: - # Fall back to basic output - self.output.write(f"[{agent_name}] {message}\n") - self.output.flush() - return - - try: - # Import Rich components for type annotations - - # Track step count for timeline - self.step_count += 1 - timestamp = datetime.now().strftime("%H:%M:%S") - - # Create a separate tree for each output item, but track agent names - # This simplifies the display and avoids issues with nested trees - agent_color = self._get_agent_color(agent_name) - - # Add agent and step information in a cleaner format - # Use plain print with styling - make it more compact - step_info = f"Step {self.step_count}" - - # Add a blank line before new agent name for better readability - # but only if this isn't the first step - if self.step_count > 1: - self.console.print("") - - self.console.print( - f"[bold {agent_color}]{agent_name}[/bold {agent_color}] · [dim]{step_info} · {timestamp}[/dim]" - ) - - # Process the message based on format - if message.startswith("💡"): - # Friendly format with title and summary - parts = message.split("\n", 1) - title = parts[0] - content = parts[1] if len(parts) > 1 else "" - - # Extract emoji if present - emoji = "" - if title.startswith("💡"): - emoji = "💡 " - title = title[2:].strip() - - # Display title and content with proper formatting - self.console.print(f"[bold green]{emoji}{title}[/bold green]") - if content: - self.console.print(f"[dim]{content}[/dim]") - else: - # Standard format - print directly - self.console.print(f"{message}") - - except Exception as e: - # Log the error at debug level - import logging - - logger = logging.getLogger(__name__) - logger.debug(f"Error in Rich formatting: {str(e)}") - - # Fall back to plain output - self.output.write(f"[{agent_name}] {message}\n") - self.output.flush() - - @staticmethod - def _get_agent_color(agent_name: str) -> str: - """Get the color for an agent based on its role.""" - agent_colors = { - "System": "bright_blue", - "MLResearchScientist": "green", - "MLEngineer": "yellow", - "MLOperationsEngineer": "magenta", - "Orchestrator": "cyan", - "DatasetAnalyser": "red", - "SchemaResolver": "orange", - "DatasetSplitter": "purple", - # Default color - "default": "blue", - } - - # Match partial agent names (e.g. "Engineer" should match "ML Engineer") - for role, color in agent_colors.items(): - if role in agent_name: - return color - - return agent_colors["default"] - - -class LoggingEmitter(ChainOfThoughtEmitter): - """ - Emitter that outputs chain of thought to the logging system. - """ - - def __init__(self, level: int = logging.INFO): - """ - Initialize the logging emitter. - - Args: - level: The logging level to use - """ - self.logger = logging.getLogger("plexe.chain_of_thought") - self.level = level - - def emit_thought(self, agent_name: str, message: str) -> None: - """ - Emit a thought to the logger. - - Args: - agent_name: The name of the agent emitting the thought - message: The thought message - """ - self.logger.log(self.level, f"[{agent_name}] {message}") - - -class MultiEmitter(ChainOfThoughtEmitter): - """ - Emitter that outputs chain of thought to multiple emitters. - """ - - def __init__(self, emitters: List[ChainOfThoughtEmitter]): - """ - Initialize the multi emitter. - - Args: - emitters: The emitters to output to - """ - self.emitters = emitters - - def emit_thought(self, agent_name: str, message: str) -> None: - """ - Emit a thought to all configured emitters. - - Args: - agent_name: The name of the agent emitting the thought - message: The thought message - """ - for emitter in self.emitters: - emitter.emit_thought(agent_name, message) diff --git a/plexe/internal/common/utils/chain_of_thought/protocol.py b/plexe/internal/common/utils/chain_of_thought/protocol.py deleted file mode 100644 index cd1f76ea..00000000 --- a/plexe/internal/common/utils/chain_of_thought/protocol.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Defines protocols and data classes for capturing agent reasoning steps. -""" - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Protocol - - -@dataclass -class ToolCall: - """Information about a tool called by an agent.""" - - name: str - args: Dict[str, Any] - - -@dataclass -class StepSummary: - """ - Framework-agnostic representation of an agent's reasoning step. - - This class represents a single step in an agent's chain of thought, - regardless of the underlying agent framework used. - """ - - step_number: Optional[int] = None - step_type: str = "Unknown" - agent_name: str = "Agent" - model_output: Optional[str] = None - tool_calls: List[ToolCall] = field(default_factory=list) - observations: Optional[str] = None - result: Any = None - error: Optional[str] = None - friendly_title: Optional[str] = None - friendly_summary: Optional[str] = None - - -class StepExtractor(Protocol): - """ - Protocol for extracting step information from agent frameworks. - """ - - def __call__(self, step: Any, agent: Any) -> StepSummary: - """ - Extract step summary from framework-specific step object. - - Args: - step: A step object from a specific agent framework - agent: The agent that performed the step - - Returns: - A framework-agnostic StepSummary object - """ - ... diff --git a/plexe/internal/common/utils/dataset_storage.py b/plexe/internal/common/utils/dataset_storage.py deleted file mode 100644 index 85d32c3b..00000000 --- a/plexe/internal/common/utils/dataset_storage.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -This module provides utilities for dataset storage and transfer across processes. - -These utilities work with any class implementing the DatasetInterface and provide -functions for storing datasets to files, reading datasets from files, and using -shared memory for cross-process dataset sharing. -""" - -from typing import Type, TypeVar, Optional -import logging - -from plexe.internal.common.datasets.interface import Dataset - -logger = logging.getLogger(__name__) - -T = TypeVar("T", bound=Dataset) - - -def write_dataset_to_file(dataset: Dataset, path: str) -> None: - """ - Write dataset to a file. - - :param dataset: The dataset to write to a file - :param path: Path to write the dataset to - """ - with open(path, "wb") as f: - f.write(dataset.to_bytes()) - - -def read_dataset_from_file(dataset_class: Type[T], path: str) -> T: - """ - Read dataset from a file. - - :param dataset_class: The dataset class to instantiate - :param path: Path to read the dataset from - :returns: Instantiated dataset of the specified class - """ - with open(path, "rb") as f: - return dataset_class.from_bytes(f.read()) - - -def dataset_to_shared_memory(dataset: Dataset, name: str) -> None: - """ - Place dataset in shared memory for cross-process access. - - This function serializes a dataset and places it in shared memory - with the given name, allowing other processes to access it. - - :param dataset: The dataset to place in shared memory - :param name: Name of the shared memory segment - :raises ImportError: If shared memory is not available - """ - try: - from multiprocessing import shared_memory - - # Serialize the dataset - data = dataset.to_bytes() - - # Create shared memory segment - shm = shared_memory.SharedMemory(name=name, create=True, size=len(data)) - - # Copy dataset bytes to shared memory - shm.buf[: len(data)] = data - - return shm - except ImportError: - raise ImportError("Shared memory requires Python 3.8+ and the multiprocessing module") - - -def dataset_from_shared_memory(dataset_class: Type[T], name: str, size: Optional[int] = None) -> T: - """ - Retrieve dataset from shared memory. - - This function retrieves a serialized dataset from shared memory - and deserializes it into an instance of the specified class. - - :param dataset_class: The dataset class to instantiate - :param name: Name of the shared memory segment - :param size: Size of the dataset in bytes (optional) - :returns: Instantiated dataset of the specified class - """ - try: - from multiprocessing import shared_memory - - # Access the shared memory segment - shm = shared_memory.SharedMemory(name=name) - - # Determine the size of the data - if size is None: - # Find the null terminator if size is not specified - # This assumes the shared memory was filled with zeros initially - # and data is not binary (contains no null bytes) - for i in range(shm.size): - if shm.buf[i] == 0: - size = i - break - else: - size = shm.size - - # Deserialize the dataset - data = bytes(shm.buf[:size]) - return dataset_class.from_bytes(data) - except ImportError: - raise ImportError("Shared memory requires Python 3.8+ and the multiprocessing module") diff --git a/plexe/internal/common/utils/dependency_utils.py b/plexe/internal/common/utils/dependency_utils.py deleted file mode 100644 index 32bc73e3..00000000 --- a/plexe/internal/common/utils/dependency_utils.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Utilities for handling optional dependencies. -""" - -import functools -import logging -from typing import Callable, TypeVar, cast, Any - -from plexe.config import is_package_available - -logger = logging.getLogger(__name__) - -# Type variable for the callable return type -T = TypeVar("T") - - -def requires_package(package_name: str, error_message: str = None) -> Callable[[Callable[..., T]], Callable[..., T]]: - """ - Decorator that checks if a required package is installed before executing a function. - If the package is not available, logs a warning and returns None. - - Example: - @requires_package('torch') - def train_neural_network(data): - # This function will only run if torch is installed - import torch - # ...training code... - - :param package_name: Name of the package that needs to be available - :param error_message: Custom error message to display if the package is not available - :return: Decorator function - """ - - def decorator(func: Callable[..., T]) -> Callable[..., T]: - @functools.wraps(func) - def wrapper(*args: Any, **kwargs: Any) -> T: - if is_package_available(package_name): - return func(*args, **kwargs) - else: - default_message = ( - f"The '{package_name}' package is required for this functionality but is not installed. " - f"Install it with 'pip install plexe[all]' or 'pip install {package_name}'." - ) - message = error_message or default_message - logger.warning(message) - return cast(T, None) - - return wrapper - - return decorator diff --git a/plexe/internal/common/utils/markdown_utils.py b/plexe/internal/common/utils/markdown_utils.py deleted file mode 100644 index 48636de8..00000000 --- a/plexe/internal/common/utils/markdown_utils.py +++ /dev/null @@ -1,116 +0,0 @@ -"""Utilities for markdown formatting of reports and data.""" - -from typing import Dict, List, Any, Union - - -def format_eda_report_markdown(eda_report: Dict[Any, Any]) -> str: - """ - Convert an EDA report dictionary to a well-formatted markdown document. - - Args: - eda_report: Dictionary containing the EDA report data - - Returns: - Formatted markdown string - """ - if not eda_report: - return "" - - markdown = [f"# ML-Focused Data Analysis Report: {eda_report.get('dataset_name', 'Dataset')}\n"] - markdown.append(f"Generated: {eda_report.get('timestamp', '')}\n") - - # Dataset Overview - if overview := eda_report.get("overview", {}): - markdown.append("## Essential Dataset Overview\n") - for key, value in overview.items(): - if isinstance(value, dict): - markdown.append(f"### {str(key).replace('_', ' ').title()}\n") - markdown.append(_dict_to_markdown(value, level=3)) - else: - markdown.append(f"**{str(key).replace('_', ' ').title()}**: {value}\n") - markdown.append("\n") - - # Feature Engineering Opportunities - if feature_engineering := eda_report.get("feature_engineering_opportunities", {}): - markdown.append("## Feature Engineering Opportunities\n") - markdown.append(_dict_to_markdown(feature_engineering, level=2)) - markdown.append("\n") - - # Data Quality Challenges - if data_quality := eda_report.get("data_quality_challenges", {}): - markdown.append("## Data Quality Challenges\n") - markdown.append(_dict_to_markdown(data_quality, level=2)) - markdown.append("\n") - - # Data Preprocessing Requirements - if preprocessing := eda_report.get("data_preprocessing_requirements", {}): - markdown.append("## Data Preprocessing Requirements\n") - markdown.append(_dict_to_markdown(preprocessing, level=2)) - markdown.append("\n") - - # Feature Importance - if importance := eda_report.get("feature_importance", {}): - markdown.append("## Feature Importance Analysis\n") - markdown.append(_dict_to_markdown(importance, level=2)) - markdown.append("\n") - - # Key Insights - if insights := eda_report.get("insights", []): - markdown.append("## Key ML Insights\n") - for insight in insights: - markdown.append(f"- {insight}\n") - markdown.append("\n") - - # Recommendations - if recommendations := eda_report.get("recommendations", []): - markdown.append("## Actionable Recommendations\n") - for recommendation in recommendations: - markdown.append(f"- {recommendation}\n") - - return "\n".join(markdown) - - -def _dict_to_markdown(data: Union[Dict, List, Any], level: int = 0) -> str: - """ - Helper function to convert nested dictionaries and lists to markdown. - - Args: - data: Data to convert (dict, list, or scalar value) - level: Header level for section titles - - Returns: - Markdown string representation - """ - if not data: - return "" - - result = [] - - if isinstance(data, dict): - for key, value in data.items(): - header_prefix = "#" * (level + 1) - key_formatted = str(key).replace("_", " ").title() - - if isinstance(value, dict): - result.append(f"{header_prefix} {key_formatted}\n") - result.append(_dict_to_markdown(value, level + 1)) - elif isinstance(value, list): - result.append(f"{header_prefix} {key_formatted}\n") - for item in value: - if isinstance(item, dict): - result.append(_dict_to_markdown(item, level + 1)) - else: - result.append(f"- {item}\n") - else: - result.append(f"**{key_formatted}**: {value}\n") - elif isinstance(data, list): - for item in data: - if isinstance(item, dict): - result.append(_dict_to_markdown(item, level)) - else: - result.append(f"- {item}\n") - else: - # Handle scalar value - result.append(str(data)) - - return "\n".join(result) diff --git a/plexe/internal/common/utils/model_state.py b/plexe/internal/common/utils/model_state.py deleted file mode 100644 index 1cc8a713..00000000 --- a/plexe/internal/common/utils/model_state.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -Model state definitions for Plexe. - -This module defines the possible states a model can be in during its lifecycle. -""" - -from enum import Enum - - -class ModelState(Enum): - """States a model can be in during its lifecycle.""" - - DRAFT = "draft" - """Model is in draft state, not yet built.""" - - BUILDING = "building" - """Model is currently being built.""" - - READY = "ready" - """Model is built and ready to use.""" - - ERROR = "error" - """Model encountered an error during building.""" diff --git a/plexe/internal/common/utils/model_utils.py b/plexe/internal/common/utils/model_utils.py deleted file mode 100644 index 40740e21..00000000 --- a/plexe/internal/common/utils/model_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -This module provides utility functions for working with model descriptions and metadata. -""" - -from typing import Optional - - -def calculate_model_size(artifacts: list) -> Optional[int]: - """ - Calculate the total size of the model artifacts in bytes. - - :param artifacts: List of artifacts with path attributes - :return: The size in bytes or None if no artifacts exist - """ - if not artifacts: - return None - - total_size = 0 - for artifact in artifacts: - if artifact.path and artifact.path.exists(): - total_size += artifact.path.stat().st_size - - return total_size if total_size > 0 else None - - -def format_code_snippet(code: Optional[str]) -> Optional[str]: - """ - Format a code snippet for display, truncating if necessary. - - :param code: The source code as a string - :return: A formatted code snippet or None if code doesn't exist - """ - if not code: - return None - - # Limit the size of code displayed, possibly add line numbers, etc. - lines = code.splitlines() - if len(lines) > 20: - # Return first 10 and last 10 lines with a note in the middle - return "\n".join(lines[:10] + ["# ... additional lines omitted ..."] + lines[-10:]) - return code diff --git a/plexe/internal/common/utils/pandas_utils.py b/plexe/internal/common/utils/pandas_utils.py deleted file mode 100644 index 477002ec..00000000 --- a/plexe/internal/common/utils/pandas_utils.py +++ /dev/null @@ -1,37 +0,0 @@ -import pandas as pd - - -def convert_dtype_to_python(dtype, sample_values=None) -> str: - """ - Convert a Pandas dtype to a Python type. - - :param dtype: The Pandas dtype to convert. - :param sample_values: Optional sample values to help detect list types in object columns. - :return: The corresponding Python type as string. - """ - if pd.api.types.is_bool_dtype(dtype): - return "bool" - elif pd.api.types.is_numeric_dtype(dtype): - if pd.api.types.is_integer_dtype(dtype): - return "int" - else: - return "float" - elif dtype == "object" and sample_values is not None: - # Check if object column contains lists by sampling values - for val in sample_values: - if pd.notna(val) and isinstance(val, list): - # Detect element type from list contents using pandas-safe methods - if not val: # Empty list - continue - first_elem = val[0] - if pd.api.types.is_bool_dtype(type(first_elem)) or isinstance(first_elem, (bool, pd.BooleanDtype)): - return "List[bool]" - elif pd.api.types.is_integer_dtype(type(first_elem)): - return "List[int]" - elif pd.api.types.is_float_dtype(type(first_elem)): - return "List[float]" - else: - return "List[str]" - return "str" - else: - return "str" diff --git a/plexe/internal/common/utils/prompt_utils.py b/plexe/internal/common/utils/prompt_utils.py deleted file mode 100644 index 1fc2b1fa..00000000 --- a/plexe/internal/common/utils/prompt_utils.py +++ /dev/null @@ -1,27 +0,0 @@ -import json -import logging -from typing import Type - -from pydantic import BaseModel - -from plexe.internal.common.utils.pydantic_utils import convert_schema_to_type_dict - -logger = logging.getLogger(__name__) - - -def join_task_statement(intent: str, input_schema: Type[BaseModel], output_schema: Type[BaseModel]) -> str: - """Join the problem statement into a single string.""" - problem_statement: str = ( - "# Problem Statement" - "\n\n" - f"{intent}" - "\n\n" - "# Expected Model Input Schema" - "\n\n" - f"{json.dumps(convert_schema_to_type_dict(input_schema), indent=4, default=str)}" - "\n\n" - "# Expected Model Output Schema" - "\n\n" - f"{json.dumps(convert_schema_to_type_dict(output_schema), indent=4, default=str)}" - ) - return problem_statement diff --git a/plexe/internal/common/utils/pydantic_utils.py b/plexe/internal/common/utils/pydantic_utils.py deleted file mode 100644 index f542cdf3..00000000 --- a/plexe/internal/common/utils/pydantic_utils.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -This module provides utility functions for manipulating Pydantic models. -""" - -from pydantic import BaseModel, create_model -from typing import Type, List, Dict, get_type_hints - - -def _validate_schema_types(schema: Type[BaseModel]) -> None: - """Validate that a BaseModel schema only contains allowed types.""" - allowed_types = {int, float, str, bool, List[int], List[float], List[str], List[bool]} - - for field_name, field_info in schema.model_fields.items(): - field_type = field_info.annotation - if field_type not in allowed_types: - raise ValueError( - f"Field '{field_name}' has unsupported type '{field_type}'. " - f"Allowed types: int, float, str, bool, List[int], List[float], List[str], List[bool]" - ) - - -def merge_models(model_name: str, models: List[Type[BaseModel]]) -> Type[BaseModel]: - """ - Merge multiple Pydantic models into a single model. The ordering of the list determines - the overriding precedence of the models; the last model in the list will override any fields - with the same name in the preceding models. - - :param model_name: The name of the new model to create. - :param models: A list of Pydantic models to merge. - :return: A new Pydantic model that combines the input models. - """ - fields = dict() - for model in models: - for name, properties in model.model_fields.items(): - fields[name] = (properties.annotation, ... if properties.is_required() else properties.default) - return create_model(model_name, **fields) - - -def create_model_from_fields(model_name: str, model_fields: dict) -> Type[BaseModel]: - """ - Create a Pydantic model from a dictionary of fields. - - :param model_name: The name of the model to create. - :param model_fields: A dictionary of field names to field properties. - """ - for name, properties in model_fields.items(): - model_fields[name] = (properties.annotation, ... if properties.is_required() else properties.default) - return create_model(model_name, **model_fields) - - -def map_to_basemodel(name: str, schema: dict | Type[BaseModel]) -> Type[BaseModel]: - """ - Ensure that the schema is a Pydantic model or a dictionary, and return the model. - - :param [str] name: the name to be given to the model class - :param [dict] schema: the schema to be converted to a Pydantic model - :return: the Pydantic model - """ - # Pydantic model: validate and return - if isinstance(schema, type) and issubclass(schema, BaseModel): - _validate_schema_types(schema) - return schema - - # Dictionary: convert to Pydantic model, if possible - if isinstance(schema, dict): - try: - # Handle both Dict[str, type] and Dict[str, str] formats - annotated_schema = {} - - for k, v in schema.items(): - # If v is a string like "int", convert it to the actual type - if isinstance(v, str): - type_mapping = { - "int": int, - "float": float, - "str": str, - "bool": bool, - "List[int]": List[int], - "List[float]": List[float], - "List[str]": List[str], - "List[bool]": List[bool], - } - if v in type_mapping: - annotated_schema[k] = (type_mapping[v], ...) - else: - raise ValueError(f"Invalid type specification: {v} for field {k}") - # If v is already a type or one of our allowed typing generics, use it directly - elif isinstance(v, type) or v in {List[int], List[float], List[str], List[bool]}: - # Validate that it's one of our allowed types - allowed_types = {int, float, str, bool, List[int], List[float], List[str], List[bool]} - if v not in allowed_types: - raise ValueError(f"Unsupported type '{v}' for field '{k}'. Allowed types: {allowed_types}") - annotated_schema[k] = (v, ...) - else: - raise ValueError(f"Invalid field specification for {k}: {v}") - - return create_model(name, **annotated_schema) - except Exception as e: - raise ValueError(f"Invalid schema definition: {e}") - - # All other schema types are invalid - raise TypeError("Schema must be a Pydantic model or a dictionary of field names to types.") - - -def format_schema(schema: Type[BaseModel]) -> Dict[str, str]: - """ - Format a schema model into a dictionary representation of field names and types. - - :param schema: A pydantic model defining a schema - :return: A dictionary representing the schema structure with field names as keys and types as values - """ - if not schema: - return {} - - result = {} - # Use model_fields which is the recommended approach in newer Pydantic versions - for field_name, field_info in schema.model_fields.items(): - field_type = getattr(field_info.annotation, "__name__", str(field_info.annotation)) - result[field_name] = field_type - - return result - - -def convert_schema_to_type_dict(schema: Type[BaseModel]) -> Dict[str, type]: - """ - Convert a Pydantic model to a dictionary mapping field names to their Python types. - - This is useful for tools that require type information without the full Pydantic field metadata. - - :param schema: A Pydantic model to convert - :return: A dictionary with field names as keys and Python types as values - """ - if not schema or not issubclass(schema, BaseModel): - raise TypeError("Schema must be a Pydantic BaseModel") - - result = {} - - # Get the actual type annotations, which will be Python types - type_hints = get_type_hints(schema) - - # Extract annotations from model fields - for field_name, field_info in schema.model_fields.items(): - # Use the type hint if available, otherwise fall back to the field annotation - field_type = type_hints.get(field_name, field_info.annotation) - result[field_name] = field_type - - return result diff --git a/plexe/internal/common/utils/response.py b/plexe/internal/common/utils/response.py deleted file mode 100644 index e93a42a2..00000000 --- a/plexe/internal/common/utils/response.py +++ /dev/null @@ -1,204 +0,0 @@ -import json -import re -import logging -import pandas as pd - -import black - -logging.getLogger("blib2to3.pgen2.driver").setLevel(logging.WARNING) -logger = logging.getLogger(__name__) - - -def wrap_code(code: str, lang="python") -> str: - """Wraps code with three backticks.""" - return f"```{lang}\n{code}\n```" - - -def is_valid_python_script(script): - """Check if a script is a valid Python script.""" - try: - compile(script, "", "exec") - return True - except SyntaxError: - return False - - -def extract_jsons(text): - """Extract all JSON objects from the text. Caveat: This function cannot handle nested JSON objects.""" - json_objects = [] - matches = re.findall(r"\{.*?\}", text, re.DOTALL) - for match in matches: - try: - json_obj = json.loads(match) - json_objects.append(json_obj) - except json.JSONDecodeError: - pass - - # Sometimes chatgpt-turbo forget the last curly bracket, so we try to add it back when no json is found - if len(json_objects) == 0 and not text.endswith("}"): - json_objects = extract_jsons(text + "}") - if len(json_objects) > 0: - return json_objects - - return json_objects - - -def trim_long_string(string, threshold=5100, k=2500): - # Check if the length of the string is longer than the threshold - if len(string) > threshold: - # Output the first k and last k characters - first_k_chars = string[:k] - last_k_chars = string[-k:] - - truncated_len = len(string) - 2 * k - - return f"{first_k_chars}\n ... [{truncated_len} characters truncated] ... \n{last_k_chars}" - else: - return string - - -def extract_code(text): - """Extract python code blocks from the text.""" - parsed_codes = [] - - # When code is in a text or python block - matches = re.findall(r"```(python)?\n*(.*?)\n*```", text, re.DOTALL) - for match in matches: - code_block = match[1] - parsed_codes.append(code_block) - - # When the entire text is code or backticks of the code block is missing - if len(parsed_codes) == 0: - matches = re.findall(r"^(```(python)?)?\n?(.*?)\n?(```)?$", text, re.DOTALL) - if matches: - code_block = matches[0][2] - parsed_codes.append(code_block) - - # validate the parsed codes - valid_code_blocks = [format_code(c) for c in parsed_codes if is_valid_python_script(c)] - return format_code("\n\n".join(valid_code_blocks)) - - -def extract_text_up_to_code(s): - """Extract (presumed) natural language text up to the start of the first code block.""" - if "```" not in s: - return "" - return s[: s.find("```")].strip() - - -def format_code(code) -> str: - """Format Python code using Black.""" - try: - return black.format_str(code, mode=black.FileMode()) - except black.parsing.InvalidInput: # type: ignore - return code - - -def extract_performance(output: str) -> float | None: - """Extract the performance metric from the output.""" - try: - last_line = output.strip().split("\n")[-1] - - # Looking for format "MetricName: value" - if ":" not in last_line: - raise RuntimeError("No colon found in last line") - - value_str = last_line.split(":")[-1].strip() - - try: - value = float(value_str) - logger.debug(f"Successfully parsed value: {value}") - return value - except ValueError as e: - raise RuntimeError(f"Could not convert '{value_str}' to float: {e}") from e - - except Exception as e: - raise RuntimeError(f"Error extracting run performance: {e}") from e - - -def extract_json_array(text: str) -> str: - """ - Extract a JSON array from an LLM response, handling common formatting issues. - - This function cleans up LLM responses that may contain JSON arrays embedded in - markdown code blocks, additional explanatory text, or other formatting artifacts. - - Args: - text: The raw text output from an LLM - - Returns: - A cleaned JSON array string ready for parsing - """ - cleaned_text = text - - # Remove code block markers if present - if "```" in cleaned_text: - json_match = re.search(r"```(?:json)?\s*(.*?)\s*```", cleaned_text, re.DOTALL) - if json_match: - cleaned_text = json_match.group(1) - - # Remove any language tags or backticks - cleaned_text = cleaned_text.replace("json", "").replace("`", "").strip() - - # Find the actual JSON array if needed - start_idx = cleaned_text.find("[") - end_idx = cleaned_text.rfind("]") - if start_idx >= 0 and end_idx >= 0: - cleaned_text = cleaned_text[start_idx : end_idx + 1] - - return cleaned_text - - -def json_to_dataframe(text: str) -> "pd.DataFrame": - """ - Convert LLM-generated JSON text to a pandas DataFrame. - - This function handles common errors in LLM responses when creating DataFrames: - 1. Extracts and cleans the JSON array from text - 2. Validates it's a proper array structure - 3. Creates a DataFrame with appropriate error handling - - Args: - text: Raw text output from an LLM that should contain a JSON array - - Returns: - pandas DataFrame created from the JSON data - - Raises: - ValueError: If the response can't be parsed as a valid JSON array - """ - import pandas as pd - - # Extract the JSON array text - json_text = extract_json_array(text) - - try: - # Parse the JSON - data = json.loads(json_text) - - # Handle both single object and array of objects - if not isinstance(data, list): - # If it's a single object, convert it to a list with one item - if isinstance(data, dict): - logger.warning("JSON is a single object, converting to list") - data = [data] - else: - raise ValueError(f"JSON is not an array or object: {json_text[:100]}...") - - # Check if it's empty - if len(data) == 0: - logger.warning("JSON array is empty") - # Return empty DataFrame - return pd.DataFrame() - - # Create DataFrame from the parsed JSON array - return pd.DataFrame(data) - - except json.JSONDecodeError as e: - # Log details about the error - logger.error(f"Failed to parse JSON: {str(e)}") - logger.debug(f"Attempted to parse: {json_text[:200]}...") - raise ValueError(f"Invalid JSON format: {str(e)}") - except Exception as e: - logger.error(f"Error converting JSON to DataFrame: {str(e)}") - raise ValueError(f"Error converting JSON to DataFrame: {str(e)}") diff --git a/plexe/internal/datasets/__init__.py b/plexe/internal/datasets/__init__.py deleted file mode 100644 index bafd51a5..00000000 --- a/plexe/internal/datasets/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -Application entry point for the data generation service. - -The data generation service is an internal API that generates synthetic data that is meant to capture a particular -data distribution, either with data or without data (low-data regime). The service also exposes functionality for -validating the synthetic data against real data, if available. -""" - -from .config import config as config diff --git a/plexe/internal/datasets/config.py b/plexe/internal/datasets/config.py deleted file mode 100644 index cd18c2b0..00000000 --- a/plexe/internal/datasets/config.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -This module provides configuration for the data generation service. -""" - -from dataclasses import dataclass -from typing import Literal - - -@dataclass(frozen=True) -class Config: - """ - Configuration class for the dataset generation functionality. - Contains settings for generators, validators, and prompt instructions. - """ - - # global configuration - GENERATOR: Literal["simple"] = "simple" - VALIDATOR: Literal["eda"] = "eda" - - # logging configuration - LEVEL: str = "INFO" - FORMAT: str = "[%(asctime)s - %(name)s - %(levelname)s - (%(threadName)-10s)]: - %(message)s" - - # generator configuration - BATCH_SIZE: int = 50 - MAX_N_LLM_SAMPLES: int = 1000 - BASE_INSTRUCTION: str = ( - "Provide expert-level data science assistance. Communicate concisely and directly. " - "When returning data or code, provide only the raw output without explanations. " - "Keep the problem domain in mind. " - ) - GENERATOR_INSTRUCTION: str = ( - "Generate exactly the requested number of samples for a machine learning problem, " - "adhering to the schema and representing the real-world data distribution. " - "Output only JSON-formatted text without additional text. " - ) - - -config = Config() diff --git a/plexe/internal/datasets/core/__init__.py b/plexe/internal/datasets/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/datasets/core/generation/__init__.py b/plexe/internal/datasets/core/generation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/datasets/core/generation/base.py b/plexe/internal/datasets/core/generation/base.py deleted file mode 100644 index ba90d4f1..00000000 --- a/plexe/internal/datasets/core/generation/base.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -This module defines the base class for data generators used in the project. - -Classes: - BaseDataGenerator: Abstract base class for generating data samples in a given schema. -""" - -from typing import Type, Optional - -import pandas as pd -from pydantic import BaseModel -from abc import ABC, abstractmethod - - -class BaseDataGenerator(ABC): - """ - Abstract base class for an object that generates data samples in a given schema. - - The BaseDataGenerator interface defines the contract for data generation operations: - - Creating new datasets from scratch with a specified schema - - Adding rows to existing datasets - - Transforming existing datasets to include new columns from an extended schema - """ - - @abstractmethod - def generate( - self, intent: str, n_generate: int, schema: Type[BaseModel], existing_data: Optional[pd.DataFrame] = None - ) -> pd.DataFrame: - """ - Generate synthetic data for a given problem description. - - This method supports three key use cases: - 1. Create new dataset from scratch: n_generate > 0, existing_data=None - 2. Add rows to existing dataset: n_generate > 0, existing_data provided - 3. Add columns to existing dataset: n_generate=0, existing_data provided - (transforms existing data to match new schema with additional columns) - - :param intent: Natural language description of the problem/dataset - :param n_generate: Number of records to generate (0 for column-only generation) - :param schema: The schema definition for the data to generate - :param existing_data: Optional existing data to augment with new rows or columns - :return: A pandas DataFrame containing the generated or augmented data - """ - pass diff --git a/plexe/internal/datasets/core/generation/simple_llm.py b/plexe/internal/datasets/core/generation/simple_llm.py deleted file mode 100644 index d1283e76..00000000 --- a/plexe/internal/datasets/core/generation/simple_llm.py +++ /dev/null @@ -1,271 +0,0 @@ -import asyncio -import math -import logging -from typing import Type, List, Tuple, Optional -import traceback - -import pandas as pd -from pydantic import BaseModel -from tqdm import tqdm - -from plexe.internal.common.provider import Provider -from plexe.internal.common.utils.response import json_to_dataframe -from .base import BaseDataGenerator - -logger = logging.getLogger(__name__) - - -class SimpleLLMDataGenerator(BaseDataGenerator): - """ - Implementation of BaseDataGenerator that uses a straightforward LLM prompting mechanism to generate - synthetic data. The generator relies on LLM inference to create or augment datasets. - """ - - def __init__(self, provider: Provider = None): - """ - Initialize the SimpleLLMDataGenerator. - - :param provider: The provider to use for LLM queries. - """ - from ...config import Config - - self.llm = provider - config = Config() - self.system_instruction = config.BASE_INSTRUCTION + config.GENERATOR_INSTRUCTION - self.batch_size = config.BATCH_SIZE - self.max_retries = 3 - - def generate( - self, intent: str, n_generate: int, schema: Type[BaseModel], existing_data: Optional[pd.DataFrame] = None - ) -> pd.DataFrame: - """ - Generate synthetic data based on the given intent, schema, and optionally existing data. - - :param intent: Description of the data to generate - :param n_generate: Number of samples to generate - :param schema: Pydantic schema defining data structure - :param existing_data: Optional existing data to augment - :return: DataFrame containing generated data - """ - if n_generate == 0 and (existing_data is None or len(existing_data) == 0): - logger.warning("No samples to generate and no existing data provided.") - return pd.DataFrame(columns=schema.model_fields.keys()) - - # Handle column generation case - if n_generate == 0 and existing_data is not None and len(existing_data) > 0: - # Find missing columns (in schema but not in existing data) - missing_columns = [field for field in schema.model_fields.keys() if field not in existing_data.columns] - - if not missing_columns: - logger.info("No new columns to generate, returning existing data.") - return existing_data - - # Use quieter logging - will be captured in the progress bar - - # Use existing batch processing for new column generation - # but with a modified prompt focused on the missing columns - batch_size = min(self.batch_size, len(existing_data)) - prompts = [] - - # Create base prompt with problem specification - base_prompt = ( - f"Give me a dataset of samples for the following ML problem:\n\n" - f"PROBLEM DESCRIPTION:\n{intent}\n\n" - f"SCHEMA:\n{schema.model_fields}\n\n" - ) - - # Store batch indices for later matching - batches = [] - - for i in range(0, len(existing_data), batch_size): - batch_data = existing_data.iloc[i : i + batch_size] - prompt = ( - f"{base_prompt}" - f"EXISTING DATA SAMPLE (generate recommendations in this exact order):\n{batch_data.to_string()}\n\n" - f"Generate ONLY the following new columns for each record: {missing_columns}\n" - f"I already have the other columns, so only output values for these specific columns.\n" - f"Generate values for exactly {len(batch_data)} records in the same order as shown above.\n" - f"Make sure each record's recommendation is personalized based on its product.\n" - f"Return a JSON array of objects, where each object has ONLY the requested columns.\n" - ) - prompts.append((prompt, len(batch_data))) - batches.append(batch_data) - - # Process the batches to get column values - df_new_columns = pd.DataFrame() - batch_data = asyncio.run(self._process_generation_batches(prompts, schema, df_new_columns)) - - # Create a copy of the existing data to add columns to - result = existing_data.copy() - - # Add the new columns with the generated values - for col in missing_columns: - if col in batch_data.columns: - # Need to reindex batch_data to match the original index to properly align values - aligned_data = batch_data.reset_index(drop=True) - # Add the column with proper values - result[col] = aligned_data[col].values[: len(result)] - - # Ensure all rows have values - if len(batch_data) < len(result): - logger.warning("Generated fewer records than expected. Adding NaN for missing values.") - - # Success is implicit from the progress bar completion - return result - - # Create base dataframe to store results - for new row generation - # Only include schema fields that aren't already in existing_data - if existing_data is not None: - # For new row generation, use all columns from schema - columns = schema.model_fields.keys() - else: - # For creating a dataset from scratch - columns = schema.model_fields.keys() - - df_generated = pd.DataFrame(columns=columns) - - # Create base prompt with problem specification - base_prompt = ( - f"Give me a dataset of samples for the following ML problem:\n\n" - f"PROBLEM DESCRIPTION:\n{intent}\n\n" - f"SCHEMA:\n{schema.model_fields}\n\n" - ) - - # Prepare prompts for generating new data in batches - prompts = self._prepare_generation_prompts(base_prompt, n_generate, existing_data) - - # Use asyncio to process all batches - df_generated = asyncio.run(self._process_generation_batches(prompts, schema, df_generated)) - - return df_generated - - def _prepare_generation_prompts( - self, base_prompt: str, n_generate: int, existing_data: Optional[pd.DataFrame] - ) -> List[Tuple[str, int]]: - """ - Prepare prompts for batch generation. - - :param base_prompt: Base prompt with problem description and schema - :param n_generate: Total number of samples to generate - :param existing_data: Optional existing data to use as examples - :return: List of (prompt, batch_size) tuples - """ - prompts = [] - records_left = n_generate - num_batches = math.ceil(n_generate / self.batch_size) - - for _ in range(num_batches): - n_generate_this_iteration = min(records_left, self.batch_size) - records_left -= n_generate_this_iteration - - # Add sample data to the prompt if available - sample_str = "" - if existing_data is not None and len(existing_data) > 0: - num_samples = min(5, len(existing_data)) - sample_str = existing_data.sample(num_samples).to_string() - - prompt = ( - f"{base_prompt}" - f"SAMPLE DATA:{sample_str}\n\n" - f"Please give me samples that match the schema and are relevant to solving the problem. " - f"The data should have an appropriate amount of variance and be representative of the problem. " - f"The data should be distributed in a way that is consistent with the problem domain. " - f"Make absolutely sure to give me EXACTLY {n_generate_this_iteration} records. " - f"You must give me no fewer than and no more than {n_generate_this_iteration} records. " - f"FORMAT: Return a JSON ARRAY of objects. Each object represents one record. " - f"The JSON should be wrapped in square brackets [ ] as an array, not a single object. " - f"In your response, only include the dataset as a JSON array, no other text. " - f"The output must be a raw JSON string with no formatting characters. " - f"Do not give me any code, any descriptions, any explanations, or any other text of any kind. " - f"Only give me a raw JSON string with the data in array format, and no other information whatsoever." - ) - prompts.append((prompt, n_generate_this_iteration)) - - return prompts - - async def _process_generation_batches( - self, prompts: List[Tuple[str, int]], schema: Type[BaseModel], df_generated: pd.DataFrame - ) -> pd.DataFrame: - """ - Process all generation batches asynchronously with retry logic. - - :param prompts: List of (prompt, batch_size) tuples - :param schema: Pydantic schema for validation - :param df_generated: DataFrame to store results - :return: DataFrame with all generated data - """ - pending_prompts = prompts.copy() - retry_count = 0 - - while pending_prompts and retry_count < self.max_retries: - # Use tqdm for progress display - with tqdm( - total=len(pending_prompts), desc=f"Generating data (attempt {retry_count + 1}/{self.max_retries})" - ) as pbar: - # Process all pending prompts in parallel - tasks = [self._generate_batch(prompt, schema) for prompt, _ in pending_prompts] - batch_results = await asyncio.gather(*tasks) - - # Process results and collect failed prompts for retry - failed_prompts = [] - for result, (prompt, n_expected) in zip(batch_results, pending_prompts): - if result is not None: - df_generated = pd.concat([df_generated, result], ignore_index=True) - logger.debug(f"Successfully generated {len(result)} samples") - pbar.update(1) - else: - failed_prompts.append((prompt, n_expected)) - # Don't update progress bar for failed batches - - # Update for next iteration - pending_prompts = failed_prompts - if failed_prompts: - logger.warning(f"Retrying {len(failed_prompts)} failed batches...") - retry_count += 1 - - if pending_prompts: - logger.error( - f"Failed to generate {sum(n for _, n in pending_prompts)} samples after {self.max_retries} attempts" - ) - - return df_generated - - async def _generate_batch(self, prompt: str, schema: Type[BaseModel]) -> Optional[pd.DataFrame]: - """ - Generate a single batch of data asynchronously. - - :param prompt: The generation prompt - :param schema: Pydantic schema for validation - :return: DataFrame with generated data or None if failed - """ - try: - - class ResponseSchema(BaseModel): - records: List[schema] - - response = await asyncio.to_thread(self.llm.query, self.system_instruction, prompt, ResponseSchema) - - if response is None: - logger.error("Received None response from LLM") - return None - - # Use the utility function to convert JSON to DataFrame - try: - df_batch = json_to_dataframe(response) - - # Validate that all required schema fields are present - missing_fields = [field for field in schema.model_fields.keys() if field not in df_batch.columns] - if missing_fields: - logger.error(f"Generated data missing required fields: {missing_fields}") - return None - - except ValueError as e: - logger.error(f"Failed to parse LLM response: {str(e)}") - return None - - return df_batch - - except Exception as e: - logger.error(f"Error during batch generation: {str(e)}") - logger.debug(traceback.format_exc()) - return None diff --git a/plexe/internal/datasets/core/generation/utils/__init__.py b/plexe/internal/datasets/core/generation/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/datasets/core/validation/__init__.py b/plexe/internal/datasets/core/validation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/datasets/core/validation/base.py b/plexe/internal/datasets/core/validation/base.py deleted file mode 100644 index 6ecccf8c..00000000 --- a/plexe/internal/datasets/core/validation/base.py +++ /dev/null @@ -1,22 +0,0 @@ -from abc import ABC, abstractmethod - - -class BaseDataValidator(ABC): - @abstractmethod - def validate( - self, - report_output_path: str, - synthetic_data_path: str, - reference_data_path: str = None, - data_schema: dict = None, - ) -> str: - """ - Validate synthetic data against reference data and schema. The validation results are saved to a report - jupyter notebook. - :param report_output_path: path to save the validation report - :param synthetic_data_path: path to the synthetic data to validate - :param reference_data_path: if provided, path to the reference data to compare against - :param data_schema: if provided, schema of the data to validate against - :return: path to the validation report - """ - pass diff --git a/plexe/internal/datasets/core/validation/eda.py b/plexe/internal/datasets/core/validation/eda.py deleted file mode 100644 index 501cbf28..00000000 --- a/plexe/internal/datasets/core/validation/eda.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -import subprocess - -import nbformat as nbf - -from .base import BaseDataValidator - - -class EdaDataValidator(BaseDataValidator): - def validate( - self, - report_output_path: str, - synthetic_data_path: str, - reference_data_path: str = None, - data_schema: dict = None, - ) -> str: - """ - Validates the synthetic data by comparing it to reference data (if available) and producing an - EDA report as a Jupyter notebook. Includes distribution plots, null checks, and statistical comparisons. - - :param report_output_path: path to save the generated report - :param synthetic_data_path: path to the synthetic data - :param reference_data_path: path to the reference data - :param data_schema: - :return: path to the generated report - """ - - # load existing notebook from template path - with open("resources/templates/eda-notebook-tabular.ipynb") as f: - nb = nbf.read(f, as_version=4) - - # todo currently schema is not used, but it should be part of the validation report - - # string interpolation to fill the data paths into the template notebook - for cell in nb["cells"]: - cell["source"] = ( - str(cell["source"]) - .replace("{{$synthetic_data_path}}", os.path.basename(synthetic_data_path)) - .replace("{{$real_data_path}}", f"./../{reference_data_path}") - ) - - # save the notebook to the output path - with open(report_output_path, "w") as f: - nbf.write(nb, f) - - # attempt to execute the notebook and convert it to pdf - try: - # execute the notebook - subprocess.run( - ["jupyter", "nbconvert", "--to", "notebook", "--execute", "--inplace", report_output_path], check=True - ) - # convert the notebook to pdf - subprocess.run(["jupyter", "nbconvert", "--to", "pdf", report_output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"Error executing the notebook or converting it to pdf: {e}") - - return report_output_path diff --git a/plexe/internal/datasets/generator.py b/plexe/internal/datasets/generator.py deleted file mode 100644 index 3a499c46..00000000 --- a/plexe/internal/datasets/generator.py +++ /dev/null @@ -1,41 +0,0 @@ -from pydantic import BaseModel -from typing import Type - -import pandas as pd - -from plexe.internal.common.provider import Provider -from plexe.internal.datasets.core.generation.simple_llm import SimpleLLMDataGenerator - - -class DatasetGenerator: - """Generate synthetic data based on request parameters.""" - - def __init__(self, provider: Provider, description: str, schema: Type[BaseModel]): - """ - Initialize the DatasetGenerator with a provider. - - :param [Provider] provider: The provider to use for data generation. - :param [str] description: The description of the data to generate. - :param [Type[BaseModel]] schema: The schema for the data to generate. - """ - - self.provider = provider - self.description = description - self.schema = schema - self.generator = SimpleLLMDataGenerator(provider=self.provider) - - def generate(self, n_samples: int, existing_data: pd.DataFrame = None) -> pd.DataFrame: - """ - Generate synthetic data based on request parameters. - - :param [int] n_samples: The number of samples to generate. - :param [SupportedDatasets] existing_data: The existing data to augment. - :return [SupportedDatasets]: The generated synthetic data. - """ - - return self.generator.generate( - intent=self.description, - n_generate=n_samples, - schema=self.schema, - existing_data=existing_data, - ) diff --git a/plexe/internal/datasets/resources/templates/eda-notebook-tabular.ipynb b/plexe/internal/datasets/resources/templates/eda-notebook-tabular.ipynb deleted file mode 100644 index ac547e43..00000000 --- a/plexe/internal/datasets/resources/templates/eda-notebook-tabular.ipynb +++ /dev/null @@ -1,455 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "566c61d6a0c2cdb6", - "metadata": {}, - "source": [ - "# Data Generation Report\n", - "This notebook contains an exploratory data analysis (EDA) report for the synthetic dataset {{$synthetic_data_path}}." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7001a63d4d4c77a4", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import warnings\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c00ab39ca74e5eb8", - "metadata": {}, - "outputs": [], - "source": [ - "# configure the notebook\n", - "warnings.filterwarnings(\"ignore\", module=\"seaborn*\")\n", - "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", - "\n", - "sns.set_theme(rc={'figure.figsize': (25, 8)})\n", - "\n", - "# print for debugging purposes\n", - "os.getcwd()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d44e8f2bb5f2e3f", - "metadata": {}, - "outputs": [], - "source": [ - "# path placeholders replaced by actual paths during notebook generation\n", - "SYNTHETIC_DATA_PATH = \"{{$synthetic_data_path}}\"\n", - "REAL_DATA_PATH = \"{{$real_data_path}}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c404ba5d2a6735a3", - "metadata": {}, - "outputs": [], - "source": [ - "# load the datasets\n", - "df_synth = pd.read_csv(SYNTHETIC_DATA_PATH)\n", - "\n", - "df_real = None\n", - "\n", - "try:\n", - " df_real = pd.read_csv(REAL_DATA_PATH)\n", - " # only keep columns that are also in df_synth\n", - " df_real = df_real[df_real.columns.intersection(df_synth.columns)]\n", - "except FileNotFoundError:\n", - " pass\n", - "except ValueError:\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "id": "ab499105539c444c", - "metadata": {}, - "source": "## Dataset Description\n" - }, - { - "cell_type": "markdown", - "id": "57266f57e9f06625", - "metadata": {}, - "source": "### Dataset Information" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7042af0cea455a69", - "metadata": {}, - "outputs": [], - "source": [ - "df_synth.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ed183e13d6a9c8b", - "metadata": {}, - "outputs": [], - "source": [ - "if df_real is not None:\n", - " df_real.info()" - ] - }, - { - "cell_type": "markdown", - "id": "e3e1271a61500c44", - "metadata": {}, - "source": "### Dataset Statistics" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2c88e8361016bba", - "metadata": {}, - "outputs": [], - "source": [ - "# synthetic data statistics\n", - "df_synth.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48286b4bdf8ff695", - "metadata": {}, - "outputs": [], - "source": [ - "# real data statistics, if available\n", - "if df_real is not None:\n", - " df_real.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "a1594a122f1255cc", - "metadata": {}, - "source": "### Dataset Examples" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a2011d54b988f4", - "metadata": {}, - "outputs": [], - "source": [ - "# synthetic data examples\n", - "pd.concat([df_synth.head(5), df_synth.tail(5)], axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f7550c2dbbeed9f", - "metadata": {}, - "outputs": [], - "source": [ - "# real data examples, if available\n", - "if df_real is not None:\n", - " pd.concat([df_real.head(5), df_real.tail(5)], axis=0)" - ] - }, - { - "cell_type": "markdown", - "id": "c2cc8891d1cc324d", - "metadata": {}, - "source": "## Data Quality" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9fdcabecd400601", - "metadata": {}, - "outputs": [], - "source": [ - "df_synth.isnull().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8df777a44c759365", - "metadata": {}, - "outputs": [], - "source": [ - "# drop any column with a name that starts with Unnamed as it is likely an index\n", - "df_synth = df_synth.loc[:, ~df_synth.columns.str.contains('^Unnamed')]\n", - "\n", - "if df_real is not None:\n", - " df_real = df_real.loc[:, ~df_real.columns.str.contains('^Unnamed')]" - ] - }, - { - "cell_type": "markdown", - "id": "3ee7a83fd1bcc3c8", - "metadata": {}, - "source": "## Data Visualisations" - }, - { - "cell_type": "markdown", - "id": "d85b663179a1b1ea", - "metadata": {}, - "source": "### Distributions (Individual Variables)" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "410c22c6ab53ae09", - "metadata": {}, - "outputs": [], - "source": [ - "for column in df_synth.select_dtypes(include=[np.number]).columns:\n", - " plt.figure()\n", - "\n", - " # add density plots, reference data only included if available\n", - " sns.histplot(df_synth[column], color='red', label='Synthetic', kde=True, stat=\"density\", linewidth=0)\n", - " if df_real is not None:\n", - " sns.histplot(df_real[column], color='blue', label='Reference', kde=True, stat=\"density\", linewidth=0)\n", - "\n", - " # plot formatting\n", - " plt.title(f'Distribution of {column}')\n", - " plt.xlabel(column)\n", - " plt.ylabel('Density')\n", - " plt.legend()\n", - " plt.grid(True)\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "3e5c51fce2f87739", - "metadata": {}, - "source": "### Correlations" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f50412538692074c", - "metadata": {}, - "outputs": [], - "source": [ - "# Correlation heatmaps for both datasets\n", - "plt.figure()\n", - "\n", - "\n", - "# for correlations, we need to map string variables to integer indices\n", - "def indexing_map(x):\n", - " return x.map({val: idx for idx, val in enumerate(x.unique())}) if x.dtype == 'O' else x\n", - "\n", - "\n", - "plt.subplot(1, 2, 1)\n", - "sns.heatmap(df_synth.apply(indexing_map).corr(), annot=True, cmap='coolwarm', center=0)\n", - "plt.title('Synthetic Data Correlation Matrix')\n", - "\n", - "if df_real is not None:\n", - " plt.subplot(1, 2, 2)\n", - " sns.heatmap(df_real.apply(indexing_map).corr(), annot=True, cmap='coolwarm', center=0)\n", - " plt.title('Reference Data Correlation Matrix')\n", - "\n", - "plt.tight_layout()\n", - "plt.show()\n" - ] - }, - { - "cell_type": "markdown", - "id": "55022633aa724623", - "metadata": {}, - "source": "### Principal Component Analysis" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca0d917e15652de5", - "metadata": {}, - "outputs": [], - "source": [ - "# project the data into 2D space using PCA\n", - "from sklearn.decomposition import PCA\n", - "\n", - "plt.figure()\n", - "\n", - "# project the synthetic data and plot it\n", - "pca_synth = PCA(n_components=2)\n", - "pca_synth.fit(df_synth.apply(indexing_map))\n", - "\n", - "plt.subplot(1, 2, 1)\n", - "plt.scatter(pca_synth.transform(df_synth.apply(indexing_map))[:, 0],\n", - " pca_synth.transform(df_synth.apply(indexing_map))[:, 1])\n", - "plt.title('Synthetic Data PCA')\n", - "\n", - "# project the real data, if available, and plot it\n", - "# note that we use the same PCA model to ensure the same projection so that the data can be compared\n", - "if df_real is not None:\n", - " plt.subplot(1, 2, 2)\n", - " plt.scatter(pca_synth.transform(df_real.apply(indexing_map))[:, 0],\n", - " pca_synth.transform(df_real.apply(indexing_map))[:, 1])\n", - " plt.title('Reference Data PCA')\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "4b7a5a8f038679b7", - "metadata": {}, - "source": "## Similarity to Reference Data" - }, - { - "cell_type": "markdown", - "id": "19208b4657f1331", - "metadata": {}, - "source": [ - "### Kolmogorov-Smirnov Test\n", - "We perform the Kolmogorov-Smirnov test on each individual continuous variable as a similarity measure for the individual\n", - "variables. The output of the test is the p-value, which indicates the probability that the two samples are drawn from the\n", - "same distribution. A low p-value indicates that the two samples are significantly different." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33c73869f69ae6bc", - "metadata": {}, - "outputs": [], - "source": [ - "# perform the Kolmogorov-Smirnov test on each continuous variable\n", - "# for column in df_synth.select_dtypes(include=[np.floating]).columns:\n", - "# res = stats.kstest(df_synth[column], df_real[column])\n", - "# print(f'{column}: {res[\"pvalue\"]}')" - ] - }, - { - "cell_type": "markdown", - "id": "36a5c64364412f46", - "metadata": {}, - "source": [ - "### Chi-Square Test\n", - "We perform the Chi-Square test on each individual categorical variable as a similarity measure for the individual\n", - "variables. The output of the test is the p-value, which indicates the probability that the two samples are drawn from the\n", - "same distribution. A low p-value indicates that the two samples are significantly different." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29140e5115b04e0f", - "metadata": {}, - "outputs": [], - "source": [ - "# perform the Chi-Square test on each categorical variable\n", - "# for column in df_synth.select_dtypes(exclude=[np.floating]).columns:\n", - "# res = stats.chisquare(df_synth[column].value_counts(), df_real[column].value_counts())\n", - "# print(f'{column}: {res.pvalue}')" - ] - }, - { - "cell_type": "markdown", - "id": "d75c8585b49ecfa0", - "metadata": {}, - "source": [ - "### Correlation Matrix Comparison\n", - "We compare the correlation matrices of the synthetic and reference datasets using the Jensen-Shannon divergence.\n", - "The Jensen-Shannon divergence is a measure of the similarity between two probability distributions.\n", - "In this case, it is used to compare the correlation matrices of the synthetic and reference datasets, where the mutual\n", - "information matrices are treated as probability distributions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d29045501f0ad1ea", - "metadata": {}, - "outputs": [], - "source": [ - "# # compute the correlation matrices\n", - "# corr_synth = df_synth.apply(indexing_map).corr()\n", - "# corr_real = df_real.apply(indexing_map).corr()\n", - "# \n", - "# # compute the norms of each individual matrix, and the difference between them\n", - "# norm_synth = np.linalg.norm(corr_synth)\n", - "# norm_real = np.linalg.norm(corr_real)\n", - "# norm_diff = np.linalg.norm(corr_synth - corr_real)\n", - "# \n", - "# print(f\"synth: {norm_synth}, real: {norm_real}, diff: {norm_diff}\")" - ] - }, - { - "cell_type": "markdown", - "id": "c13dd2f82b29a94a", - "metadata": {}, - "source": [ - "### Variable-Wise Jensen-Shannon Distance\n", - "We calculate the Jensen-Shannon distance between the distributions of each variable in the synthetic and reference datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1aeb22213a0d7193", - "metadata": {}, - "outputs": [], - "source": [ - "# # for each variable in the dataset, compute the Jensen-Shannon distance\n", - "# distances = {}\n", - "# \n", - "# for column in df_synth.columns:\n", - "# if df_synth[column].dtype == np.number:\n", - "# column_pdf_synth = np.histogram(df_synth[column], bins=100, density=True)[0]\n", - "# column_pdf_real = np.histogram(df_real[column], bins=100, density=True)[0]\n", - "# else:\n", - "# column_pdf_synth = df_synth[column].value_counts(normalize=True)\n", - "# column_pdf_real = df_real[column].value_counts(normalize=True)\n", - "# \n", - "# # normalise the historigrams to make them valid probability distributions\n", - "# column_pdf_synth /= np.sum(column_pdf_synth)\n", - "# column_pdf_real /= np.sum(column_pdf_real)\n", - "# \n", - "# # compute the Jensen-Shannon distance\n", - "# jsd = distance.jensenshannon(column_pdf_synth, column_pdf_real)\n", - "# distances[column] = jsd\n", - " \n", - " " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/plexe/internal/models/__init__.py b/plexe/internal/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/callbacks/__init__.py b/plexe/internal/models/callbacks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/callbacks/chain_of_thought.py b/plexe/internal/models/callbacks/chain_of_thought.py deleted file mode 100644 index 377671c2..00000000 --- a/plexe/internal/models/callbacks/chain_of_thought.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Chain of Thought model callback for emitting chain of thought information - -This module provides a callback class that captures and formats the chain of thought of the agents. It is -useful for understanding at a glance the steps taken by the model during the building process. -""" - -from typing import List - -from plexe.callbacks import Callback, BuildStateInfo -from plexe.internal.common.utils.chain_of_thought.callable import ChainOfThoughtCallable -from plexe.internal.common.utils.chain_of_thought.emitters import ConsoleEmitter - - -class ChainOfThoughtModelCallback(Callback): - """ - Callback that captures and formats the chain of thought for model building. - - This callback bridges between the Plexe callback system and the - chain of thought callback system. - """ - - def __init__(self, emitter=None): - """ - Initialize the chain of thought model callback. - - Args: - emitter: The emitter to use for chain of thought output - """ - - self.cot_callable = ChainOfThoughtCallable(emitter=emitter or ConsoleEmitter()) - - def on_build_start(self, info: BuildStateInfo) -> None: - """ - Reset the chain of thought at the beginning of the build process. - """ - self.cot_callable.clear() - self.cot_callable.emitter.emit_thought("System", f"🚀 Starting model build for intent: {info.intent[:40]}...") - - def on_build_end(self, info: BuildStateInfo) -> None: - """ - Emit completion message at the end of the build process. - """ - self.cot_callable.emitter.emit_thought("System", "✅ Model build completed") - - def on_iteration_start(self, info: BuildStateInfo) -> None: - """ - Emit iteration start message. - """ - self.cot_callable.emitter.emit_thought("System", f"📊 Starting iteration {info.iteration + 1}") - - def on_iteration_end(self, info: BuildStateInfo) -> None: - """ - Emit iteration end message with performance metrics. - """ - if info.node and info.node.performance: - self.cot_callable.emitter.emit_thought( - "System", - f"📋 Iteration {info.iteration + 1} completed: {info.node.performance.name}={info.node.performance.value}", - ) - else: - self.cot_callable.emitter.emit_thought( - "System", f"📋 Iteration {info.iteration + 1} failed: No performance metrics available" - ) - - def get_chain_of_thought_callable(self): - """ - Get the underlying chain of thought callable. - - Returns: - The chain of thought callable used by this model callback - """ - return self.cot_callable - - def get_full_chain_of_thought(self) -> List: - """ - Get the full chain of thought captured during model building. - - Returns: - The list of steps in the chain of thought - """ - return self.cot_callable.get_full_chain_of_thought() diff --git a/plexe/internal/models/callbacks/checkpoint.py b/plexe/internal/models/callbacks/checkpoint.py deleted file mode 100644 index 9130b7fe..00000000 --- a/plexe/internal/models/callbacks/checkpoint.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Checkpoint callback for model building process in Plexe. - -This module provides a checkpoint callback implementation that saves model state -at regular intervals during the build process. -""" - -import logging -from typing import Optional - -from plexe.callbacks import Callback, BuildStateInfo -import plexe.fileio as fileio # Import the module, not individual functions - -logger = logging.getLogger(__name__) - - -class ModelCheckpointCallback(Callback): - """ - Callback that saves model state checkpoints during the build process. - - This callback periodically saves the model state after each iteration to allow - resuming a build from a checkpoint if the process is interrupted. - """ - - def __init__( - self, - keep_n_latest: Optional[int] = None, - checkpoint_dir: Optional[str] = None, - delete_on_success: Optional[bool] = None, - ): - """ - Initialize the model checkpoint callback. - - Args: - keep_n_latest: Number of most recent checkpoints to keep for this model - checkpoint_dir: Optional custom directory for checkpoints - delete_on_success: Whether to delete checkpoints when build completes successfully - """ - from plexe.config import config - - # Use provided values or defaults from config - self.keep_n_latest = keep_n_latest if keep_n_latest is not None else config.file_storage.keep_checkpoints - self.checkpoint_dir = checkpoint_dir - self.delete_on_success = ( - delete_on_success if delete_on_success is not None else config.file_storage.delete_checkpoints_on_success - ) - self.model = None - self.checkpoints = [] - - def on_build_start(self, info: BuildStateInfo) -> None: - """ - Store reference to the model on build start. - - Args: - info: Information about the model building process start - """ - # We need access to the model object to create checkpoints - # This is not directly accessible via BuildStateInfo, so we'll - # need to get a reference to it when the callback is registered with the model - pass - - def on_iteration_end(self, info: BuildStateInfo) -> None: - """ - Create a checkpoint after each iteration. - - Args: - info: Information about the iteration end - """ - if not hasattr(info, "model"): - logger.warning("Cannot create checkpoint: no model reference available") - return - - model = info.model - try: - # Save checkpoint with current iteration number - checkpoint_path = fileio.save_checkpoint(model, info.iteration, self.checkpoint_dir) - self.checkpoints.append(checkpoint_path) - logger.info(f"Created checkpoint at {checkpoint_path}") - - # Manage checkpoint retention - self._manage_checkpoints(model.identifier) - except Exception as e: - logger.error(f"Error creating checkpoint: {e}") - - def on_build_end(self, info: BuildStateInfo) -> None: - """ - Optionally clean up checkpoints when build completes successfully. - - Args: - info: Information about the model building process end - """ - if not self.delete_on_success: - return - - if not hasattr(info, "model"): - return - - model = info.model - - if model.state.name == "READY": - # Build completed successfully, clean up checkpoints if configured - for checkpoint in self.checkpoints: - try: - fileio.delete_checkpoint(checkpoint) - logger.info(f"Deleted checkpoint {checkpoint} after successful build") - except Exception as e: - logger.error(f"Error deleting checkpoint {checkpoint}: {e}") - - def _manage_checkpoints(self, model_id: str) -> None: - """ - Manage checkpoint retention policy. - - Args: - model_id: Model identifier to filter checkpoints - """ - # Get all checkpoints for this model - all_checkpoints = fileio.list_checkpoints(model_id) - - # Sort by modification time (newest first) - all_checkpoints.sort(key=lambda p: str(p), reverse=True) - - # Delete older checkpoints beyond our retention limit - if len(all_checkpoints) > self.keep_n_latest: - for checkpoint in all_checkpoints[self.keep_n_latest :]: - try: - fileio.delete_checkpoint(checkpoint) - logger.info(f"Deleted old checkpoint {checkpoint} (retention policy)") - except Exception as e: - logger.error(f"Error deleting old checkpoint {checkpoint}: {e}") diff --git a/plexe/internal/models/callbacks/mlflow.py b/plexe/internal/models/callbacks/mlflow.py deleted file mode 100644 index b4eebce1..00000000 --- a/plexe/internal/models/callbacks/mlflow.py +++ /dev/null @@ -1,417 +0,0 @@ -""" -MLFlow callback for tracking model building process. - -This module provides a callback implementation that logs model building -metrics, parameters, and artifacts to MLFlow. -""" - -import os -import re -import tempfile -import datetime -from pathlib import Path -from typing import Dict, Any, Optional, List - -import mlflow -import logging -import warnings - -from plexe.callbacks import Callback, BuildStateInfo -from plexe.internal.models.entities.metric import Metric - -logger = logging.getLogger(__name__) -warnings.filterwarnings("ignore", category=UserWarning, module="mlflow") - - -class MLFlowCallback(Callback): - """ - Callback that logs the model building process to MLFlow with hierarchical run organization. - - Implements nested runs with parent/child relationship: - - Parent run: Overall model building process, common parameters - - Child runs: Individual iterations with iteration-specific metrics - """ - - def __init__(self, tracking_uri: str, experiment_name: str, connect_timeout: int = 10): - """ - Initialize MLFlow callback. - - Args: - tracking_uri: MLFlow tracking server URI. - experiment_name: Name for the MLFlow experiment. - connect_timeout: Timeout in seconds for MLFlow server connection. - """ - self.tracking_uri = tracking_uri - self.experiment_name = experiment_name - self.experiment_id = None - self.connect_timeout = connect_timeout - self.parent_run_id = None - self._setup_mlflow() - - def _setup_mlflow(self) -> None: - """Configure MLFlow tracking and clean up any active runs.""" - try: - # End any active runs from previous sessions - if mlflow.active_run(): - mlflow.end_run() - - # Configure MLFlow environment - os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = str(self.connect_timeout) - mlflow.set_tracking_uri(self.tracking_uri) - - # Explicitly set the experiment first to avoid default experiment use for traces - experiment = mlflow.get_experiment_by_name(self.experiment_name) - if experiment: - self.experiment_id = experiment.experiment_id - mlflow.set_experiment(experiment_name=self.experiment_name) - else: - self.experiment_id = mlflow.create_experiment(self.experiment_name) - mlflow.set_experiment(experiment_id=self.experiment_id) - - # Enable autologging for smolagents AFTER setting experiment - try: - mlflow.smolagents.autolog() - except ModuleNotFoundError: - pass - - except Exception as e: - logger.error(f"Failed to setup MLFlow: {e}") - raise RuntimeError(f"Failed to setup MLFlow: {e}") from e - - @staticmethod - def _timestamp() -> str: - """Get formatted timestamp for runs and logs.""" - return datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - @staticmethod - def _safe_get(obj: Any, attrs: List[str], default: Any = None): - """Safely access nested attributes.""" - if obj is None: - return default - - current = obj - for attr in attrs: - if not hasattr(current, attr): - return default - current = getattr(current, attr) - return current - - def _get_or_create_experiment(self) -> str: - """ - Get or create the MLFlow experiment and return its ID. - Reuses experiment_id if already set during initialization. - """ - # If we already have an experiment ID, ensure it's active and return it - if self.experiment_id: - mlflow.set_experiment(experiment_id=self.experiment_id) - return self.experiment_id - - # Otherwise, look up by name or create - experiment = mlflow.get_experiment_by_name(self.experiment_name) - if experiment: - self.experiment_id = experiment.experiment_id - else: - # Create if not exists - self.experiment_id = mlflow.create_experiment(self.experiment_name) - - # Set the experiment as active and notify user - mlflow.set_experiment(experiment_id=self.experiment_id) - print(f"✅ MLFlow: tracking URI '{self.tracking_uri}', experiment '{self.experiment_name}'") - return self.experiment_id - - def _ensure_parent_run_active(self) -> bool: - """Ensure the parent run is active, activating it if needed.""" - if not self.parent_run_id or not self.experiment_id: - return False - - active_run = mlflow.active_run() - - # If already active and it's the parent run, we're good - if active_run and active_run.info.run_id == self.parent_run_id: - return True - - # End any active run and start the parent run - if active_run: - mlflow.end_run() - - # Ensure experiment is active before starting the run - mlflow.set_experiment(experiment_id=self.experiment_id) - - try: - mlflow.start_run(run_id=self.parent_run_id) - return True - except Exception as e: - logger.warning(f"Could not activate parent run: {e}") - return False - - @staticmethod - def _safe_log_artifact(content: str, filename: str) -> None: - """ - Safely log an artifact by writing to a temporary file first. - - Args: - content: Content to write to the file - filename: Name of the file in MLFlow - """ - if not mlflow.active_run() or not content: - return - - # Create unique temp file - with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=Path(filename).suffix) as tmp: - tmp_path = Path(tmp.name) - - try: - # Write content and log - with open(tmp_path, "w") as f: - f.write(content) - - mlflow.log_artifact(str(tmp_path)) - - except Exception as e: - logger.warning(f"Failed to log artifact '{filename}': {e}") - finally: - # Clean up temp file - if tmp_path.exists(): - tmp_path.unlink() - - def _extract_model_context(self, info: BuildStateInfo) -> Dict[str, Any]: - """Extract essential model context for logging.""" - context = {"intent": info.intent, "provider": str(info.provider)} - - # Add timing and iteration information - if info.timeout: - context["timeout_seconds"] = info.timeout - if info.run_timeout: - context["run_timeout_seconds"] = info.run_timeout - if info.max_iterations: - context["max_iterations"] = info.max_iterations - - # Add model ID if available - if info.model_identifier: - context["model_id"] = info.model_identifier - - # Add basic schema and dataset info - if info.input_schema: - context["input_schema_fields"] = len(info.input_schema.model_fields) - if info.output_schema: - context["output_schema_fields"] = len(info.output_schema.model_fields) - if info.datasets: - context["dataset_count"] = len(info.datasets) - - return context - - def _log_metric(self, metric: Metric, prefix: str = "", step: Optional[int] = None) -> None: - """Safely log a Plexe Metric object to MLFlow.""" - if not mlflow.active_run() or not metric: - return - - metric_name = self._safe_get(metric, ["name"]) - metric_value = self._safe_get(metric, ["value"]) - - if not metric_name or metric_value is None: - return - - # Clean metric name and convert value - clean_name = re.sub(r"[^a-zA-Z0-9_]", "", f"{prefix}{metric_name}") - - try: - # Try to log as numeric - value = float(metric_value) - if step is not None: - mlflow.log_metric(clean_name, value, step=step) - else: - mlflow.log_metric(clean_name, value) - except (ValueError, TypeError): - # If not numeric, log as tag - mlflow.set_tag(f"metric_{clean_name}", str(metric_value)) - mlflow.set_tag("non_numeric_metrics", "true") - - def on_build_start(self, info: BuildStateInfo) -> None: - """Start MLFlow parent run and log initial parameters.""" - try: - # Ensure experiment is set and active - self.experiment_id = self._get_or_create_experiment() - - # Get model info and timestamp - model_id = (info.model_identifier or "unknown")[0:12] + "..." - timestamp = self._timestamp() - - # End any active run before starting parent - if mlflow.active_run(): - mlflow.end_run() - - # Ensure the experiment is active before starting the run - mlflow.set_experiment(experiment_id=self.experiment_id) - - # Start parent run - parent_run = mlflow.start_run( - run_name=f"{model_id}-{timestamp}", - experiment_id=self.experiment_id, - description=f"Model building: {info.intent[:100]}...", - ) - self.parent_run_id = parent_run.info.run_id - logger.info(f"Started parent run '{parent_run.info.run_id}' in experiment '{self.experiment_name}'") - - # Log common parameters and tags - mlflow.log_params(self._extract_model_context(info)) - mlflow.set_tags({"provider": str(info.provider), "run_type": "parent", "build_timestamp": timestamp}) - - # Log intent - if info.intent: - self._safe_log_artifact(content=info.intent, filename="intent.txt") - - except Exception as e: - logger.error(f"Error starting build in MLFlow: {e}") - - def on_iteration_start(self, info: BuildStateInfo) -> None: - """Start a new nested child run for this iteration.""" - if not self.parent_run_id: - return - - try: - # Ensure experiment is active - mlflow.set_experiment(experiment_id=self.experiment_id) - - # Create nested run under the parent, letting MLflow decide the name - mlflow.start_run( - experiment_id=self.experiment_id, - nested=True, - description=f"Iteration {info.iteration}", - ) - - # Log iteration parameters - mlflow.log_params({"iteration": info.iteration}) - mlflow.set_tags( - {"run_type": "iteration", "iteration": str(info.iteration), "parent_run_id": self.parent_run_id} - ) - - # Log datasets if available - if info.datasets: - for name, data in info.datasets.items(): - try: - mlflow.log_input(mlflow.data.from_pandas(data.to_pandas(), name=name), context="training") - except Exception as e: - logger.warning(f"Could not log dataset '{name}': {e}") - - except Exception as e: - logger.error(f"Error starting iteration in MLFlow: {e}") - - def on_iteration_end(self, info: BuildStateInfo) -> None: - """Log metrics for this iteration and end the child run.""" - if not mlflow.active_run(): - return - - try: - # Process node data if available - node = info.node - if node: - # Log training code - training_code = self._safe_get(node, ["training_code"]) - if training_code: - self._safe_log_artifact( - content=training_code, filename=f"trainer_source_iteration_{info.iteration}.py" - ) - - # Log performance metrics - performance = self._safe_get(node, ["performance"]) - if performance: - self._log_metric(performance) - - # Log execution time - execution_time = self._safe_get(node, ["execution_time"]) - if execution_time: - mlflow.log_metric("execution_time", execution_time) - - # Log exception information - exception_raised = self._safe_get(node, ["exception_was_raised"], False) - if exception_raised: - exception_obj = self._safe_get(node, ["exception"]) - exception_type = type(exception_obj).__name__ if exception_obj else "unknown" - - mlflow.set_tags({"exception_raised": "true", "exception_type": exception_type}) - - # Log exception details - if exception_obj: - self._safe_log_artifact( - content=str(exception_obj), filename=f"exception-iteration-{info.iteration}.txt" - ) - - # Log model artifacts - artifacts = self._safe_get(node, ["model_artifacts"], []) - for artifact in artifacts: - if artifact.is_path() and Path(artifact.path).exists(): - try: - mlflow.log_artifact(str(artifact)) - except Exception: - pass - - # Determine run status - status = "FINISHED" - performance = self._safe_get(node, ["performance"]) - if ( - self._safe_get(node, ["exception_was_raised"], False) - or performance is None - or (hasattr(performance, "is_worst") and performance.is_worst) - ): - status = "FAILED" - - # End the child run - mlflow.end_run(status=status) - - except Exception as e: - logger.error(f"Error ending iteration in MLFlow: {e}") - try: - mlflow.end_run(status="FAILED") - except Exception: - pass - - def on_build_end(self, info: BuildStateInfo) -> None: - """Log final model details and end MLFlow parent run.""" - try: - # End any active child run first - active_run = mlflow.active_run() - if active_run and active_run.info.run_id != self.parent_run_id: - mlflow.end_run() - - # Ensure parent run is active - if not self._ensure_parent_run_active(): - return - - # Log EDA reports - node_metadata = self._safe_get(info.node, ["metadata"], {}) - if node_metadata and "eda_markdown_reports" in node_metadata: - for dataset_name, report_markdown in node_metadata["eda_markdown_reports"].items(): - self._safe_log_artifact(content=report_markdown, filename=f"eda_report_{dataset_name}.md") - - # Log model information - if info.final_metric and hasattr(info.final_metric, "name") and hasattr(info.final_metric, "value"): - mlflow.log_metric(f"best_{info.final_metric.name}", float(info.final_metric.value)) - - # Log model artifacts and status - mlflow.set_tag("best_iteration", str(info.iteration)) - - # Log artifact names - if info.final_artifacts: - artifact_names = [a.name for a in info.final_artifacts] - mlflow.set_tag("model_artifacts", ", ".join(artifact_names)) - - # Log model state - if info.model_state: - mlflow.set_tag("final_model_state", str(info.model_state)) - - # Log final model code - if info.trainer_source: - self._safe_log_artifact(content=info.trainer_source, filename="final_trainer.py") - - if info.predictor_source: - self._safe_log_artifact(content=info.predictor_source, filename="final_predictor.py") - - # End the parent run - mlflow.end_run() - - except Exception as e: - logger.error(f"Error finalizing build in MLFlow: {e}") - try: - mlflow.end_run() - except Exception: - pass diff --git a/plexe/internal/models/entities/__init__.py b/plexe/internal/models/entities/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/entities/artifact.py b/plexe/internal/models/entities/artifact.py deleted file mode 100644 index d6ff1f08..00000000 --- a/plexe/internal/models/entities/artifact.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -This module defines the "Artifact" dataclass, a simple representation of an external artifact. - -An "external artifact" is a text or binary entity that is used by a model. The canonical example is -a blob containing the weights of a neural network. The Artifact class can be used either to point to -a file on disk by its path, or to hold the raw text or binary data in memory itself. -""" - -import io -from typing import BinaryIO, Union -from pathlib import Path - - -class Artifact: - """ - Represents a model artifact, which can either be a file path or raw text/binary data. - - When the artifact holds a path pointing to a file, the 'is_path' property is true and the path can - be accessed via the 'path' attribute. When the artifact holds raw data, the 'is_data' property is true - and the data can be accessed via the 'data' attribute. In either case, the 'type' property indicates - whether the data (in memory or in the file) is text or binary. - """ - - def __init__(self, name: str, path: Path = None, handle: BinaryIO = None, data: bytes = None): - self.name: str = name - self.path: Path = path - self.handle: BinaryIO = handle - self.data: bytes = data - - if sum([path is not None, handle is not None, data is not None]) != 1: - raise ValueError("Exactly one of 'handle', 'path', or 'data' must be provided.") - - def is_path(self) -> bool: - """ - True if the artifact is a file path. - """ - return self.path is not None - - def is_handle(self) -> bool: - """ - True if the artifact is file path or file-like object. - """ - return self.handle is not None - - def is_data(self) -> bool: - """ - True if the artifact is a string or bytes object loaded in memory. - """ - return self.data is not None - - def get_as_handle(self) -> BinaryIO: - """ - Get the artifact as a file-like object. - """ - if self.is_handle(): - return self.handle - elif self.is_path(): - return open(self.path, "rb") - elif self.is_data(): - return io.BytesIO(self.data) - else: - raise ValueError("Artifact does not have a valid handle, path, or data.") - - @staticmethod - def from_path(path: Union[str, Path]): - """ - Create an Artifact instance from a file path. - """ - path = Path(path) - return Artifact(name=path.name, path=path) - - @staticmethod - def from_data(name: str, data: bytes): - """ - Create an Artifact instance from an in-memory sequence of bytes. - """ - return Artifact(name=name, data=data) diff --git a/plexe/internal/models/entities/code.py b/plexe/internal/models/entities/code.py deleted file mode 100644 index f9d6b602..00000000 --- a/plexe/internal/models/entities/code.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -This module defines a Code dataclass for representing code objects passed around by agents. -""" - -from dataclasses import dataclass, field - - -@dataclass -class Code: - """Represents a code object.""" - - code: str = field() - performance: float = field(default=None) diff --git a/plexe/internal/models/entities/description.py b/plexe/internal/models/entities/description.py deleted file mode 100644 index d0885c9d..00000000 --- a/plexe/internal/models/entities/description.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -This module defines dataclasses for structured model descriptions. - -These classes provide a comprehensive representation of a model's -characteristics, including schemas, implementation details, performance -metrics, and source code, organized in a structured format suitable -for various output formats and visualization purposes. -""" - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from dataclasses_json import DataClassJsonMixin - - -@dataclass -class SchemaInfo(DataClassJsonMixin): - """Information about the model's input and output schemas.""" - - input: Dict[str, Any] - output: Dict[str, Any] - - -@dataclass -class ImplementationInfo(DataClassJsonMixin): - """Technical information about the model implementation.""" - - framework: Optional[str] = None - model_type: Optional[str] = None - artifacts: List[str] = field(default_factory=list) - size: Optional[int] = None - - -@dataclass -class PerformanceInfo(DataClassJsonMixin): - """Performance metrics and training data information.""" - - metrics: Dict[str, Any] = field(default_factory=dict) - training_data_info: Dict[str, Dict[str, Any]] = field(default_factory=dict) - - -@dataclass -class CodeInfo(DataClassJsonMixin): - """Information about the model's source code.""" - - training: Optional[str] = None - prediction: Optional[str] = None - feature_transformations: Optional[str] = None - - -@dataclass -class ModelDescription(DataClassJsonMixin): - """A comprehensive description of a model.""" - - id: str - state: str - intent: str - schemas: SchemaInfo - implementation: ImplementationInfo - performance: PerformanceInfo - code: CodeInfo - training_date: Optional[str] = None - rationale: Optional[str] = None - provider: Optional[str] = None - task_type: Optional[str] = None - domain: Optional[str] = None - behavior: Optional[str] = None - preprocessing_summary: Optional[str] = None - architecture_summary: Optional[str] = None - training_procedure: Optional[str] = None - evaluation_metric: Optional[str] = None - inference_behavior: Optional[str] = None - strengths: Optional[str] = None - limitations: Optional[str] = None - - def as_text(self) -> str: - """Convert the model description to a formatted text string.""" - # Simple text representation - lines = [ - f"Model: {self.id}", - f"State: {self.state}", - f"Intent: {self.intent}", - f"Training Date: {self.training_date or 'Not available'}", - f"Built with: {self.provider or 'Unknown provider'}", - "", - "Input Schema:", - "\n".join(f" - {k}: {v}" for k, v in self.schemas.input.items()), - "", - "Output Schema:", - "\n".join(f" - {k}: {v}" for k, v in self.schemas.output.items()), - "", - "Implementation:", - f" - Framework: {self.implementation.framework or 'Not specified'}", - f" - Model Type: {self.implementation.model_type or 'Not specified'}", - f" - Size: {self.implementation.size or 'Unknown'} bytes", - "", - "Task Information:", - f" - Task Type: {self.task_type or 'Not specified'}", - f" - Domain: {self.domain or 'Not specified'}", - f" - Behavior: {self.behavior or 'Not specified'}", - "", - "Technical Details:", - f" - Preprocessing: {self.preprocessing_summary or 'Not available'}", - f" - Architecture: {self.architecture_summary or 'Not available'}", - f" - Training Procedure: {self.training_procedure or 'Not available'}", - f" - Evaluation Metric: {self.evaluation_metric or 'Not available'}", - f" - Inference Behavior: {self.inference_behavior or 'Not available'}", - "", - "Analysis:", - f" - Strengths: {self.strengths or 'Not available'}", - f" - Limitations: {self.limitations or 'Not available'}", - "", - "Performance Metrics:", - "\n".join(f" - {k}: {v}" for k, v in self.performance.metrics.items()), - "", - "Model Code:", - " - Training Code:", - f" ```python\n{self.code.training or '# No training code available'}\n```", - " - Prediction Code:", - f" ```python\n{self.code.prediction or '# No prediction code available'}\n```", - " - Feature Transformation Code:", - f" ```python\n{self.code.feature_transformations or '# No feature transformation code available'}\n```", - "", - "Rationale:", - self.rationale or "Not available", - ] - return "\n".join(lines) - - def as_markdown(self) -> str: - """Convert the model description to a markdown string.""" - # Markdown representation with formatting - lines = [ - f"# Model: {self.id}", - "", - f"**State:** {self.state}", - f"**Intent:** {self.intent}", - f"**Training Date:** {self.training_date or 'Not available'}", - f"**Built with:** {self.provider or 'Unknown provider'}", - "", - "## Input Schema", - "\n".join(f"- `{k}`: {v}" for k, v in self.schemas.input.items()), - "", - "## Output Schema", - "\n".join(f"- `{k}`: {v}" for k, v in self.schemas.output.items()), - "", - "## Implementation", - f"- **Framework:** {self.implementation.framework or 'Not specified'}", - f"- **Model Type:** {self.implementation.model_type or 'Not specified'}", - f"- **Size:** {self.implementation.size or 'Unknown'} bytes", - f"- **Artifacts:** {', '.join(self.implementation.artifacts) or 'None'}", - "", - "## Task Information", - f"- **Task Type:** {self.task_type or 'Not specified'}", - f"- **Domain:** {self.domain or 'Not specified'}", - f"- **Behavior:** {self.behavior or 'Not specified'}", - "", - "## Technical Details", - f"- **Preprocessing:** {self.preprocessing_summary or 'Not available'}", - f"- **Architecture:** {self.architecture_summary or 'Not available'}", - f"- **Training Procedure:** {self.training_procedure or 'Not available'}", - f"- **Evaluation Metric:** {self.evaluation_metric or 'Not available'}", - f"- **Inference Behavior:** {self.inference_behavior or 'Not available'}", - "", - "## Analysis", - f"- **Strengths:** {self.strengths or 'Not available'}", - f"- **Limitations:** {self.limitations or 'Not available'}", - "", - "## Performance Metrics", - "\n".join(f"- **{k}:** {v}" for k, v in self.performance.metrics.items()), - "", - "## Model Code", - "### Training Code", - "```python", - self.code.training or "# No training code available", - "```", - "### Prediction Code", - "```python", - self.code.prediction or "# No prediction code available", - "```", - "### Feature Transformation Code", - "```python", - self.code.feature_transformations or "# No feature transformation code available", - "```", - "", - "## Rationale", - self.rationale or "_Not available_", - ] - return "\n".join(lines) diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py deleted file mode 100644 index f5355984..00000000 --- a/plexe/internal/models/entities/metric.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -Module: plexe/internal/common/dataclasses/metric - -This module defines classes for handling and comparing metrics in a flexible and extensible way. - -Classes: - - ComparisonMethod: Enum defining methods for comparing metrics. - - MetricComparator: Encapsulates comparison logic for metrics, including methods like higher-is-better, - lower-is-better, and target-is-better. - - Metric: Represents a specific metric with a name, value, and comparator, allowing metrics to be compared - and evaluated. - -Example Usage: - from metric_class import Metric, MetricComparator, ComparisonMethod - - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metric1 = Metric(name="accuracy", value=0.8, comparator=comparator) - metric2 = Metric(name="accuracy", value=0.9, comparator=comparator) - - print(metric1 < metric2) # True -""" - -from enum import Enum -from functools import total_ordering - - -class ComparisonMethod(Enum): - """ - Defines methods for comparing metrics. - - Attributes: - HIGHER_IS_BETTER: Indicates that higher values are better. - LOWER_IS_BETTER: Indicates that lower values are better. - TARGET_IS_BETTER: Indicates that values closer to a target are better. - """ - - HIGHER_IS_BETTER = "higher_is_better" - LOWER_IS_BETTER = "lower_is_better" - TARGET_IS_BETTER = "target_is_better" - - -class MetricComparator: - """ - Encapsulates comparison logic for metrics. - - Attributes: - comparison_method (ComparisonMethod): The method used to compare metrics. - target (float, optional): The target value for TARGET_IS_BETTER comparisons. - """ - - def __init__(self, comparison_method: ComparisonMethod, target: float = None, epsilon: float = 1e-9): - """ - Initializes the MetricComparator. - - :param comparison_method: The method to compare metric values. - :param target: The target value for TARGET_IS_BETTER comparisons (optional). - :param epsilon: The tolerance for floating-point error in TARGET_IS_BETTER comparisons (default: 1e-9). - :raises ValueError: If TARGET_IS_BETTER is used without a target value. - """ - self.comparison_method = comparison_method - self.target = target if comparison_method == ComparisonMethod.TARGET_IS_BETTER else None - self.epsilon = epsilon - - if self.comparison_method == ComparisonMethod.TARGET_IS_BETTER and self.target is None: - raise ValueError("'TARGET_IS_BETTER' comparison requires a target value.") - if self.comparison_method == ComparisonMethod.TARGET_IS_BETTER and not isinstance(self.target, (float, int)): - raise ValueError("'TARGET_IS_BETTER' requires a numeric target value.") - - def compare(self, value1: float, value2: float) -> int: - """ - Compare two metric values based on the defined comparison method. - - :param value1: The first metric value. - :param value2: The second metric value. - :return: -1 if value1 is better, 1 if value2 is better, 0 if they are equal. - :raises ValueError: If an invalid comparison method is used. - """ - if value1 is None and value2 is None: - return 0 - elif value1 is None: - return 1 - elif value2 is None: - return -1 - elif self.comparison_method == ComparisonMethod.HIGHER_IS_BETTER: - return (value2 > value1 + self.epsilon) - (value1 > value2 + self.epsilon) - elif self.comparison_method == ComparisonMethod.LOWER_IS_BETTER: - return (value1 > value2 + self.epsilon) - (value2 > value1 + self.epsilon) - elif self.comparison_method == ComparisonMethod.TARGET_IS_BETTER: - dist1 = abs(value1 - self.target) - dist2 = abs(value2 - self.target) - if dist1 > dist2 + self.epsilon: - return 1 - elif dist2 > dist1 + self.epsilon: - return -1 - else: - return 0 - else: - raise ValueError("Invalid comparison method.") - - -# todo: this class is a mess as it mixes concerns of a metric and a metric value; needs refactoring -@total_ordering -class Metric: - """ - Represents a metric with a name, a value, and a comparator for determining which metric is better. - - Attributes: - name (str): The name of the metric (e.g., 'accuracy', 'loss'). - value (float): The numeric value of the metric. - comparator (MetricComparator): The comparison logic for the metric. - """ - - def __init__(self, name: str, value: float = None, comparator: MetricComparator = None, is_worst: bool = False): - """ - Initializes a Metric object. - - :param name: The name of the metric. - :param value: The numeric value of the metric. - :param comparator: An instance of MetricComparator for comparison logic. - :param is_worst: Indicates if the metric value is the worst possible value. - """ - self.name = name - self.value = value - self.comparator = comparator - self.is_worst = is_worst or value is None - - def __gt__(self, other) -> bool: - """ - Determine if this metric is better than another metric. - - :param other: Another Metric object to compare against. - :return: True if this metric is better, False otherwise. - :raises ValueError: If the metrics have different names or comparison methods. - """ - if not isinstance(other, Metric): - return NotImplemented - - if self.is_worst or (self.is_worst and other.is_worst): - return False - - if other.is_worst: - return True - - if self.name != other.name: - raise ValueError("Cannot compare metrics with different names.") - - if self.comparator.comparison_method != other.comparator.comparison_method: - raise ValueError("Cannot compare metrics with different comparison methods.") - - if ( - self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER - and self.comparator.target != other.comparator.target - ): - raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.") - - return self.comparator.compare(self.value, other.value) < 0 - - def __eq__(self, other) -> bool: - """ - Check if this metric is equal to another metric. - - :param other: Another Metric object to compare against. - :return: True if the metrics are equal, False otherwise. - """ - if not isinstance(other, Metric): - return NotImplemented - - if self.is_worst and other.is_worst: - return True - - if self.is_worst or other.is_worst: - return False - - return ( - self.name == other.name - and self.comparator.comparison_method == other.comparator.comparison_method - and self.comparator.compare(self.value, other.value) == 0 - ) - - def __repr__(self) -> str: - """ - Return a string representation of the Metric object. - - :return: A string representation of the Metric. - """ - target_str = ( - f", target={self.comparator.target}" - if self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER - else "" - ) - return f"Metric(name={self.name!r}, value={self.value}, comparison={self.comparator.comparison_method.name}{target_str})" - - def __str__(self) -> str: - """ - Return a user-friendly string representation of the Metric. - - :return: A string describing the Metric. - """ - comparison_symbols = { - ComparisonMethod.HIGHER_IS_BETTER: "↑", - ComparisonMethod.LOWER_IS_BETTER: "↓", - ComparisonMethod.TARGET_IS_BETTER: "≈", - } - symbol = comparison_symbols.get(self.comparator.comparison_method, "?") - return f"Metric {self.name} {symbol} {self.value}" - - @property - def is_valid(self) -> bool: - """ - Check if the metric value is valid (i.e., not None or NaN). - - :return: True if the metric value is valid, False otherwise. - """ - return self.value is not None and not (self.value != self.value) # NaN check diff --git a/plexe/internal/models/execution/__init__.py b/plexe/internal/models/execution/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/execution/docker_executor.py b/plexe/internal/models/execution/docker_executor.py deleted file mode 100644 index bbd408b3..00000000 --- a/plexe/internal/models/execution/docker_executor.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Any - -from plexe.internal.models.execution.executor import Executor, ExecutionResult - - -class DockerExecutor(Executor): - """ - Execute Python code snippets in an isolated Docker container. - - The `DockerExecutor` class implements the `Executor` interface, allowing Python code - snippets to be executed in an isolated Docker container with strict isolation, output capture, - and timeout enforcement. - """ - - def __init__(self, code: str, timeout: int = 3600, **kwargs: Any) -> None: - raise NotImplementedError("DockerExecutor is not yet implemented") - - def run(self) -> ExecutionResult: - raise NotImplementedError("DockerExecutor is not yet implemented") - - def cleanup(self) -> None: - raise NotImplementedError("DockerExecutor is not yet implemented") diff --git a/plexe/internal/models/execution/executor.py b/plexe/internal/models/execution/executor.py deleted file mode 100644 index f3811271..00000000 --- a/plexe/internal/models/execution/executor.py +++ /dev/null @@ -1,63 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from typing import Any, Optional, List -from pathlib import Path - - -@dataclass -class ExecutionResult: - """ - Result of executing code in an environment. - - Attributes: - term_out (list[str]): The terminal output from the execution. - exec_time (float): The time taken to execute the code. - """ - - term_out: list[str] - exec_time: float - model_artifact_paths: List[Path | str] = field(default_factory=list) - exception: Exception = field(default=None) - performance: Optional[float] = field(default=None) - - def is_valid_performance(self) -> bool: - """Validate if performance metric is usable.""" - return ( - self.performance is not None - and isinstance(self.performance, (int, float)) - and self.performance not in [float("inf"), float("-inf")] - ) - - -class Executor(ABC): - """ - Abstract base class for code execution environments. - """ - - @abstractmethod - def __init__(self, code: str, timeout: int = 3600, **kwargs: Any) -> None: - """ - Initialise the executor. - - :param: [str] code: The code to execute. - :param: [int] timeout: Maximum execution time in seconds. Defaults to 3600. - :param: [Any] kwargs: Additional parameters specific to the implementation. - """ - self.code = code - self.timeout = timeout - - @abstractmethod - def run(self) -> ExecutionResult: - """ - Execute the code in the defined environment. - - :return: [ExecutionResult] The results of execution, including output and errors. - """ - pass - - @abstractmethod - def cleanup(self) -> None: - """ - Perform any necessary cleanup (e.g., terminate processes, remove temporary files). - """ - pass diff --git a/plexe/internal/models/execution/process_executor.py b/plexe/internal/models/execution/process_executor.py deleted file mode 100644 index 3f841e6e..00000000 --- a/plexe/internal/models/execution/process_executor.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -Module: ProcessExecutor for Isolated Python Code Execution - -This module provides an implementation of the `Executor` interface for executing Python code snippets -in an isolated process. It captures stdout, stderr, exceptions, and stack traces, and enforces -timeout limits on execution. - -Classes: - - RedirectQueue: A helper class to redirect stdout and stderr to a multiprocessing Queue. - - ProcessExecutor: A class to execute Python code snippets in an isolated process. - -Usage: - Create an instance of `ProcessExecutor`, providing the Python code, working directory, and timeout. - Call the `run` method to execute the code and return the results in an `ExecutionResult` object. - -Exceptions: - - Raises `RuntimeError` if the child process fails unexpectedly. - -""" - -import logging -import subprocess -import sys -import time -import pyarrow.parquet as pq -import pyarrow as pa -from pathlib import Path -from typing import Dict - -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.utils.response import extract_performance -from plexe.internal.models.execution.executor import ExecutionResult, Executor -from plexe.config import config - -logger = logging.getLogger(__name__) - - -class ProcessExecutor(Executor): - """ - Execute Python code snippets in an isolated process. - - The `ProcessExecutor` class implements the `Executor` interface, allowing Python code - snippets to be executed with strict isolation, output capture, and timeout enforcement. - """ - - def __init__( - self, - execution_id: str, - code: str, - working_dir: Path | str, - datasets: Dict[str, TabularConvertible], - timeout: int, - code_execution_file_name: str = config.execution.runfile_name, - ): - """ - Initialize the ProcessExecutor. - - Args: - execution_id (str): Unique identifier for this execution. - code (str): The Python code to execute. - working_dir (Path | str): The working directory for execution. - datasets (Dict[str, TabularConvertible]): Datasets to be used for execution. - timeout (int): The maximum allowed execution time in seconds. - code_execution_file_name (str): The filename to use for the executed script. - """ - super().__init__(code, timeout) - # Create a unique working directory for this execution - self.working_dir = Path(working_dir).resolve() / execution_id - self.working_dir.mkdir(parents=True, exist_ok=True) - # Set the file names for the code and training data - self.code_file_name = code_execution_file_name - self.datasets = datasets - # Keep track of resources for cleanup - self.dataset_files = [] - self.code_file = None - self.process = None - - def run(self) -> ExecutionResult: - """Execute code in a subprocess and return results.""" - logger.debug(f"ProcessExecutor is executing code with working directory: {self.working_dir}") - start_time = time.time() - - try: - # Write code to file with module environment setup - self.code_file = self.working_dir / self.code_file_name - module_setup = "import os\n" "import sys\n" "from pathlib import Path\n\n" - with open(self.code_file, "w", encoding="utf-8") as f: - f.write(module_setup + self.code) - - # Write datasets to files - self.dataset_files = [] - for dataset_name, dataset in self.datasets.items(): - dataset_file: Path = self.working_dir / f"{dataset_name}.parquet" - pq.write_table(pa.Table.from_pandas(df=dataset.to_pandas()), dataset_file) - self.dataset_files.append(dataset_file) - - # Execute the code in a subprocess - self.process = subprocess.Popen( - [sys.executable, str(self.code_file)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=str(self.working_dir), - text=True, - ) - - stdout, stderr = self.process.communicate(timeout=self.timeout) - exec_time = time.time() - start_time - - # Collect all model artifacts created by the execution - not code or datasets - model_artifacts = [] - model_dir = self.working_dir / "model_files" - if model_dir.exists() and model_dir.is_dir(): - model_artifacts.append(str(model_dir)) - else: - # If model_files directory doesn't exist, collect individual files - for file in self.working_dir.iterdir(): - if file != self.code_file and file not in self.dataset_files: - model_artifacts.append(str(file)) - - if self.process.returncode != 0: - return ExecutionResult( - term_out=[stdout], - exec_time=exec_time, - exception=RuntimeError(f"Process exited with code {self.process.returncode}: {stderr}"), - model_artifact_paths=model_artifacts, - ) - - # Extract performance and create result - return ExecutionResult( - term_out=[stdout], - exec_time=exec_time, - model_artifact_paths=model_artifacts, - performance=extract_performance(stdout), - ) - - except subprocess.TimeoutExpired: - if self.process: - self.process.kill() - - return ExecutionResult( - term_out=[], - exec_time=self.timeout, - exception=TimeoutError( - f"Execution exceeded {self.timeout}s timeout - individual run timeout limit reached" - ), - ) - except Exception as e: - stdout, stderr = "", "" - - if self.process: - # Try to collect any output that was produced before the exception - try: - if hasattr(self.process, "stdout") and self.process.stdout: - stdout = self.process.stdout.read() or "" - except Exception: - pass # Best effort to get output - - self.process.kill() - - return ExecutionResult( - term_out=[stdout or f"Process failed with exception: {str(e)}"], - exec_time=time.time() - start_time, - exception=e, - ) - finally: - # Always clean up resources regardless of execution path - self.cleanup() - - def cleanup(self): - """ - Clean up resources after execution while preserving model artifacts. - """ - logger.debug(f"Cleaning up resources for execution in {self.working_dir}") - - try: - # Clean up dataset files - for dataset_file in self.dataset_files: - dataset_file.unlink(missing_ok=True) - - # Clean up code file - if self.code_file: - try: - self.code_file.unlink(missing_ok=True) - except AttributeError: - # Python 3.7 compatibility - missing_ok not available - if self.code_file.exists(): - self.code_file.unlink() - - # Terminate process if still running - if self.process and self.process.poll() is None: - self.process.kill() - - except Exception as e: - logger.warning(f"Error during resource cleanup: {str(e)}") - - def __del__(self): - """Ensure cleanup happens when the object is garbage collected.""" - try: - self.cleanup() - except Exception: - # Silent failure during garbage collection - detailed logging already done in cleanup() - pass diff --git a/plexe/internal/models/execution/ray_executor.py b/plexe/internal/models/execution/ray_executor.py deleted file mode 100644 index e682c483..00000000 --- a/plexe/internal/models/execution/ray_executor.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Module: RayExecutor for Distributed Python Code Execution - -This module provides an implementation of the `Executor` interface for executing Python code snippets -in a distributed Ray cluster. It leverages Ray to run code remotely, with support for parallel execution. - -Classes: - - RayExecutor: A class to execute Python code snippets in a Ray cluster. - -Usage: - Create an instance of `RayExecutor`, providing the Python code, working directory, and timeout. - Call the `run` method to execute the code on the Ray cluster and return the results in an - `ExecutionResult` object. -""" - -import logging -import time -import ray -import pyarrow.parquet as pq -import pyarrow as pa -from pathlib import Path -from typing import Dict, List - -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.utils.response import extract_performance -from plexe.internal.models.execution.executor import ExecutionResult, Executor -from plexe.config import config - -logger = logging.getLogger(__name__) - - -@ray.remote -def _run_code(code: str, working_dir: str, dataset_files: List[str], timeout: int) -> dict: - """Ray remote function that executes the code.""" - import subprocess - import sys - from pathlib import Path - - working_dir = Path(working_dir) - code_file = working_dir / "run.py" - - # Write code to file - with open(code_file, "w", encoding="utf-8") as f: - f.write("import os\nimport sys\nfrom pathlib import Path\n\n" + code) - - start_time = time.time() - process = subprocess.Popen( - [sys.executable, str(code_file)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=str(working_dir), - text=True, - ) - - try: - stdout, stderr = process.communicate(timeout=timeout) - exec_time = time.time() - start_time - - # Collect model artifacts - model_artifacts = [] - model_dir = working_dir / "model_files" - if model_dir.exists() and model_dir.is_dir(): - model_artifacts.append(str(model_dir)) - else: - for file in working_dir.iterdir(): - if file != code_file and str(file) not in dataset_files: - model_artifacts.append(str(file)) - - return { - "stdout": stdout, - "stderr": stderr, - "returncode": process.returncode, - "exec_time": exec_time, - "model_artifacts": model_artifacts, - } - except subprocess.TimeoutExpired: - process.kill() - return { - "stdout": "", - "stderr": f"Execution exceeded {timeout}s timeout", - "returncode": -1, - "exec_time": timeout, - "model_artifacts": [], - } - - -class RayExecutor(Executor): - """Execute Python code snippets on a Ray cluster.""" - - _ray_was_used = False - - def __init__( - self, - execution_id: str, - code: str, - working_dir: Path | str, - datasets: Dict[str, TabularConvertible], - timeout: int, - code_execution_file_name: str = config.execution.runfile_name, - ): - """Initialize the RayExecutor. - - Args: - execution_id (str): Unique ID for this execution. - code (str): The Python code to execute. - working_dir (Path | str): The working directory for execution. - datasets (Dict[str, TabularConvertible]): The datasets to be used. - timeout (int): The maximum allowed execution time in seconds. - code_execution_file_name (str): The filename to use for the executed script. - """ - RayExecutor._ray_was_used = True - super().__init__(code, timeout) - self.working_dir = Path(working_dir).resolve() / execution_id - self.working_dir.mkdir(parents=True, exist_ok=True) - self.code_file_name = code_execution_file_name - self.dataset = datasets - - # Initialize Ray if not already done - if not ray.is_initialized(): - from plexe.config import config - - ray.init( - address=getattr(config.ray, "address", None) if hasattr(config, "ray") else None, - num_cpus=getattr(config.ray, "num_cpus", None) if hasattr(config, "ray") else None, - num_gpus=getattr(config.ray, "num_gpus", None) if hasattr(config, "ray") else None, - ignore_reinit_error=True, - ) - - def run(self) -> ExecutionResult: - """Execute code using Ray and return results.""" - logger.debug(f"RayExecutor is executing code with working directory: {self.working_dir}") - - # Write datasets to files - dataset_files = [] - for dataset_name, dataset in self.dataset.items(): - dataset_file = self.working_dir / f"{dataset_name}.parquet" - pq.write_table(pa.Table.from_pandas(df=dataset.to_pandas()), dataset_file) - dataset_files.append(str(dataset_file)) - - try: - # Execute the code using Ray - result_ref = _run_code.remote(self.code, str(self.working_dir), dataset_files, self.timeout) - # Wait for result with timeout - ready_refs, remaining_refs = ray.wait([result_ref], timeout=self.timeout) - - # If no ready refs, we hit a timeout - if not ready_refs: - ray.cancel(result_ref, force=True) - return ExecutionResult( - term_out=[], - exec_time=self.timeout, - exception=TimeoutError(f"Execution exceeded {self.timeout}s timeout - Ray timeout reached"), - ) - - # Get the result from the completed task - result = ray.get(ready_refs[0]) - - if result["returncode"] != 0: - return ExecutionResult( - term_out=[result["stdout"]], - exec_time=result["exec_time"], - exception=RuntimeError(result["stderr"]), - model_artifact_paths=result["model_artifacts"], - ) - - return ExecutionResult( - term_out=[result["stdout"]], - exec_time=result["exec_time"], - model_artifact_paths=result["model_artifacts"], - performance=extract_performance(result["stdout"]), - ) - - except ray.exceptions.GetTimeoutError: - ray.cancel(result_ref, force=True) - return ExecutionResult( - term_out=[], - exec_time=self.timeout, - exception=TimeoutError(f"Execution exceeded {self.timeout}s timeout - Ray timeout reached"), - ) - - def cleanup(self) -> None: - """Clean up Ray resources if needed.""" - pass diff --git a/plexe/internal/models/generation/__init__.py b/plexe/internal/models/generation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/generation/planning.py b/plexe/internal/models/generation/planning.py deleted file mode 100644 index c68d49b1..00000000 --- a/plexe/internal/models/generation/planning.py +++ /dev/null @@ -1,62 +0,0 @@ -# plexe/internal/models/generation/planning.py - -""" -This module provides functions and classes for generating and planning solutions for machine learning problems. -""" - -import json -import logging - -from pydantic import BaseModel - -from plexe.config import prompt_templates -from plexe.internal.common.provider import Provider -from plexe.internal.models.entities.metric import Metric, MetricComparator, ComparisonMethod - -logger = logging.getLogger(__name__) - - -class SolutionPlanGenerator: - """ - A class to generate solution plans for given problem statements. - """ - - def __init__(self, provider: Provider): - """ - Initializes the SolutionPlanGenerator with an empty context. - """ - self.provider: Provider = provider - - def select_target_metric(self, problem_statement: str) -> Metric: - """ - Selects the metric to optimise for the given problem statement and dataset. - - :param problem_statement: definition of the problem - :return: the metric to optimise - """ - - class MetricResponse(BaseModel): - name: str - comparison_method: ComparisonMethod - comparison_target: float = None - - response: MetricResponse = MetricResponse( - **json.loads( - self.provider.query( - system_message=prompt_templates.planning_system(), - user_message=prompt_templates.planning_select_metric( - problem_statement=problem_statement, - ), - response_format=MetricResponse, - ) - ) - ) - - try: - return Metric( - name=response.name, - value=float("inf") if response.comparison_method == ComparisonMethod.LOWER_IS_BETTER else -float("inf"), - comparator=MetricComparator(response.comparison_method, response.comparison_target), - ) - except Exception as e: - raise ValueError(f"Could not determine optimization metric from problem statement: {response}") from e diff --git a/plexe/internal/models/generation/review.py b/plexe/internal/models/generation/review.py deleted file mode 100644 index 89b51a42..00000000 --- a/plexe/internal/models/generation/review.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -This module provides functionality for reviewing and analyzing generated models. - -It examines the solution plan, training code, and inference code to extract -metadata about the model, such as the framework used, model type, and provides -explanations about how the model works and why it's appropriate for the task. -""" - -import json -import logging -from datetime import datetime -from typing import Dict - -from pydantic import BaseModel - -from plexe.config import prompt_templates -from plexe.internal.common.provider import Provider - -logger = logging.getLogger(__name__) - - -class ModelReviewResponse(BaseModel): - """ - Response model for the model review operation. - """ - - framework: str # e.g. PyTorch, TensorFlow, Scikit-Learn - model_type: str # e.g. CNN, Transformer, RandomForest - - task_type: str # e.g. classification, regression, generation - domain: str # e.g. NLP, computer vision, tabular, multimodal - - behavior: str # what the model 'does', what relationship is it really learning - - preprocessing_summary: str # high-level view of input processing pipeline - architecture_summary: str # concise summary of model architecture - training_procedure: str # brief description of optimizer, loss, epochs, etc. - evaluation_metric: str # list of metrics used for evaluation (e.g., accuracy, F1) - - inference_behavior: str # how inference is performed, assumptions, outputs - strengths: str # where the model is expected to perform well - limitations: str # known weaknesses, assumptions, or risks - - selection_rationale: str # why this model was selected for the given intent - - -class ModelReviewer: - """ - A class for analyzing and reviewing generated models. - """ - - def __init__(self, provider: Provider): - """ - Initialize the model reviewer with a provider. - - :param provider: The provider to use for generating model reviews - """ - self.provider = provider - - def review_model( - self, - intent: str, - input_schema: Dict[str, str], - output_schema: Dict[str, str], - solution_plan: str, - training_code: str, - inference_code: str, - ) -> Dict[str, str]: - """ - Review a generated model to extract metadata, explanations and insights about the trained model. - - :param intent: The original model intent - :param input_schema: The input schema for the model, for example {"feat_1": "int", "feat_2": "str"} - :param output_schema: The output schema for the model, for example {"output": "float"} - :param solution_plan: The solution plan used to generate the model - :param training_code: The generated training code - :param inference_code: The generated inference code - :return: A dictionary containing framework, model_type, creation_date and various explanations - """ - try: - response = self.provider.query( - system_message=prompt_templates.review_system(), - user_message=prompt_templates.review_model( - intent=intent, - input_schema=json.dumps(input_schema, indent=2), - output_schema=json.dumps(output_schema, indent=2), - solution_plan=solution_plan, - training_code=training_code, - inference_code=inference_code, - ), - response_format=ModelReviewResponse, - ) - - # Parse the response and create metadata dictionary - review_data = json.loads(response) - - # Create metadata dictionary with review results and creation date - metadata = { - "framework": review_data["framework"], - "model_type": review_data["model_type"], - "task_type": review_data["task_type"], - "domain": review_data["domain"], - "behavior": review_data["behavior"], - "preprocessing_summary": review_data["preprocessing_summary"], - "architecture_summary": review_data["architecture_summary"], - "training_procedure": review_data["training_procedure"], - "evaluation_metric": review_data["evaluation_metric"], - "inference_behavior": review_data["inference_behavior"], - "strengths": review_data["strengths"], - "limitations": review_data["limitations"], - "selection_rationale": review_data["selection_rationale"], - "creation_date": datetime.now().isoformat(), - } - return metadata - - except Exception as e: - logger.warning(f"Error during model review: {str(e)}") - # Return default values if there was an error - return { - "framework": "Unknown", - "model_type": "Unknown", - "task_type": "Unknown", - "domain": "Unknown", - "behavior": "Unknown", - "preprocessing_summary": "Unknown", - "architecture_summary": "Unknown", - "training_procedure": "Unknown", - "evaluation_metric": "Unknown", - "inference_behavior": "Unknown", - "strengths": "Unknown", - "limitations": "Unknown", - "selection_rationale": "Could not determine model details due to an error.", - "creation_date": datetime.now().isoformat(), - } diff --git a/plexe/internal/models/generation/training.py b/plexe/internal/models/generation/training.py deleted file mode 100644 index 99bfa22f..00000000 --- a/plexe/internal/models/generation/training.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -This module provides functions and classes for generating, fixing, and reviewing machine learning model training code. - -Functions: - generate_training_code: Generates machine learning model training code based on a problem statement and solution plan. - generate_training_tests: Generates tests for the machine learning model training code. - fix_training_code: Fixes the machine learning model training code based on review and identified problems. - fix_training_tests: Fixes the tests for the machine learning model training code based on review and identified problems. - review_training_code: Reviews the machine learning model training code to identify improvements and fix issues. - review_training_tests: Reviews the tests for the machine learning model training code to identify improvements and fix issues. - -Classes: - TrainingCodeGenerator: A class to generate, fix, and review machine learning model training code. -""" - -import json -import logging -from typing import List, Dict -from pathlib import Path - -from pydantic import BaseModel - -from plexe.config import config, prompt_templates -from plexe.internal.common.provider import Provider -from plexe.internal.common.utils.response import extract_code - -logger = logging.getLogger(__name__) - - -class TrainingCodeGenerator: - """ - A class to generate, fix, and review machine learning model training code. - """ - - def __init__(self, provider: Provider): - """ - Initializes the TrainingCodeGenerator with an empty history. - - :param Provider provider: The provider to use for querying. - """ - self.provider = provider - self.history: List[Dict[str, str]] = [] - - def generate_training_code( - self, - problem_statement: str, - plan: str, - train_dataset_names: list[str], - validation_dataset_names: list[str] = None, - ) -> str: - """ - Generates machine learning model training code based on the given problem statement and solution plan. - - :param [str] problem_statement: The description of the problem to be solved. - :param [str] plan: The proposed solution plan. - :param [str] train_dataset_names: The names of the datasets to use for training. - :param [str] validation_dataset_names: The names of the datasets to use for validation. - :return str: The generated training code. - """ - validation_dataset_names = validation_dataset_names or [] - - return extract_code( - self.provider.query( - system_message=prompt_templates.training_system(), - user_message=prompt_templates.training_generate( - problem_statement=problem_statement, - plan=plan, - history=self.history, - allowed_packages=config.code_generation.allowed_packages, - training_data_files=[Path(f"{file}.parquet").as_posix() for file in train_dataset_names], - validation_data_files=[Path(f"{file}.parquet").as_posix() for file in validation_dataset_names], - ), - ) - ) - - def fix_training_code( - self, - training_code: str, - plan: str, - review: str, - train_dataset_names: list[str], - validation_dataset_names: list[str] = None, - problems: str = None, - ) -> str: - """ - Fixes the machine learning model training code based on the review and identified problems. - - :param [str] training_code: The previously generated training code. - :param [str] plan: The proposed solution plan. - :param [str] review: The review of the previous solution. - :param [str] train_dataset_names: The names of the datasets to use for training. - :param [str] validation_dataset_names: The names of the datasets to use for validation. - :param [str] problems: Specific errors or bugs identified. - :return str: The fixed training code. - """ - - class FixResponse(BaseModel): - plan: str - code: str - - response: FixResponse = FixResponse( - **json.loads( - self.provider.query( - system_message=prompt_templates.training_system(), - user_message=prompt_templates.training_fix( - plan=plan, - training_code=training_code, - review=review, - problems=problems, - training_data_files=[Path(f"{file}.parquet").as_posix() for file in train_dataset_names], - validation_data_files=[Path(f"{file}.parquet").as_posix() for file in validation_dataset_names], - allowed_packages=config.code_generation.allowed_packages, - ), - response_format=FixResponse, - ) - ) - ) - return extract_code(response.code) - - def review_training_code(self, training_code: str, problem_statement: str, plan: str, problems: str = None) -> str: - """ - Reviews the machine learning model training code to identify improvements and fix issues. - - :param [str] training_code: The previously generated training code. - :param [str] problem_statement: The description of the problem to be solved. - :param [str] plan: The proposed solution plan. - :param [str] problems: Specific errors or bugs identified. - :return str: The review of the training code with suggestions for improvements. - """ - return self.provider.query( - system_message=prompt_templates.training_system(), - user_message=prompt_templates.training_review( - problem_statement=problem_statement, - plan=plan, - training_code=training_code, - problems=problems, - allowed_packages=config.code_generation.allowed_packages, - ), - ) - - def generate_training_tests(self, problem_statement: str, plan: str, training_code: str) -> str: - raise NotImplementedError("Generation of the training tests is not yet implemented.") - - def fix_training_tests(self, training_tests: str, training_code: str, review: str, problems: str = None) -> str: - raise NotImplementedError("Fixing of the training tests is not yet implemented.") - - def review_training_tests(self, training_tests: str, training_code: str, problem_statement: str, plan: str) -> str: - raise NotImplementedError("Review of the training tests is not yet implemented.") diff --git a/plexe/internal/models/validation/__init__.py b/plexe/internal/models/validation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/validation/composite.py b/plexe/internal/models/validation/composite.py deleted file mode 100644 index d8fbef00..00000000 --- a/plexe/internal/models/validation/composite.py +++ /dev/null @@ -1,43 +0,0 @@ -# plexe/internal/models/validation/composite.py - -""" -This module defines the `CompositeValidator` class, which chains multiple validators together in a workflow. - -Classes: - - CompositeValidator: A validator that chains multiple validators together in a workflow. -""" - -from plexe.internal.models.validation.validator import Validator, ValidationResult - - -class CompositeValidator(Validator): - """ - A validator that chains multiple validators together in a workflow. - - Attributes: - validators (list[Validator]): The validators to run in the workflow. - """ - - def __init__(self, name: str, validators: list[Validator]): - """ - Initializes the validator pipeline with a name and a list of validators. - - :param [str] name: The name of the validator pipeline. - :param [list[Validator]] validators: The validators to run in the pipeline. - """ - super().__init__(name) - self.validators = validators - - def validate(self, code: str, **kwargs) -> ValidationResult: - """ - Validates the given code by running it through each validator in the pipeline. - - :param [str] code: The code to validate. - :return: [ValidationResult] The result of the validation. - """ - for validator in self.validators: - result = validator.validate(code, **kwargs) - if not result.passed: - return result - - return ValidationResult(self.name, True, "All validators passed.") diff --git a/plexe/internal/models/validation/composites/__init__.py b/plexe/internal/models/validation/composites/__init__.py deleted file mode 100644 index 8e1c585e..00000000 --- a/plexe/internal/models/validation/composites/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from plexe.internal.models.validation.composites.training import TrainingCodeValidator as TrainingCodeValidator -from plexe.internal.models.validation.composites.inference import InferenceCodeValidator as InferenceCodeValidator diff --git a/plexe/internal/models/validation/composites/inference.py b/plexe/internal/models/validation/composites/inference.py deleted file mode 100644 index 03048113..00000000 --- a/plexe/internal/models/validation/composites/inference.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -This module defines a composite validator for validating the correctness of prediction code. - -Classes: - - InferenceCodeValidator: A validator class that validates the correctness of prediction code. -""" - -from typing import Type, List, Dict, Any - -from pydantic import BaseModel - -from plexe.internal.models.validation.composite import CompositeValidator -from plexe.internal.models.validation.primitives.predict import PredictorValidator -from plexe.internal.models.validation.primitives.syntax import SyntaxValidator - - -class InferenceCodeValidator(CompositeValidator): - """ - A validator class that validates the correctness of prediction code. - """ - - def __init__( - self, - input_schema: Type[BaseModel], - output_schema: Type[BaseModel], - input_sample: List[Dict[str, Any]], - ): - """ - Initialize the InferenceCodeValidator with the name 'prediction'. - - Args: - input_schema: The input schema for the model - output_schema: The output schema for the model - input_sample: List of sample input dictionaries to test the predictor - """ - super().__init__( - "prediction", - [ - SyntaxValidator(), - PredictorValidator(input_schema, output_schema, input_sample), - ], - ) diff --git a/plexe/internal/models/validation/composites/training.py b/plexe/internal/models/validation/composites/training.py deleted file mode 100644 index 1767e4be..00000000 --- a/plexe/internal/models/validation/composites/training.py +++ /dev/null @@ -1,23 +0,0 @@ -# internal/models/validation/pipelines.py - -""" -This module defines a composite validator for validating the correctness of training code. - -Classes: - - TrainingCodeValidator: A validator class that validates the correctness of training code. -""" - -from plexe.internal.models.validation.primitives.syntax import SyntaxValidator -from plexe.internal.models.validation.composite import CompositeValidator - - -class TrainingCodeValidator(CompositeValidator): - """ - A validator class that validates the correctness of training code. - """ - - def __init__(self): - """ - Initialize the TrainingValidator with the name 'training'. - """ - super().__init__("training", [SyntaxValidator()]) diff --git a/plexe/internal/models/validation/primitives/__init__.py b/plexe/internal/models/validation/primitives/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/models/validation/primitives/predict.py b/plexe/internal/models/validation/primitives/predict.py deleted file mode 100644 index 15c2bfdd..00000000 --- a/plexe/internal/models/validation/primitives/predict.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -This module defines the `PredictorValidator` class, which validates that a predictor behaves as expected. - -Classes: - - PredictorValidator: A validator class that checks the behavior of a predictor. -""" - -import types -import warnings -from typing import Type, List, Dict, Any - -from pydantic import BaseModel - -from plexe.internal.models.validation.validator import Validator, ValidationResult -from plexe.core.interfaces.predictor import Predictor - - -class PredictorValidator(Validator): - """ - A validator class that checks that a predictor behaves as expected. - """ - - def __init__( - self, - input_schema: Type[BaseModel], - output_schema: Type[BaseModel], - sample: List[Dict[str, Any]], - ) -> None: - """ - Initialize the PredictorValidator with the name 'predictor'. - - :param input_schema: The input schema of the predictor. - :param output_schema: The output schema of the predictor. - :param sample: List of sample input dictionaries to test the predictor. - """ - super().__init__("predictor") - self.input_schema: Type[BaseModel] = input_schema - self.output_schema: Type[BaseModel] = output_schema - self.input_sample: List[Dict[str, Any]] = sample - - def validate(self, code: str, model_artifacts=None) -> ValidationResult: - """ - Validates that the given code for a predictor behaves as expected. - :param code: prediction code to be validated - :param model_artifacts: model artifacts to be used for validation - :return: True if valid, False otherwise - """ - - def validation_error(stage, e): - """Helper to create validation error results""" - return ValidationResult( - self.name, - False, - message=f"Failed at {stage} stage: {str(e)}", - exception=e, - error_stage=stage, - error_type=type(e).__name__, - error_details=str(e), - ) - - # Stage 1: Load module - try: - predictor_module = self._load_module(code) - except Exception as e: - return validation_error("loading", e) - - # Stage 2: Check class definition - try: - predictor_class = getattr(predictor_module, "PredictorImplementation") - self._is_subclass(predictor_class) - except Exception as e: - return validation_error("class_definition", e) - - # Stage 3: Initialize predictor - try: - predictor = predictor_class(model_artifacts) - except Exception as e: - return validation_error("initialization", e) - - # Stage 4: Test prediction - try: - self._returns_output_when_called(predictor) - except Exception as e: - return validation_error("prediction", e) - - # All validation steps passed - return ValidationResult(self.name, True, "Prediction code is valid.") - - @staticmethod - def _load_module(code: str) -> types.ModuleType: - """ - Compiles and loads the predictor module from the given code. - """ - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - module = types.ModuleType("test_predictor") - try: - exec(code, module.__dict__) - except Exception as e: - raise RuntimeError(f"Failed to load predictor: {str(e)}") - return module - - @staticmethod - def _is_subclass(predictor) -> None: - if not issubclass(predictor, Predictor): - raise TypeError("The predictor class is not a subclass of Predictor.") - - def _returns_output_when_called(self, predictor) -> None: - """ - Tests the `predict` function by calling it with sample inputs and validates outputs. - """ - total_tests = len(self.input_sample) - issues = [] - - for i, sample in enumerate(self.input_sample): - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # Test prediction execution - output = predictor.predict(sample) - - # Validate output against schema - try: - self.output_schema.model_validate(output) - except Exception as schema_err: - # Include truncated sample and output for context - truncated_output = str(output)[:100] + "..." if len(str(output)) > 100 else output - issues.append( - { - "error": f"Output schema validation error: {str(schema_err)}", - "output": truncated_output, - "index": i, - } - ) - except Exception as e: - # Include truncated sample for context - sample_str = str(sample) - truncated_sample = sample_str[:100] + "..." if len(sample_str) > 100 else sample_str - issues.append({"error": str(e), "sample": truncated_sample, "index": i}) - - if len(issues) > 0: - raise RuntimeError(f"{len(issues)}/{total_tests} calls to 'predict' failed. Issues: {issues}") diff --git a/plexe/internal/models/validation/primitives/security.py b/plexe/internal/models/validation/primitives/security.py deleted file mode 100644 index 3311b756..00000000 --- a/plexe/internal/models/validation/primitives/security.py +++ /dev/null @@ -1,35 +0,0 @@ -# plexe/internal/models/validation/security.py - -""" -This module defines the SecurityValidator class, which is responsible for validating the security -of Python code using the Bandit tool. - -Classes: - - SecurityValidator: A validator class that checks the security of Python code. -""" - -from plexe.internal.models.validation.validator import Validator, ValidationResult - - -class SecurityValidator(Validator): - """ - A validator class that checks the security of Python code using the Bandit tool. - """ - - def __init__(self): - """ - Initialize the SecurityValidator with the name 'security'. - """ - super().__init__("security") - - def validate(self, code: str, **kwargs) -> ValidationResult: - """ - Validate the generated code for security vulnerabilities using the Bandit tool. - - :param code: The Python code to be validated. - :return: The result of the validation, indicating whether any security vulnerabilities were found. - """ - # todo: implement properly by invoking bandit, see https://bandit.readthedocs.io/en/latest/start.html - return ValidationResult( - name=self.name, passed=True, message="No security vulnerabilities found.", exception=None - ) diff --git a/plexe/internal/models/validation/primitives/syntax.py b/plexe/internal/models/validation/primitives/syntax.py deleted file mode 100644 index bf1e6db2..00000000 --- a/plexe/internal/models/validation/primitives/syntax.py +++ /dev/null @@ -1,46 +0,0 @@ -# plexe/internal/models/validation/syntax.py - -""" -This module defines the SyntaxValidator class, which is responsible for validating the syntax -of Python code using the AST module. - -Classes: - - SyntaxValidator: A validator class that checks the syntax of Python code. -""" - -import ast - -from plexe.internal.models.validation.validator import Validator, ValidationResult - - -class SyntaxValidator(Validator): - """ - A validator class that checks the syntax of Python code using the AST module. - """ - - def __init__(self): - """ - Initialize the SyntaxValidator with the name 'syntax'. - """ - super().__init__("syntax") - - def validate(self, code: str, **kwargs) -> ValidationResult: - """ - Validate Python code using AST. - - :param code: Python code to validate. - :return: Validation result indicating syntax validity. - """ - try: - ast.parse(code) - return ValidationResult(self.name, passed=True, message="Syntax is valid.") - except SyntaxError as e: - return ValidationResult( - self.name, - False, - message=f"Syntax is not valid: {e.msg} at line {e.lineno}, column {e.offset}.", - exception=e, - error_stage="syntax", - error_type="SyntaxError", - error_details=f"{e.msg} at line {e.lineno}, column {e.offset}", - ) diff --git a/plexe/internal/models/validation/validator.py b/plexe/internal/models/validation/validator.py deleted file mode 100644 index 9d1733c1..00000000 --- a/plexe/internal/models/validation/validator.py +++ /dev/null @@ -1,64 +0,0 @@ -# plexe/internal/models/validation/validator.py - -""" -This module defines the `Validator` abstract base class and the `ValidationResult` data class. - -The `Validator` class provides a framework for implementing various code validators, while the -`ValidationResult` class encapsulates the results of a validation, including whether it passed, -any messages, and exceptions raised during validation. -""" - -import abc -from dataclasses import dataclass - - -@dataclass -class ValidationResult: - """ - Represents the result of a validation. - - Attributes: - name (str): The name of the validation. - passed (bool): Whether the validation passed or not. - message (str, optional): A message providing details about the validation result. - exception (Exception, optional): An exception that was raised during validation, if any. - error_stage (str, optional): The stage at which validation failed (e.g., "syntax", "loading", "prediction"). - error_type (str, optional): The type of error that occurred (e.g., "SyntaxError", "TypeError"). - error_details (str, optional): Detailed information about the error. - """ - - name: str - passed: bool - message: str = None - exception: Exception | None = None - error_stage: str = None - error_type: str = None - error_details: str = None - - -class Validator(abc.ABC): - """ - Abstract base class for validators. - - Attributes: - name (str): The name of the validator. - """ - - @abc.abstractmethod - def __init__(self, name: str): - """ - Initializes the validator with a name. - - :param [str] name: The name of the validator. - """ - self.name = name - - @abc.abstractmethod - def validate(self, code: str, **kwargs) -> ValidationResult: - """ - Validates the given code. - - :param [str] code: The code to validate. - :return: [ValidationResult] The result of the validation. - """ - pass diff --git a/plexe/internal/schemas/__init__.py b/plexe/internal/schemas/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/internal/schemas/resolver.py b/plexe/internal/schemas/resolver.py deleted file mode 100644 index 2d92b208..00000000 --- a/plexe/internal/schemas/resolver.py +++ /dev/null @@ -1,139 +0,0 @@ -""" -Module for schema generation and handling. -""" - -import json -import logging -from enum import Enum -from typing import Tuple, Dict, Type - -from pydantic import BaseModel, create_model - -from plexe.config import prompt_templates -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.provider import Provider -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.internal.common.utils.pandas_utils import convert_dtype_to_python -from plexe.internal.common.utils.pydantic_utils import map_to_basemodel - -logger = logging.getLogger(__name__) - - -class SchemaResolver: - """ - A utility class for resolving input and output schemas for a given intent and dataset. - """ - - def __init__( - self, - provider: Provider, - intent: str, - input_schema: Type[BaseModel] = None, - output_schema: Type[BaseModel] = None, - ): - self.provider: Provider = provider - self.intent: str = intent - self.input_schema: Type[BaseModel] | None = input_schema - self.output_schema: Type[BaseModel] | None = output_schema - - # TODO: support Dataset interface instead of just TabularConvertible - def resolve(self, datasets: Dict[str, TabularConvertible] = None) -> Tuple[Type[BaseModel], Type[BaseModel]]: - """ - Resolve the input and output schemas for a given intent and dataset. - - :param datasets: A dictionary of dataset names and their corresponding data. - :return: A tuple containing the input and output schemas. - """ - if datasets: - return self._resolve_from_datasets(datasets) - else: - return self._resolve_from_intent() - - # TODO: support Dataset interface instead of just TabularConvertible - def _resolve_from_datasets( - self, datasets: Dict[str, TabularConvertible] - ) -> Tuple[Type[BaseModel], Type[BaseModel]]: - """ - Generate a schema from a dataset. - :param datasets: - :return: - """ - - try: - feature_names = DatasetAdapter.features(datasets) - - # Infer output column - class OutputSchema(BaseModel): - output: Enum("Features", {feat: feat for feat in feature_names}) - - # Use LLM to decide what the output should be - output_col = json.loads( - self.provider.query( - system_message=prompt_templates.schema_base(), - user_message=prompt_templates.schema_identify_target( - columns="\n".join(f"- {feat}" for feat in feature_names), intent=self.intent - ), - response_format=OutputSchema, - ) - )["output"] - - # Verify output column exists - if output_col not in feature_names: - raise RuntimeError(f"LLM suggested non-existent feature {output_col} as target.") - - # Infer input schema - types = {} - for feature in feature_names: - match feature.split("."): - case [dataset, column]: - if isinstance(datasets[dataset], TabularConvertible): - df = datasets[dataset].to_pandas() - # Pass sample values to help detect list types in object columns - sample_values = df[column].dropna().head(10).tolist() if len(df) > 0 else None - types[column] = convert_dtype_to_python(df[column].dtype, sample_values) - else: - raise ValueError(f"Dataset {dataset} has unsupported type: '{type(datasets[dataset])}'") - case [dataset]: - raise ValueError(f"Dataset {dataset} has unsupported type: '{type(datasets[dataset])}'") - case _: - raise ValueError(f"Feature name '{feature}' is not in the expected format.") - - output_col = output_col.split(".")[-1] - - # Split into input and output schemas - input_schema = {col: types[col] for col in types if col != output_col} - output_schema = {output_col: types[output_col]} - - return map_to_basemodel("InputSchema", input_schema), map_to_basemodel("OutputSchema", output_schema) - - except Exception as e: - logger.error(f"Error inferring schema from data: {e}") - raise - - def _resolve_from_intent(self) -> Tuple[Type[BaseModel], Type[BaseModel]]: - """ - Generate a schema from an intent using the LLM. - :return: input and output schemas - """ - try: - - class SchemaResponse(BaseModel): - input_schema: Dict[str, str] - output_schema: Dict[str, str] - - response = SchemaResponse( - **json.loads( - self.provider.query( - system_message=prompt_templates.schema_base(), - user_message=prompt_templates.schema_generate_from_intent(intent=self.intent), - response_format=SchemaResponse, - ) - ) - ) - return ( - create_model("InputSchema", **response.input_schema), - create_model("OutputSchema", **response.output_schema), - ) - except Exception as e: - logger.error(f"Error generating schema from intent: {e}") - raise diff --git a/plexe/langgraph/__init__.py b/plexe/langgraph/__init__.py new file mode 100644 index 00000000..f69a000c --- /dev/null +++ b/plexe/langgraph/__init__.py @@ -0,0 +1,30 @@ +""" +LangGraph-based multi-agent system for ML model generation. + +This module provides a refactored agent architecture using LangGraph +for orchestrating multiple specialized agents in the ML pipeline. +""" + +from plexe.langgraph.agents import ( + ConversationalAgent, + EDAAgent, + DatasetBuilderAgent, + TaskBuilderAgent, + RelationalGNNSpecialistAgent, + OperationAgent, +) +from plexe.langgraph.orchestrator import PlexeOrchestrator +from plexe.langgraph.state import PipelineState +from plexe.langgraph.config import AgentConfig + +__all__ = [ + "ConversationalAgent", + "EDAAgent", + "DatasetBuilderAgent", + "TaskBuilderAgent", + "RelationalGNNSpecialistAgent", + "OperationAgent", + "PlexeOrchestrator", + "PipelineState", + "AgentConfig", +] diff --git a/plexe/langgraph/agents/__init__.py b/plexe/langgraph/agents/__init__.py new file mode 100644 index 00000000..8c129500 --- /dev/null +++ b/plexe/langgraph/agents/__init__.py @@ -0,0 +1,23 @@ +""" +LangGraph-based agent implementations. + +This module provides specialized agents for the ML pipeline using LangGraph. +""" + +from plexe.langgraph.agents.base import BaseAgent +from plexe.langgraph.agents.conversational import ConversationalAgent +from plexe.langgraph.agents.eda import EDAAgent +from plexe.langgraph.agents.dataset_builder import DatasetBuilderAgent +from plexe.langgraph.agents.task_builder import TaskBuilderAgent +from plexe.langgraph.agents.gnn_specialist import RelationalGNNSpecialistAgent +from plexe.langgraph.agents.operation import OperationAgent + +__all__ = [ + "BaseAgent", + "ConversationalAgent", + "EDAAgent", + "DatasetBuilderAgent", + "TaskBuilderAgent", + "RelationalGNNSpecialistAgent", + "OperationAgent", +] diff --git a/plexe/langgraph/agents/base.py b/plexe/langgraph/agents/base.py new file mode 100644 index 00000000..8313c768 --- /dev/null +++ b/plexe/langgraph/agents/base.py @@ -0,0 +1,408 @@ +""" +Base agent class for LangGraph agents. + +Provides common functionality for all specialized agents. +""" + +import logging +from abc import ABC, abstractmethod +from typing import List, Dict, Any, Optional, Callable + +from langchain_core.tools import BaseTool +from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage +from langchain_core.callbacks import BaseCallbackHandler +from langchain.agents import create_agent + +from plexe.langgraph.config import AgentConfig, get_llm_from_model_id +from plexe.langgraph.state import PipelineState +from plexe.langgraph.utils import BaseEmitter, ChainOfThoughtCallback +from plexe.langgraph.mcp_manager import MCPManager + +logger = logging.getLogger(__name__) + +def extract_text_content(content) -> str: + """Extract text from message content (handles string or list format).""" + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for block in content: + if isinstance(block, str): + parts.append(block) + elif isinstance(block, dict) and block.get("type") == "text": + parts.append(block.get("text", "")) + return "".join(parts) + return str(content) if content else "" + + +class AgentCallbackHandler(BaseCallbackHandler): + """Callback handler for agent events with detailed chain-of-thought.""" + + def __init__(self, agent_name: str, emitter: Optional[BaseEmitter] = None, model_id: str = ""): + self.agent_name = agent_name + self.emitter = emitter + self.model_id = model_id + self.current_thought = "" + self._llm_start_emitted = False + self._last_emitted_text = "" # Track to avoid duplicate emissions + + def on_llm_start(self, serialized, prompts, **kwargs): + # Don't emit "Thinking..." - wait for actual response + self._llm_start_emitted = True + + def on_llm_end(self, response, **kwargs): + if not self.emitter or not response or not response.generations: + return + + try: + generation = response.generations[0][0] + text = None + thinking_text = None + + # Extract thinking/reasoning from extended thinking models (Claude, etc.) + if hasattr(generation, 'message'): + message = generation.message + # Check for thinking blocks (extended thinking) + if hasattr(message, 'content') and isinstance(message.content, list): + for block in message.content: + if isinstance(block, dict): + if block.get("type") == "thinking": + thinking_text = block.get("thinking", "") + elif block.get("type") == "text": + text = block.get("text", "") + elif hasattr(message, 'content'): + text = extract_text_content(message.content) + + # Also check additional_kwargs for reasoning (Gemini, DeepSeek, etc.) + if hasattr(message, 'additional_kwargs'): + kwargs_data = message.additional_kwargs + # DeepSeek reasoning + if 'reasoning_content' in kwargs_data: + thinking_text = kwargs_data['reasoning_content'] + # Gemini thinking in response_metadata + if hasattr(message, 'response_metadata'): + metadata = message.response_metadata or {} + # Some models put thinking in candidates + if 'candidates' in metadata: + for candidate in metadata.get('candidates', []): + if 'content' in candidate: + parts = candidate['content'].get('parts', []) + for part in parts: + if part.get('thought'): + thinking_text = part.get('text', '') + + # Fallback to text extraction + if not text and hasattr(generation, 'text') and generation.text: + text = extract_text_content(generation.text) + + # Determine what to emit - prefer thinking, fallback to text + emit_text = thinking_text or text + if not emit_text: + return + + # Avoid duplicate emissions + emit_text = emit_text.strip() + if emit_text == self._last_emitted_text: + return + self._last_emitted_text = emit_text + + # Skip if it looks like a tool call response (starts with JSON or action) + if emit_text.startswith('{') or emit_text.startswith('Action:'): + return + + # Use full text without truncation for UI display + display_text = emit_text + + self.current_thought = display_text + model_info = f" [{self.model_id}]" if self.model_id else "" + + # Extract token usage if available + token_usage = None + if hasattr(response, 'llm_output') and response.llm_output: + # Some LLMs return token_usage in llm_output + token_usage = response.llm_output.get('token_usage') + elif hasattr(generation, 'message'): + message = generation.message + # Check for usage_metadata (LangChain standard as of 0.2+) + if hasattr(message, 'usage_metadata') and message.usage_metadata: + usage = message.usage_metadata + # usage_metadata is a TypedDict with keys: input_tokens, output_tokens, total_tokens + token_usage = { + 'prompt_tokens': usage.get('input_tokens', 0), + 'completion_tokens': usage.get('output_tokens', 0), + 'total_tokens': usage.get('total_tokens', 0) + } + # Fallback: Check response_metadata for usage info (older format) + elif hasattr(message, 'response_metadata') and message.response_metadata: + metadata = message.response_metadata + if 'usage' in metadata: + usage = metadata['usage'] + token_usage = { + 'prompt_tokens': usage.get('prompt_tokens', 0), + 'completion_tokens': usage.get('completion_tokens', 0), + 'total_tokens': usage.get('total_tokens', 0) + } + + if thinking_text: + self.emitter.emit_thought(self.agent_name, f"💭 Reasoning{model_info}:\n{display_text}", token_usage) + else: + self.emitter.emit_thought(self.agent_name, f"💡 Analysis{model_info}:\n{display_text}", token_usage) + + except Exception as e: + logger.debug(f"Error extracting LLM response: {e}") + + def on_tool_start(self, serialized, input_str, **kwargs): + if self.emitter: + tool_name = serialized.get("name", "tool") if isinstance(serialized, dict) else "tool" + args = {} + if isinstance(input_str, str): + try: + import json + args = json.loads(input_str) if input_str.startswith("{") else {"input": input_str[:100]} + except: + args = {"input": str(input_str)[:100]} + elif isinstance(input_str, dict): + args = {k: str(v)[:50] for k, v in list(input_str.items())[:3]} + self.emitter.emit_tool_call(self.agent_name, tool_name, args) + + def on_tool_end(self, output, **kwargs): + if self.emitter and output: + output_str = str(output) if output else "" + if output_str: + # Format with newlines for better readability + formatted_output = output_str.replace('\\n', '\n') + self.emitter.emit_thought(self.agent_name, f"Tool result:\n{formatted_output}") + + def on_chain_error(self, error, **kwargs): + if self.emitter: + self.emitter.emit_thought(self.agent_name, f"Error encountered: {str(error)[:200]}") + + +class BaseAgent(ABC): + """Base class for all LangGraph agents.""" + + def __init__( + self, + agent_type: str, + config: Optional[AgentConfig] = None, + tools: Optional[List[BaseTool]] = None, + emitter: Optional[BaseEmitter] = None, + ): + """ + Initialize the base agent. + + Args: + agent_type: Type identifier for this agent + config: Agent configuration (uses defaults if None) + tools: List of tools available to this agent + emitter: Optional emitter for progress callbacks + """ + self.agent_type = agent_type + self.config = config or AgentConfig.from_env() + self.emitter = emitter + self.tools = tools or [] + + # Initialize MCP Manager and load tools using a thread to avoid event loop conflicts + import asyncio + from concurrent.futures import ThreadPoolExecutor + + self.mcp_manager = MCPManager() + + def _run_async_init(): + """Run async MCP initialization in a new event loop within a thread.""" + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(self.mcp_manager.initialize()) + return self.mcp_manager.get_tools() + finally: + loop.close() + + try: + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(_run_async_init) + mcp_tools = future.result(timeout=30) + if mcp_tools: + logger.info(f"Agent {self.name} loaded {len(mcp_tools)} MCP tools") + self.tools.extend(mcp_tools) + except Exception as e: + logger.warning(f"Could not load MCP tools for {self.name}: {e}") + + self.model_id = self.config.get_model_for_agent(agent_type) + self.llm = get_llm_from_model_id(self.model_id, self.config.temperature) + + self._agent = None + self._callback_handler = AgentCallbackHandler(self.name, emitter, self.model_id) + + @property + @abstractmethod + def system_prompt(self) -> str: + """Return the system prompt for this agent.""" + pass + + @property + def name(self) -> str: + """Return the agent name.""" + return self.__class__.__name__ + + @property + def description(self) -> str: + """Return the agent description.""" + return self.system_prompt[:200] + "..." + + def set_emitter(self, emitter: BaseEmitter): + """Set the emitter for progress callbacks.""" + self.emitter = emitter + self._callback_handler = AgentCallbackHandler(self.name, emitter, self.model_id) + + def get_agent(self): + """Get or create the LangGraph agent.""" + if self._agent is None: + self._agent = create_agent( + model=self.llm, + tools=self.tools, + system_prompt=self.system_prompt, + ) + return self._agent + + def invoke(self, state: PipelineState) -> Dict[str, Any]: + """ + Invoke the agent with the current state, streaming thoughts to emitter. + + Args: + state: Current pipeline state + + Returns: + Updated state components + """ + agent = self.get_agent() + + messages = self._build_messages(state) + logger.info(f"Agent {self.name} invoking with {len(messages)} messages using model {self.model_id}") + + if self.emitter: + self.emitter.emit_agent_start(self.name, self.model_id) + + try: + config = {"callbacks": [self._callback_handler]} if self.emitter else {} + + result = None + last_valid_output = None + for chunk in agent.stream({"messages": messages}, config=config, stream_mode="updates"): + for node_name, node_output in chunk.items(): + # Store valid outputs (dict with messages) + if isinstance(node_output, dict) and node_output.get("messages"): + last_valid_output = node_output + + if self.emitter and node_name == "agent" and isinstance(node_output, dict): + agent_messages = node_output.get("messages", []) + for msg in agent_messages: + if isinstance(msg, AIMessage): + # Don't emit here - let the callback handler do it + # Just emit tool calls + if hasattr(msg, 'tool_calls') and msg.tool_calls: + for tc in msg.tool_calls: + tool_name = tc.get("name", "unknown") if isinstance(tc, dict) else getattr(tc, 'name', 'unknown') + tool_args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, 'args', {}) + self.emitter.emit_tool_call(self.name, tool_name, tool_args) + result = node_output + + # Use last valid output if final result is None or invalid + if result is None or not isinstance(result, dict): + if last_valid_output: + result = last_valid_output + else: + result = agent.invoke({"messages": messages}, config=config) + elif not result.get("messages"): + if last_valid_output: + result = last_valid_output + else: + result = {"messages": []} + + logger.info(f"Agent {self.name} received {len(result.get('messages', []))} response messages") + + processed = self._process_result(result, state) + + if self.emitter: + response_text = "" + for msg in processed.get("messages", []): + if msg.get("role") == "assistant": + response_text = msg.get("content", "") + break + self.emitter.emit_agent_end(self.name, response_text) + + return processed + except Exception as e: + logger.error(f"Agent {self.name} failed: {e}", exc_info=True) + if self.emitter: + self.emitter.emit_agent_end(self.name, f"Error: {str(e)}") + return { + "errors": [f"{self.name} error: {str(e)}"] + } + + def _build_messages(self, state: PipelineState) -> List: + """Build message list from state.""" + messages = [SystemMessage(content=self.system_prompt)] + + for msg in state.get("messages", []): + role = msg.get("role", "user") + content = msg.get("content", "") + + if role == "user": + messages.append(HumanMessage(content=content)) + elif role == "assistant": + messages.append(AIMessage(content=content)) + elif role == "system": + messages.append(SystemMessage(content=content)) + + context = self._build_context(state) + if context: + messages.append(HumanMessage(content=f"Current context:\n{context}")) + + return messages + + def _build_context(self, state: PipelineState) -> str: + """Build context string from state for the agent.""" + context_parts = [] + + if state.get("working_dir"): + context_parts.append(f"Working directory: {state['working_dir']}") + + if state.get("db_connection_string"): + context_parts.append(f"Database connection: {state['db_connection_string']}") + + if state.get("csv_dir"): + context_parts.append(f"CSV directory: {state['csv_dir']}") + + if state.get("schema_info"): + tables = list(state["schema_info"].get("tables", {}).keys()) + context_parts.append(f"Available tables: {', '.join(tables)}") + + if state.get("dataset_info"): + context_parts.append(f"Dataset class: {state['dataset_info'].get('class_name')}") + context_parts.append(f"Dataset file: {state['dataset_info'].get('file_path')}") + + if state.get("task_info"): + context_parts.append(f"Task class: {state['task_info'].get('class_name')}") + context_parts.append(f"Task type: {state['task_info'].get('task_type')}") + + return "\n".join(context_parts) + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """ + Process agent result and extract state updates. + + Override in subclasses to handle specific outputs. + """ + messages = result.get("messages", []) + + new_messages = [] + for msg in messages: + if isinstance(msg, AIMessage): + new_messages.append({ + "role": "assistant", + "content": extract_text_content(msg.content), + "timestamp": None, + }) + + return {"messages": new_messages} diff --git a/plexe/langgraph/agents/conversational.py b/plexe/langgraph/agents/conversational.py new file mode 100644 index 00000000..a6e9cff7 --- /dev/null +++ b/plexe/langgraph/agents/conversational.py @@ -0,0 +1,112 @@ +""" +Conversational Agent for user interaction and requirements gathering. + +This agent guides users through ML model definition via natural conversation, +validates inputs, and initiates the model building process. +""" + +import logging +from typing import Optional, List, Dict, Any + +from langchain_core.tools import BaseTool + +from plexe.langgraph.agents.base import BaseAgent, extract_text_content +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.state import PipelineState, PipelinePhase +from plexe.langgraph.tools.conversational import get_dataset_preview +from plexe.langgraph.tools.graph_architect import validate_db_connection +from plexe.langgraph.prompts.conversational import CONVERSATIONAL_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + +class ConversationalAgent(BaseAgent): + """Agent for conversational requirements gathering and user interaction.""" + + def __init__( + self, + config: Optional[AgentConfig] = None, + additional_tools: Optional[List[BaseTool]] = None, + ): + tools = [ + get_dataset_preview, + validate_db_connection, + ] + + if additional_tools: + tools.extend(additional_tools) + + super().__init__( + agent_type="conversational", + config=config, + tools=tools, + ) + + @property + def system_prompt(self) -> str: + return CONVERSATIONAL_SYSTEM_PROMPT + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """Process conversation result and detect readiness to proceed.""" + base_result = super()._process_result(result, state) + + messages = result.get("messages", []) + last_message = messages[-1] if messages else None + + if last_message: + raw_content = last_message.content if hasattr(last_message, 'content') else "" + content = extract_text_content(raw_content).lower() + logger.info(f"ConversationalAgent response: {content[:200]}...") + + ready_indicators = [ + "ready to proceed", + "start building", + "begin training", + "initiate pipeline", + "all requirements gathered", + "let's begin", + "let me start", + "i'll start", + "proceed with", + "starting the pipeline", + "begin the process", + ] + + if any(indicator in content for indicator in ready_indicators): + logger.info("Detected ready indicator, setting user_confirmation_required") + base_result["user_confirmation_required"] = True + base_result["user_confirmed"] = True + base_result["user_intent"] = self._extract_intent_from_state(state) + base_result["user_confirmation_context"] = { + "type": "proceed_to_pipeline", + "message": "Ready to start the ML pipeline" + } + + has_db = state.get("db_connection_string") + has_task = any("predict" in msg.get("content", "").lower() for msg in state.get("messages", [])) + if has_db and has_task and not base_result.get("user_intent"): + logger.info("Auto-detecting intent from state") + base_result["user_intent"] = self._extract_intent_from_state(state) + + return base_result + + def _extract_intent_from_state(self, state: PipelineState) -> Dict[str, Any]: + """Extract intent from state.""" + intent = { + "prediction_target": None, + "entity_type": None, + "task_type": "binary_classification", + "data_source": "database" if state.get("db_connection_string") else "csv", + "confirmed": True, + } + + for msg in state.get("messages", []): + content = msg.get("content", "").lower() + if "predict" in content: + intent["prediction_target"] = msg.get("content", "")[:200] + if "churn" in content or "leave" in content or "cancel" in content: + intent["task_type"] = "binary_classification" + elif "count" in content or "amount" in content or "revenue" in content: + intent["task_type"] = "regression" + break + + return intent diff --git a/plexe/langgraph/agents/dataset_builder.py b/plexe/langgraph/agents/dataset_builder.py new file mode 100644 index 00000000..5bd2f134 --- /dev/null +++ b/plexe/langgraph/agents/dataset_builder.py @@ -0,0 +1,198 @@ +""" +Dataset Builder Agent. + +This agent builds RelBench Database objects from CSV files, +generating complete Python Dataset classes for GNN training. +""" + +import logging +import os +from typing import Optional, List, Dict, Any + +from langchain_core.tools import BaseTool + +from plexe.langgraph.agents.base import BaseAgent +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.state import PipelineState, PipelinePhase +from plexe.langgraph.tools.common import save_artifact +from plexe.langgraph.tools.dataset_builder import ( + get_csv_files_info, + get_temporal_statistics, + register_dataset_code, +) +from plexe.langgraph.prompts.dataset_builder import DATASET_BUILDER_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + +class DatasetBuilderAgent(BaseAgent): + """Agent for building RelBench Dataset classes from CSV data.""" + + def __init__( + self, + config: Optional[AgentConfig] = None, + additional_tools: Optional[List[BaseTool]] = None, + ): + tools = [ + get_csv_files_info, + get_temporal_statistics, + register_dataset_code, + save_artifact, + ] + + if additional_tools: + tools.extend(additional_tools) + + super().__init__( + agent_type="dataset_builder", + config=config, + tools=tools, + ) + + @property + def system_prompt(self) -> str: + return DATASET_BUILDER_SYSTEM_PROMPT + + def _build_context(self, state: PipelineState) -> str: + """Build context with CSV, schema, and EDA information.""" + context_parts = [] + + working_dir = state.get('working_dir', '') + csv_dir = state.get('csv_dir', '') + + # Convert to absolute paths for the agent + if working_dir: + working_dir = os.path.abspath(working_dir) + context_parts.append(f"Working directory: {working_dir}") + + if csv_dir: + csv_dir = os.path.abspath(csv_dir) + context_parts.append(f"CSV directory: {csv_dir}") + + if state.get("schema_info"): + schema = state["schema_info"] + tables = list(schema.get("tables", {}).keys()) + context_parts.append(f"Tables: {', '.join(tables)}") + + if schema.get("relationships"): + rels = [] + for r in schema["relationships"]: + rels.append(f"{r['source_table']}.{r['source_column']} -> {r['target_table']}") + context_parts.append(f"Foreign keys: {'; '.join(rels)}") + + if schema.get("temporal_columns"): + for table, cols in schema["temporal_columns"].items(): + context_parts.append(f"{table} time columns: {cols}") + + if state.get("eda_info"): + eda = state["eda_info"] + context_parts.append("\n## EDA Analysis Results:") + + if eda.get("quality_issues"): + context_parts.append("Data Quality Issues:") + for table, issues in eda["quality_issues"].items(): + if issues: + context_parts.append(f" - {table}: {issues}") + + if eda.get("temporal_analysis"): + context_parts.append("Temporal Analysis:") + for table, analysis in eda["temporal_analysis"].items(): + if analysis.get("time_columns"): + context_parts.append(f" - {table}: time columns = {analysis['time_columns']}") + + if eda.get("suggested_splits"): + splits = eda["suggested_splits"] + if splits.get("val_timestamp"): + context_parts.append(f"Suggested val_timestamp: {splits['val_timestamp']}") + if splits.get("test_timestamp"): + context_parts.append(f"Suggested test_timestamp: {splits['test_timestamp']}") + + if eda.get("relationship_analysis"): + context_parts.append("Relationship Analysis:") + for key, info in eda["relationship_analysis"].items(): + if isinstance(info, dict) and info.get("cardinality"): + context_parts.append(f" - {key}: {info['cardinality']}") + + task_instruction = f""" +YOUR COMPLETE TASK - ALL 5 STEPS ARE MANDATORY + +STEP 1: Information Gathering +Tool: get_csv_files_info("{csv_dir}") +Purpose: Understand table structure, column names, and row counts for all CSV files + +STEP 2: Temporal Analysis +Tool: get_temporal_statistics("{csv_dir}") +Purpose: Determine val_timestamp and test_timestamp for train/validation/test splits + +STEP 3: Design Analysis (write your analysis explicitly) +You must analyze and document: +- Which tables are temporal (have time_col) vs static (time_col=None) +- Foreign key relationships between tables (which columns reference which tables) +- The exact val_timestamp and test_timestamp values you will use +- Any data cleaning requirements (\\N missing values, timezone handling, type conversions) +- Which tables are dimension tables vs fact tables +Action: Write out your complete analysis before proceeding to Step 4 + +STEP 4: Code Generation +Generate a complete GenDataset class that includes: +- Class definition extending Dataset +- val_timestamp = pd.Timestamp("YYYY-MM-DD") using value from Step 2 +- test_timestamp = pd.Timestamp("YYYY-MM-DD") using value from Step 2 +- __init__ method that accepts csv_dir and cache_dir parameters +- make_db() method that: + * Loads all CSV files from Step 1 + * Applies data cleaning from Step 3 + * Creates Table objects with correct fkey_col_to_pkey_table mappings + * Sets appropriate time_col for each table (or None for static tables) + * Returns Database with all tables +Action: Generate the complete, working Python code now + +STEP 5: Code Registration (CRITICAL - DO NOT SKIP) +Tool: register_dataset_code(code, "GenDataset", "{working_dir}/dataset.py") +Purpose: Save the generated Python code to the file system +Action: Call this tool with your complete code from Step 4 +Result: Must return {{"status": "registered", ...}} + +CRITICAL REQUIREMENTS: +- DO NOT STOP after Steps 1-2. You MUST complete ALL 5 STEPS. +- DO NOT say "I will generate the code" or "Next I'll call the tool" - EXECUTE the actions immediately. +- The file {working_dir}/dataset.py MUST exist when you finish. +- Your task is INCOMPLETE without calling register_dataset_code() in Step 5. + +SUCCESS CONDITION: +The register_dataset_code() tool was called and returned success status. +The file {working_dir}/dataset.py exists and contains valid Python code. + +FAILURE CONDITION: +You stopped before calling register_dataset_code() OR the file was not created. +""" + context_parts.append(task_instruction) + + return "\n".join(context_parts) + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """Process result and extract dataset information.""" + base_result = super()._process_result(result, state) + + dataset_info = {} + working_dir = state.get("working_dir", "") + + # Check if dataset.py was created + dataset_path = os.path.join(working_dir, "dataset.py") + if os.path.exists(dataset_path): + dataset_info["class_name"] = "GenDataset" + dataset_info["file_path"] = dataset_path + logger.info(f"Dataset file created at: {dataset_path}") + else: + error_msg = f"CRITICAL ERROR: Dataset file not found at {dataset_path}. DatasetBuilderAgent did not complete its task. The agent must call register_dataset_code() to generate dataset.py." + logger.error(error_msg) + # Return error state to force re-invocation or escalation + base_result["error"] = error_msg + base_result["status"] = "error" + dataset_info["class_name"] = "GenDataset" + dataset_info["file_path"] = dataset_path + dataset_info["error"] = "File not generated" + + base_result["dataset_info"] = dataset_info + base_result["current_phase"] = PipelinePhase.TASK_BUILDING.value + + return base_result diff --git a/plexe/langgraph/agents/eda.py b/plexe/langgraph/agents/eda.py new file mode 100644 index 00000000..834e18fb --- /dev/null +++ b/plexe/langgraph/agents/eda.py @@ -0,0 +1,147 @@ +""" +EDA (Exploratory Data Analysis) Agent. + +This agent transforms relational databases into heterogeneous graphs, +performs comprehensive exploratory data analysis, and prepares data for modeling. +""" + +import logging +from typing import Optional, List, Dict, Any + +from langchain_core.tools import BaseTool + +from plexe.langgraph.agents.base import BaseAgent +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.state import PipelineState, PipelinePhase +from plexe.langgraph.tools.graph_architect import ( + validate_db_connection, + export_tables_to_csv, + extract_schema_metadata, +) +from plexe.langgraph.tools.eda import ( + analyze_csv_statistics, + detect_data_quality_issues, + analyze_temporal_patterns, + analyze_table_relationships, + generate_eda_summary, +) +from plexe.langgraph.prompts.eda import EDA_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + +class EDAAgent(BaseAgent): + """Agent for schema analysis, data export, and exploratory data analysis.""" + + def __init__( + self, + config: Optional[AgentConfig] = None, + additional_tools: Optional[List[BaseTool]] = None, + ): + tools = [ + validate_db_connection, + export_tables_to_csv, + extract_schema_metadata, + analyze_csv_statistics, + detect_data_quality_issues, + analyze_temporal_patterns, + analyze_table_relationships, + generate_eda_summary, + ] + + if additional_tools: + tools.extend(additional_tools) + + super().__init__( + agent_type="eda", + config=config, + tools=tools, + ) + + @property + def system_prompt(self) -> str: + return EDA_SYSTEM_PROMPT + + def _build_context(self, state: PipelineState) -> str: + """Build context with database and EDA-specific information.""" + context_parts = [] + + if state.get("working_dir"): + context_parts.append(f"Working directory: {state['working_dir']}") + context_parts.append(f"CSV output directory: {state['working_dir']}/csv_files") + + if state.get("db_connection_string"): + context_parts.append(f"Database: {state['db_connection_string']}") + + if state.get("user_intent"): + intent = state["user_intent"] + if isinstance(intent, dict): + context_parts.append(f"Prediction target: {intent.get('prediction_target', 'unknown')}") + else: + context_parts.append(f"User intent: {intent}") + + context_parts.append(""" +EXECUTE THESE STEPS: +1. extract_schema_metadata(db_connection_string) - analyze database schema +2. export_tables_to_csv(db_connection_string, working_dir/csv_files) - export data +3. analyze_csv_statistics(working_dir/csv_files) - get statistics +4. detect_data_quality_issues(working_dir/csv_files) - find issues +5. analyze_temporal_patterns(working_dir/csv_files) - find timestamps +6. analyze_table_relationships(working_dir/csv_files) - classify tables +7. generate_eda_summary(working_dir/csv_files) - create report +""") + + return "\n".join(context_parts) + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """Process result and extract schema/CSV/EDA information.""" + base_result = super()._process_result(result, state) + + eda_info = {} + working_dir = state.get("working_dir", "") + csv_dir = f"{working_dir}/csv_files" if working_dir else None + + messages = result.get("messages", []) + for msg in messages: + if hasattr(msg, 'tool_calls') and msg.tool_calls: + for tool_call in msg.tool_calls: + tool_name = tool_call.get("name", "") + tool_result = tool_call.get("result", {}) + + if tool_name == "export_tables_to_csv": + if csv_dir: + base_result["csv_dir"] = csv_dir + + if tool_name == "extract_schema_metadata": + if isinstance(tool_result, dict) and "tables" in tool_result: + base_result["schema_info"] = tool_result + + if tool_name == "analyze_csv_statistics" and isinstance(tool_result, dict): + if tool_result.get("status") == "success": + eda_info["statistics"] = tool_result.get("statistics") + + if tool_name == "detect_data_quality_issues" and isinstance(tool_result, dict): + if tool_result.get("status") == "success": + eda_info["quality_issues"] = tool_result.get("quality_issues") + + if tool_name == "analyze_temporal_patterns" and isinstance(tool_result, dict): + if tool_result.get("status") == "success": + eda_info["temporal_analysis"] = tool_result.get("temporal_analysis") + eda_info["suggested_splits"] = tool_result.get("suggested_splits") + + if tool_name == "analyze_table_relationships" and isinstance(tool_result, dict): + if tool_result.get("status") == "success": + eda_info["relationship_analysis"] = tool_result.get("relationship_analysis") + + if tool_name == "generate_eda_summary" and isinstance(tool_result, dict): + if tool_result.get("status") == "success": + eda_info["summary"] = tool_result.get("summary") + + if csv_dir: + base_result["csv_dir"] = csv_dir + + if eda_info: + base_result["eda_info"] = eda_info + + base_result["current_phase"] = PipelinePhase.DATASET_BUILDING.value + + return base_result diff --git a/plexe/langgraph/agents/gnn_specialist.py b/plexe/langgraph/agents/gnn_specialist.py new file mode 100644 index 00000000..d55d06f7 --- /dev/null +++ b/plexe/langgraph/agents/gnn_specialist.py @@ -0,0 +1,198 @@ +""" +Relational GNN Specialist Agent. + +This agent generates and executes GNN training scripts using +the plexe.relbench.modeling modules. +""" + +import logging +from typing import Optional, List, Dict, Any + +from langchain_core.tools import BaseTool + +from plexe.langgraph.agents.base import BaseAgent +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.state import PipelineState, PipelinePhase +from plexe.langgraph.tools.common import save_artifact +from plexe.langgraph.tools.gnn_specialist import generate_training_script +from plexe.langgraph.prompts.gnn_specialist import GNN_SPECIALIST_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + +class RelationalGNNSpecialistAgent(BaseAgent): + """ + Agent for GNN training script generation with Training-Free HPO. + + This agent uses MCP (Model Context Protocol) to access external + knowledge sources for hyperparameter optimization without training. + MCP tools are loaded automatically via MCPManager in BaseAgent. + """ + + def __init__( + self, + config: Optional[AgentConfig] = None, + additional_tools: Optional[List[BaseTool]] = None, + ): + # Core GNN-specific tools (non-MCP) + tools = [ + generate_training_script, + save_artifact, + ] + + if additional_tools: + tools.extend(additional_tools) + + super().__init__( + agent_type="gnn_specialist", + config=config, + tools=tools, + ) + + @property + def system_prompt(self) -> str: + return GNN_SPECIALIST_SYSTEM_PROMPT + + def _build_context(self, state: PipelineState) -> str: + """Build context with training-specific information.""" + context_parts = [] + + working_dir = state.get("working_dir", "") + csv_dir = state.get("csv_dir", "") + + context_parts.append(f"Working directory: {working_dir}") + context_parts.append(f"CSV directory: {csv_dir}") + + dataset_info = state.get("dataset_info") + if dataset_info and isinstance(dataset_info, dict): + context_parts.append(f"Dataset file: {dataset_info.get('file_path', working_dir + '/dataset.py')}") + context_parts.append(f"Dataset class: {dataset_info.get('class_name', 'GenDataset')}") + else: + context_parts.append(f"Dataset file: {working_dir}/dataset.py") + context_parts.append(f"Dataset class: GenDataset") + + task_info = state.get("task_info") + if task_info and isinstance(task_info, dict): + context_parts.append(f"Task file: {task_info.get('file_path', working_dir + '/task.py')}") + context_parts.append(f"Task class: {task_info.get('class_name', 'GenTask')}") + context_parts.append(f"Task type: {task_info.get('task_type', 'binary_classification')}") + task_type = task_info.get("task_type", "binary_classification") + else: + context_parts.append(f"Task file: {working_dir}/task.py") + context_parts.append(f"Task class: GenTask") + context_parts.append(f"Task type: binary_classification") + task_type = "binary_classification" + + # Build dataset characteristics for HPO search + # Safely get schema_info - handle None case explicitly + schema_info = state.get("schema_info") + num_tables = 0 + if schema_info and isinstance(schema_info, dict): + tables = schema_info.get("tables", {}) + if tables: + num_tables = len(tables) + + dataset_chars = { + "num_tables": num_tables, + "num_nodes": 10000, # Estimate - would be calculated from schema + "is_temporal": True, # Always true for RelBench tasks + } + + context_parts.append(f""" +EXECUTE THESE STEPS (Training-Free HPO via MCP): + +1. SEARCH FOR OPTIMAL HYPERPARAMETERS using MCP tools from MULTIPLE sources: + + a) HEURISTICS - search_optimal_hyperparameters( + task_type="{task_type}", + num_nodes={dataset_chars.get('num_nodes', 10000)}, + num_tables={dataset_chars.get('num_tables', 5)}, + is_temporal={dataset_chars.get('is_temporal', True)}, + model_architecture="gnn" + ) + # Returns: Rule-based hyperparameters + + b) GOOGLE SCHOLAR - search_gnn_papers_for_hyperparameters( + task_type="{task_type}", + model_type="Graph Neural Network", + limit=5 + ) + # Returns: Hyperparameters extracted from Google Scholar papers with citations + + c) KAGGLE BENCHMARKS - search_gnn_competitions_for_benchmarks( + task_type="{task_type}", + limit=3 + ) + # Returns: Winning solutions from Kaggle competitions + + d) KAGGLE NOTEBOOKS - search_gnn_notebooks_for_hyperparameters( + task_type="GNN {task_type}", + limit=5 + ) + # Returns: Top voted notebooks with proven hyperparameters + + e) ARXIV PAPERS - search_arxiv_papers( + query="Graph Neural Network {task_type} hyperparameters", + max_results=5 + ) + # Returns: Recent preprints with methodology details + + f) ENSEMBLE VOTING - compare_hyperparameter_configs( + configs=[results_from_a, results_from_b, results_from_c, results_from_d], + strategy="ensemble_median" + ) + # Returns: Final recommended hyperparameters via ensemble voting + +2. GENERATE TRAINING SCRIPT with optimal hyperparameters: + generate_training_script( + dataset_module_path="{working_dir}/dataset.py", + dataset_class_name="GenDataset", + task_module_path="{working_dir}/task.py", + task_class_name="GenTask", + working_dir="{working_dir}", + csv_dir="{csv_dir}", + task_type="{task_type}", + **recommended_hyperparameters # Use result from step 1f + ) + +3. Report the selected hyperparameters with reasoning from all sources: + - Google Scholar papers (academic consensus) + - Kaggle competitions (proven winners) + - Kaggle notebooks (community best practices) + - arXiv preprints (cutting-edge research) + - Heuristic rules (dataset-specific) + +NOTE: +- All HPO tools are provided via MCP (Model Context Protocol) +- You have access to 5 knowledge sources: Google Scholar, Kaggle, arXiv, Semantic Scholar, Papers With Code +- Training execution will be handled by the Operation Agent +- Focus on selecting optimal hyperparameters WITHOUT training experiments +""") + + return "\n".join(context_parts) + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """Process result and prepare for operation phase.""" + base_result = super()._process_result(result, state) + + import os + + working_dir = state.get("working_dir", "") + script_path = os.path.join(working_dir, "train_script.py") + + # Check if training script was generated + if os.path.exists(script_path): + base_result["training_script_ready"] = True + base_result["training_script_path"] = script_path + logger.info(f"Training script generated at {script_path}") + else: + logger.warning("Training script not found") + base_result["training_script_ready"] = False + + # Store selected hyperparameters in state for Operation Agent + if "hyperparameters" in result: + base_result["selected_hyperparameters"] = result["hyperparameters"] + + # Transition to OPERATION phase for execution + base_result["current_phase"] = PipelinePhase.OPERATION.value + + return base_result diff --git a/plexe/langgraph/agents/operation.py b/plexe/langgraph/agents/operation.py new file mode 100644 index 00000000..7a3f62dc --- /dev/null +++ b/plexe/langgraph/agents/operation.py @@ -0,0 +1,261 @@ +""" +Operation Agent. + +This agent handles environment setup, execution monitoring, +and final model packaging. +""" + +import logging +from typing import Optional, List, Dict, Any + +from langchain_core.tools import BaseTool + +from plexe.langgraph.agents.base import BaseAgent +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.state import PipelineState, PipelinePhase +from plexe.langgraph.tools.common import save_artifact +from plexe.langgraph.tools.gnn_specialist import execute_training_script +from plexe.langgraph.prompts.operation import OPERATION_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + + +class OperationAgent(BaseAgent): + """Agent for environment setup and execution monitoring.""" + + def __init__( + self, + config: Optional[AgentConfig] = None, + additional_tools: Optional[List[BaseTool]] = None, + ): + tools = [ + execute_training_script, + save_artifact, + ] + + if additional_tools: + tools.extend(additional_tools) + + super().__init__( + agent_type="operation", + config=config, + tools=tools, + ) + + @property + def system_prompt(self) -> str: + return OPERATION_SYSTEM_PROMPT + + def _build_context(self, state: PipelineState) -> str: + """Build context with operation-specific information.""" + context_parts = [] + + working_dir = state.get("working_dir", "") + context_parts.append(f"Working directory: {working_dir}") + + # Check if training script is ready + training_script_ready = state.get("training_script_ready", False) + training_script_path = state.get("training_script_path", f"{working_dir}/train_script.py") + + if training_script_ready: + context_parts.append(f"Training script ready: {training_script_path}") + + # Check if training has been executed + if state.get("training_result"): + result = state["training_result"] + context_parts.append(f"Training already completed:") + context_parts.append(f" - Metrics: {result.get('metrics')}") + context_parts.append(f" - Model path: {result.get('model_path')}") + + # Check for hyperparameters from GNN Specialist + if state.get("selected_hyperparameters"): + hp = state["selected_hyperparameters"] + context_parts.append(f"Selected hyperparameters: {hp}") + + if state.get("errors"): + context_parts.append(f"Previous errors: {state['errors']}") + + # Instructions based on state + if not state.get("training_result"): + context_parts.append(f""" +EXECUTE TRAINING: +1. execute_training_script( + script_path="{training_script_path}", + timeout=3600 # 1 hour timeout +) +2. Process the training results from {working_dir}/training_results.json +3. Report metrics and model location +""") + else: + context_parts.append(f""" +FINALIZE PIPELINE: +1. Review training results +2. List all generated artifacts: + - {working_dir}/dataset.py - Dataset class + - {working_dir}/task.py - Task class + - {working_dir}/train_script.py - Training script + - {working_dir}/best_model.pt - Trained model + - {working_dir}/training_results.json - Training metrics +3. Provide summary and deployment recommendations +""") + + return "\n".join(context_parts) + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """Process result and finalize the pipeline.""" + base_result = super()._process_result(result, state) + + import os + import json + + working_dir = state.get("working_dir", ".") + results_path = os.path.join(working_dir, "training_results.json") + + # Check if training was executed and results are available + if os.path.exists(results_path): + try: + with open(results_path) as f: + training_results = json.load(f) + + base_result["training_result"] = { + "metrics": training_results, + "model_path": training_results.get("model_path"), + "script_path": os.path.join(working_dir, "train_script.py"), + } + logger.info(f"Training results processed: {training_results}") + except Exception as e: + logger.warning(f"Could not read training results: {e}") + base_result["errors"] = base_result.get("errors", []) + [f"Failed to read training results: {e}"] + + # Mark pipeline as completed + base_result["current_phase"] = PipelinePhase.COMPLETED.value + + return base_result + + def generate_inference_code(self, state: PipelineState) -> str: + """Generate inference code for the trained model.""" + working_dir = state.get("working_dir", ".") + # Safely get task_type - handle None case explicitly + task_info = state.get("task_info") + task_type = "regression" + if task_info and isinstance(task_info, dict): + task_type = task_info.get("task_type", "regression") + + inference_code = f'''""" +Auto-generated inference code for the trained GNN model. +""" + +import torch +import sys +import os + +sys.path.insert(0, "{working_dir}") + +from dataset import GenDataset +from task import GenTask + +def load_model(model_path: str): + """Load the trained model.""" + from plexe.relbench.modeling.nn import HeteroEncoder, HeteroTemporalEncoder, HeteroGraphSAGE + from plexe.relbench.modeling.graph import make_pkey_fkey_graph + from plexe.relbench.modeling.utils import get_stype_proposal + + # Initialize dataset and task + csv_dir = "{working_dir}/csv_files" + dataset = GenDataset(csv_dir=csv_dir) + task = GenTask(dataset) + db = dataset.get_db() + + # Build graph + col_to_stype_dict = get_stype_proposal(db) + data, col_stats_dict = make_pkey_fkey_graph( + db, + col_to_stype_dict=col_to_stype_dict, + text_embedder_cfg=None, + cache_dir="{working_dir}/cache/", + ) + + # Recreate model architecture + class GNNModel(torch.nn.Module): + def __init__(self, data, col_stats_dict, hidden_channels=128, out_channels=1): + super().__init__() + self.encoder = HeteroEncoder( + channels=hidden_channels, + node_to_col_names={{ + node_type: list(col_stats_dict[node_type].keys()) + for node_type in data.node_types + if node_type in col_stats_dict + }}, + node_to_col_stats=col_stats_dict, + ) + self.temporal_encoder = HeteroTemporalEncoder( + node_types=data.node_types, + channels=hidden_channels, + ) + self.gnn = HeteroGraphSAGE( + node_types=data.node_types, + edge_types=data.edge_types, + channels=hidden_channels, + num_layers=2, + ) + self.head = torch.nn.Sequential( + torch.nn.Linear(hidden_channels, hidden_channels), + torch.nn.ReLU(), + torch.nn.Dropout(0.2), + torch.nn.Linear(hidden_channels, out_channels), + ) + + def forward(self, batch, entity_table): + x_dict = self.encoder(batch.tf_dict) + rel_time_dict = self.temporal_encoder( + batch.seed_time, batch.time_dict, batch.batch_dict + ) + for node_type in x_dict: + x_dict[node_type] = x_dict[node_type] + rel_time_dict[node_type] + x_dict = self.gnn(x_dict, batch.edge_index_dict) + return self.head(x_dict[entity_table]) + + model = GNNModel(data, col_stats_dict) + model.load_state_dict(torch.load(model_path)) + model.eval() + + return model, data, task + + +def predict(model, data, task, entities, timestamp): + """ + Make predictions for given entities at a specific timestamp. + + Args: + model: Trained GNN model + data: HeteroData graph + task: Task definition + entities: List of entity IDs to predict for + timestamp: Prediction timestamp + + Returns: + Predictions for each entity + """ + from plexe.relbench.modeling.graph import get_node_train_table_input + from torch_geometric.loader import NeighborLoader + import pandas as pd + + # Create prediction table + pred_df = pd.DataFrame({{ + task.time_col: [pd.Timestamp(timestamp)] * len(entities), + task.entity_col: entities, + }}) + + # Create loader + # ... (implementation depends on specific use case) + + return predictions + + +if __name__ == "__main__": + model_path = "{working_dir}/best_model.pt" + model, data, task = load_model(model_path) + print("Model loaded successfully!") +''' + + return inference_code diff --git a/plexe/langgraph/agents/task_builder.py b/plexe/langgraph/agents/task_builder.py new file mode 100644 index 00000000..5d1495f8 --- /dev/null +++ b/plexe/langgraph/agents/task_builder.py @@ -0,0 +1,212 @@ +""" +Task Builder Agent. + +This agent builds RelBench Task objects for prediction tasks, +generating SQL queries and complete Python Task classes. +""" + +import logging +import os +from typing import Optional, List, Dict, Any + +from langchain_core.tools import BaseTool + +from plexe.langgraph.agents.base import BaseAgent +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.state import PipelineState, PipelinePhase +from plexe.langgraph.tools.common import save_artifact +from plexe.langgraph.tools.dataset_builder import get_csv_files_info +from plexe.langgraph.tools.task_builder import test_sql_query, register_task_code +from plexe.langgraph.prompts.task_builder import TASK_BUILDER_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + +class TaskBuilderAgent(BaseAgent): + """Agent for building RelBench Task classes.""" + + def __init__( + self, + config: Optional[AgentConfig] = None, + additional_tools: Optional[List[BaseTool]] = None, + ): + tools = [ + get_csv_files_info, + test_sql_query, + register_task_code, + save_artifact, + ] + + if additional_tools: + tools.extend(additional_tools) + + super().__init__( + agent_type="task_builder", + config=config, + tools=tools, + ) + + @property + def system_prompt(self) -> str: + return TASK_BUILDER_SYSTEM_PROMPT + + def _build_context(self, state: PipelineState) -> str: + """Build context with task-specific information.""" + context_parts = [] + + if state.get("working_dir"): + context_parts.append(f"Working directory: {state['working_dir']}") + + if state.get("csv_dir"): + context_parts.append(f"CSV directory: {state['csv_dir']}") + + # User intent analysis + if state.get("user_intent"): + intent = state["user_intent"] + context_parts.append("\n## User Intent:") + if isinstance(intent, dict): + pred_target = intent.get('prediction_target', 'unknown') + task_type = intent.get('task_type', 'unknown') + context_parts.append(f" - Prediction target: {pred_target}") + context_parts.append(f" - Task type: {task_type}") + + # Suggest appropriate metrics + if 'binary' in str(task_type).lower() or 'classification' in str(task_type).lower(): + context_parts.append(f" - Suggested metrics: average_precision, accuracy, f1, roc_auc") + elif 'regression' in str(task_type).lower(): + context_parts.append(f" - Suggested metrics: mae, rmse, r2") + elif 'link' in str(task_type).lower() or 'recommendation' in str(task_type).lower(): + context_parts.append(f" - Suggested metrics: link_prediction_map, link_prediction_precision, link_prediction_recall") + context_parts.append(f" - Use RecommendationTask base class with eval_k parameter") + else: + context_parts.append(f" - Intent: {intent}") + + # Schema information + if state.get("schema_info"): + schema = state["schema_info"] + context_parts.append("\n## Schema Information:") + tables = list(schema.get("tables", {}).keys()) + context_parts.append(f"Available tables: {', '.join(tables)}") + + context_parts.append("\nTable Details:") + for table_name, table_info in schema.get("tables", {}).items(): + columns = table_info.get("columns", []) + pk = table_info.get("primary_key", []) + context_parts.append(f" - {table_name}:") + context_parts.append(f" * Columns: {', '.join([c['name'] for c in columns[:10]])}") + if pk: + context_parts.append(f" * Primary Key: {pk}") + + # Foreign key relationships + if schema.get("relationships"): + context_parts.append("\nForeign Key Relationships:") + for rel in schema["relationships"]: + context_parts.append( + f" - {rel['source_table']}.{rel['source_column']} -> {rel['target_table']}.{rel['target_column']}" + ) + + # Dataset information + if state.get("dataset_info"): + ds = state["dataset_info"] + context_parts.append("\n## Dataset Information:") + context_parts.append(f" - Class: {ds.get('class_name')}") + if ds.get("val_timestamp"): + context_parts.append(f" - Validation timestamp: {ds.get('val_timestamp')}") + if ds.get("test_timestamp"): + context_parts.append(f" - Test timestamp: {ds.get('test_timestamp')}") + + # EDA insights + if state.get("eda_info"): + eda = state["eda_info"] + context_parts.append("\n## EDA Analysis:") + + if eda.get("statistics"): + context_parts.append("Table Statistics:") + for table, stats in eda["statistics"].items(): + if isinstance(stats, dict): + row_count = stats.get("row_count", "unknown") + context_parts.append(f" - {table}: {row_count} rows") + + if eda.get("temporal_analysis"): + context_parts.append("\nTemporal Analysis:") + for table, analysis in eda["temporal_analysis"].items(): + if analysis.get("time_columns"): + cols = analysis['time_columns'] + context_parts.append(f" - {table} time columns: {cols}") + # Add time range info if available + for col_name, col_info in cols.items(): + if isinstance(col_info, dict): + min_date = col_info.get('min') + max_date = col_info.get('max') + if min_date and max_date: + context_parts.append(f" * {col_name}: {min_date} to {max_date}") + + # Suggest timedelta based on temporal data + if eda.get("suggested_timedelta"): + context_parts.append(f"\nSuggested prediction window: {eda.get('suggested_timedelta')}") + + # Task generation instructions + working_dir = state.get('working_dir', '') + csv_dir = state.get('csv_dir', '') + + context_parts.append(f""" +## Your Task: +1. Determine if this is an EntityTask or RecommendationTask based on user intent +2. Identify the entity table and entity column (or src/dst for recommendations) +3. Determine appropriate time_col (from temporal analysis) +4. Design SQL query with proper temporal filtering to compute target labels +5. Choose appropriate metrics based on task type +6. Estimate reasonable timedelta (prediction window) based on temporal data range +7. Set num_eval_timestamps (default 20, adjust based on data frequency) +8. For link prediction: set eval_k (typical: 10-12) +9. Test your SQL: test_sql_query("{csv_dir}", query) +10. Generate complete code and save: register_task_code(code, "GenTask", "{working_dir}/task.py", task_type) + +CRITICAL REMINDERS: +- Use TaskType enum: TaskType.BINARY_CLASSIFICATION, TaskType.REGRESSION, TaskType.LINK_PREDICTION +- Import correct base class: EntityTask or RecommendationTask +- Import only metrics you use from plexe.relbench.metrics +- Convert timestamps: timestamp_df = pd.DataFrame({{"timestamp": timestamps}}) +- Use duckdb.sql() method, not conn.execute() +- Return Table object with proper fkey_col_to_pkey_table mapping +""") + + return "\n".join(context_parts) + + def _process_result(self, result: Dict[str, Any], state: PipelineState) -> Dict[str, Any]: + """Process result and extract task information.""" + base_result = super()._process_result(result, state) + + task_info = {} + generated_code = state.get("generated_code", {}) + working_dir = state.get("working_dir", "") + + task_path = os.path.join(working_dir, "task.py") + if os.path.exists(task_path): + task_info["class_name"] = "GenTask" + task_info["file_path"] = task_path + logger.info(f"Task file created at: {task_path}") + + intent = state.get("user_intent", {}) + if isinstance(intent, dict): + task_info["task_type"] = intent.get("task_type", "binary_classification") + else: + task_info["task_type"] = "binary_classification" + else: + error_msg = f"CRITICAL ERROR: Task file not found at {task_path}. TaskBuilderAgent did not complete its task. The agent must call register_task_code() to generate task.py." + logger.error(error_msg) + # Return error state to force re-invocation or escalation + base_result["error"] = error_msg + base_result["status"] = "error" + task_info["class_name"] = "GenTask" + task_info["file_path"] = task_path + task_info["error"] = "File not generated" + task_info["task_type"] = "binary_classification" + + base_result["task_info"] = task_info + + if generated_code: + base_result["generated_code"] = generated_code + + base_result["current_phase"] = PipelinePhase.GNN_TRAINING.value + + return base_result diff --git a/plexe/langgraph/config.py b/plexe/langgraph/config.py new file mode 100644 index 00000000..85a9e45e --- /dev/null +++ b/plexe/langgraph/config.py @@ -0,0 +1,131 @@ +""" +Configuration for LangGraph-based agents. + +This module provides configuration management for agent models +using environment variables and defaults. +""" + +import os +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ExternalAPIConfig: + """Configuration for external API services.""" + + # Semantic Scholar API + semantic_scholar_api_key: Optional[str] = field(default_factory=lambda: os.environ.get( + "SEMANTIC_SCHOLAR_API_KEY" + )) + + # arXiv API (no key needed, but rate limits apply) + arxiv_base_url: str = "http://export.arxiv.org/api/query" + + # Papers With Code API + papers_with_code_api_url: str = "https://paperswithcode.com/api/v1" + + # Hugging Face API (for datasets and models) + huggingface_token: Optional[str] = field(default_factory=lambda: os.environ.get( + "HF_TOKEN" + )) + + # OpenML API + openml_api_key: Optional[str] = field(default_factory=lambda: os.environ.get( + "OPENML_API_KEY" + )) + + # Rate limiting + max_requests_per_minute: int = 10 + request_timeout: int = 30 + + @classmethod + def from_env(cls) -> "ExternalAPIConfig": + """Create configuration from environment variables.""" + return cls() + + +@dataclass +class AgentConfig: + """Configuration for agent models from environment variables.""" + + orchestrator_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_ORCHESTRATOR_MODEL", "openai/gpt-4o" + )) + conversational_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_CONVERSATIONAL_MODEL", "openai/gpt-4o" + )) + eda_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_EDA_MODEL", "openai/gpt-4o" + )) + dataset_builder_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_DATASET_BUILDER_MODEL", "openai/gpt-4o" + )) + task_builder_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_TASK_BUILDER_MODEL", "openai/gpt-4o" + )) + gnn_specialist_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_GNN_SPECIALIST_MODEL", "openai/gpt-4o" + )) + operation_model: str = field(default_factory=lambda: os.environ.get( + "PLEXE_OPERATION_MODEL", "openai/gpt-4o" + )) + + temperature: float = field(default_factory=lambda: float(os.environ.get( + "PLEXE_AGENT_TEMPERATURE", "0.1" + ))) + max_retries: int = field(default_factory=lambda: int(os.environ.get( + "PLEXE_MAX_RETRIES", "3" + ))) + verbose: bool = field(default_factory=lambda: os.environ.get( + "PLEXE_VERBOSE", "false" + ).lower() == "true") + + @classmethod + def from_env(cls) -> "AgentConfig": + """Create configuration from environment variables.""" + return cls() + + def get_model_for_agent(self, agent_type: str) -> str: + """Get the model ID for a specific agent type.""" + mapping = { + "orchestrator": self.orchestrator_model, + "conversational": self.conversational_model, + "eda": self.eda_model, + "dataset_builder": self.dataset_builder_model, + "task_builder": self.task_builder_model, + "gnn_specialist": self.gnn_specialist_model, + "operation": self.operation_model, + } + return mapping.get(agent_type, self.orchestrator_model) + + +def get_llm_from_model_id(model_id: str, temperature: float = 0.1): + """ + Create a LangChain LLM instance from a model ID string. + + Supports formats: + - openai/gpt-4o -> OpenAI + - anthropic/claude-sonnet-4-20250514 -> Anthropic + - gemini/gemini-2.5-flash -> Google Gemini + """ + from langchain_core.language_models import BaseChatModel + + if model_id.startswith("openai/"): + from langchain_openai import ChatOpenAI + model_name = model_id.replace("openai/", "") + return ChatOpenAI(model=model_name, temperature=temperature) + + elif model_id.startswith("anthropic/"): + from langchain_anthropic import ChatAnthropic + model_name = model_id.replace("anthropic/", "") + return ChatAnthropic(model=model_name, temperature=temperature) + + elif model_id.startswith("gemini/"): + from langchain_google_genai import ChatGoogleGenerativeAI + model_name = model_id.replace("gemini/", "") + return ChatGoogleGenerativeAI(model=model_name, temperature=temperature) + + else: + from langchain_openai import ChatOpenAI + return ChatOpenAI(model=model_id, temperature=temperature) diff --git a/plexe/langgraph/mcp_manager.py b/plexe/langgraph/mcp_manager.py new file mode 100644 index 00000000..c716d894 --- /dev/null +++ b/plexe/langgraph/mcp_manager.py @@ -0,0 +1,202 @@ +import logging +import json +import os +import re +import sys +from typing import List, Dict, Any, Optional +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +from langchain_core.tools import Tool, BaseTool, StructuredTool +from pydantic import BaseModel, Field, create_model +from dotenv import load_dotenv +from contextlib import AsyncExitStack + +logger = logging.getLogger(__name__) + +class MCPManager: + """ + Manager for Model Context Protocol (MCP) servers. + Provides connectivity and conversion of MCP tools to LangChain tools. + """ + + def __init__(self, config_path: Optional[str] = None): + self.config_path = config_path or os.environ.get("MCP_CONFIG_PATH", "mcp_config.json") + self.sessions: Dict[str, Any] = {} + self.tools: List[BaseTool] = [] + self._exit_stack = AsyncExitStack() + # Load environment variables from .env + load_dotenv() + + async def initialize(self): + """Initialize connections to configured MCP servers.""" + if not os.path.exists(self.config_path): + logger.warning(f"MCP config not found at {self.config_path}") + return + + try: + with open(self.config_path, 'r') as f: + config = json.load(f) + + for server_name, server_config in config.get("mcpServers", {}).items(): + await self._connect_to_server(server_name, server_config) + + except Exception as e: + logger.error(f"Error initializing MCP Manager: {e}") + + async def _connect_to_server(self, name: str, config: Dict[str, Any]): + """Connect to a specific MCP server and discover tools.""" + try: + command = config.get("command") + args = config.get("args", []) + + # If command is "python", use the current Python interpreter + # This ensures we use the venv Python, not system Python + if command == "python": + command = sys.executable + + # Expand environment variables in env config + env_config = config.get("env", {}) + env = {**os.environ} + for k, v in env_config.items(): + if isinstance(v, str) and v.startswith("${") and v.endswith("}"): + var_name = v[2:-1] + env[k] = os.environ.get(var_name, "") + logger.debug(f"Expanded MCP env var {k}={var_name}") + else: + env[k] = v + + # Convert relative paths to absolute + abs_args = [] + for arg in args: + if arg.endswith('.py') and not os.path.isabs(arg): + abs_args.append(os.path.abspath(arg)) + else: + abs_args.append(arg) + + params = StdioServerParameters(command=command, args=abs_args, env=env) + + # Use proper async context manager pattern + logger.info(f"Connecting to MCP server: {name}...") + + client_ctx = stdio_client(params) + streams = await self._exit_stack.enter_async_context(client_ctx) + read_stream, write_stream = streams + + session = ClientSession(read_stream, write_stream) + await self._exit_stack.enter_async_context(session) + await session.initialize() + + # List tools from the server + result = await session.list_tools() + + for tool_info in result.tools: + langchain_tool = self._convert_to_langchain_tool(session, tool_info) + self.tools.append(langchain_tool) + + self.sessions[name] = session + logger.info(f"Connected to MCP server: {name} with {len(result.tools)} tools") + + except Exception as e: + logger.error(f"Failed to connect to MCP server {name}: {e}") + + def _convert_to_langchain_tool(self, session: ClientSession, tool_info: Any) -> BaseTool: + """Convert an MCP tool definition to a LangChain BaseTool with proper schema.""" + import asyncio + + async def tool_func_async(**kwargs): + result = await session.call_tool(tool_info.name, kwargs) + # Handle list of content parts from MCP + return "\n".join([str(c.text) if hasattr(c, 'text') else str(c) for c in result.content]) + + def tool_func_sync(**kwargs): + try: + # Try to get existing loop or create new one + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + # We are in an async world already, this is tricky + # But for now, we'll use a wrapper if needed + import nest_asyncio + nest_asyncio.apply() + return loop.run_until_complete(tool_func_async(**kwargs)) + else: + return loop.run_until_complete(tool_func_async(**kwargs)) + except RuntimeError: + return asyncio.run(tool_func_async(**kwargs)) + except Exception as e: + return f"MCP tool error: {str(e)}" + + # Check if tool has input schema with multiple properties + input_schema = getattr(tool_info, 'inputSchema', None) + + if input_schema and isinstance(input_schema, dict): + properties = input_schema.get('properties', {}) + required = input_schema.get('required', []) + + if properties: + # Build Pydantic model dynamically for StructuredTool + field_definitions = {} + for prop_name, prop_info in properties.items(): + prop_type = prop_info.get('type', 'string') + prop_desc = prop_info.get('description', '') + + # Map JSON schema types to Python types + type_mapping = { + 'string': str, + 'integer': int, + 'number': float, + 'boolean': bool, + 'array': list, + 'object': dict, + } + python_type = type_mapping.get(prop_type, str) + + # Special handling for array types: ensure items schema is properly specified + if prop_type == 'array': + items_schema = prop_info.get('items') + if items_schema is None or not isinstance(items_schema, dict): + # If items is missing or invalid, default to list of dicts + # This ensures Gemini gets a valid schema + prop_info['items'] = {'type': 'object'} + logger.warning(f"Array parameter '{prop_name}' missing 'items' schema, defaulting to 'object'") + elif 'type' not in items_schema: + # If items exists but lacks 'type', add it + items_schema['type'] = 'object' + logger.warning(f"Array parameter '{prop_name}' items missing 'type', defaulting to 'object'") + + # Use Optional for non-required fields + if prop_name in required: + field_definitions[prop_name] = (python_type, Field(description=prop_desc)) + else: + field_definitions[prop_name] = (Optional[python_type], Field(default=None, description=prop_desc)) + + # Create dynamic Pydantic model for args schema + ArgsModel = create_model(f'{tool_info.name}Args', **field_definitions) + + return StructuredTool.from_function( + func=tool_func_sync, + name=tool_info.name, + description=tool_info.description or f"MCP tool: {tool_info.name}", + args_schema=ArgsModel, + ) + + # Fallback to simple Tool for tools without complex schema + return Tool( + name=tool_info.name, + description=tool_info.description or f"MCP tool: {tool_info.name}", + func=tool_func_sync, + ) + + def get_tools(self) -> List[BaseTool]: + """Return the list of discovered MCP tools.""" + return self.tools + + async def close(self): + """Close all MCP server connections.""" + try: + await self._exit_stack.aclose() + self.sessions.clear() + self.tools.clear() + logger.info("MCP Manager closed all connections") + except Exception as e: + logger.error(f"Error closing MCP Manager: {e}") \ No newline at end of file diff --git a/plexe/langgraph/mcp_servers/arxiv_server.py b/plexe/langgraph/mcp_servers/arxiv_server.py new file mode 100644 index 00000000..b14e8755 --- /dev/null +++ b/plexe/langgraph/mcp_servers/arxiv_server.py @@ -0,0 +1,66 @@ +import os +import requests +import xml.etree.ElementTree as ET +from typing import List, Dict, Any, Optional +from mcp.server.fastmcp import FastMCP + +# Initialize FastMCP server +mcp = FastMCP("arXiv") + +BASE_URL = "http://export.arxiv.org/api/query" + +@mcp.tool() +def search_arxiv_papers( + query: str, + max_results: int = 10, + sort_by: str = "relevance", + sort_order: str = "descending" +) -> List[Dict[str, Any]]: + """ + Search arXiv for papers. + """ + params = { + 'search_query': query, + 'start': 0, + 'max_results': max_results, + 'sortBy': sort_by, + 'sortOrder': sort_order + } + + try: + response = requests.get( + BASE_URL, + params=params, + timeout=30 + ) + response.raise_for_status() + return _parse_arxiv_response(response.text) + except Exception as e: + return [{"error": str(e)}] + +def _parse_arxiv_response(xml_text: str) -> List[Dict[str, Any]]: + """Parse arXiv XML response into list of paper dicts.""" + papers = [] + + try: + root = ET.fromstring(xml_text) + ns = {'atom': 'http://www.w3.org/2005/Atom'} + + for entry in root.findall('atom:entry', ns): + paper = { + 'id': entry.find('atom:id', ns).text if entry.find('atom:id', ns) is not None else '', + 'title': entry.find('atom:title', ns).text.strip() if entry.find('atom:title', ns) is not None else '', + 'summary': entry.find('atom:summary', ns).text.strip() if entry.find('atom:summary', ns) is not None else '', + 'published': entry.find('atom:published', ns).text if entry.find('atom:published', ns) is not None else '', + 'updated': entry.find('atom:updated', ns).text if entry.find('atom:updated', ns) is not None else '', + 'authors': [author.find('atom:name', ns).text for author in entry.findall('atom:author', ns)], + 'categories': [cat.get('term') for cat in entry.findall('atom:category', ns)] + } + papers.append(paper) + except Exception as e: + papers.append({"error": f"Parse error: {str(e)}"}) + + return papers + +if __name__ == "__main__": + mcp.run() diff --git a/plexe/langgraph/mcp_servers/hpo_server.py b/plexe/langgraph/mcp_servers/hpo_server.py new file mode 100644 index 00000000..72e8422a --- /dev/null +++ b/plexe/langgraph/mcp_servers/hpo_server.py @@ -0,0 +1,476 @@ +""" +MCP Server for Hyperparameter Optimization (HPO) Search. + +This server provides training-free HPO capabilities by: +1. Extracting hyperparameters from academic papers (via other MCP servers) +2. Querying benchmark databases for proven configurations +3. Applying heuristics based on dataset characteristics +""" + +import os +import re +from typing import Dict, Any, List, Optional +from mcp.server.fastmcp import FastMCP +from pydantic import BaseModel, Field +import requests +import xml.etree.ElementTree as ET + +# Initialize FastMCP server +mcp = FastMCP("HPO Search") + + +class HyperparameterConfig(BaseModel): + """Hyperparameter configuration from a single source.""" + hyperparameters: Dict[str, Any] = Field( + description="Dictionary of hyperparameter names to values" + ) + source: str = Field( + default="unknown", + description="Source of this configuration (e.g., 'heuristics', 'literature', 'benchmark')" + ) + confidence: Optional[str] = Field( + default=None, + description="Confidence level of this configuration" + ) + + +@mcp.tool() +def search_optimal_hyperparameters( + task_type: str, + num_nodes: int = 10000, + num_tables: int = 5, + is_temporal: bool = True, + model_architecture: str = "gnn" +) -> Dict[str, Any]: + """ + Search for optimal hyperparameters using training-free heuristics. + + This implements knowledge-based HPO without running training experiments. + Based on dataset characteristics and task type. + + Args: + task_type: Type of task (regression, binary_classification, multiclass_classification) + num_nodes: Number of nodes in graph + num_tables: Number of tables in relational DB + is_temporal: Whether task is temporal + model_architecture: Model architecture type (default: "gnn") + + Returns: + Dict with optimal hyperparameters and reasoning + """ + + # Heuristic rules based on dataset size and task type + if num_nodes < 5000: + hidden_channels = 64 + batch_size = 256 + num_layers = 2 + elif num_nodes < 50000: + hidden_channels = 128 + batch_size = 512 + num_layers = 2 + else: + hidden_channels = 256 + batch_size = 1024 + num_layers = 3 + + # Task-specific hyperparameters + if task_type == "regression": + learning_rate = 0.005 + epochs = 20 + tune_metric = "mae" + higher_is_better = False + elif task_type == "binary_classification": + learning_rate = 0.01 + epochs = 15 + tune_metric = "accuracy" + higher_is_better = True + else: # multiclass_classification + learning_rate = 0.01 + epochs = 15 + tune_metric = "accuracy" + higher_is_better = True + + # Temporal adjustment + if is_temporal: + learning_rate *= 0.8 + + # Build reasoning + reasoning = [ + f"Based on dataset size ({num_nodes} nodes, {num_tables} tables):", + f" - Hidden channels: {hidden_channels} (balanced capacity)", + f" - Batch size: {batch_size} (optimal memory/performance)", + f" - GNN layers: {num_layers} (appropriate receptive field)", + f"\nBased on task type ({task_type}):", + f" - Learning rate: {learning_rate}", + f" - Epochs: {epochs}", + f" - Metric: {tune_metric}", + ] + if is_temporal: + reasoning.append("\nTemporal adjustment: reduced LR by 20%") + + return { + "hyperparameters": { + "hidden_channels": hidden_channels, + "batch_size": batch_size, + "learning_rate": learning_rate, + "num_gnn_layers": num_layers, + "epochs": epochs, + "tune_metric": tune_metric, + "higher_is_better": higher_is_better, + }, + "reasoning": "\n".join(reasoning), + "confidence": "high", + "source": "heuristic_based" + } + + +@mcp.tool() +def extract_hyperparameters_from_papers( + paper_query: str, + model_type: str = "gnn", + num_papers: int = 5 +) -> Dict[str, Any]: + """ + Search papers and extract hyperparameters from their content. + + Uses arXiv API to find relevant papers and extracts hyperparameter + values from abstracts using pattern matching. + + Args: + paper_query: Search query for papers (e.g., "Graph Neural Networks node classification") + model_type: Type of model (gnn, transformer, etc.) + num_papers: Number of papers to analyze + + Returns: + Dict with extracted hyperparameters from multiple papers + """ + + # Search arXiv for papers + query = f"all:{model_type} {paper_query}" + params = { + 'search_query': query, + 'start': 0, + 'max_results': num_papers, + 'sortBy': 'relevance', + 'sortOrder': 'descending' + } + + papers_analyzed = [] + hyperparams_found = [] + + try: + response = requests.get( + "http://export.arxiv.org/api/query", + params=params, + timeout=30 + ) + response.raise_for_status() + + # Parse XML response + root = ET.fromstring(response.text) + ns = {'atom': 'http://www.w3.org/2005/Atom'} + + for entry in root.findall('atom:entry', ns): + title_elem = entry.find('atom:title', ns) + summary_elem = entry.find('atom:summary', ns) + + if title_elem is None or summary_elem is None: + continue + + title = title_elem.text.strip() + summary = summary_elem.text.strip() + + papers_analyzed.append({ + "title": title, + "summary_preview": summary[:200] + "..." + }) + + # Extract hyperparameters from text + extracted = _extract_hyperparameters_from_text(summary) + if extracted: + hyperparams_found.append({ + "source_paper": title, + "hyperparameters": extracted + }) + + except Exception as e: + return { + "error": str(e), + "papers_analyzed": 0, + "hyperparameters_found": [] + } + + # Aggregate hyperparameters + aggregated = _aggregate_hyperparameters(hyperparams_found) + + return { + "papers_analyzed": len(papers_analyzed), + "papers_with_hyperparams": len(hyperparams_found), + "extracted_hyperparameters": hyperparams_found, + "aggregated_hyperparameters": aggregated, + "confidence": "high" if len(hyperparams_found) >= 3 else "medium", + "source": "literature_extraction" + } + + +@mcp.tool() +def get_benchmark_hyperparameters( + task_type: str, + dataset_domain: str = "general", + model_architecture: str = "gnn" +) -> Dict[str, Any]: + """ + Get hyperparameters from benchmark leaderboards and competitions. + + Queries Papers With Code and other benchmark databases for + proven hyperparameter configurations. + + Args: + task_type: Type of task (regression, classification, etc.) + dataset_domain: Domain of dataset (general, temporal, relational, etc.) + model_architecture: Model architecture (gnn, transformer, etc.) + + Returns: + Dict with benchmark-based hyperparameters + """ + + # Query Papers With Code API + benchmark_configs = [] + + try: + # Search for papers with code + search_query = f"{model_architecture} {task_type}" + response = requests.get( + "https://paperswithcode.com/api/v1/papers/", + params={'q': search_query, 'items_per_page': 5}, + timeout=30 + ) + + if response.status_code == 200: + data = response.json() + papers = data.get('results', []) + + for paper in papers[:3]: + benchmark_configs.append({ + "paper_title": paper.get('title', ''), + "paper_url": paper.get('url', ''), + "conference": paper.get('conference', 'N/A') + }) + + except Exception as e: + pass # Continue with defaults if API fails + + # Provide sensible defaults based on benchmarks + # These are based on common winning configurations + if model_architecture == "gnn": + if task_type in ["binary_classification", "multiclass_classification"]: + hyperparams = { + "hidden_channels": 128, + "batch_size": 512, + "learning_rate": 0.01, + "num_gnn_layers": 2, + "epochs": 15, + "dropout": 0.1, + "optimizer": "adam" + } + else: # regression + hyperparams = { + "hidden_channels": 128, + "batch_size": 512, + "learning_rate": 0.005, + "num_gnn_layers": 2, + "epochs": 20, + "dropout": 0.1, + "optimizer": "adam" + } + else: + hyperparams = { + "learning_rate": 0.001, + "batch_size": 256, + "epochs": 20 + } + + return { + "hyperparameters": hyperparams, + "benchmarks_referenced": len(benchmark_configs), + "benchmark_papers": benchmark_configs, + "confidence": "high" if benchmark_configs else "medium", + "source": "benchmark_leaderboards" + } + + +@mcp.tool() +def compare_hyperparameter_configs( + configs: List[HyperparameterConfig], + strategy: str = "ensemble_median" +) -> Dict[str, Any]: + """ + Compare multiple hyperparameter configurations and select the best. + + Uses ensemble voting across multiple sources (heuristics, literature, + benchmarks) to determine optimal hyperparameters. + + Args: + configs: List of hyperparameter config objects from different sources. + Each config should have 'hyperparameters' dict, 'source' string, and optional 'confidence'. + strategy: Strategy for combining configs (ensemble_median, highest_confidence, voting) + + Returns: + Dict with final recommended hyperparameters + """ + + if not configs: + return { + "error": "No configurations provided", + "recommended_hyperparameters": {} + } + + # Extract all hyperparameter dicts + all_hyperparams = [] + sources = [] + + for config in configs: + # Handle both Pydantic models and dicts + config_dict = config.model_dump() if hasattr(config, 'model_dump') else config + if "hyperparameters" in config_dict: + all_hyperparams.append(config_dict["hyperparameters"]) + sources.append(config_dict.get("source", "unknown")) + + if not all_hyperparams: + return { + "error": "No hyperparameters found in configs", + "recommended_hyperparameters": {} + } + + # Aggregate using ensemble strategy + final_hyperparams = {} + + # Get all parameter names + all_param_names = set() + for hp in all_hyperparams: + all_param_names.update(hp.keys()) + + # For each parameter, compute median or mode + for param_name in all_param_names: + values = [hp[param_name] for hp in all_hyperparams if param_name in hp] + + if not values: + continue + + if isinstance(values[0], (int, float)): + # Numeric: take median + sorted_vals = sorted(values) + final_hyperparams[param_name] = sorted_vals[len(sorted_vals) // 2] + else: + # Categorical: take most common + final_hyperparams[param_name] = max(set(values), key=values.count) + + return { + "recommended_hyperparameters": final_hyperparams, + "num_sources": len(all_hyperparams), + "sources": sources, + "strategy": strategy, + "confidence": "high" if len(all_hyperparams) >= 3 else "medium" + } + + +def _extract_hyperparameters_from_text(text: str) -> Dict[str, Any]: + """Extract hyperparameter values from text using regex patterns.""" + hyperparams = {} + text_lower = text.lower() + + # Learning rate patterns + lr_patterns = [ + r'learning rate[:\s]+([0-9.e-]+)', + r'lr[:\s=]+([0-9.e-]+)', + r'α[:\s=]+([0-9.e-]+)' + ] + for pattern in lr_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['learning_rate'] = float(match.group(1)) + break + + # Batch size + batch_patterns = [ + r'batch size[:\s]+([0-9]+)', + r'batch[:\s=]+([0-9]+)', + ] + for pattern in batch_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['batch_size'] = int(match.group(1)) + break + + # Hidden dimensions + hidden_patterns = [ + r'hidden[_ ](?:dimension|channel|unit)s?[:\s]+([0-9]+)', + r'embedding[_ ](?:dimension|size)[:\s]+([0-9]+)', + ] + for pattern in hidden_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['hidden_channels'] = int(match.group(1)) + break + + # Number of layers + layer_patterns = [ + r'([0-9]+)[- ]layer', + r'num[_ ]layers?[:\s]+([0-9]+)', + ] + for pattern in layer_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['num_layers'] = int(match.group(1)) + break + + # Epochs + epoch_patterns = [ + r'([0-9]+) epochs?', + r'epochs?[:\s]+([0-9]+)', + ] + for pattern in epoch_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['epochs'] = int(match.group(1)) + break + + return hyperparams + + +def _aggregate_hyperparameters( + hyperparams_list: List[Dict[str, Any]] +) -> Dict[str, Any]: + """Aggregate hyperparameters from multiple sources using median.""" + + if not hyperparams_list: + return {} + + aggregated = {} + + # Get all parameter names + all_params = set() + for item in hyperparams_list: + hp = item.get("hyperparameters", {}) + all_params.update(hp.keys()) + + # For each parameter, compute median + for param in all_params: + values = [] + for item in hyperparams_list: + hp = item.get("hyperparameters", {}) + if param in hp: + values.append(hp[param]) + + if values: + if isinstance(values[0], (int, float)): + sorted_vals = sorted(values) + aggregated[param] = sorted_vals[len(sorted_vals) // 2] + else: + aggregated[param] = max(set(values), key=values.count) + + return aggregated + + +if __name__ == "__main__": + mcp.run() diff --git a/plexe/langgraph/mcp_servers/kaggle_server.py b/plexe/langgraph/mcp_servers/kaggle_server.py new file mode 100644 index 00000000..6a6b74b9 --- /dev/null +++ b/plexe/langgraph/mcp_servers/kaggle_server.py @@ -0,0 +1,135 @@ +import os +import re +from typing import Dict, Any, List +from mcp.server.fastmcp import FastMCP + +# Initialize FastMCP server +mcp = FastMCP("Kaggle") + +@mcp.tool() +def search_gnn_competitions_for_benchmarks( + task_type: str = "graph", + limit: int = 5 +) -> Dict[str, Any]: + """ + Search Kaggle competitions for GNN benchmarks and winning solutions. + + Useful for finding proven hyperparameters from competition winners. + + Args: + task_type: Type of task (e.g., "graph", "node classification", "link prediction") + limit: Maximum number of competitions to retrieve + + Returns: + Dict with competition info and potential hyperparameter insights + """ + from kaggle.api.kaggle_api_extended import KaggleApi + api = KaggleApi() + api.authenticate() + + # Search for competitions + query = f"{task_type} neural network" + competitions = api.competitions_list(search=query) + + results = [] + for i, comp in enumerate(competitions): + if i >= limit: + break + results.append({ + "ref": comp.ref, + "title": comp.title, + "description": comp.description[:200] if comp.description else "", + "reward": str(comp.reward) if hasattr(comp, 'reward') else "N/A", + "teamCount": comp.teamCount if hasattr(comp, 'teamCount') else 0, + "userHasEntered": comp.userHasEntered if hasattr(comp, 'userHasEntered') else False + }) + + return { + "competitions_found": len(results), + "competitions": results, + "source": "Kaggle Competitions", + "note": "Check competition notebooks for winning hyperparameters" + } + +@mcp.tool() +def search_gnn_notebooks_for_hyperparameters( + task_type: str = "graph neural network", + limit: int = 5 +) -> Dict[str, Any]: + """ + Search Kaggle notebooks for GNN implementations and extract hyperparameters. + + Args: + task_type: Type of task or model (e.g., "GNN", "Graph Neural Network") + limit: Maximum number of notebooks to retrieve + + Returns: + Dict with notebook info and hyperparameter insights + """ + from kaggle.api.kaggle_api_extended import KaggleApi + api = KaggleApi() + api.authenticate() + + # Search for notebooks/kernels + kernels = api.kernels_list(search=task_type, sort_by="voteCount") + + results = [] + for i, kernel in enumerate(kernels): + if i >= limit: + break + results.append({ + "ref": kernel.ref, + "title": kernel.title, + "author": kernel.author, + "voteCount": kernel.voteCount if hasattr(kernel, 'voteCount') else 0, + "language": kernel.language if hasattr(kernel, 'language') else "unknown", + "url": f"https://www.kaggle.com/{kernel.ref}" + }) + + return { + "notebooks_found": len(results), + "notebooks": results, + "source": "Kaggle Notebooks", + "confidence": "high" if results else "low", + "note": "Top notebooks by votes often contain well-tuned hyperparameters" + } + +@mcp.tool() +def search_kaggle_datasets(query: str, limit: int = 5): + """ + Search for datasets on Kaggle. + """ + from kaggle.api.kaggle_api_extended import KaggleApi + api = KaggleApi() + api.authenticate() + + datasets = api.dataset_list(search=query) + results = [] + for i, ds in enumerate(datasets): + if i >= limit: + break + results.append({ + "ref": ds.ref, + "title": ds.title, + "size": ds.size, + "lastUpdated": str(ds.lastUpdated), + "downloadCount": ds.downloadCount, + "voteCount": ds.voteCount + }) + return results + +@mcp.tool() +def download_kaggle_dataset(dataset_ref: str, path: str = "data"): + """ + Download a Kaggle dataset. + """ + from kaggle.api.kaggle_api_extended import KaggleApi + api = KaggleApi() + api.authenticate() + + os.makedirs(path, exist_ok=True) + api.dataset_download_files(dataset_ref, path=path, unzip=True) + return f"Dataset {dataset_ref} downloaded to {path}" + +if __name__ == "__main__": + mcp.run() diff --git a/plexe/langgraph/mcp_servers/scholar_server.py b/plexe/langgraph/mcp_servers/scholar_server.py new file mode 100644 index 00000000..6e598108 --- /dev/null +++ b/plexe/langgraph/mcp_servers/scholar_server.py @@ -0,0 +1,209 @@ +import sys +import re +from typing import Dict, Any, List +from mcp.server.fastmcp import FastMCP +from scholarly import scholarly + +# Initialize FastMCP server +mcp = FastMCP("Google Scholar") + +@mcp.tool() +def search_gnn_papers_for_hyperparameters( + task_type: str, + model_type: str = "Graph Neural Network", + limit: int = 5 +) -> Dict[str, Any]: + """ + Search Google Scholar for GNN papers and extract hyperparameters. + + Optimized for finding optimal hyperparameters for GNN training. + + Args: + task_type: Type of task (e.g., "node classification", "link prediction", "graph classification") + model_type: Model architecture (default: "Graph Neural Network") + limit: Maximum number of papers to analyze + + Returns: + Dict with papers and extracted hyperparameters + """ + query = f"{model_type} {task_type} hyperparameters learning rate" + search_query = scholarly.search_pubs(query) + + papers = [] + hyperparams_found = [] + + for i, pub in enumerate(search_query): + if i >= limit: + break + + bib = pub.get('bib', {}) + abstract = bib.get('abstract', '') + + paper_info = { + "title": bib.get('title'), + "authors": bib.get('author'), + "year": bib.get('pub_year'), + "venue": bib.get('venue'), + "citations": pub.get('num_citations', 0), + "url": pub.get('pub_url'), + "abstract": abstract[:500] if abstract else "" # Truncate for display + } + papers.append(paper_info) + + # Extract hyperparameters from abstract + if abstract: + extracted = _extract_hyperparameters_from_text(abstract) + if extracted: + hyperparams_found.append({ + "paper": bib.get('title', 'Unknown'), + "hyperparameters": extracted + }) + + # Aggregate findings + aggregated = _aggregate_hyperparameters(hyperparams_found) + + return { + "papers_analyzed": len(papers), + "papers_with_hyperparams": len(hyperparams_found), + "papers": papers, + "extracted_hyperparameters": hyperparams_found, + "aggregated_recommendations": aggregated, + "confidence": "high" if len(hyperparams_found) >= 3 else "medium", + "source": "Google Scholar" + } + +@mcp.tool() +def search_scholar(query: str, limit: int = 5): + """ + General search for academic papers on Google Scholar. + + Args: + query: Search query + limit: Maximum number of results + """ + search_query = scholarly.search_pubs(query) + results = [] + for i, pub in enumerate(search_query): + if i >= limit: + break + bib = pub.get('bib', {}) + results.append({ + "title": bib.get('title'), + "author": bib.get('author'), + "pub_year": bib.get('pub_year'), + "venue": bib.get('venue'), + "abstract": bib.get('abstract'), + "url": pub.get('pub_url'), + "num_citations": pub.get('num_citations') + }) + return results + +@mcp.tool() +def get_author_info(name: str): + """ + Get information about an academic author on Google Scholar. + """ + search_query = scholarly.search_author(name) + author = next(search_query, None) + if author: + author = scholarly.fill(author) + return { + "name": author.get('name'), + "affiliation": author.get('affiliation'), + "interests": author.get('interests'), + "citedby": author.get('citedby'), + "hindex": author.get('hindex'), + "publications_count": len(author.get('publications', [])) + } + return "Author not found" + +def _extract_hyperparameters_from_text(text: str) -> Dict[str, Any]: + """Extract hyperparameter values from text using regex patterns.""" + hyperparams = {} + text_lower = text.lower() + + # Learning rate + lr_patterns = [ + r'learning rate[:\s]+([0-9.e-]+)', + r'lr[:\s=]+([0-9.e-]+)', + ] + for pattern in lr_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['learning_rate'] = float(match.group(1)) + break + + # Batch size + batch_patterns = [ + r'batch size[:\s]+([0-9]+)', + r'batch[:\s=]+([0-9]+)', + ] + for pattern in batch_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['batch_size'] = int(match.group(1)) + break + + # Hidden dimensions/channels + hidden_patterns = [ + r'hidden[_ ](?:dimension|channel|unit)s?[:\s]+([0-9]+)', + r'embedding[_ ]size[:\s]+([0-9]+)', + ] + for pattern in hidden_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['hidden_channels'] = int(match.group(1)) + break + + # Number of layers + layer_patterns = [ + r'([0-9]+)[- ]layer', + r'num[_ ]layers?[:\s]+([0-9]+)', + ] + for pattern in layer_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['num_layers'] = int(match.group(1)) + break + + # Epochs + epoch_patterns = [ + r'([0-9]+) epochs?', + r'epochs?[:\s]+([0-9]+)', + ] + for pattern in epoch_patterns: + match = re.search(pattern, text_lower) + if match: + hyperparams['epochs'] = int(match.group(1)) + break + + return hyperparams + +def _aggregate_hyperparameters(hyperparams_list: List[Dict[str, Any]]) -> Dict[str, Any]: + """Aggregate hyperparameters from multiple papers using median.""" + if not hyperparams_list: + return {} + + aggregated = {} + all_params = set() + + for item in hyperparams_list: + hp = item.get('hyperparameters', {}) + all_params.update(hp.keys()) + + for param in all_params: + values = [] + for item in hyperparams_list: + hp = item.get('hyperparameters', {}) + if param in hp: + values.append(hp[param]) + + if values: + # Take median for numeric values + sorted_vals = sorted(values) + aggregated[param] = sorted_vals[len(sorted_vals) // 2] + + return aggregated + +if __name__ == "__main__": + mcp.run() diff --git a/plexe/langgraph/mcp_servers/semantic_scholar_server.py b/plexe/langgraph/mcp_servers/semantic_scholar_server.py new file mode 100644 index 00000000..f2f225a3 --- /dev/null +++ b/plexe/langgraph/mcp_servers/semantic_scholar_server.py @@ -0,0 +1,71 @@ +import os +from mcp.server.fastmcp import FastMCP +import requests +from typing import List, Dict, Any, Optional + +# Initialize FastMCP server +mcp = FastMCP("Semantic Scholar") + +BASE_URL = "https://api.semanticscholar.org/graph/v1" + +def get_session(): + api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY") + session = requests.Session() + if api_key: + session.headers.update({"x-api-key": api_key}) + return session + +@mcp.tool() +def search_papers( + query: str, + limit: int = 10, + year_min: Optional[int] = None, + year_max: Optional[int] = None +) -> List[Dict[str, Any]]: + """ + Search for papers using Semantic Scholar API. + """ + fields = ['paperId', 'title', 'abstract', 'year', 'venue', + 'authors', 'citationCount', 'influentialCitationCount'] + + params = { + 'query': query, + 'fields': ','.join(fields), + 'limit': limit + } + + if year_min: + params['year'] = f"{year_min}-{year_max or ''}" + + session = get_session() + try: + response = session.get( + f"{BASE_URL}/paper/search", + params=params, + timeout=30 + ) + response.raise_for_status() + data = response.json() + return data.get('data', []) + except Exception as e: + return [{"error": str(e)}] + +@mcp.tool() +def get_paper_details(paper_id: str) -> Dict[str, Any]: + """Get detailed information about a specific paper.""" + fields = ['title', 'abstract', 'year', 'venue', 'authors', 'citationCount'] + + session = get_session() + try: + response = session.get( + f"{BASE_URL}/paper/{paper_id}", + params={'fields': ','.join(fields)}, + timeout=30 + ) + response.raise_for_status() + return response.json() + except Exception as e: + return {"error": str(e)} + +if __name__ == "__main__": + mcp.run() diff --git a/plexe/langgraph/orchestrator.py b/plexe/langgraph/orchestrator.py new file mode 100644 index 00000000..03c75474 --- /dev/null +++ b/plexe/langgraph/orchestrator.py @@ -0,0 +1,475 @@ +""" +Plexe Orchestrator using LangGraph. + +This module provides the main orchestrator that coordinates +all agents in the ML pipeline using LangGraph's StateGraph. +""" + +import logging +import os +from datetime import datetime +from typing import Optional, Dict, Any, Callable, Literal + +from plexe.langgraph.utils.logging_utils import session_id_var + +from langgraph.graph import StateGraph, END +from langgraph.checkpoint.memory import MemorySaver + +from plexe.langgraph.state import ( + PipelineState, + PipelinePhase, + create_initial_state, +) +from plexe.langgraph.config import AgentConfig +from plexe.langgraph.utils import BaseEmitter, ConsoleEmitter, ChainOfThoughtCallback +from plexe.langgraph.agents import ( + ConversationalAgent, + EDAAgent, + DatasetBuilderAgent, + TaskBuilderAgent, + RelationalGNNSpecialistAgent, + OperationAgent, +) + +logger = logging.getLogger(__name__) + + +class PlexeOrchestrator: + """ + Main orchestrator for the Plexe ML pipeline using LangGraph. + + This orchestrator manages the workflow between specialized agents: + 1. ConversationalAgent - User interaction and requirements gathering + 2. EDAAgent - Schema analysis, data export, and exploratory data analysis + 3. DatasetBuilderAgent - Dataset class generation + 4. TaskBuilderAgent - Task class and SQL generation + 5. RelationalGNNSpecialistAgent - GNN training + 6. OperationAgent - Environment and execution management + """ + + def __init__( + self, + config: Optional[AgentConfig] = None, + verbose: bool = False, + callback: Optional[Callable] = None, + emitter: Optional[BaseEmitter] = None, + ): + """ + Initialize the orchestrator. + + Args: + config: Agent configuration (uses defaults if None) + verbose: Enable verbose logging + callback: Optional callback for progress updates + emitter: Optional emitter for UI communication + """ + self.config = config or AgentConfig.from_env() + self.verbose = verbose + self.callback = callback + self.emitter = emitter or ConsoleEmitter() + + self._init_agents() + self._build_graph() + + def set_emitter(self, emitter: BaseEmitter): + """Set the emitter for all agents.""" + self.emitter = emitter + for agent in [ + self.conversational_agent, + self.eda_agent, + self.dataset_builder_agent, + self.task_builder_agent, + self.gnn_specialist_agent, + self.operation_agent, + ]: + agent.set_emitter(emitter) + + def _init_agents(self): + """Initialize all agents.""" + self.conversational_agent = ConversationalAgent(config=self.config) + self.eda_agent = EDAAgent(config=self.config) + self.dataset_builder_agent = DatasetBuilderAgent(config=self.config) + self.task_builder_agent = TaskBuilderAgent(config=self.config) + self.gnn_specialist_agent = RelationalGNNSpecialistAgent(config=self.config) + self.operation_agent = OperationAgent(config=self.config) + + for agent in [ + self.conversational_agent, + self.eda_agent, + self.dataset_builder_agent, + self.task_builder_agent, + self.gnn_specialist_agent, + self.operation_agent, + ]: + agent.set_emitter(self.emitter) + + def _build_graph(self): + """Build the LangGraph workflow.""" + workflow = StateGraph(PipelineState) + + workflow.add_node("conversation", self._conversation_node) + workflow.add_node("schema_analysis", self._schema_analysis_node) + workflow.add_node("dataset_building", self._dataset_building_node) + workflow.add_node("task_building", self._task_building_node) + workflow.add_node("gnn_training", self._gnn_training_node) + workflow.add_node("operation", self._operation_node) + workflow.add_node("error_handler", self._error_handler_node) + + workflow.set_entry_point("conversation") + + workflow.add_conditional_edges( + "conversation", + self._route_from_conversation, + { + "continue": "conversation", + "proceed": "schema_analysis", + "end": END, + } + ) + + workflow.add_conditional_edges( + "schema_analysis", + self._route_from_schema, + { + "success": "dataset_building", + "error": "error_handler", + } + ) + + workflow.add_conditional_edges( + "dataset_building", + self._route_from_dataset, + { + "success": "task_building", + "error": "error_handler", + } + ) + + workflow.add_conditional_edges( + "task_building", + self._route_from_task, + { + "success": "gnn_training", + "error": "error_handler", + } + ) + + workflow.add_conditional_edges( + "gnn_training", + self._route_from_training, + { + "success": "operation", + "error": "error_handler", + } + ) + + workflow.add_edge("operation", END) + + workflow.add_conditional_edges( + "error_handler", + self._route_from_error, + { + "retry": "conversation", + "end": END, + } + ) + + self.checkpointer = MemorySaver() + self.graph = workflow.compile(checkpointer=self.checkpointer) + + def _conversation_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle conversation with user.""" + self._log_phase("Conversation", "ConversationalAgent") + result = self.conversational_agent.invoke(state) + return result + + def _schema_analysis_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle schema analysis, data export, and EDA.""" + self._log_phase("Schema Analysis & EDA", "EDAAgent") + result = self.eda_agent.invoke(state) + return result + + def _dataset_building_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle dataset class generation.""" + self._log_phase("Dataset Building", "DatasetBuilderAgent") + result = self.dataset_builder_agent.invoke(state) + return result + + def _task_building_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle task class generation.""" + self._log_phase("Task Building", "TaskBuilderAgent") + result = self.task_builder_agent.invoke(state) + return result + + def _gnn_training_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle GNN training.""" + self._log_phase("GNN Training", "RelationalGNNSpecialistAgent") + result = self.gnn_specialist_agent.invoke(state) + return result + + def _operation_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle operation and finalization.""" + self._log_phase("Operation", "OperationAgent") + result = self.operation_agent.invoke(state) + result["current_phase"] = PipelinePhase.COMPLETED.value + return result + + def _error_handler_node(self, state: PipelineState) -> Dict[str, Any]: + """Handle errors and determine recovery strategy.""" + errors = state.get("errors", []) + logger.error(f"Pipeline error: {errors}") + + if self.emitter: + self.emitter.emit_thought("ErrorHandler", f"Handling errors: {errors}") + + # Safely get retry_count - handle None case explicitly + metadata = state.get("metadata") or {} + retry_count = metadata.get("retry_count", 0) + max_retries = self.config.max_retries + + if retry_count < max_retries: + return { + "metadata": {**metadata, "retry_count": retry_count + 1}, + "warnings": [f"Retrying after error (attempt {retry_count + 1}/{max_retries})"], + } + + return { + "current_phase": PipelinePhase.FAILED.value, + } + + def _route_from_conversation(self, state: PipelineState) -> Literal["continue", "proceed", "end"]: + """Route from conversation node.""" + logger.debug(f"Routing from conversation: user_confirmation_required={state.get('user_confirmation_required')}, " + f"user_confirmed={state.get('user_confirmed')}, user_intent={state.get('user_intent')}, " + f"db_connection_string={bool(state.get('db_connection_string'))}") + + if state.get("user_confirmation_required"): + if state.get("user_confirmed"): + logger.info("User confirmed, proceeding to schema analysis") + return "proceed" + return "continue" + + if state.get("db_connection_string") or state.get("csv_dir"): + if state.get("user_intent"): + logger.info("Intent detected with data source, proceeding to schema analysis") + return "proceed" + + messages = state.get("messages", []) + for msg in reversed(messages): + if msg.get("role") == "assistant": + content = msg.get("content", "").lower() + ready_indicators = [ + "ready to proceed", + "start building", + "begin training", + "let's begin", + "i'll start", + "proceed with", + "starting the pipeline", + ] + if any(indicator in content for indicator in ready_indicators): + if state.get("db_connection_string") or state.get("csv_dir"): + logger.info("Ready indicator found in response, proceeding") + return "proceed" + break + + return "continue" + + def _route_from_schema(self, state: PipelineState) -> Literal["success", "error"]: + """Route from schema analysis node.""" + if state.get("errors"): + return "error" + if state.get("csv_dir") and state.get("schema_info"): + return "success" + if state.get("csv_dir"): + return "success" + return "error" + + def _route_from_dataset(self, state: PipelineState) -> Literal["success", "error"]: + """Route from dataset building node.""" + if state.get("errors"): + return "error" + if state.get("dataset_info"): + return "success" + working_dir = state.get("working_dir", "") + if os.path.exists(os.path.join(working_dir, "dataset.py")): + return "success" + return "error" + + def _route_from_task(self, state: PipelineState) -> Literal["success", "error"]: + """Route from task building node.""" + if state.get("errors"): + return "error" + if state.get("task_info"): + return "success" + working_dir = state.get("working_dir", "") + if os.path.exists(os.path.join(working_dir, "task.py")): + return "success" + return "error" + + def _route_from_training(self, state: PipelineState) -> Literal["success", "error"]: + """Route from training node.""" + if state.get("errors"): + return "error" + if state.get("training_result"): + return "success" + working_dir = state.get("working_dir", "") + if os.path.exists(os.path.join(working_dir, "training_results.json")): + return "success" + return "error" + + def _route_from_error(self, state: PipelineState) -> Literal["retry", "end"]: + """Route from error handler.""" + # Safely get retry_count - handle None case explicitly + metadata = state.get("metadata") or {} + retry_count = metadata.get("retry_count", 0) + if retry_count < self.config.max_retries: + return "retry" + return "end" + + def _log_phase(self, phase: str, agent_name: str = ""): + """Log phase transition with rich formatting.""" + if self.verbose: + logger.info(f"=== Phase: {phase} ({agent_name}) ===") + if self.callback: + self.callback({ + "phase": phase, + "agent": agent_name, + "timestamp": datetime.now().strftime("%H:%M:%S") + }) + if self.emitter: + self.emitter.emit_thought("Orchestrator", f"Pipeline Phase: {phase} - Activating {agent_name}") + + def run( + self, + user_message: str, + db_connection_string: Optional[str] = None, + csv_dir: Optional[str] = None, + working_dir: Optional[str] = None, + session_id: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Run the ML pipeline. + + Args: + user_message: Initial user request + db_connection_string: Optional database connection + csv_dir: Optional directory with CSV files + working_dir: Working directory for artifacts + session_id: Session identifier + + Returns: + Final pipeline state + """ + if session_id is None: + session_id = f"session-{datetime.now().strftime('%Y%m%d-%H%M%S')}" + + if working_dir is None: + working_dir = os.path.join("workdir", session_id) + + os.makedirs(working_dir, exist_ok=True) + + # Set session ID for logging + session_id_token = session_id_var.set(session_id) + + initial_state = create_initial_state( + session_id=session_id, + working_dir=working_dir, + user_message=user_message, + db_connection_string=db_connection_string, + ) + + if csv_dir: + initial_state["csv_dir"] = csv_dir + + config = {"configurable": {"thread_id": session_id}} + + try: + final_state = self.graph.invoke(initial_state, config) + return { + "status": "completed" if final_state.get("current_phase") == PipelinePhase.COMPLETED.value else "failed", + "state": final_state, + "session_id": session_id, + "working_dir": working_dir, + } + except Exception as e: + logger.error(f"Pipeline execution failed: {e}") + return { + "status": "error", + "error": str(e), + "session_id": session_id, + "working_dir": working_dir, + } + + def chat( + self, + message: str, + session_id: str, + working_dir: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Send a chat message to an ongoing session. + + Args: + message: User message + session_id: Existing session ID + working_dir: Working directory + + Returns: + Agent response + """ + config = {"configurable": {"thread_id": session_id}} + + # Set session ID for logging + session_id_var.set(session_id) + + try: + current_state = self.graph.get_state(config) + + if current_state and current_state.values: + state = dict(current_state.values) + state["messages"] = state.get("messages", []) + [{ + "role": "user", + "content": message, + "timestamp": datetime.now().isoformat(), + }] + + if state.get("user_confirmation_required"): + if any(word in message.lower() for word in ["yes", "proceed", "confirm", "ok"]): + state["user_confirmed"] = True + elif any(word in message.lower() for word in ["no", "stop", "cancel"]): + state["user_confirmed"] = False + state["current_phase"] = PipelinePhase.CONVERSATION.value + + result = self.graph.invoke(state, config) + return { + "status": "success", + "response": self._extract_response(result), + "phase": result.get("current_phase"), + } + else: + return self.run(message, working_dir=working_dir, session_id=session_id) + + except Exception as e: + logger.error(f"Chat error: {e}") + return { + "status": "error", + "error": str(e), + } + + def _extract_response(self, state: Dict[str, Any]) -> str: + """Extract the latest assistant response from state.""" + messages = state.get("messages", []) + for msg in reversed(messages): + if msg.get("role") == "assistant": + return msg.get("content", "") + return "" + + def get_session_state(self, session_id: str) -> Optional[Dict[str, Any]]: + """Get the current state of a session.""" + config = {"configurable": {"thread_id": session_id}} + try: + state = self.graph.get_state(config) + return dict(state.values) if state and state.values else None + except Exception: + return None diff --git a/plexe/langgraph/prompts/__init__.py b/plexe/langgraph/prompts/__init__.py new file mode 100644 index 00000000..018a564e --- /dev/null +++ b/plexe/langgraph/prompts/__init__.py @@ -0,0 +1,21 @@ +""" +System prompts for LangGraph agents. + +Each agent has its own prompt file defining its role, workflow, and guidelines. +""" + +from plexe.langgraph.prompts.conversational import CONVERSATIONAL_SYSTEM_PROMPT +from plexe.langgraph.prompts.eda import EDA_SYSTEM_PROMPT +from plexe.langgraph.prompts.dataset_builder import DATASET_BUILDER_SYSTEM_PROMPT +from plexe.langgraph.prompts.task_builder import TASK_BUILDER_SYSTEM_PROMPT +from plexe.langgraph.prompts.gnn_specialist import GNN_SPECIALIST_SYSTEM_PROMPT +from plexe.langgraph.prompts.operation import OPERATION_SYSTEM_PROMPT + +__all__ = [ + "CONVERSATIONAL_SYSTEM_PROMPT", + "EDA_SYSTEM_PROMPT", + "DATASET_BUILDER_SYSTEM_PROMPT", + "TASK_BUILDER_SYSTEM_PROMPT", + "GNN_SPECIALIST_SYSTEM_PROMPT", + "OPERATION_SYSTEM_PROMPT", +] diff --git a/plexe/langgraph/prompts/conversational.py b/plexe/langgraph/prompts/conversational.py new file mode 100644 index 00000000..a6f1c4b0 --- /dev/null +++ b/plexe/langgraph/prompts/conversational.py @@ -0,0 +1,34 @@ +CONVERSATIONAL_SYSTEM_PROMPT = """You are an expert ML consultant specializing in Relational Deep Learning and Graph Neural Networks. + +Your role is to help users build prediction models for relational databases by understanding their data and requirements. + +WORKFLOW: +1. When user provides a database connection string, use validate_db_connection to see available tables +2. Ask what prediction task they want to solve (what to predict, for which entity) +3. Clarify the task type (classification, regression, recommendation) +4. Confirm all requirements before proceeding + +REQUIREMENTS TO GATHER: +- Database connection string (postgresql://user:pass@host:port/db) +- Target entity (which table's rows to make predictions for) +- Prediction target (what to predict: churn, sales, engagement, etc.) +- Task type (binary_classification, regression, multiclass_classification) +- Time horizon for prediction (e.g., 30 days) + +RESPONSE FORMAT: +Keep responses brief and focused. Ask one question at a time. +When all requirements are gathered, respond with: +"I have all the information needed. Ready to proceed with building the model." + +IMPORTANT: +- Use validate_db_connection to explore the database schema +- Be specific about what you need from the user +- When ready, include "ready to proceed" in your response to trigger the pipeline + +EXAMPLE INTERACTION: +User: "Build a model using postgresql://user:pass@localhost:5432/mydb" +You: [Use validate_db_connection first] +You: "Connected to the database. I see tables: users, orders, products. What would you like to predict?" +User: "Predict which users will churn" +You: "I'll build a binary classification model to predict user churn. Ready to proceed with building the model." +""" diff --git a/plexe/langgraph/prompts/dataset_builder.py b/plexe/langgraph/prompts/dataset_builder.py new file mode 100644 index 00000000..afdccc3b --- /dev/null +++ b/plexe/langgraph/prompts/dataset_builder.py @@ -0,0 +1,142 @@ +DATASET_BUILDER_SYSTEM_PROMPT = """You are the Dataset Builder Agent for Relational Deep Learning. + +# YOUR MISSION: +Generate a complete GenDataset Python class that loads CSV data and defines the database schema. + +# CRITICAL REQUIREMENT: +Your task is NOT COMPLETE until you have called register_dataset_code() to save the generated code. +IF YOU DO NOT CALL register_dataset_code(), YOU HAVE FAILED THE TASK COMPLETELY. +DO NOT respond with "Completed" or finish your work UNTIL dataset.py EXISTS on disk. + +# MANDATORY WORKFLOW - EXECUTE ALL 5 STEPS (NOT OPTIONAL): + +## 1: Call get_csv_files_info(csv_dir) +Purpose: List all CSV files, their columns, and row counts + +## 2: Call get_temporal_statistics(csv_dir) +Purpose: Analyze timestamp columns and get val_timestamp/test_timestamp for train/val/test splits + +## 3: ANALYSIS - Write your understanding before generating code: +- Identify which tables have temporal columns (time_col) vs static tables (time_col=None) +- Classify tables as dimension tables (users, products) vs fact tables (transactions, events) +- Map foreign key relationships between tables +- Determine the val_timestamp and test_timestamp values to use +- Note any data cleaning requirements (missing values, timezone issues, type conversions) + +## 4: CODE GENERATION - Create the complete GenDataset class: +- Include val_timestamp and test_timestamp from Step 2 +- Define all tables from Step 1 with proper Table() definitions +- Set correct fkey_col_to_pkey_table mappings for each table +- Assign appropriate time_col for temporal tables or None for static tables +- Include necessary data cleaning code + +## 5: ⚠️ MANDATORY - Call register_dataset_code(code, "GenDataset", file_path) +This saves your generated code to disk. + +**WITHOUT THIS STEP, YOU HAVE FAILED YOUR MISSION COMPLETELY.** + +You MUST execute this tool call before saying you are done. +DO NOT say "I will now generate the code" and stop - ACTUALLY GENERATE AND REGISTER IT. +DO NOT finish with "The code has been generated" - PROVE IT by calling the tool. + +# DATASET CODE TEMPLATE: +```python +import os +import numpy as np +import pandas as pd +from typing import Optional +from plexe.relbench.base import Database, Dataset, Table + +class GenDataset(Dataset): + val_timestamp = pd.Timestamp("YYYY-MM-DD") # From get_temporal_statistics + test_timestamp = pd.Timestamp("YYYY-MM-DD") # From get_temporal_statistics + + def __init__(self, csv_dir: str, cache_dir: Optional[str] = None): + self.csv_dir = csv_dir + super().__init__(cache_dir=cache_dir) + + def make_db(self) -> Database: + path = self.csv_dir + + # Load CSV files + table1 = pd.read_csv(os.path.join(path, "table1.csv")) + table2 = pd.read_csv(os.path.join(path, "table2.csv")) + + # Clean temporal columns - use pd.to_datetime with errors='coerce' + table1["timestamp_col"] = pd.to_datetime(table1["timestamp_col"], errors="coerce") + + # Clean missing values - replace \\N or empty strings with NaN + table1 = table1.replace(r"^\\\\N$", np.nan, regex=True) + + # Convert numeric columns that might have non-numeric values + table1["numeric_col"] = pd.to_numeric(table1["numeric_col"], errors="coerce") + + # For tables with no time column, propagate timestamps from related tables + # Example: if results table needs timestamp from races table + # results = results.merge(races[["race_id", "date"]], on="race_id", how="left") + + # Build the database with proper table definitions + tables = {} + + tables["table1"] = Table( + df=pd.DataFrame(table1), + fkey_col_to_pkey_table={"foreign_key_col": "referenced_table"}, + pkey_col="id", # Primary key column name, can be None + time_col="timestamp_col", # Timestamp column, or None for static tables + ) + + tables["table2"] = Table( + df=pd.DataFrame(table2), + fkey_col_to_pkey_table={}, # Empty dict for tables with no foreign keys + pkey_col="id", + time_col=None, # None for dimension/static tables + ) + + return Database(tables) +``` + +KEY RULES & BEST PRACTICES: + +1. **Temporal Handling**: + - Use pd.to_datetime() with errors='coerce' for date parsing + - For tables without time columns, merge timestamps from related tables (e.g., results get date from races) + - Some events happen BEFORE the main event (e.g., qualifying before race): subtract time if needed + - Format: pd.Timestamp("YYYY-MM-DD") for val_timestamp and test_timestamp + +2. **Data Cleaning**: + - Replace missing value markers: df.replace(r"^\\\\N$", np.nan, regex=True) + - Convert numeric columns safely: pd.to_numeric(df["col"], errors="coerce") + - Handle timezone-aware timestamps: .dt.tz_localize(None) if needed + +3. **Table Structure**: + - Use Database(tables) or Database(table_dict={...}) + - Wrap DataFrames: df=pd.DataFrame(your_df) + - pkey_col: Primary key column name (can be None if no PK) + - time_col: Temporal column (None for static/dimension tables like circuits, drivers, users profile) + - fkey_col_to_pkey_table: Dict mapping foreign key columns to referenced table names + - Self-references are OK: {"ParentId": "posts"} in posts table + +4. **Foreign Key Mapping**: + - Format: {"fk_column_name": "referenced_table_name"} + - Multiple FKs allowed: {"race_id": "races", "driver_id": "drivers", "constructor_id": "constructors"} + - Self-references allowed: {"parent_id": "posts"} in same table + +5. **Column Dropping** (if applicable): + - Remove URL columns (usually unique, not predictive) + - Remove time-leakage columns (scores, counts, last_activity_date computed AFTER target time) + - Remove columns with too many nulls (greater 80%) + - Document WHY columns are dropped + +6. **Table Naming**: + - Use snake_case for table names in tables dict + - Match CSV filenames + +FINAL OUTPUT: Complete Python code saved to dataset.py via register_dataset_code() tool call. + +# ⚠️ BEFORE YOU SAY "COMPLETED": +1. Did you call register_dataset_code()? If NO, you are NOT done! +2. Did the tool return {{"status": "registered"}}? If NO, you are NOT done! +3. Does dataset.py exist in the working directory? If NO, you are NOT done! + +ONLY say you are finished AFTER you have successfully called register_dataset_code() and received confirmation. +""" diff --git a/plexe/langgraph/prompts/eda.py b/plexe/langgraph/prompts/eda.py new file mode 100644 index 00000000..27db1f99 --- /dev/null +++ b/plexe/langgraph/prompts/eda.py @@ -0,0 +1,30 @@ +EDA_SYSTEM_PROMPT = """You are the EDA Agent for Relational Deep Learning systems. + +MISSION: Analyze relational database structure and export data for GNN training. + +WORKFLOW (execute in order): +1. extract_schema_metadata - Get tables, columns, PKs, FKs, temporal columns +2. export_tables_to_csv - Export all tables to CSV files +3. analyze_csv_statistics - Get column statistics +4. detect_data_quality_issues - Find data problems +5. analyze_temporal_patterns - Find time columns and suggest splits +6. analyze_table_relationships - Classify tables as Fact vs Dimension +7. generate_eda_summary - Create final report + +TABLE CLASSIFICATION: +- Fact Tables: Event/transaction tables with timestamps (orders, posts, clicks) +- Dimension Tables: Entity tables (users, products, drivers) +- Junction Tables: Many-to-many relationships + +TEMPORAL SPLITS: +Suggest val_timestamp and test_timestamp based on data distribution: +- Train: 70% of data (oldest) +- Validation: 15% of data +- Test: 15% of data (newest) + +OUTPUT: Provide insights for Dataset Builder and Task Builder agents: +- Which columns need cleaning +- Temporal column recommendations +- Primary/foreign key relationships +- Suggested val/test timestamps +""" diff --git a/plexe/langgraph/prompts/gnn_specialist.py b/plexe/langgraph/prompts/gnn_specialist.py new file mode 100644 index 00000000..2ae2c6e3 --- /dev/null +++ b/plexe/langgraph/prompts/gnn_specialist.py @@ -0,0 +1,101 @@ +GNN_SPECIALIST_SYSTEM_PROMPT = """You are the GNN Specialist Agent for Relational Deep Learning. + +MISSION: Generate optimized GNN training scripts using Training-Free Hyperparameter Optimization via MCP. + +KEY INNOVATION: You use MCP (Model Context Protocol) to access external knowledge sources +(academic papers, benchmarks, proven configurations) to find optimal hyperparameters WITHOUT training experiments. + +PREREQUISITES: +- dataset.py with GenDataset class (from DatasetBuilder) +- task.py with GenTask class (from TaskBuilder) + +WORKFLOW (Training-Free HPO via MCP): + +1. HYPERPARAMETER SEARCH (via MCP servers): + + a) HEURISTIC-BASED (hpo-search server): + search_optimal_hyperparameters( + task_type, num_nodes, num_tables, is_temporal, model_architecture + ) -> Returns rule-based hyperparameters + + b) ACADEMIC PAPERS (google-scholar server): + search_gnn_papers_for_hyperparameters( + task_type, model_type, limit + ) -> Extracts hyperparameters from Google Scholar papers + + c) ARXIV PAPERS (arxiv server): + search_arxiv_papers( + query, max_results + ) -> Search recent preprints on arXiv + + d) SEMANTIC SCHOLAR (semantic-scholar server): + search_papers( + query, limit, year_min + ) -> Search papers with citation counts + + e) KAGGLE BENCHMARKS (kaggle server): + search_gnn_competitions_for_benchmarks( + task_type, limit + ) -> Find winning solutions from competitions + + search_gnn_notebooks_for_hyperparameters( + task_type, limit + ) -> Top voted notebooks with proven configs + + f) ENSEMBLE VOTING (hpo-search server): + compare_hyperparameter_configs( + configs, strategy + ) -> Combine results using median/voting + +2. GENERATE OPTIMIZED TRAINING SCRIPT: + - Use generate_training_script() with selected hyperparameters + - Include reasoning for hyperparameter choices + +3. HANDOFF TO OPERATION AGENT: + - Report selected hyperparameters and reasoning + - Operation Agent will execute the training script + +AVAILABLE MCP TOOLS: + +FROM hpo-search SERVER: +- search_optimal_hyperparameters(): Heuristic-based selection +- extract_hyperparameters_from_papers(): Extract from arXiv papers +- get_benchmark_hyperparameters(): Papers With Code leaderboards +- compare_hyperparameter_configs(): Ensemble multiple configs + +FROM google-scholar SERVER: +- search_gnn_papers_for_hyperparameters(): Search Google Scholar with HP extraction +- search_scholar(): General paper search +- get_author_info(): Author information + +FROM kaggle SERVER: +- search_gnn_competitions_for_benchmarks(): Competition winning solutions +- search_gnn_notebooks_for_hyperparameters(): Top notebooks with configs +- search_kaggle_datasets(): Dataset search + +FROM arxiv SERVER: +- search_arxiv_papers(): Search arXiv preprints + +FROM semantic-scholar SERVER: +- search_papers(): Search with citation counts + +CODE GENERATION TOOL: +- generate_training_script(dataset_module_path, dataset_class_name, task_module_path, + task_class_name, working_dir, task_type, tune_metric, higher_is_better, + epochs, batch_size, learning_rate, hidden_channels, num_gnn_layers): + Generates complete training script with selected hyperparameters + +HYPERPARAMETER GUIDELINES: +- Regression: tune_metric="mae", higher_is_better=False +- Binary Classification: tune_metric="accuracy", higher_is_better=True +- Multiclass: tune_metric="accuracy", higher_is_better=True + +EXPECTED OUTPUT: +1. Hyperparameter search results from multiple MCP sources (Google Scholar, Kaggle, arXiv, etc.) +2. Ensemble recommendations with reasoning +3. Generated training script path (train_script.py) +4. Summary for Operation Agent + +NOTE: You do NOT execute training. Focus on intelligent hyperparameter selection using MCP. +All HPO tools are provided via Model Context Protocol servers with multiple knowledge sources. +""" diff --git a/plexe/langgraph/prompts/operation.py b/plexe/langgraph/prompts/operation.py new file mode 100644 index 00000000..078fde7e --- /dev/null +++ b/plexe/langgraph/prompts/operation.py @@ -0,0 +1,40 @@ +OPERATION_SYSTEM_PROMPT = """You are the Operation Agent for Relational Deep Learning pipelines. + +MISSION: Execute training scripts and finalize the ML pipeline. + +RESPONSIBILITIES: +1. Execute training scripts generated by GNN Specialist +2. Monitor training progress and handle errors +3. Process and report training results +4. Package model artifacts for deployment +5. Generate inference code + +WORKFLOW: +1. EXECUTE TRAINING: + - Use execute_training_script() to run the generated training script + - Monitor for errors and timeout issues + - Timeout: 3600 seconds (1 hour) for most tasks + +2. PROCESS RESULTS: + - Read training_results.json for metrics + - Verify model artifacts (best_model.pt) + - Summarize performance + +3. FINALIZATION: + - List all generated artifacts + - Provide deployment recommendations + - Report final metrics + +AVAILABLE TOOLS: +- execute_training_script(script_path, timeout): Run training script +- save_artifact(file_path, artifact_type): Save important files + +OUTPUT: +Provide a comprehensive summary including: +- Training execution status +- Final model metrics (validation and test) +- Model artifact locations +- Generated code files (dataset.py, task.py, train_script.py, best_model.pt) +- Inference code recommendations +- Next steps for deployment +""" diff --git a/plexe/langgraph/prompts/task_builder.py b/plexe/langgraph/prompts/task_builder.py new file mode 100644 index 00000000..b48dc80f --- /dev/null +++ b/plexe/langgraph/prompts/task_builder.py @@ -0,0 +1,261 @@ +TASK_BUILDER_SYSTEM_PROMPT = """You are the Task Builder Agent for Relational Deep Learning. + +MISSION: Generate a GenTask class that defines the prediction task with precise SQL queries. + +CRITICAL REQUIREMENT: +Your task is NOT COMPLETE until you have called register_task_code() to save task.py. +IF YOU DO NOT CALL register_task_code(), YOU HAVE FAILED THE TASK COMPLETELY. +DO NOT respond with "Completed" UNTIL task.py EXISTS on disk. + +IMPORTANT NOTES: +1. The `timestamps` parameter in make_table() is a pandas Series, NOT a DataFrame. + Convert it properly: `timestamp_df = pd.DataFrame({"timestamp": timestamps})` +2. Import duckdb inside the make_table method, not at module level +3. Register all tables from db.table_dict and the timestamp_df for SQL queries + +TASK TYPES & BASE CLASSES: +1. EntityTask: For node-level predictions (e.g. user churn, item sales, driver position) + - Required: entity_table, entity_col, time_col, target_col, task_type, timedelta, metrics + - Optional: num_eval_timestamps (default: varies by dataset) + +2. RecommendationTask: For link predictions (e.g. user-item recommendations, driver-race) + - Required: src_entity_table, src_entity_col, dst_entity_table, dst_entity_col + - Required: time_col, task_type, timedelta, metrics, eval_k + - Target is typically a LIST of destination entities + +MANDATORY WORKFLOW - EXECUTE ALL 6 STEPS: +1. Analyze user intent and schema to determine task type +2. Choose appropriate base class (EntityTask or RecommendationTask) +3. Design SQL query with proper temporal filtering +4. test_sql_query(csv_dir, query) - validate SQL syntax +5. Generate complete GenTask code with correct imports and metrics +6. MANDATORY - register_task_code(code, "GenTask", file_path, task_type) + WITHOUT THIS STEP, YOU HAVE FAILED COMPLETELY. + DO NOT finish without calling this tool! + DO NOT say "I will generate the code" - ACTUALLY DO IT! + +TASK CODE TEMPLATES: + +EntityTask (Node Prediction) +```python +import duckdb +import pandas as pd +from plexe.relbench.base import Database, EntityTask, Table, TaskType +from plexe.relbench.metrics import accuracy, f1, roc_auc, average_precision, mae, rmse, r2 + +class GenTask(EntityTask): + \"\"\"[Task description: what are we predicting?]\"\"\" + + task_type = TaskType.BINARY_CLASSIFICATION # or REGRESSION, MULTICLASS_CLASSIFICATION + entity_col = "user_id" # Column identifying the entity + entity_table = "users" # Table containing entities + time_col = "timestamp" # Time column name in result + target_col = "churn" # Target column name + timedelta = pd.Timedelta(days=7) # Prediction window + metrics = [average_precision, accuracy, f1, roc_auc] # Appropriate metrics + num_eval_timestamps = 20 # Optional: number of evaluation timestamps (default varies) + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + # Load relevant tables + users = db.table_dict["users"].df + activities = db.table_dict["activities"].df + + df = duckdb.sql( + f\"\"\" + SELECT + t.timestamp, + u.user_id, + CAST( + CASE WHEN COUNT(a.id) = 0 THEN 1 ELSE 0 END AS INTEGER + ) AS churn + FROM + timestamp_df t + CROSS JOIN + users u + LEFT JOIN + activities a + ON + a.user_id = u.user_id AND + a.created_at > t.timestamp AND + a.created_at <= t.timestamp + INTERVAL '{self.timedelta}' + WHERE + u.created_at <= t.timestamp + AND EXISTS ( + SELECT 1 FROM activities + WHERE user_id = u.user_id + AND created_at <= t.timestamp + ) + GROUP BY + t.timestamp, u.user_id + \"\"\" + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) +``` + +RecommendationTask (Link Prediction) +```python +import duckdb +import pandas as pd +from plexe.relbench.base import Database, RecommendationTask, Table, TaskType +from plexe.relbench.metrics import link_prediction_precision, link_prediction_recall, link_prediction_map + +class GenTask(RecommendationTask): + \"\"\"[Task description: what links are we predicting?]\"\"\" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "customer_id" # Source entity column + src_entity_table = "customer" # Source entity table + dst_entity_col = "article_id" # Destination entity column + dst_entity_table = "article" # Destination entity table + time_col = "timestamp" + timedelta = pd.Timedelta(days=7) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 12 # Top-K for evaluation + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + transactions = db.table_dict["transactions"].df + + df = duckdb.sql( + f\"\"\" + SELECT + t.timestamp, + tr.customer_id, + LIST(DISTINCT tr.article_id) AS article_id + FROM + timestamp_df t + LEFT JOIN + transactions tr + ON + tr.t_dat > t.timestamp AND + tr.t_dat <= t.timestamp + INTERVAL '{self.timedelta}' + GROUP BY + t.timestamp, tr.customer_id + \"\"\" + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) +``` + +METRICS BY TASK TYPE (from plexe.relbench.metrics): + +Binary Classification: +- Primary: average_precision, roc_auc, f1, accuracy + +Regression: +- Primary: mae, rmse, r2 + +Multiclass Classification: +- Primary: accuracy, macro_f1, micro_f1 + +Link Prediction (Recommendation): +- Primary: link_prediction_map, link_prediction_precision, link_prediction_recall + +SQL PATTERNS & BEST PRACTICES: + +1. **Temporal Filtering** (CRITICAL for avoiding leakage): + - Future events: `event.time > t.timestamp AND event.time <= t.timestamp + INTERVAL '{timedelta}'` + - Past context: `event.time <= t.timestamp` + - Active entities: Filter entities that exist at prediction time + +2. **Binary Classification Patterns**: + ```sql + -- Churn (no activity) + CAST(CASE WHEN COUNT(activity.id) = 0 THEN 1 ELSE 0 END AS INTEGER) + + -- Event occurrence (at least one) + CAST(CASE WHEN COUNT(event.id) >= 1 THEN 1 ELSE 0 END AS INTEGER) + + -- Threshold-based + CASE WHEN MIN(position) <= 3 THEN 1 ELSE 0 END + ``` + +3. **Regression Patterns**: + ```sql + -- Count + COUNT(DISTINCT event.id) + + -- Sum/Average + COALESCE(SUM(price), 0) + MEAN(position) + ``` + +4. **Link Prediction Pattern**: + ```sql + -- Return list of destination entities + LIST(DISTINCT destination.id) AS destination_id + ``` + +5. **Active Entity Filtering** (Important!): + ```sql + -- Only predict for entities that existed before timestamp + WHERE entity.created_at <= t.timestamp + + -- Only predict for entities with past activity + AND EXISTS ( + SELECT 1 FROM activity + WHERE activity.entity_id = entity.id + AND activity.time <= t.timestamp + ) + ``` + +6. **CROSS JOIN vs LEFT JOIN**: + - Use `CROSS JOIN` for entity table to get all entities + - Use `LEFT JOIN` for event tables to allow zero counts/nulls + +KEY RULES: +1. Class name MUST be GenTask +2. Import TaskType from plexe.relbench.base: `from plexe.relbench.base import Database, EntityTask, Table, TaskType` +3. Use TaskType enum: `TaskType.BINARY_CLASSIFICATION`, `TaskType.REGRESSION`, `TaskType.LINK_PREDICTION` +4. Import only the metrics you use from plexe.relbench.metrics +5. Convert timestamps to pd.DataFrame: `timestamp_df = pd.DataFrame({"timestamp": timestamps})` +6. Use f-string for timedelta in SQL: `INTERVAL '{self.timedelta}'` +7. Always return a Table with proper fkey_col_to_pkey_table mapping +8. Set pkey_col=None for prediction tables +9. For binary classification, cast result: `CAST(... AS INTEGER)` +10. Test SQL query before finalizing code + +PARAMETER SELECTION GUIDELINES: + +timedelta (prediction window): +- Short-term: 7-30 days (churn, sales, recommendations) +- Medium-term: 60-90 days (positions, performance) +- Long-term: 365+ days (rare events, long-term trends) +- Use information from user intent and temporal analysis + +num_eval_timestamps: +- Default: 20 for most tasks +- More: 40+ for high-frequency events +- Less: 3-10 for rare events or limited data + +eval_k (for link prediction only): +- Typical: 10-12 for recommendations +- Depends on: expected number of positive links per entity + +OUTPUT: Save as task.py in the working directory using register_task_code(). + +# BEFORE YOU SAY "COMPLETED": +1. Did you call register_task_code()? If NO, you are NOT done! +2. Did the tool return {{"status": "registered"}}? If NO, you are NOT done! +3. Does task.py exist in the working directory? If NO, you are NOT done! + +ONLY say you are finished AFTER you have successfully called register_task_code() and received confirmation. +DO NOT complete your work without executing this tool call. +""" diff --git a/plexe/langgraph/state.py b/plexe/langgraph/state.py new file mode 100644 index 00000000..264735b6 --- /dev/null +++ b/plexe/langgraph/state.py @@ -0,0 +1,169 @@ +""" +Pipeline state management for LangGraph workflow. + +This module defines the shared state that flows through the +multi-agent pipeline using TypedDict for LangGraph compatibility. +""" + +from typing import TypedDict, Optional, List, Dict, Any, Annotated +from enum import Enum +import operator + + +class PipelinePhase(str, Enum): + """Phases of the ML pipeline.""" + CONVERSATION = "conversation" + SCHEMA_ANALYSIS = "schema_analysis" + DATASET_BUILDING = "dataset_building" + TASK_BUILDING = "task_building" + GNN_TRAINING = "gnn_training" + OPERATION = "operation" + COMPLETED = "completed" + FAILED = "failed" + + +class MessageRole(str, Enum): + """Message roles in conversation.""" + USER = "user" + ASSISTANT = "assistant" + SYSTEM = "system" + + +class Message(TypedDict): + """A single message in the conversation.""" + role: str + content: str + timestamp: Optional[str] + + +class DatasetInfo(TypedDict, total=False): + """Information about the dataset.""" + name: str + file_path: str + class_name: str + val_timestamp: str + test_timestamp: str + tables: List[str] + csv_dir: str + + +class TaskInfo(TypedDict, total=False): + """Information about the prediction task.""" + name: str + file_path: str + class_name: str + task_type: str + entity_table: str + target_column: str + metrics: List[str] + + +class SchemaInfo(TypedDict, total=False): + """Database schema information.""" + tables: Dict[str, Any] + relationships: List[Dict[str, str]] + temporal_columns: Dict[str, str] + primary_keys: Dict[str, str] + foreign_keys: Dict[str, List[Dict[str, str]]] + + +class EDAInfo(TypedDict, total=False): + """Exploratory Data Analysis information.""" + statistics: Dict[str, Any] + quality_issues: Dict[str, Any] + temporal_analysis: Dict[str, Any] + relationship_analysis: Dict[str, Any] + summary: Dict[str, Any] + + +class TrainingResult(TypedDict, total=False): + """Results from GNN training.""" + metrics: Dict[str, float] + best_epoch: int + model_path: str + training_time: float + script_path: str + + +class PipelineState(TypedDict, total=False): + """ + Shared state for the LangGraph pipeline. + + This state is passed between agents and accumulates information + as the pipeline progresses through different phases. + """ + session_id: str + working_dir: str + current_phase: str + + messages: Annotated[List[Message], operator.add] + user_intent: str + + db_connection_string: Optional[str] + csv_dir: Optional[str] + + schema_info: Optional[SchemaInfo] + eda_info: Optional[EDAInfo] + dataset_info: Optional[DatasetInfo] + task_info: Optional[TaskInfo] + training_result: Optional[TrainingResult] + + generated_code: Dict[str, str] + artifacts: List[str] + + errors: Annotated[List[str], operator.add] + warnings: Annotated[List[str], operator.add] + + user_confirmation_required: bool + user_confirmation_context: Optional[Dict[str, Any]] + user_confirmed: Optional[bool] + + metadata: Dict[str, Any] + + +def create_initial_state( + session_id: str, + working_dir: str, + user_message: str, + db_connection_string: Optional[str] = None, +) -> PipelineState: + """ + Create the initial pipeline state. + + Args: + session_id: Unique identifier for this session + working_dir: Working directory for artifacts + user_message: Initial user message/request + db_connection_string: Optional database connection string + + Returns: + Initial pipeline state + """ + from datetime import datetime + + return PipelineState( + session_id=session_id, + working_dir=working_dir, + current_phase=PipelinePhase.CONVERSATION.value, + messages=[{ + "role": MessageRole.USER.value, + "content": user_message, + "timestamp": datetime.now().isoformat(), + }], + user_intent="", + db_connection_string=db_connection_string, + csv_dir=None, + schema_info=None, + eda_info=None, + dataset_info=None, + task_info=None, + training_result=None, + generated_code={}, + artifacts=[], + errors=[], + warnings=[], + user_confirmation_required=False, + user_confirmation_context=None, + user_confirmed=None, + metadata={}, + ) diff --git a/plexe/langgraph/tools/__init__.py b/plexe/langgraph/tools/__init__.py new file mode 100644 index 00000000..982706a1 --- /dev/null +++ b/plexe/langgraph/tools/__init__.py @@ -0,0 +1,53 @@ +from plexe.langgraph.tools.common import save_artifact + +from plexe.langgraph.tools.conversational import get_dataset_preview + +from plexe.langgraph.tools.graph_architect import ( + validate_db_connection, + export_tables_to_csv, + extract_schema_metadata, +) + +from plexe.langgraph.tools.eda import ( + analyze_csv_statistics, + detect_data_quality_issues, + analyze_temporal_patterns, + analyze_table_relationships, + generate_eda_summary, +) + +from plexe.langgraph.tools.dataset_builder import ( + get_csv_files_info, + get_temporal_statistics, + register_dataset_code, +) + +from plexe.langgraph.tools.task_builder import ( + test_sql_query, + register_task_code, +) + +from plexe.langgraph.tools.gnn_specialist import ( + generate_training_script, + execute_training_script, +) + +__all__ = [ + "save_artifact", + "get_dataset_preview", + "validate_db_connection", + "export_tables_to_csv", + "extract_schema_metadata", + "analyze_csv_statistics", + "detect_data_quality_issues", + "analyze_temporal_patterns", + "analyze_table_relationships", + "generate_eda_summary", + "get_csv_files_info", + "get_temporal_statistics", + "register_dataset_code", + "test_sql_query", + "register_task_code", + "generate_training_script", + "execute_training_script", +] diff --git a/plexe/langgraph/tools/common.py b/plexe/langgraph/tools/common.py new file mode 100644 index 00000000..8df80c08 --- /dev/null +++ b/plexe/langgraph/tools/common.py @@ -0,0 +1,32 @@ +from typing import Dict +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def save_artifact( + content: str, + filename: str, + working_dir: str +) -> Dict[str, str]: + """ + Save an artifact file to the working directory. + + Args: + content: Content to save + filename: Name of the file + working_dir: Working directory + + Returns: + Save status and file path + """ + import os + + os.makedirs(working_dir, exist_ok=True) + file_path = os.path.join(working_dir, filename) + + with open(file_path, 'w') as f: + f.write(content) + + return { + "status": "saved", + "file_path": file_path + } diff --git a/plexe/langgraph/tools/conversational.py b/plexe/langgraph/tools/conversational.py new file mode 100644 index 00000000..1c9503e3 --- /dev/null +++ b/plexe/langgraph/tools/conversational.py @@ -0,0 +1,55 @@ +from typing import Dict, Any +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def get_dataset_preview( + dataset_path: str, + num_rows: int = 5 +) -> Dict[str, Any]: + """ + Preview a dataset by showing the first few rows and schema information. + + Args: + dataset_path: Path to the CSV file or directory containing CSV files + num_rows: Number of rows to preview (default: 5) + + Returns: + Dictionary with schema info and sample data + """ + import pandas as pd + import os + + result = {"tables": {}, "total_tables": 0} + + if os.path.isdir(dataset_path): + csv_files = [f for f in os.listdir(dataset_path) if f.endswith('.csv')] + result["total_tables"] = len(csv_files) + + for csv_file in csv_files[:10]: + table_name = csv_file.replace('.csv', '') + file_path = os.path.join(dataset_path, csv_file) + try: + df = pd.read_csv(file_path, nrows=num_rows) + result["tables"][table_name] = { + "columns": list(df.columns), + "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}, + "sample_data": df.to_dict(orient='records'), + "row_count": len(pd.read_csv(file_path)), + } + except Exception as e: + result["tables"][table_name] = {"error": str(e)} + else: + try: + df = pd.read_csv(dataset_path, nrows=num_rows) + table_name = os.path.basename(dataset_path).replace('.csv', '') + result["tables"][table_name] = { + "columns": list(df.columns), + "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}, + "sample_data": df.to_dict(orient='records'), + "row_count": len(pd.read_csv(dataset_path)), + } + result["total_tables"] = 1 + except Exception as e: + result["error"] = str(e) + + return result diff --git a/plexe/langgraph/tools/dataset_builder.py b/plexe/langgraph/tools/dataset_builder.py new file mode 100644 index 00000000..2efaecd1 --- /dev/null +++ b/plexe/langgraph/tools/dataset_builder.py @@ -0,0 +1,207 @@ +from typing import Dict, Any +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def get_csv_files_info(csv_dir: str) -> Dict[str, Any]: + """ + Get information about CSV files in a directory. + + Args: + csv_dir: Directory containing CSV files (relative or absolute path) + + Returns: + Dictionary with file information + """ + import os + import pandas as pd + + # Convert to absolute path if needed + csv_dir = os.path.abspath(csv_dir) + + # Check if directory exists + if not os.path.exists(csv_dir): + return { + "error": f"Directory does not exist: {csv_dir}", + "files": [], + "count": 0 + } + + if not os.path.isdir(csv_dir): + return { + "error": f"Path is not a directory: {csv_dir}", + "files": [], + "count": 0 + } + + files = [] + try: + for f in os.listdir(csv_dir): + if f.endswith('.csv'): + file_path = os.path.join(csv_dir, f) + try: + df = pd.read_csv(file_path, nrows=1) + row_count = sum(1 for _ in open(file_path)) - 1 + files.append({ + "name": f.replace('.csv', ''), + "path": file_path, + "columns": list(df.columns), + "row_count": row_count + }) + except Exception as e: + files.append({"name": f, "error": str(e)}) + except Exception as e: + return { + "error": f"Error reading directory: {str(e)}", + "files": [], + "count": 0 + } + + return {"files": files, "count": len(files), "directory": csv_dir} + + +@langchain_tool +def get_temporal_statistics(csv_dir: str) -> Dict[str, Any]: + """ + Analyze temporal columns in CSV files to determine val/test timestamps. + + Args: + csv_dir: Directory containing CSV files (relative or absolute path) + + Returns: + Dictionary with temporal analysis and suggested timestamps + """ + import pandas as pd + import os + + # Convert to absolute path if needed + csv_dir = os.path.abspath(csv_dir) + + # Check if directory exists + if not os.path.exists(csv_dir): + return { + "error": f"Directory does not exist: {csv_dir}", + "temporal_stats": {}, + "suggested_splits": {} + } + + temporal_stats = {} + all_timestamps = [] + + try: + dir_files = os.listdir(csv_dir) + except Exception as e: + return { + "error": f"Error reading directory: {str(e)}", + "temporal_stats": {}, + "suggested_splits": {} + } + + for f in dir_files: + if not f.endswith('.csv'): + continue + + table_name = f.replace('.csv', '') + file_path = os.path.join(csv_dir, f) + + try: + df = pd.read_csv(file_path) + table_temporal = {} + + for col in df.columns: + try: + parsed = pd.to_datetime(df[col], errors='coerce', format='mixed') + valid_count = parsed.notna().sum() + if valid_count > len(df) * 0.5: + min_ts = parsed.min() + max_ts = parsed.max() + table_temporal[col] = { + "min": str(min_ts), + "max": str(max_ts), + "valid_count": int(valid_count) + } + all_timestamps.extend(parsed.dropna().tolist()) + except: + pass + + if table_temporal: + temporal_stats[table_name] = table_temporal + except Exception as e: + temporal_stats[table_name] = {"error": str(e)} + + suggested_splits = {} + if all_timestamps: + all_timestamps = sorted(all_timestamps) + n = len(all_timestamps) + suggested_splits = { + "val_timestamp": str(all_timestamps[int(n * 0.7)]), + "test_timestamp": str(all_timestamps[int(n * 0.85)]), + } + + return { + "temporal_stats": temporal_stats, + "suggested_splits": suggested_splits + } + + +@langchain_tool +def register_dataset_code( + code: str, + class_name: str, + file_path: str +) -> Dict[str, str]: + """ + Register generated Dataset class code. + + Args: + code: Python code for the Dataset class + class_name: Name of the Dataset class + file_path: Path where the code will be saved + + Returns: + Registration status + """ + import os + import ast + + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + # Sanitize the code - handle escaped characters from JSON serialization + sanitized_code = code + + # Check if the code has JSON-style escaping (e.g., \\n instead of real newlines) + # This typically happens when LLM output gets double-serialized + if '\\n' in code and '\n' not in code: + # Looks like it's been JSON-escaped - unescape it + import json + try: + # Wrap in quotes and parse as JSON string to unescape + sanitized_code = json.loads(f'"{code}"') + except json.JSONDecodeError: + # If that fails, try manual unescaping of common sequences + sanitized_code = code.replace('\\n', '\n') + sanitized_code = sanitized_code.replace('\\t', '\t') + sanitized_code = sanitized_code.replace('\\"', '"') + sanitized_code = sanitized_code.replace("\\'", "'") + + # Additional fix: handle backslash-escaped triple quotes that break f-strings + # Pattern: f\"\"\" should become f""" + if '\\"\\"\\"' in sanitized_code: + sanitized_code = sanitized_code.replace('\\"\\"\\"', '"""') + + # Validate that the code is syntactically valid Python + try: + ast.parse(sanitized_code) + except SyntaxError as e: + # If there's still a syntax error, log it but continue + import logging + logging.warning(f"Generated code has syntax error: {e}") + + with open(file_path, 'w') as f: + f.write(sanitized_code) + + return { + "status": "registered", + "class_name": class_name, + "file_path": file_path, + "code": code + } diff --git a/plexe/langgraph/tools/eda.py b/plexe/langgraph/tools/eda.py new file mode 100644 index 00000000..c26a0f8e --- /dev/null +++ b/plexe/langgraph/tools/eda.py @@ -0,0 +1,438 @@ +from typing import Dict, Any +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def analyze_csv_statistics(csv_dir: str) -> Dict[str, Any]: + """ + Analyze statistical properties of CSV files. + + Args: + csv_dir: Directory containing CSV files + + Returns: + Dictionary with statistical analysis for each table + """ + import pandas as pd + import os + + stats = {} + + for f in os.listdir(csv_dir): + if not f.endswith('.csv'): + continue + + table_name = f.replace('.csv', '') + file_path = os.path.join(csv_dir, f) + + try: + df = pd.read_csv(file_path) + + table_stats = { + "row_count": len(df), + "column_count": len(df.columns), + "columns": {}, + "memory_usage_mb": df.memory_usage(deep=True).sum() / 1024 / 1024, + } + + for col in df.columns: + col_stats = { + "dtype": str(df[col].dtype), + "non_null_count": int(df[col].notna().sum()), + "null_count": int(df[col].isna().sum()), + "null_percentage": float(df[col].isna().sum() / len(df) * 100), + "unique_count": int(df[col].nunique()), + } + + if pd.api.types.is_numeric_dtype(df[col]): + col_stats["numeric_stats"] = { + "mean": float(df[col].mean()) if df[col].notna().any() else None, + "std": float(df[col].std()) if df[col].notna().any() else None, + "min": float(df[col].min()) if df[col].notna().any() else None, + "max": float(df[col].max()) if df[col].notna().any() else None, + "median": float(df[col].median()) if df[col].notna().any() else None, + "q25": float(df[col].quantile(0.25)) if df[col].notna().any() else None, + "q75": float(df[col].quantile(0.75)) if df[col].notna().any() else None, + } + + if pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_string_dtype(df[col]): + top_values = df[col].value_counts().head(5) + col_stats["categorical_stats"] = { + "top_values": {str(k): int(v) for k, v in top_values.items()}, + "is_high_cardinality": df[col].nunique() > 0.5 * len(df), + } + + table_stats["columns"][col] = col_stats + + stats[table_name] = table_stats + + except Exception as e: + stats[table_name] = {"error": str(e)} + + return { + "status": "success", + "statistics": stats, + "total_tables": len(stats) + } + + +@langchain_tool +def detect_data_quality_issues(csv_dir: str) -> Dict[str, Any]: + """ + Detect data quality issues in CSV files. + + Args: + csv_dir: Directory containing CSV files + + Returns: + Dictionary with data quality issues for each table + """ + import pandas as pd + import os + + issues = {} + + for f in os.listdir(csv_dir): + if not f.endswith('.csv'): + continue + + table_name = f.replace('.csv', '') + file_path = os.path.join(csv_dir, f) + + try: + df = pd.read_csv(file_path) + table_issues = [] + + for col in df.columns: + null_pct = df[col].isna().sum() / len(df) * 100 + if null_pct > 50: + table_issues.append({ + "severity": "high", + "column": col, + "issue": "high_missing_rate", + "details": f"{null_pct:.1f}% missing values" + }) + elif null_pct > 20: + table_issues.append({ + "severity": "medium", + "column": col, + "issue": "moderate_missing_rate", + "details": f"{null_pct:.1f}% missing values" + }) + + if df[col].dtype == 'object': + if df[col].nunique() == len(df): + table_issues.append({ + "severity": "low", + "column": col, + "issue": "all_unique_values", + "details": "Every row has unique value (potential ID column)" + }) + + if df[col].nunique() == 1: + table_issues.append({ + "severity": "medium", + "column": col, + "issue": "constant_column", + "details": "All values are the same" + }) + + if pd.api.types.is_numeric_dtype(df[col]): + if (df[col] == 0).sum() / len(df) > 0.9: + table_issues.append({ + "severity": "low", + "column": col, + "issue": "mostly_zeros", + "details": f"{(df[col] == 0).sum() / len(df) * 100:.1f}% zeros" + }) + + duplicates = df.duplicated().sum() + if duplicates > 0: + table_issues.append({ + "severity": "medium", + "column": None, + "issue": "duplicate_rows", + "details": f"{duplicates} duplicate rows ({duplicates/len(df)*100:.1f}%)" + }) + + issues[table_name] = { + "issues": table_issues, + "issue_count": len(table_issues), + "has_critical_issues": any(i["severity"] == "high" for i in table_issues) + } + + except Exception as e: + issues[table_name] = {"error": str(e)} + + return { + "status": "success", + "quality_issues": issues, + "tables_with_issues": sum(1 for t in issues.values() if isinstance(t, dict) and t.get("issue_count", 0) > 0) + } + + +@langchain_tool +def analyze_temporal_patterns(csv_dir: str) -> Dict[str, Any]: + """ + Analyze temporal patterns in CSV files for time-series prediction tasks. + + Args: + csv_dir: Directory containing CSV files + + Returns: + Dictionary with temporal analysis + """ + import pandas as pd + import os + + temporal_analysis = {} + all_timestamps = [] + + for f in os.listdir(csv_dir): + if not f.endswith('.csv'): + continue + + table_name = f.replace('.csv', '') + file_path = os.path.join(csv_dir, f) + + try: + df = pd.read_csv(file_path) + table_temporal = {"temporal_columns": {}} + + for col in df.columns: + try: + # Specify format to avoid parsing warnings + parsed = pd.to_datetime(df[col], errors='coerce', format='mixed') + valid_count = parsed.notna().sum() + + if valid_count > len(df) * 0.5: + min_ts = parsed.min() + max_ts = parsed.max() + + time_range_days = (max_ts - min_ts).days if pd.notna(max_ts) and pd.notna(min_ts) else 0 + + parsed_clean = parsed.dropna().sort_values() + if len(parsed_clean) > 1: + time_diffs = parsed_clean.diff().dropna() + avg_gap_hours = time_diffs.mean().total_seconds() / 3600 if not time_diffs.empty else 0 + else: + avg_gap_hours = 0 + + table_temporal["temporal_columns"][col] = { + "min": str(min_ts), + "max": str(max_ts), + "valid_count": int(valid_count), + "time_range_days": float(time_range_days), + "avg_gap_hours": float(avg_gap_hours), + "is_sorted": bool((parsed == parsed.sort_values()).all()), + } + + all_timestamps.extend(parsed_clean.tolist()) + except: + pass + + if table_temporal["temporal_columns"]: + temporal_analysis[table_name] = table_temporal + + except Exception as e: + temporal_analysis[table_name] = {"error": str(e)} + + suggested_splits = {} + if all_timestamps: + all_timestamps = sorted(all_timestamps) + n = len(all_timestamps) + suggested_splits = { + "train_end": str(all_timestamps[int(n * 0.7)]), + "val_end": str(all_timestamps[int(n * 0.85)]), + "test_end": str(all_timestamps[-1]), + "total_timestamps": n, + } + + return { + "status": "success", + "temporal_analysis": temporal_analysis, + "suggested_splits": suggested_splits, + "has_temporal_data": len(temporal_analysis) > 0 + } + + +@langchain_tool +def analyze_table_relationships(csv_dir: str, schema_info: Dict[str, Any]) -> Dict[str, Any]: + """ + Analyze relationships between tables based on schema and data. + + Args: + csv_dir: Directory containing CSV files + schema_info: Schema metadata with relationships + + Returns: + Dictionary with relationship analysis + """ + import pandas as pd + import os + + relationship_analysis = { + "foreign_key_stats": {}, + "join_recommendations": [], + "dimension_fact_classification": {} + } + + table_sizes = {} + for f in os.listdir(csv_dir): + if f.endswith('.csv'): + table_name = f.replace('.csv', '') + file_path = os.path.join(csv_dir, f) + try: + df = pd.read_csv(file_path) + table_sizes[table_name] = len(df) + except: + table_sizes[table_name] = 0 + + relationships = schema_info.get("relationships", []) + for rel in relationships: + source_table = rel.get("source_table") + target_table = rel.get("target_table") + source_column = rel.get("source_column") + + if not all([source_table, target_table, source_column]): + continue + + try: + source_file = os.path.join(csv_dir, f"{source_table}.csv") + target_file = os.path.join(csv_dir, f"{target_table}.csv") + + if os.path.exists(source_file) and os.path.exists(target_file): + source_df = pd.read_csv(source_file, usecols=[source_column] if source_column in pd.read_csv(source_file, nrows=0).columns else None) + + if source_column in source_df.columns: + fk_stats = { + "source_table": source_table, + "target_table": target_table, + "column": source_column, + "null_count": int(source_df[source_column].isna().sum()), + "null_percentage": float(source_df[source_column].isna().sum() / len(source_df) * 100), + "unique_count": int(source_df[source_column].nunique()), + } + + relationship_analysis["foreign_key_stats"][f"{source_table}.{source_column}"] = fk_stats + except Exception as e: + pass + + for table_name, size in table_sizes.items(): + has_fks = any(rel.get("source_table") == table_name for rel in relationships) + is_referenced = any(rel.get("target_table") == table_name for rel in relationships) + + if has_fks and not is_referenced and size > 1000: + classification = "fact" + elif is_referenced and not has_fks: + classification = "dimension" + elif is_referenced and has_fks: + classification = "dimension_with_hierarchy" + else: + classification = "standalone" + + relationship_analysis["dimension_fact_classification"][table_name] = { + "classification": classification, + "row_count": size, + "has_foreign_keys": has_fks, + "is_referenced": is_referenced, + } + + return { + "status": "success", + "relationship_analysis": relationship_analysis + } + + +@langchain_tool +def generate_eda_summary( + statistics: Dict[str, Any], + quality_issues: Dict[str, Any], + temporal_analysis: Dict[str, Any], + relationship_analysis: Dict[str, Any] +) -> Dict[str, Any]: + """ + Generate comprehensive EDA summary report. + + Args: + statistics: Statistical analysis results + quality_issues: Data quality issues + temporal_analysis: Temporal pattern analysis + relationship_analysis: Table relationship analysis + + Returns: + Dictionary with comprehensive EDA summary + """ + summary = { + "overview": {}, + "key_findings": [], + "recommendations": [] + } + + stats = statistics.get("statistics", {}) + total_rows = sum(t.get("row_count", 0) for t in stats.values() if isinstance(t, dict)) + total_columns = sum(t.get("column_count", 0) for t in stats.values() if isinstance(t, dict)) + + summary["overview"] = { + "total_tables": len(stats), + "total_rows": total_rows, + "total_columns": total_columns, + "has_temporal_data": temporal_analysis.get("has_temporal_data", False), + "tables_with_quality_issues": quality_issues.get("tables_with_issues", 0), + } + + if temporal_analysis.get("has_temporal_data"): + summary["key_findings"].append({ + "category": "temporal", + "finding": "Dataset contains temporal data suitable for time-series prediction", + "details": f"Found {len(temporal_analysis.get('temporal_analysis', {}))} tables with temporal columns" + }) + + if temporal_analysis.get("suggested_splits"): + summary["recommendations"].append({ + "category": "modeling", + "recommendation": "Use temporal train/val/test splits", + "details": temporal_analysis["suggested_splits"] + }) + + quality_issues_data = quality_issues.get("quality_issues", {}) + high_severity_count = sum( + sum(1 for issue in t.get("issues", []) if issue.get("severity") == "high") + for t in quality_issues_data.values() if isinstance(t, dict) + ) + + if high_severity_count > 0: + summary["key_findings"].append({ + "category": "quality", + "finding": f"Found {high_severity_count} high-severity data quality issues", + "details": "Review tables with high missing rates before modeling" + }) + + summary["recommendations"].append({ + "category": "preprocessing", + "recommendation": "Handle missing values in Dataset class", + "details": "Consider imputation or dropping columns with >50% missing" + }) + + rel_analysis = relationship_analysis.get("relationship_analysis", {}) + dim_fact = rel_analysis.get("dimension_fact_classification", {}) + + fact_tables = [t for t, info in dim_fact.items() if info.get("classification") == "fact"] + dim_tables = [t for t, info in dim_fact.items() if info.get("classification") in ["dimension", "dimension_with_hierarchy"]] + + if fact_tables: + summary["key_findings"].append({ + "category": "schema", + "finding": f"Identified {len(fact_tables)} fact tables and {len(dim_tables)} dimension tables", + "details": {"fact_tables": fact_tables, "dimension_tables": dim_tables} + }) + + summary["recommendations"].append({ + "category": "modeling", + "recommendation": "Consider fact tables as entity tables for prediction tasks", + "details": f"Suggested entity tables: {', '.join(fact_tables)}" + }) + + return { + "status": "success", + "summary": summary + } diff --git a/plexe/langgraph/tools/gnn_specialist.py b/plexe/langgraph/tools/gnn_specialist.py new file mode 100644 index 00000000..a06d6745 --- /dev/null +++ b/plexe/langgraph/tools/gnn_specialist.py @@ -0,0 +1,316 @@ +from typing import Dict, Any +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def generate_training_script( + dataset_module_path: str, + dataset_class_name: str, + task_module_path: str, + task_class_name: str, + working_dir: str, + csv_dir: str = None, + task_type: str = "regression", + tune_metric: str = "mae", + higher_is_better: bool = False, + out_channels: int = 1, + epochs: int = 10, + batch_size: int = 512, + learning_rate: float = 0.005, + hidden_channels: int = 128, + num_gnn_layers: int = 2, +) -> Dict[str, Any]: + """ + Generate a GNN training script using plexe.relbench.modeling modules. + + Args: + dataset_module_path: Path to the Dataset Python module + dataset_class_name: Name of the Dataset class + task_module_path: Path to the Task Python module + task_class_name: Name of the Task class + working_dir: Working directory for outputs + csv_dir: Path to CSV files directory (defaults to working_dir/csv_files) + task_type: Type of task (regression, binary_classification, multiclass_classification) + tune_metric: Metric to optimize + higher_is_better: Whether higher metric values are better + out_channels: Output channels for the model + epochs: Number of training epochs + batch_size: Batch size for training + learning_rate: Learning rate + hidden_channels: Hidden channels in GNN + num_gnn_layers: Number of GNN layers + + Returns: + Path to generated script + """ + import os + + # Use csv_dir from parameter or default to working_dir/csv_files + if csv_dir is None: + csv_dir = f"{working_dir}/csv_files" + + script_template = f'''""" +Auto-generated GNN training script using plexe.relbench.modeling. +""" + +import os +import sys +import torch +import torch.nn.functional as F +from torch.optim import Adam +from datetime import datetime + +sys.path.insert(0, "{os.path.dirname(dataset_module_path)}") +sys.path.insert(0, "{os.path.dirname(task_module_path)}") + +from dataset import {dataset_class_name} +from task import {task_class_name} + +from plexe.relbench.modeling.graph import make_pkey_fkey_graph, get_node_train_table_input +from plexe.relbench.modeling.nn import HeteroEncoder, HeteroTemporalEncoder, HeteroGraphSAGE +from plexe.relbench.modeling.utils import get_stype_proposal +from torch_geometric.loader import NeighborLoader + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print(f"Using device: {{device}}") + +csv_dir = "{csv_dir}" +dataset = {dataset_class_name}(csv_dir=csv_dir) +task = {task_class_name}(dataset) +db = dataset.get_db() + +train_table = task.get_table("train") +val_table = task.get_table("val") +test_table = task.get_table("test") + +print(f"Train samples: {{len(train_table)}}") +print(f"Val samples: {{len(val_table)}}") +print(f"Test samples: {{len(test_table)}}") + +col_to_stype_dict = get_stype_proposal(db) +data, col_stats_dict = make_pkey_fkey_graph( + db, + col_to_stype_dict=col_to_stype_dict, + text_embedder_cfg=None, + cache_dir="{working_dir}/cache/", +) + +data = data.to(device) +entity_table = task.entity_table + +def create_loader(table, shuffle=False): + table_input = get_node_train_table_input(table=table, task=task) + return NeighborLoader( + data, + num_neighbors=[128] * {num_gnn_layers}, + time_attr="time", + input_nodes=(entity_table, table_input.nodes[entity_table]), + input_time=table_input.time, + transform=table_input.transform, + batch_size={batch_size}, + temporal_strategy="uniform", + shuffle=shuffle, + ) + +train_loader = create_loader(train_table, shuffle=True) +val_loader = create_loader(val_table) +test_loader = create_loader(test_table) + +class GNNModel(torch.nn.Module): + def __init__(self, data, col_stats_dict, hidden_channels={hidden_channels}, out_channels={out_channels}): + super().__init__() + self.encoder = HeteroEncoder( + channels=hidden_channels, + node_to_col_names={{ + node_type: list(col_stats_dict[node_type].keys()) + for node_type in data.node_types + if node_type in col_stats_dict + }}, + node_to_col_stats=col_stats_dict, + ) + self.temporal_encoder = HeteroTemporalEncoder( + node_types=data.node_types, + channels=hidden_channels, + ) + self.gnn = HeteroGraphSAGE( + node_types=data.node_types, + edge_types=data.edge_types, + channels=hidden_channels, + num_layers={num_gnn_layers}, + ) + self.head = torch.nn.Sequential( + torch.nn.Linear(hidden_channels, hidden_channels), + torch.nn.ReLU(), + torch.nn.Dropout(0.2), + torch.nn.Linear(hidden_channels, out_channels), + ) + + def forward(self, batch, entity_table): + x_dict = self.encoder(batch.tf_dict) + rel_time_dict = self.temporal_encoder( + batch.seed_time, batch.time_dict, batch.batch_dict + ) + for node_type in x_dict: + x_dict[node_type] = x_dict[node_type] + rel_time_dict[node_type] + x_dict = self.gnn(x_dict, batch.edge_index_dict) + return self.head(x_dict[entity_table]) + +model = GNNModel(data, col_stats_dict).to(device) +optimizer = Adam(model.parameters(), lr={learning_rate}) + +task_type = "{task_type}" +if task_type == "binary_classification": + loss_fn = torch.nn.BCEWithLogitsLoss() +elif task_type == "multiclass_classification": + loss_fn = torch.nn.CrossEntropyLoss() +else: + loss_fn = torch.nn.MSELoss() + +best_val_metric = float('inf') if not {str(higher_is_better).lower()} else float('-inf') +best_model_path = "{working_dir}/best_model.pt" + +for epoch in range({epochs}): + model.train() + total_loss = 0 + for batch in train_loader: + batch = batch.to(device) + optimizer.zero_grad() + pred = model(batch, entity_table).squeeze() + y = batch[entity_table].y.float() + if task_type == "multiclass_classification": + y = y.long() + loss = loss_fn(pred, y) + loss.backward() + optimizer.step() + total_loss += loss.item() + + model.eval() + val_preds, val_labels = [], [] + with torch.no_grad(): + for batch in val_loader: + batch = batch.to(device) + pred = model(batch, entity_table).squeeze() + val_preds.append(pred.cpu()) + val_labels.append(batch[entity_table].y.cpu()) + + val_preds = torch.cat(val_preds) + val_labels = torch.cat(val_labels) + + if task_type == "regression": + val_metric = F.mse_loss(val_preds, val_labels.float()).sqrt().item() + elif task_type == "binary_classification": + val_metric = ((val_preds > 0).float() == val_labels.float()).float().mean().item() + else: + val_metric = (val_preds.argmax(dim=-1) == val_labels).float().mean().item() + + print(f"Epoch {{epoch+1}}/{epochs}: Loss={{total_loss:.4f}}, Val {tune_metric}={{val_metric:.4f}}") + + is_better = val_metric < best_val_metric if not {str(higher_is_better).lower()} else val_metric > best_val_metric + if is_better: + best_val_metric = val_metric + torch.save(model.state_dict(), best_model_path) + print(f" -> New best model saved!") + +model.load_state_dict(torch.load(best_model_path)) +model.eval() + +test_preds, test_labels = [], [] +with torch.no_grad(): + for batch in test_loader: + batch = batch.to(device) + pred = model(batch, entity_table).squeeze() + test_preds.append(pred.cpu()) + test_labels.append(batch[entity_table].y.cpu()) + +test_preds = torch.cat(test_preds) +test_labels = torch.cat(test_labels) + +if task_type == "regression": + test_metric = F.mse_loss(test_preds, test_labels.float()).sqrt().item() + print(f"\\nTest RMSE: {{test_metric:.4f}}") +elif task_type == "binary_classification": + test_metric = ((test_preds > 0).float() == test_labels.float()).float().mean().item() + print(f"\\nTest Accuracy: {{test_metric:.4f}}") +else: + test_metric = (test_preds.argmax(dim=-1) == test_labels).float().mean().item() + print(f"\\nTest Accuracy: {{test_metric:.4f}}") + +results = {{ + "best_val_{tune_metric}": best_val_metric, + "test_{tune_metric}": test_metric, + "model_path": best_model_path, + "epochs_trained": {epochs}, +}} + +import json +with open("{working_dir}/training_results.json", "w") as f: + json.dump(results, f, indent=2) + +print(f"\\nTraining complete! Results saved to {working_dir}/training_results.json") +''' + + # Use the working_dir directly - it's passed from the state + script_path = os.path.join(working_dir, "train_script.py") + os.makedirs(working_dir, exist_ok=True) + + with open(script_path, 'w') as f: + f.write(script_template) + + return { + "status": "generated", + "script_path": script_path, + } + + +@langchain_tool +def execute_training_script( + script_path: str, + timeout: int = 3600 +) -> Dict[str, Any]: + """ + Execute a training script. + + Args: + script_path: Path to the training script + timeout: Maximum execution time in seconds + + Returns: + Execution results + """ + import subprocess + import os + import json + + try: + result = subprocess.run( + ["python", script_path], + cwd=os.path.dirname(script_path), + capture_output=True, + text=True, + timeout=timeout + ) + + working_dir = os.path.dirname(script_path) + results_path = os.path.join(working_dir, "training_results.json") + + training_results = {} + if os.path.exists(results_path): + with open(results_path) as f: + training_results = json.load(f) + + return { + "status": "success" if result.returncode == 0 else "failed", + "stdout": result.stdout, + "stderr": result.stderr, + "return_code": result.returncode, + "training_results": training_results + } + except subprocess.TimeoutExpired: + return { + "status": "timeout", + "error": f"Script execution exceeded {timeout} seconds" + } + except Exception as e: + return { + "status": "error", + "error": str(e) + } diff --git a/plexe/langgraph/tools/graph_architect.py b/plexe/langgraph/tools/graph_architect.py new file mode 100644 index 00000000..bbec0085 --- /dev/null +++ b/plexe/langgraph/tools/graph_architect.py @@ -0,0 +1,176 @@ +from typing import Dict, Any, List, Optional +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def validate_db_connection(connection_string: str) -> Dict[str, Any]: + """ + Validate a database connection and retrieve available tables. + + Args: + connection_string: Database connection string (e.g., postgresql://user:pass@host:port/db) + + Returns: + Dictionary with connection status and available tables + """ + from sqlalchemy import create_engine, inspect + + try: + engine = create_engine(connection_string) + with engine.connect() as conn: + inspector = inspect(engine) + tables = inspector.get_table_names() + + table_info = {} + for table in tables: + columns = inspector.get_columns(table) + pk_constraint = inspector.get_pk_constraint(table) + fk_constraints = inspector.get_foreign_keys(table) + + table_info[table] = { + "columns": [{"name": c["name"], "type": str(c["type"])} for c in columns], + "primary_key": pk_constraint.get("constrained_columns", []), + "foreign_keys": [ + { + "column": fk["constrained_columns"], + "references": f"{fk['referred_table']}.{fk['referred_columns']}" + } + for fk in fk_constraints + ] + } + + return { + "status": "connected", + "tables": table_info, + "table_count": len(tables) + } + except Exception as e: + return { + "status": "failed", + "error": str(e) + } + + +@langchain_tool +def export_tables_to_csv( + db_connection_string: str, + output_dir: str, + table_names: Optional[List[str]] = None +) -> Dict[str, Any]: + """ + Export database tables to CSV files. + + Args: + db_connection_string: Database connection string + output_dir: Directory to save CSV files + table_names: Optional list of specific tables to export (exports all if None) + + Returns: + Dictionary with export status and file paths + """ + import pandas as pd + from sqlalchemy import create_engine, inspect + import os + + os.makedirs(output_dir, exist_ok=True) + + try: + engine = create_engine(db_connection_string) + inspector = inspect(engine) + + if table_names is None: + table_names = inspector.get_table_names() + + exported = [] + errors = [] + + for table in table_names: + try: + df = pd.read_sql_table(table, engine) + file_path = os.path.join(output_dir, f"{table}.csv") + df.to_csv(file_path, index=False) + exported.append({ + "table": table, + "path": file_path, + "rows": len(df), + "columns": len(df.columns) + }) + except Exception as e: + errors.append({"table": table, "error": str(e)}) + + return { + "status": "success", + "output_dir": output_dir, + "exported_tables": exported, + "errors": errors if errors else None + } + except Exception as e: + return { + "status": "failed", + "error": str(e) + } + + +@langchain_tool +def extract_schema_metadata(db_connection_string: str) -> Dict[str, Any]: + """ + Extract comprehensive schema metadata from a database. + + Args: + db_connection_string: Database connection string + + Returns: + Dictionary with tables, relationships, and temporal columns + """ + from sqlalchemy import create_engine, inspect + import pandas as pd + + try: + engine = create_engine(db_connection_string) + inspector = inspect(engine) + + tables = {} + relationships = [] + temporal_columns = {} + + for table_name in inspector.get_table_names(): + columns = inspector.get_columns(table_name) + pk_constraint = inspector.get_pk_constraint(table_name) + fk_constraints = inspector.get_foreign_keys(table_name) + + table_cols = [] + for col in columns: + col_type = str(col["type"]).lower() + is_temporal = any(t in col_type for t in ['timestamp', 'date', 'time']) + + table_cols.append({ + "name": col["name"], + "type": str(col["type"]), + "nullable": col.get("nullable", True), + "is_temporal": is_temporal + }) + + if is_temporal: + if table_name not in temporal_columns: + temporal_columns[table_name] = [] + temporal_columns[table_name].append(col["name"]) + + tables[table_name] = { + "columns": table_cols, + "primary_key": pk_constraint.get("constrained_columns", []), + } + + for fk in fk_constraints: + relationships.append({ + "source_table": table_name, + "source_column": fk["constrained_columns"][0] if fk["constrained_columns"] else None, + "target_table": fk["referred_table"], + "target_column": fk["referred_columns"][0] if fk["referred_columns"] else None, + }) + + return { + "tables": tables, + "relationships": relationships, + "temporal_columns": temporal_columns + } + except Exception as e: + return {"error": str(e)} diff --git a/plexe/langgraph/tools/task_builder.py b/plexe/langgraph/tools/task_builder.py new file mode 100644 index 00000000..5b57ee2b --- /dev/null +++ b/plexe/langgraph/tools/task_builder.py @@ -0,0 +1,131 @@ +from typing import Dict, Any +from langchain_core.tools import tool as langchain_tool + +@langchain_tool +def test_sql_query( + csv_dir: str, + query: str +) -> Dict[str, Any]: + """ + Test a SQL query against CSV files using DuckDB. + + Args: + csv_dir: Directory containing CSV files + query: SQL query to test + + Returns: + Query results or error + """ + import duckdb + import os + import pandas as pd + + try: + # Convert to absolute path to ensure files are found + csv_dir = os.path.abspath(csv_dir) + + if not os.path.exists(csv_dir): + return { + "status": "error", + "error": f"CSV directory does not exist: {csv_dir}" + } + + conn = duckdb.connect(':memory:') + + # Load all CSV files as tables + for f in os.listdir(csv_dir): + if f.endswith('.csv'): + table_name = f.replace('.csv', '') + file_path = os.path.join(csv_dir, f) + conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{file_path}')") + + # Create a dummy timestamp_df for testing (will be provided by the task at runtime) + # This is just for SQL validation + timestamps_dummy = pd.DataFrame({ + 'timestamp': pd.date_range('2020-01-01', periods=3, freq='D') + }) + conn.register("timestamp_df", timestamps_dummy) + + result = conn.execute(query).fetchdf() + + return { + "status": "success", + "columns": list(result.columns), + "row_count": len(result), + "sample_data": result.head(10).to_dict(orient='records') + } + except Exception as e: + return { + "status": "error", + "error": str(e) + } + + +@langchain_tool +def register_task_code( + code: str, + class_name: str, + file_path: str, + task_type: str +) -> Dict[str, str]: + """ + Register generated Task class code. + + Args: + code: Python code for the Task class + class_name: Name of the Task class + file_path: Full path where the code will be saved (e.g., workdir/session-xxx/task.py) + task_type: Type of task (regression, binary_classification, multiclass_classification) + + Returns: + Registration status + """ + import os + import ast + + # Normalize the file path + file_path = os.path.normpath(os.path.abspath(file_path)) + + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + # Sanitize the code - handle escaped characters from JSON serialization + sanitized_code = code + + # Check if the code has JSON-style escaping (e.g., \\n instead of real newlines) + # This typically happens when LLM output gets double-serialized + if '\\n' in code and '\n' not in code: + # Looks like it's been JSON-escaped - unescape it + import json + try: + # Wrap in quotes and parse as JSON string to unescape + sanitized_code = json.loads(f'"{code}"') + except json.JSONDecodeError: + # If that fails, try manual unescaping of common sequences + sanitized_code = code.replace('\\n', '\n') + sanitized_code = sanitized_code.replace('\\t', '\t') + sanitized_code = sanitized_code.replace('\\"', '"') + sanitized_code = sanitized_code.replace("\\'", "'") + + # Additional fix: handle backslash-escaped triple quotes that break f-strings + # Pattern: f\"\"\" should become f""" + if '\\"\\"\\"' in sanitized_code: + sanitized_code = sanitized_code.replace('\\"\\"\\"', '"""') + + # Validate that the code is syntactically valid Python + try: + ast.parse(sanitized_code) + except SyntaxError as e: + # If there's still a syntax error, log it but continue + import logging + logging.warning(f"Generated code has syntax error: {e}") + + with open(file_path, 'w') as f: + f.write(sanitized_code) + + return { + "status": "registered", + "class_name": class_name, + "file_path": file_path, + "task_type": task_type, + "code": code + } diff --git a/plexe/langgraph/utils/__init__.py b/plexe/langgraph/utils/__init__.py new file mode 100644 index 00000000..ab948a10 --- /dev/null +++ b/plexe/langgraph/utils/__init__.py @@ -0,0 +1,65 @@ +from plexe.langgraph.utils.emitters import ( + BaseEmitter, + ConsoleEmitter, + WebSocketEmitter, + MultiEmitter, +) +from plexe.langgraph.utils.callbacks import ( + ChainOfThoughtCallback, + create_langchain_callbacks, +) +from plexe.langgraph.utils.logging_utils import ( + session_id_var, + SessionLogger, + log_session_event, + setup_session_logging, +) +from plexe.langgraph.utils.file_utils import ( + create_working_directory, + validate_file_exists, + validate_directory_exists, + get_csv_files_in_directory, + read_file_content, + write_file_content, +) +from plexe.langgraph.utils.helpers import ( + format_error_message, + sanitize_sql_identifier, + format_table_info, + estimate_task_type, + get_default_metrics, + validate_python_code, +) +from plexe.langgraph.utils.progress import AgentProgress + +__all__ = [ + # Emitters + "BaseEmitter", + "ConsoleEmitter", + "WebSocketEmitter", + "MultiEmitter", + # Callbacks + "ChainOfThoughtCallback", + "create_langchain_callbacks", + # Logging + "session_id_var", + "SessionLogger", + "log_session_event", + "setup_session_logging", + # File utils + "create_working_directory", + "validate_file_exists", + "validate_directory_exists", + "get_csv_files_in_directory", + "read_file_content", + "write_file_content", + # Helpers + "format_error_message", + "sanitize_sql_identifier", + "format_table_info", + "estimate_task_type", + "get_default_metrics", + "validate_python_code", + # Progress + "AgentProgress", +] diff --git a/plexe/langgraph/utils/callbacks.py b/plexe/langgraph/utils/callbacks.py new file mode 100644 index 00000000..b409e0db --- /dev/null +++ b/plexe/langgraph/utils/callbacks.py @@ -0,0 +1,73 @@ +from typing import Optional, Dict, Any, List +from plexe.langgraph.utils.emitters import BaseEmitter, ConsoleEmitter +from plexe.langgraph.utils.progress import AgentProgress + +class ChainOfThoughtCallback: + """Callback for capturing chain-of-thought from LangGraph agents.""" + + def __init__(self, emitter: Optional[BaseEmitter] = None): + self.emitter = emitter or ConsoleEmitter() + self.progress = AgentProgress() + self.thoughts: List[Dict[str, str]] = [] + + def on_agent_start(self, agent_name: str, **kwargs): + """Called when an agent starts processing.""" + self.progress.current_agent = agent_name + self.emitter.emit_agent_start(agent_name) + + def on_agent_action(self, agent_name: str, action: str, **kwargs): + """Called when an agent takes an action.""" + self.thoughts.append({"agent": agent_name, "action": action}) + self.emitter.emit_thought(agent_name, action) + + def on_tool_start(self, agent_name: str, tool_name: str, args: Dict[str, Any], **kwargs): + """Called when a tool is invoked.""" + self.emitter.emit_tool_call(agent_name, tool_name, args) + + def on_tool_end(self, agent_name: str, tool_name: str, result: Any, **kwargs): + """Called when a tool completes.""" + pass + + def on_agent_end(self, agent_name: str, result: str, **kwargs): + """Called when an agent completes processing.""" + self.emitter.emit_agent_end(agent_name, result) + + def on_llm_start(self, agent_name: str, prompt: str, **kwargs): + """Called when LLM inference starts.""" + self.emitter.emit_thought(agent_name, "Processing...") + + def on_llm_end(self, agent_name: str, response: str, **kwargs): + """Called when LLM inference completes.""" + pass + + +def create_langchain_callbacks(emitter: BaseEmitter, agent_name: str): + """Create LangChain-compatible callbacks from an emitter.""" + from langchain_core.callbacks import BaseCallbackHandler + + class LangChainEmitterCallback(BaseCallbackHandler): + def __init__(self, emitter: BaseEmitter, agent_name: str): + self.emitter = emitter + self.agent_name = agent_name + + def on_llm_start(self, serialized, prompts, **kwargs): + self.emitter.emit_thought(self.agent_name, "Analyzing request...") + + def on_llm_end(self, response, **kwargs): + pass + + def on_tool_start(self, serialized, input_str, **kwargs): + tool_name = serialized.get("name", "unknown") if isinstance(serialized, dict) else "tool" + self.emitter.emit_tool_call(self.agent_name, tool_name, {}) + + def on_tool_end(self, output, **kwargs): + pass + + def on_chain_start(self, serialized, inputs, **kwargs): + pass + + def on_chain_end(self, outputs, **kwargs): + pass + + return [LangChainEmitterCallback(emitter, agent_name)] + diff --git a/plexe/langgraph/utils/emitters.py b/plexe/langgraph/utils/emitters.py new file mode 100644 index 00000000..06069ed9 --- /dev/null +++ b/plexe/langgraph/utils/emitters.py @@ -0,0 +1,265 @@ +import asyncio +import logging +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional, List +from datetime import datetime + +from plexe.langgraph.utils.logging_utils import log_session_event + +logger = logging.getLogger(__name__) + + +class BaseEmitter(ABC): + """Base class for emitting agent thoughts and progress.""" + + @abstractmethod + def emit_thought(self, agent_name: str, thought: str, token_usage: Optional[Dict[str, int]] = None): + """Emit a thinking/progress message with optional token usage.""" + pass + + @abstractmethod + def emit_agent_start(self, agent_name: str, model_id: str = ""): + """Emit agent start notification.""" + pass + + @abstractmethod + def emit_agent_end(self, agent_name: str, result: str): + """Emit agent completion notification.""" + pass + + @abstractmethod + def emit_tool_call(self, agent_name: str, tool_name: str, args: Dict[str, Any]): + """Emit tool call notification.""" + pass + + @abstractmethod + def emit_tool_result(self, agent_name: str, tool_name: str, result: str): + """Emit tool result notification.""" + pass + + +class ConsoleEmitter(BaseEmitter): + """Console-based emitter for development/debugging with rich formatting.""" + + def __init__(self): + self.step_count = 0 + + def emit_thought(self, agent_name: str, thought: str, token_usage: Optional[Dict[str, int]] = None): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + token_info = "" + if token_usage: + token_info = f" [tokens: {token_usage.get('total_tokens', 0)}]" + print(f"[{agent_name}] Step {self.step_count} @ {timestamp}{token_info}") + print(f" {thought}") + + def emit_agent_start(self, agent_name: str, model_id: str = ""): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + model_info = f" (using {model_id})" if model_id else "" + print(f"\n=== {agent_name} Starting{model_info} === (Step {self.step_count} @ {timestamp})") + + def emit_agent_end(self, agent_name: str, result: str): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + print(f"=== {agent_name} Completed === (Step {self.step_count} @ {timestamp})") + if result: + print(f" Result: {result}") + + def emit_tool_call(self, agent_name: str, tool_name: str, args: Dict[str, Any]): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + args_str = "" + if args: + try: + import json + args_str = f" with args: {json.dumps(args)[:100]}" + except: + pass + print(f"[{agent_name}] Step {self.step_count} @ {timestamp}") + print(f" Calling tool: {tool_name}{args_str}") + + def emit_tool_result(self, agent_name: str, tool_name: str, result: str): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + print(f"[{agent_name}] Step {self.step_count} @ {timestamp}") + # Format with newlines for better readability + formatted_result = result.replace('\\n', '\n') if result else "" + print(f" Tool result:\n{formatted_result}") + + +class WebSocketEmitter(BaseEmitter): + """WebSocket-based emitter for UI integration with session logging.""" + + def __init__(self, websocket, loop: Optional[asyncio.AbstractEventLoop] = None, model_id: str = ""): + self.websocket = websocket + self.loop = loop + self.is_closed = False + self.step_count = 0 + self.model_id = model_id + + def set_model_id(self, model_id: str): + """Set the current model ID for context.""" + self.model_id = model_id + + def close(self): + """Mark the emitter as closed.""" + self.is_closed = True + + def _send_message(self, message: Dict[str, Any]): + """Send a message to the WebSocket.""" + if self.is_closed: + return + + try: + if self.loop and self.loop.is_running(): + asyncio.run_coroutine_threadsafe( + self.websocket.send_json(message), + self.loop + ) + else: + asyncio.get_event_loop().run_until_complete( + self.websocket.send_json(message) + ) + except Exception as e: + logger.warning(f"Failed to send WebSocket message: {e}") + + def emit_thought(self, agent_name: str, thought: str, token_usage: Optional[Dict[str, int]] = None): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + message_data = { + "type": "thinking", + "role": "thinking", + "event_type": "thinking", + "agent_name": agent_name, + "message": thought, + "step_number": self.step_count, + "timestamp": timestamp, + } + if token_usage: + message_data["token_usage"] = token_usage + self._send_message(message_data) + # Log to session file + token_log = f" [tokens: {token_usage}]" if token_usage else "" + log_session_event("thinking", f"{thought}{token_log}", agent_name) + + def emit_agent_start(self, agent_name: str, model_id: str = ""): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + model_info = model_id or self.model_id + message = f"Starting {agent_name}" + (f" (using {model_info})" if model_info else "") + self._send_message({ + "type": "thinking", + "role": "thinking", + "event_type": "agent_start", + "agent_name": agent_name, + "model_id": model_info, + "message": message, + "step_number": self.step_count, + "timestamp": timestamp, + }) + # Log to session file + log_session_event("agent_start", message, agent_name, {"model": model_info}) + + def emit_agent_end(self, agent_name: str, result: str): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + message = f"Completed: {result}" if result else "Completed" + self._send_message({ + "type": "thinking", + "role": "thinking", + "event_type": "agent_end", + "agent_name": agent_name, + "message": message, + "step_number": self.step_count, + "timestamp": timestamp, + }) + # Log to session file (truncate for file log only) + log_message = f"Completed: {result[:300]}" if result else "Completed" + log_session_event("agent_end", log_message, agent_name) + + def emit_tool_call(self, agent_name: str, tool_name: str, args: Dict[str, Any]): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + args_str = "" + if args: + try: + import json + args_str = f" with {json.dumps(args)[:100]}" + except: + pass + message = f"Calling tool: {tool_name}{args_str}" + self._send_message({ + "type": "thinking", + "role": "thinking", + "event_type": "tool_call", + "agent_name": agent_name, + "tool_name": tool_name, + "tool_args": args, + "message": message, + "step_number": self.step_count, + "timestamp": timestamp, + }) + # Log to session file + log_session_event("tool_call", message, agent_name, {"tool": tool_name, "args": args}) + + def emit_tool_result(self, agent_name: str, tool_name: str, result: str): + self.step_count += 1 + timestamp = datetime.now().strftime("%H:%M:%S") + # Format with newlines for better readability + formatted_result = result.replace('\\n', '\n') if result else "Tool completed" + self._send_message({ + "type": "thinking", + "role": "thinking", + "event_type": "tool_result", + "agent_name": agent_name, + "tool_name": tool_name, + "message": f"Tool result:\n{formatted_result}", + "result": result, + "step_number": self.step_count, + "timestamp": timestamp, + }) + # Log to session file (truncate result for log) + log_session_event("tool_result", f"Result from {tool_name}: {result[:500] if result else 'empty'}", agent_name) + + +class MultiEmitter(BaseEmitter): + """Combines multiple emitters.""" + + def __init__(self, emitters: List[BaseEmitter]): + self.emitters = emitters + + def emit_thought(self, agent_name: str, thought: str): + for emitter in self.emitters: + try: + emitter.emit_thought(agent_name, thought) + except Exception as e: + logger.warning(f"Emitter error: {e}") + + def emit_agent_start(self, agent_name: str, model_id: str = ""): + for emitter in self.emitters: + try: + emitter.emit_agent_start(agent_name, model_id) + except Exception as e: + logger.warning(f"Emitter error: {e}") + + def emit_agent_end(self, agent_name: str, result: str): + for emitter in self.emitters: + try: + emitter.emit_agent_end(agent_name, result) + except Exception as e: + logger.warning(f"Emitter error: {e}") + + def emit_tool_call(self, agent_name: str, tool_name: str, args: Dict[str, Any]): + for emitter in self.emitters: + try: + emitter.emit_tool_call(agent_name, tool_name, args) + except Exception as e: + logger.warning(f"Emitter error: {e}") + + def emit_tool_result(self, agent_name: str, tool_name: str, result: str): + for emitter in self.emitters: + try: + emitter.emit_tool_result(agent_name, tool_name, result) + except Exception as e: + logger.warning(f"Emitter error: {e}") diff --git a/plexe/langgraph/utils/file_utils.py b/plexe/langgraph/utils/file_utils.py new file mode 100644 index 00000000..6f5522de --- /dev/null +++ b/plexe/langgraph/utils/file_utils.py @@ -0,0 +1,66 @@ +import os +import logging +from typing import Optional +from datetime import datetime + +logger = logging.getLogger(__name__) + +def create_working_directory(base_dir: str = "workdir") -> str: + """ + Create a unique working directory for a session. + + Args: + base_dir: Base directory for workspaces + + Returns: + Path to the created working directory + """ + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + session_dir = os.path.join(base_dir, f"session-{timestamp}") + os.makedirs(session_dir, exist_ok=True) + + subdirs = ["csv_files", "cache", "artifacts"] + for subdir in subdirs: + os.makedirs(os.path.join(session_dir, subdir), exist_ok=True) + + return session_dir + + +def validate_file_exists(file_path: str) -> bool: + """Check if a file exists.""" + return os.path.isfile(file_path) + + +def validate_directory_exists(dir_path: str) -> bool: + """Check if a directory exists.""" + return os.path.isdir(dir_path) + + +def get_csv_files_in_directory(dir_path: str) -> list: + """Get list of CSV files in a directory.""" + if not os.path.isdir(dir_path): + return [] + return [f for f in os.listdir(dir_path) if f.endswith('.csv')] + + +def read_file_content(file_path: str) -> Optional[str]: + """Read content from a file.""" + try: + with open(file_path, 'r') as f: + return f.read() + except Exception as e: + logger.error(f"Error reading file {file_path}: {e}") + return None + + +def write_file_content(file_path: str, content: str) -> bool: + """Write content to a file.""" + try: + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, 'w') as f: + f.write(content) + return True + except Exception as e: + logger.error(f"Error writing file {file_path}: {e}") + return False + diff --git a/plexe/langgraph/utils/helpers.py b/plexe/langgraph/utils/helpers.py new file mode 100644 index 00000000..2cffd095 --- /dev/null +++ b/plexe/langgraph/utils/helpers.py @@ -0,0 +1,75 @@ +from typing import Dict, Any + +def format_error_message(error: Exception, context: str = "") -> str: + """Format an error message with context.""" + msg = f"Error: {type(error).__name__}: {str(error)}" + if context: + msg = f"{context}: {msg}" + return msg + + +def sanitize_sql_identifier(identifier: str) -> str: + """Sanitize a SQL identifier to prevent injection.""" + return ''.join(c for c in identifier if c.isalnum() or c == '_') + + +def format_table_info(tables: Dict[str, Any]) -> str: + """Format table information for display.""" + lines = [] + for table_name, info in tables.items(): + pk = info.get("primary_key", []) + cols = [c["name"] for c in info.get("columns", [])] + lines.append(f"- {table_name}") + if pk: + lines.append(f" PK: {', '.join(pk)}") + lines.append(f" Columns: {', '.join(cols[:5])}{'...' if len(cols) > 5 else ''}") + return "\n".join(lines) + + +def estimate_task_type(target_description: str) -> str: + """Estimate task type from description.""" + description_lower = target_description.lower() + + binary_indicators = ["churn", "fraud", "click", "convert", "buy", "will", "whether"] + regression_indicators = ["count", "amount", "price", "revenue", "quantity", "how many"] + multiclass_indicators = ["category", "class", "type", "segment", "which"] + + for indicator in binary_indicators: + if indicator in description_lower: + return "binary_classification" + + for indicator in regression_indicators: + if indicator in description_lower: + return "regression" + + for indicator in multiclass_indicators: + if indicator in description_lower: + return "multiclass_classification" + + return "regression" + + +def get_default_metrics(task_type: str) -> list: + """Get default metrics for a task type.""" + metrics_map = { + "regression": ["mae", "rmse", "r2"], + "binary_classification": ["accuracy", "auroc", "f1"], + "multiclass_classification": ["accuracy", "f1_macro", "f1_micro"], + } + return metrics_map.get(task_type, ["mae"]) + + +def validate_python_code(code: str) -> Dict[str, Any]: + """Validate Python code for syntax errors.""" + try: + compile(code, '', 'exec') + return {"valid": True, "errors": []} + except SyntaxError as e: + return { + "valid": False, + "errors": [{ + "line": e.lineno, + "offset": e.offset, + "message": e.msg, + }] + } diff --git a/plexe/langgraph/utils/logging_utils.py b/plexe/langgraph/utils/logging_utils.py new file mode 100644 index 00000000..84e8801b --- /dev/null +++ b/plexe/langgraph/utils/logging_utils.py @@ -0,0 +1,112 @@ +import logging +import os +from pathlib import Path +from contextvars import ContextVar +from typing import Optional, Dict +from datetime import datetime + +# Context variable to store the session ID for the current thread/context +session_id_var: ContextVar[Optional[str]] = ContextVar("session_id", default=None) + +# Get the project root directory (where logs should be created) +PROJECT_ROOT = Path(__file__).parent.parent.parent.parent +LOG_DIR = PROJECT_ROOT / "logs" + +# Ensure log directory exists +LOG_DIR.mkdir(parents=True, exist_ok=True) + +logger = logging.getLogger(__name__) + + +class SessionLogger: + """ + Logger for session-specific experiment logs. + + Only logs activities within a chat session/experiment, + not general backend infrastructure logs. + """ + + _handlers: Dict[str, logging.FileHandler] = {} + + @classmethod + def get_session_logger(cls, session_id: str) -> logging.Logger: + """Get or create a logger for a specific session.""" + logger_name = f"plexe.session.{session_id}" + session_logger = logging.getLogger(logger_name) + + if session_id not in cls._handlers: + log_file = LOG_DIR / f"session-{session_id}.log" + handler = logging.FileHandler(str(log_file), encoding='utf-8') + formatter = logging.Formatter( + '[%(asctime)s] [%(levelname)s] %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + handler.setFormatter(formatter) + handler.setLevel(logging.INFO) + + session_logger.addHandler(handler) + session_logger.setLevel(logging.INFO) + session_logger.propagate = False # Don't propagate to root logger + + cls._handlers[session_id] = handler + + return session_logger + + @classmethod + def close_session(cls, session_id: str): + """Close and remove the handler for a session.""" + if session_id in cls._handlers: + cls._handlers[session_id].close() + del cls._handlers[session_id] + + # Also remove the logger + logger_name = f"plexe.session.{session_id}" + if logger_name in logging.Logger.manager.loggerDict: + del logging.Logger.manager.loggerDict[logger_name] + + +def log_session_event( + event_type: str, + message: str, + agent_name: str = "", + extra: Optional[Dict] = None +): + """ + Log an event to the current session's log file. + + Args: + event_type: Type of event (e.g., 'agent_start', 'tool_call', 'thinking') + message: The log message + agent_name: Name of the agent (optional) + extra: Additional data to log (optional) + """ + session_id = session_id_var.get() + if not session_id: + return # No session, don't log + + session_logger = SessionLogger.get_session_logger(session_id) + + # Format the log message + parts = [] + if agent_name: + parts.append(f"[{agent_name}]") + parts.append(f"[{event_type}]") + parts.append(message) + + log_message = " ".join(parts) + + if extra: + log_message += f" | {extra}" + + session_logger.info(log_message) + + +def setup_session_logging(level=logging.INFO): + """ + Initialize session logging infrastructure. + + This no longer modifies the root logger - it just ensures + the log directory exists and the infrastructure is ready. + """ + LOG_DIR.mkdir(parents=True, exist_ok=True) + logger.info(f"Session logging initialized. Logs will be written to: {LOG_DIR}") diff --git a/plexe/langgraph/utils/progress.py b/plexe/langgraph/utils/progress.py new file mode 100644 index 00000000..f5031406 --- /dev/null +++ b/plexe/langgraph/utils/progress.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass +from typing import Dict, Any + +@dataclass +class AgentProgress: + """Track agent progress and phase information.""" + current_agent: str = "" + current_phase: str = "" + total_steps: int = 0 + completed_steps: int = 0 + + def to_dict(self) -> Dict[str, Any]: + return { + "current_agent": self.current_agent, + "current_phase": self.current_phase, + "total_steps": self.total_steps, + "completed_steps": self.completed_steps, + "progress_pct": int(self.completed_steps / max(self.total_steps, 1) * 100) + } + diff --git a/plexe/main.py b/plexe/main.py index 9e6c79a5..a7897fe1 100644 --- a/plexe/main.py +++ b/plexe/main.py @@ -4,10 +4,11 @@ import threading import time -import webbrowser import logging +import os import uvicorn +from dotenv import load_dotenv logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -15,9 +16,18 @@ def main(): """Launch the Plexe assistant with a web UI.""" + load_dotenv() host = "127.0.0.1" port = 8000 + # If the user exported GEMINI_API_KEY but not GOOGLE_API_KEY, map it so + # litellm/Google provider can pick it up (common naming mismatch). + # This does not persist anything to disk and only affects the current process. + gemini_key = os.environ.get("GEMINI_API_KEY") + if gemini_key and not os.environ.get("GOOGLE_API_KEY") and not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"): + os.environ["GOOGLE_API_KEY"] = gemini_key + logging.getLogger(__name__).info("Mapped GEMINI_API_KEY -> GOOGLE_API_KEY for this process") + # Configure uvicorn to run in a thread config = uvicorn.Config("plexe.server:app", host=host, port=port, log_level="info", reload=False) server = uvicorn.Server(config) @@ -32,7 +42,7 @@ def main(): # Open the browser url = f"http://{host}:{port}" logger.info(f"Opening browser at {url}") - webbrowser.open(url) + # webbrowser.open(url) # Keep the main thread alive try: diff --git a/plexe/model_builder.py b/plexe/model_builder.py deleted file mode 100644 index 0d4dffd7..00000000 --- a/plexe/model_builder.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -ModelBuilder for creating ML models through agentic workflows. - -This module provides the ModelBuilder class that handles the orchestration of -the multi-agent system to build machine learning models. -""" - -import os -import json -import logging -from datetime import datetime -from typing import Dict, List, Type, Optional - -import pandas as pd -from pydantic import BaseModel - -from plexe.config import prompt_templates -from plexe.datasets import DatasetGenerator -from plexe.callbacks import Callback, BuildStateInfo, ChainOfThoughtModelCallback, ModelCheckpointCallback -from plexe.internal.common.utils.chain_of_thought.emitters import ConsoleEmitter -from plexe.agents.agents import PlexeAgent -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.internal.common.provider import ProviderConfig -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.common.utils.pydantic_utils import map_to_basemodel, format_schema -from plexe.internal.common.utils.markdown_utils import format_eda_report_markdown -from plexe.core.state import ModelState -from plexe.tools.schemas import get_solution_schemas - -logger = logging.getLogger(__name__) - - -class ModelBuilder: - """Factory for creating ML models through agentic workflows.""" - - def __init__( - self, - provider: str | ProviderConfig = "openai/gpt-4o-mini", - verbose: bool = False, - distributed: bool = False, - working_dir: Optional[str] = None, - ): - """ - Initialize the model builder. - - Args: - provider: LLM provider configuration - verbose: Whether to display detailed agent logs - distributed: Whether to use distributed training with Ray - working_dir: Optional custom working directory - """ - self.provider_config = ProviderConfig(default_provider=provider) if isinstance(provider, str) else provider - self.verbose = verbose - self.distributed = distributed - self.working_dir = working_dir or self._create_working_dir() - - @staticmethod - def _create_working_dir() -> str: - """Create unique working directory for this build.""" - run_id = f"run-{datetime.now().isoformat()}".replace(":", "-").replace(".", "-") - working_dir = f"./workdir/{run_id}/" - os.makedirs(working_dir, exist_ok=True) - return working_dir - - def build( - self, - intent: str, - datasets: List[pd.DataFrame | DatasetGenerator], - input_schema: Type[BaseModel] | Dict[str, type] = None, - output_schema: Type[BaseModel] | Dict[str, type] = None, - timeout: int = None, - max_iterations: int = None, - run_timeout: int = 1800, - callbacks: List[Callback] = None, - enable_checkpointing: bool = False, - ): - """ - Build a complete ML model using the agentic workflow. - - Args: - intent: Natural language description of the model's purpose - datasets: Training datasets - input_schema: Optional input schema (inferred if not provided) - output_schema: Optional output schema (inferred if not provided) - timeout: Maximum total time for building - max_iterations: Maximum number of iterations - run_timeout: Maximum time per training run - callbacks: Optional callbacks for monitoring - enable_checkpointing: Whether to enable checkpointing - - Returns: - Completed Model instance - """ - # Clear and use singleton object registry for this build - object_registry = ObjectRegistry() - object_registry.clear() - - # Validate parameters - if timeout is None and max_iterations is None: - raise ValueError("At least one of 'timeout' or 'max_iterations' must be set") - if run_timeout is not None and timeout is not None and run_timeout > timeout: - raise ValueError(f"Run timeout ({run_timeout}s) cannot exceed total timeout ({timeout}s)") - - # Process schemas - input_schema = map_to_basemodel("in", input_schema) if input_schema else None - output_schema = map_to_basemodel("out", output_schema) if output_schema else None - - object_registry.register(bool, "input_schema_is_locked", input_schema is not None, immutable=True) - object_registry.register(bool, "output_schema_is_locked", output_schema is not None, immutable=True) - - # Initialize callbacks - callbacks = callbacks or [] - if enable_checkpointing and not any(isinstance(cb, ModelCheckpointCallback) for cb in callbacks): - callbacks.append(ModelCheckpointCallback()) - - cot_callback = ChainOfThoughtModelCallback(emitter=ConsoleEmitter()) - callbacks.append(cot_callback) - cot_callable = cot_callback.get_chain_of_thought_callable() - - # Register callbacks - object_registry.register_multiple(Callback, {f"{i}": c for i, c in enumerate(callbacks)}) - - try: - # Register datasets - training_data = { - f"dataset_{i}": DatasetAdapter.coerce((data.data if isinstance(data, DatasetGenerator) else data)) - for i, data in enumerate(datasets) - } - object_registry.register_multiple(TabularConvertible, training_data, immutable=True) - - # Register schemas if provided - if input_schema: - object_registry.register(dict, "input_schema", format_schema(input_schema), immutable=True) - if output_schema: - object_registry.register(dict, "output_schema", format_schema(output_schema), immutable=True) - - # Generate unique model identifier - model_identifier = f"model-{datetime.now().isoformat()}".replace(":", "-").replace(".", "-") - - # Notify callbacks of build start - self._notify_callbacks( - callbacks, - "build_start", - intent, - input_schema, - output_schema, - training_data, - model_identifier=model_identifier, - model_state="BUILDING", - ) - - # Create and run agent - agent = PlexeAgent( - orchestrator_model_id=self.provider_config.orchestrator_provider, - ml_researcher_model_id=self.provider_config.research_provider, - ml_engineer_model_id=self.provider_config.engineer_provider, - ml_ops_engineer_model_id=self.provider_config.ops_provider, - tool_model_id=self.provider_config.tool_provider, - verbose=self.verbose, - max_steps=30, - distributed=self.distributed, - chain_of_thought_callable=cot_callable, - max_solutions=max_iterations, - ) - - agent_prompt = prompt_templates.agent_builder_prompt( - intent=intent, - input_schema=json.dumps(format_schema(input_schema), indent=4), - output_schema=json.dumps(format_schema(output_schema), indent=4), - datasets=list(training_data.keys()), - working_dir=self.working_dir, - max_iterations=max_iterations, - resume=False, - ) - - additional_args = { - "intent": intent, - "working_dir": self.working_dir, - "input_schema": format_schema(input_schema), - "output_schema": format_schema(output_schema), - "max_iterations": max_iterations, - "timeout": timeout, - "run_timeout": run_timeout, - } - - generated = agent.run(agent_prompt, additional_args=additional_args) - - # Extract final schemas (may have been inferred) - schemas = get_solution_schemas("best_performing_solution") - final_input_schema = map_to_basemodel("InputSchema", schemas["input"]) - final_output_schema = map_to_basemodel("OutputSchema", schemas["output"]) - - # Build metadata - metadata = { - "provider": str(self.provider_config.default_provider), - "orchestrator_provider": str(self.provider_config.orchestrator_provider), - "research_provider": str(self.provider_config.research_provider), - "engineer_provider": str(self.provider_config.engineer_provider), - "ops_provider": str(self.provider_config.ops_provider), - "tool_provider": str(self.provider_config.tool_provider), - } - metadata.update(generated.metadata) - - # Extract EDA reports - eda_reports = {} - eda_markdown_reports = {} - for name in training_data.keys(): - try: - eda_report = object_registry.get(dict, f"eda_report_{name}") - eda_reports[name] = eda_report - eda_markdown_reports[name] = format_eda_report_markdown(eda_report) - except KeyError: - logger.debug(f"No EDA report found for dataset '{name}'") - - metadata["eda_reports"] = eda_reports - metadata["eda_markdown_reports"] = eda_markdown_reports - - # Import here to avoid circular dependency - from plexe.models import Model - - # Create model and populate it with results - model = Model(intent=intent, input_schema=final_input_schema, output_schema=final_output_schema) - model.identifier = model_identifier - model.predictor = generated.predictor - model.trainer_source = generated.training_source_code - model.predictor_source = generated.inference_source_code - model.feature_transformer_source = generated.feature_transformer_source_code - model.dataset_splitter_source = generated.dataset_split_code - model.testing_source = generated.testing_source_code - model.artifacts = generated.model_artifacts - model.metric = generated.test_performance - model.evaluation_report = generated.evaluation_report - model.metadata.update(metadata) - model.training_data = training_data # Store actual training data - model.state = ModelState.READY - - # Notify callbacks of build end - self._notify_callbacks( - callbacks, - "build_end", - intent, - final_input_schema, - final_output_schema, - training_data, - model_identifier=model_identifier, - model_state="READY", - final_metric=generated.test_performance, - final_artifacts=generated.model_artifacts, - trainer_source=generated.training_source_code, - predictor_source=generated.inference_source_code, - ) - - return model - - except Exception as e: - logger.error(f"Error during model building: {str(e)}") - raise e - - def _notify_callbacks( - self, - callbacks, - event, - intent, - input_schema, - output_schema, - training_data, - model_identifier=None, - model_state=None, - final_metric=None, - final_artifacts=None, - trainer_source=None, - predictor_source=None, - ): - """Helper to notify callbacks with consistent error handling.""" - for callback in callbacks: - try: - method_name = f"on_{event}" - if hasattr(callback, method_name): - getattr(callback, method_name)( - BuildStateInfo( - intent=intent, - input_schema=input_schema, - output_schema=output_schema, - provider=self.provider_config.tool_provider, - datasets=training_data, - model_identifier=model_identifier, - model_state=model_state, - final_metric=final_metric, - final_artifacts=final_artifacts, - trainer_source=trainer_source, - predictor_source=predictor_source, - ) - ) - except Exception as e: - logger.warning(f"Error in callback {callback.__class__.__name__}.{method_name}: {str(e)[:50]}") diff --git a/plexe/models.py b/plexe/models.py deleted file mode 100644 index dfab6512..00000000 --- a/plexe/models.py +++ /dev/null @@ -1,336 +0,0 @@ -""" -This module defines the `Model` class, which represents a machine learning model. - -A `Model` is characterized by a natural language description of its intent, structured input and output schemas. -This class provides methods for building the model, making predictions, and inspecting its state, metadata, and metrics. - -Key Features: -- Intent: A natural language description of the model's purpose. -- Input/Output Schema: Defines the structure and types of inputs and outputs. -- Mutable State: Tracks the model's lifecycle, training metrics, and metadata. -- Build Process: Integrates solution generation with callbacks. -- Chain of Thought: Captures the reasoning steps of the model building process. - -Example: ->>> model = Model( ->>> intent="Given a dataset of house features, predict the house price.", ->>> output_schema=create_model("output", **{"price": float}), ->>> input_schema=create_model("input", **{ ->>> "bedrooms": int, ->>> "bathrooms": int, ->>> "square_footage": float ->>> }) ->>> ) ->>> ->>> model.build( ->>> datasets=[pd.read_csv("houses.csv")], ->>> provider="openai:gpt-4o-mini", ->>> max_iterations=10, ->>> chain_of_thought=True # Enable chain of thought logging ->>> ) ->>> ->>> prediction = model.predict({"bedrooms": 3, "bathrooms": 2, "square_footage": 1500.0}) ->>> print(prediction) -""" - -import logging -import os -import uuid -import warnings -from datetime import datetime -from typing import Dict, List, Type, Any -from deprecated import deprecated - -import pandas as pd -from pydantic import BaseModel - -from plexe.callbacks import Callback -from plexe.core.interfaces.predictor import Predictor -from plexe.core.object_registry import ObjectRegistry -from plexe.core.state import ModelState # Import from core package -from plexe.datasets import DatasetGenerator -from plexe.internal.common.datasets.interface import Dataset -from plexe.internal.common.provider import ProviderConfig -from plexe.internal.common.utils.model_utils import calculate_model_size, format_code_snippet -from plexe.internal.common.utils.pydantic_utils import map_to_basemodel, format_schema -from plexe.internal.models.entities.artifact import Artifact -from plexe.internal.models.entities.description import ( - ModelDescription, - SchemaInfo, - ImplementationInfo, - PerformanceInfo, - CodeInfo, -) -from plexe.internal.models.entities.metric import Metric - -logger = logging.getLogger(__name__) - - -class Model: - """ - Represents a model that transforms inputs to outputs according to a specified intent. - - A `Model` is defined by a human-readable description of its expected intent, as well as structured - definitions of its input schema and output schema. - - Attributes: - intent (str): A human-readable, natural language description of the model's expected intent. - output_schema (dict): A mapping of output key names to their types. - input_schema (dict): A mapping of input key names to their types. - - Example: - model = Model( - intent="Given a dataset of house features, predict the house price.", - output_schema=create_model("output_schema", **{"price": float}), - input_schema=create_model("input_schema", **{ - "bedrooms": int, - "bathrooms": int, - "square_footage": float, - }) - ) - """ - - def __init__( - self, - intent: str, - input_schema: Type[BaseModel] | Dict[str, type] = None, - output_schema: Type[BaseModel] | Dict[str, type] = None, - distributed: bool = False, - ): - """ - Initialise a model with a natural language description of its intent, as well as - structured definitions of its input schema and output schema. - - :param intent: A human-readable, natural language description of the model's expected intent. - :param input_schema: a pydantic model or dictionary defining the input schema - :param output_schema: a pydantic model or dictionary defining the output schema - :param distributed: Whether to use distributed training with Ray if available. - """ - # todo: analyse natural language inputs and raise errors where applicable - - # The model's identity is defined by these fields - self.intent: str = intent - self.input_schema: Type[BaseModel] = map_to_basemodel("in", input_schema) if input_schema else None - self.output_schema: Type[BaseModel] = map_to_basemodel("out", output_schema) if output_schema else None - self.training_data: Dict[str, Dataset] = dict() - self.distributed: bool = distributed - - # The model's mutable state is defined by these fields - self.state: ModelState = ModelState.DRAFT - self.predictor: Predictor | None = None - self.trainer_source: str | None = None - self.predictor_source: str | None = None - self.feature_transformer_source: str | None = None - self.dataset_splitter_source: str | None = None - self.testing_source: str | None = None - self.evaluation_report: Dict | None = None - self.artifacts: List[Artifact] = [] - self.metric: Metric | None = None - self.metadata: Dict[str, Any] = dict() # todo: initialise metadata, etc - - # Registries used to make datasets, artifacts and other objects available across the system - self.object_registry = ObjectRegistry() - - # Setup the working directory and unique identifiers - self.identifier: str = f"model-{abs(hash(self.intent))}-{str(uuid.uuid4())}" - self.run_id = f"run-{datetime.now().isoformat()}".replace(":", "-").replace(".", "-") - self.working_dir = f"./workdir/{self.run_id}/" - os.makedirs(self.working_dir, exist_ok=True) - - @deprecated(reason="Use ModelBuilder.build() instead", version="0.23.0") - def build( - self, - datasets: List[pd.DataFrame | DatasetGenerator], - provider: str | ProviderConfig = "openai/gpt-4o-mini", - timeout: int = None, - max_iterations: int = None, - run_timeout: int = 1800, - callbacks: List[Callback] = None, - verbose: bool = False, - # resume: bool = False, - enable_checkpointing: bool = False, - ) -> None: - """ - Build the model using the provided dataset and optional data generation configuration. - - DEPRECATED: This interface is deprecated. Use ModelBuilder.build() instead: - - from plexe import ModelBuilder - builder = ModelBuilder(provider=provider, verbose=verbose, distributed=distributed) - model = builder.build(intent=intent, datasets=datasets, ...) - - :param datasets: the datasets to use for training the model - :param provider: the provider to use for model building, either a string or a ProviderConfig - for granular control of which models to use for different agent roles - :param timeout: maximum total time in seconds to spend building the model (all iterations combined) - :param max_iterations: maximum number of iterations to spend building the model - :param run_timeout: maximum time in seconds for each individual model training run - :param callbacks: list of callbacks to notify during the model building process - :param verbose: whether to display detailed agent logs during model building (default: False) - :param enable_checkpointing: whether to enable automatic checkpointing (default: True) - :return: - """ - warnings.warn( - "Model.build() is deprecated. Use ModelBuilder.build() instead:\n\n" - " from plexe import ModelBuilder\n" - " builder = ModelBuilder(provider=provider, verbose=verbose, distributed=distributed)\n" - " model = builder.build(intent=intent, datasets=datasets, ...)\n", - DeprecationWarning, - stacklevel=2, - ) - - # Import here to avoid circular dependency - from plexe.model_builder import ModelBuilder - - # Create builder and delegate to it - builder = ModelBuilder( - provider=provider, - verbose=verbose, - distributed=self.distributed, - working_dir=self.working_dir, - ) - - # Build the model using ModelBuilder - built_model = builder.build( - intent=self.intent, - datasets=datasets, - input_schema=self.input_schema, - output_schema=self.output_schema, - timeout=timeout, - max_iterations=max_iterations, - run_timeout=run_timeout, - callbacks=callbacks, - enable_checkpointing=enable_checkpointing, - ) - - # Copy all results back to self to maintain backwards compatibility - for attr in [ - "identifier", - "input_schema", - "output_schema", - "predictor", - "trainer_source", - "predictor_source", - "feature_transformer_source", - "dataset_splitter_source", - "testing_source", - "evaluation_report", - "artifacts", - "metric", - "training_data", - ]: - setattr(self, attr, getattr(built_model, attr)) - - self.metadata.update(built_model.metadata) - self.state = ModelState.READY - - def predict(self, x: Dict[str, Any], validate_input: bool = False, validate_output: bool = False) -> Dict[str, Any]: - """ - Call the model with input x and return the output. - :param x: input to the model - :param validate_input: whether to validate the input against the input schema - :param validate_output: whether to validate the output against the output schema - :return: output of the model - """ - if self.state != ModelState.READY: - raise RuntimeError("The model is not ready for predictions.") - try: - if validate_input: - self.input_schema.model_validate(x) - y = self.predictor.predict(x) - if validate_output: - self.output_schema.model_validate(y) - return y - except Exception as e: - raise RuntimeError(f"Error during prediction: {str(e)}") from e - - def get_state(self) -> ModelState: - """ - Return the current state of the model. - :return: the current state of the model - """ - return self.state - - def get_metadata(self) -> dict: - """ - Return metadata about the model. - :return: metadata about the model - """ - return self.metadata - - def get_metrics(self) -> dict: - """ - Return metrics about the model. - :return: metrics about the model - """ - return None if self.metric is None else {self.metric.name: self.metric.value} - - def describe(self) -> ModelDescription: - """ - Return a structured description of the model. - - :return: A ModelDescription object with various methods like to_dict(), as_text(), - as_markdown(), to_json() for different output formats - """ - # Create schema info - schemas = SchemaInfo( - input=format_schema(self.input_schema), - output=format_schema(self.output_schema), - ) - - # Create implementation info - implementation = ImplementationInfo( - framework=self.metadata.get("framework", "Unknown"), - model_type=self.metadata.get("model_type", "Unknown"), - artifacts=[a.name for a in self.artifacts], - size=calculate_model_size(self.artifacts), - ) - - # Create performance info - # Convert Metric objects to string representation for JSON serialization - metrics_dict = {} - if hasattr(self.metric, "value") and hasattr(self.metric, "name"): # Check if it's a Metric object - metrics_dict[self.metric.name] = str(self.metric.value) - - performance = PerformanceInfo( - metrics=metrics_dict, - training_data_info={ - name: { - "modality": data.structure.modality, - "features": data.structure.features, - "structure": data.structure.details, - } - for name, data in self.training_data.items() - }, - ) - - # Create code info - code = CodeInfo( - training=format_code_snippet(self.trainer_source), - prediction=format_code_snippet(self.predictor_source), - feature_transformations=format_code_snippet(self.feature_transformer_source), - ) - - # Assemble and return the complete model description - return ModelDescription( - id=self.identifier, - state=self.state.value, - intent=self.intent, - schemas=schemas, - implementation=implementation, - performance=performance, - code=code, - training_date=self.metadata.get("creation_date", "Unknown"), - rationale=self.metadata.get("selection_rationale", "Unknown"), - provider=self.metadata.get("provider", "Unknown"), - task_type=self.metadata.get("task_type", "Unknown"), - domain=self.metadata.get("domain", "Unknown"), - behavior=self.metadata.get("behavior", "Unknown"), - preprocessing_summary=self.metadata.get("preprocessing_summary", "Unknown"), - architecture_summary=self.metadata.get("architecture_summary", "Unknown"), - training_procedure=self.metadata.get("training_procedure", "Unknown"), - evaluation_metric=self.metadata.get("evaluation_metric", "Unknown"), - inference_behavior=self.metadata.get("inference_behavior", "Unknown"), - strengths=self.metadata.get("strengths", "Unknown"), - limitations=self.metadata.get("limitations", "Unknown"), - ) diff --git a/plexe/relbench/__init__.py b/plexe/relbench/__init__.py new file mode 100644 index 00000000..55980fdc --- /dev/null +++ b/plexe/relbench/__init__.py @@ -0,0 +1,28 @@ +from . import base, datasets, modeling, tasks +from .base import ( + Database, + Dataset, + Table, + BaseTask, + TaskType, + EntityTask, + RecommendationTask, + AutoCompleteTask, +) + +__version__ = "1.1.0" + +__all__ = [ + "base", + "datasets", + "modeling", + "tasks", + "Database", + "Dataset", + "Table", + "BaseTask", + "TaskType", + "EntityTask", + "RecommendationTask", + "AutoCompleteTask", +] diff --git a/plexe/relbench/base/__init__.py b/plexe/relbench/base/__init__.py new file mode 100644 index 00000000..b070b09c --- /dev/null +++ b/plexe/relbench/base/__init__.py @@ -0,0 +1,18 @@ +from .database import Database +from .dataset import Dataset +from .table import Table +from .task_autocomplete import AutoCompleteTask +from .task_base import BaseTask, TaskType +from .task_entity import EntityTask +from .task_recommendation import RecommendationTask + +__all__ = [ + "Database", + "Dataset", + "Table", + "BaseTask", + "TaskType", + "RecommendationTask", + "EntityTask", + "AutoCompleteTask", +] diff --git a/plexe/relbench/base/database.py b/plexe/relbench/base/database.py new file mode 100644 index 00000000..4bc1192b --- /dev/null +++ b/plexe/relbench/base/database.py @@ -0,0 +1,122 @@ +import os +from functools import lru_cache +from pathlib import Path +from typing import Dict, Union + +import pandas as pd +from typing_extensions import Self + +from .table import Table + + +class Database: + r"""A database is a collection of named tables linked by foreign key - primary key + connections.""" + + def __init__(self, table_dict: Dict[str, Table]) -> None: + r"""Creates a database from a dictionary of tables.""" + + self.table_dict = table_dict + + def __repr__(self) -> str: + return f"{self.__class__.__name__}()" + + def save(self, path: Union[str, os.PathLike]) -> None: + r"""Save the database to a directory. + + Simply saves each table individually with the table name as base name of file. + """ + + for name, table in self.table_dict.items(): + table.save(f"{path}/{name}.parquet") + + @classmethod + def load(cls, path: Union[str, os.PathLike]) -> Self: + r"""Load a database from a directory of tables in parquet files.""" + + table_dict = {} + for table_path in Path(path).glob("*.parquet"): + table = Table.load(table_path) + table_dict[table_path.stem] = table + + return cls(table_dict) + + @property + @lru_cache(maxsize=None) + def min_timestamp(self) -> pd.Timestamp: + r"""Return the earliest timestamp in the database.""" + + return min( + table.min_timestamp + for table in self.table_dict.values() + if table.time_col is not None + ) + + @property + @lru_cache(maxsize=None) + def max_timestamp(self) -> pd.Timestamp: + r"""Return the latest timestamp in the database.""" + + return max( + table.max_timestamp + for table in self.table_dict.values() + if table.time_col is not None + ) + + def upto(self, timestamp: pd.Timestamp) -> Self: + r"""Return a database with all rows upto timestamp.""" + + return Database( + table_dict={ + name: table.upto(timestamp) for name, table in self.table_dict.items() + } + ) + + def from_(self, timestamp: pd.Timestamp) -> Self: + r"""Return a database with all rows from timestamp.""" + + return Database( + table_dict={ + name: table.from_(timestamp) for name, table in self.table_dict.items() + } + ) + + def reindex_pkeys_and_fkeys(self) -> None: + r"""Map primary and foreign keys into indices according to the ordering in the + primary key tables.""" + # Get pkey to idx mapping: + index_map_dict: Dict[str, pd.Series] = {} + for table_name, table in self.table_dict.items(): + if table.pkey_col is not None: + if table.time_col is not None: + table.df = table.df.sort_values(table.time_col).reset_index( + drop=True + ) + + ser = table.df[table.pkey_col] + + if ser.nunique() != len(ser): + raise RuntimeError( + f"The primary key '{table.pkey_col}' " + f"of table '{table_name}' contains " + "duplicated elements" + ) + arange_ser = pd.RangeIndex(len(ser)).astype("Int64") + index_map_dict[table_name] = pd.Series( + index=ser, + data=arange_ser, + name="index", + ) + table.df[table.pkey_col] = arange_ser + + # Replace fkey_col_to_pkey_table with indices. + for table in self.table_dict.values(): + for fkey_col, pkey_table_name in table.fkey_col_to_pkey_table.items(): + out = pd.merge( + table.df[fkey_col], + index_map_dict[pkey_table_name], + how="left", + left_on=fkey_col, + right_index=True, + ) + table.df[fkey_col] = out["index"] diff --git a/plexe/relbench/base/dataset.py b/plexe/relbench/base/dataset.py new file mode 100644 index 00000000..839c4257 --- /dev/null +++ b/plexe/relbench/base/dataset.py @@ -0,0 +1,206 @@ +import time +from functools import lru_cache +from pathlib import Path +from typing import Optional + +import numpy as np +import pandas as pd + +from .database import Database + + +class Dataset: + r"""A dataset is a database with validation and test timestamps defined for it. + + Attributes: + val_timestamp: Rows upto this timestamp (inclusive) can be input for validation. + test_timestamp: Rows upto this timestamp (inclusive) can be input for testing. + + Validation split of a task involves predicting the target variable for a + time period after val_timestamp (exclusive) using data upto val_timestamp. + Similarly for test_timestamp. + """ + + # To be set by subclass. + val_timestamp: pd.Timestamp + test_timestamp: pd.Timestamp + + # For predict column task. + target_col: Optional[str] + entity_table: Optional[str] + remove_columns: list[tuple[str, str]] + + def __init__( + self, + cache_dir: Optional[str] = None, + ) -> None: + r"""Create a dataset object. + + Args: + cache_dir: A directory for caching the database object. If specified, + we will either process and cache the file (if not available) or use + the cached file. If None, we will not use cached file and re-process + everything from scratch without saving the cache. + """ + + self.cache_dir = cache_dir + + self.target_col = None + self.entity_table = None + self.remove_columns = [] + + def __repr__(self) -> str: + return f"{self.__class__.__name__}()" + + def validate_and_correct_db(self, db): + r"""Validate and correct input db in-place. + + Removing rows after test_timestamp can result in dangling foreign keys. + """ + # Validate that all primary keys are consecutively index. + + for table_name, table in db.table_dict.items(): + if table.pkey_col is not None: + ser = table.df[table.pkey_col] + if not (ser.values == np.arange(len(ser))).all(): + raise RuntimeError( + f"The primary key column {table.pkey_col} of table " + f"{table_name} is not consecutively index." + ) + + # Discard any foreign keys that are larger than primary key table as + # dangling foreign keys (represented as None). + for table_name, table in db.table_dict.items(): + for fkey_col, pkey_table_name in table.fkey_col_to_pkey_table.items(): + num_pkeys = len(db.table_dict[pkey_table_name]) + mask = table.df[fkey_col] >= num_pkeys + if mask.any(): + table.df.loc[mask, fkey_col] = None + + @lru_cache(maxsize=None) + def get_db(self, upto_test_timestamp=True) -> Database: + r"""Return the database object. + + The returned database object is cached in memory. + + Args: + upto_test_timestamp: If True, only return rows upto test_timestamp. + + Returns: + Database: The database object. + + `upto_test_timestamp` is True by default to prevent test leakage. + """ + + db_path = f"{self.cache_dir}/db" + if self.cache_dir and Path(db_path).exists() and any(Path(db_path).iterdir()): + print(f"Loading Database object from {db_path}...") + tic = time.time() + db = Database.load(db_path) + toc = time.time() + print(f"Done in {toc - tic:.2f} seconds.") + + else: + print("Making Database object from scratch...") + print( + "(You can also use `get_dataset(..., download=True)` " + "for datasets prepared by the RelBench team.)" + ) + tic = time.time() + db = self.make_db() + db.reindex_pkeys_and_fkeys() + toc = time.time() + print(f"Done in {toc - tic:.2f} seconds.") + + if self.cache_dir: + print(f"Caching Database object to {db_path}...") + tic = time.time() + db.save(db_path) + toc = time.time() + print(f"Done in {toc - tic:.2f} seconds.") + + if upto_test_timestamp: + db = db.upto(self.test_timestamp) + + self.validate_and_correct_db(db) + + if self.target_col: + # Get the modified db with the target column removed + db = self.get_modified_db(db) + + return db + + def get_modified_db(self, db) -> Database: + r"""Get the modified db with the target column removed. + + The target columns is saved to `db.table_dict[table_name].removed_cols` + and the column is dropped from the table. + Args: + db: The database object. + + Returns: + Database: The modified database object. + """ + + # Remove the target column from the source entity table + if self.target_col: + table_name = self.entity_table + col = self.target_col + + if col not in db.table_dict[table_name].df.columns: + raise ValueError(f"Column {col} not found in table {table_name}.") + if col in db.table_dict[table_name].fkey_col_to_pkey_table.keys(): + raise ValueError( + f"Column {col} is a foreign key in table {table_name}. Only feature columns can be removed." + ) + if col == db.table_dict[table_name].pkey_col: + raise ValueError( + f"Column {col} is the primary key in table {table_name}. Only feature columns can be removed." + ) + + # save the columns to be dropped + id_keys = [] + if db.table_dict[table_name].pkey_col: + id_keys.append(db.table_dict[table_name].pkey_col) + else: + # add primary key to table_name if it doesn't have one + db.table_dict[table_name].df["primary_key"] = np.arange( + len(db.table_dict[table_name].df) + ) + id_keys.append("primary_key") + db.table_dict[table_name].pkey_col = "primary_key" + + # Save the target column to be dropped + db.table_dict[table_name].removed_cols = db.table_dict[table_name].df[ + id_keys + [col] + ] + # drop the columns + db.table_dict[table_name].df = db.table_dict[table_name].df.drop( + columns=[col] + ) + + for table, remove_col in self.remove_columns: + if remove_col in db.table_dict[table].df.columns: + # If the column is in the table, remove it + db.table_dict[table].df = db.table_dict[table].df.drop( + columns=[remove_col] + ) + else: + print( + f"Column {remove_col} not found in table {table}. " + "Skipping removal from this table." + ) + + # Clear the get_dataset cache as the dataset instance was modified. + from ..datasets import get_dataset + + get_dataset.cache_clear() + + return db + + def make_db(self) -> Database: + r"""Make the database object from scratch, i.e. using raw data sources. + + To be implemented by subclass. + """ + raise NotImplementedError diff --git a/plexe/relbench/base/table.py b/plexe/relbench/base/table.py new file mode 100644 index 00000000..7fd684c5 --- /dev/null +++ b/plexe/relbench/base/table.py @@ -0,0 +1,152 @@ +import json +import os +from functools import lru_cache +from pathlib import Path +from typing import Dict, Optional, Union + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from typing_extensions import Self + + +class Table: + r"""A table in a database. + + Args: + df: The underlying data frame of the table. + fkey_col_to_pkey_table: A dictionary mapping + foreign key names to table names that contain the foreign keys as + primary keys. + pkey_col: The primary key column if it exists. + time_col: The time column. + """ + + def __init__( + self, + df: pd.DataFrame, + fkey_col_to_pkey_table: Dict[str, str], + pkey_col: Optional[str] = None, + time_col: Optional[str] = None, + ): + self.df = df + self.fkey_col_to_pkey_table = fkey_col_to_pkey_table + self.pkey_col = pkey_col + self.time_col = time_col + self.removed_cols = None + + def __repr__(self) -> str: + return ( + f"Table(df=\n{self.df},\n" + f" fkey_col_to_pkey_table={self.fkey_col_to_pkey_table},\n" + f" pkey_col={self.pkey_col},\n" + f" time_col={self.time_col}" + f")" + ) + + def __len__(self) -> int: + r"""Return the number of rows in the table.""" + return len(self.df) + + def save(self, path: Union[str, os.PathLike]) -> None: + r"""Save the table to a parquet file. + + Stores other attributes as parquet metadata. + """ + assert str(path).endswith(".parquet") + metadata = { + "fkey_col_to_pkey_table": self.fkey_col_to_pkey_table, + "pkey_col": self.pkey_col, + "time_col": self.time_col, + } + + # Convert DataFrame to a PyArrow Table + table = pa.Table.from_pandas(self.df, preserve_index=False) + + # Add metadata to the PyArrow Table + metadata_bytes = { + key: json.dumps(value).encode("utf-8") for key, value in metadata.items() + } + + table = table.replace_schema_metadata( + {**table.schema.metadata, **metadata_bytes} + ) + + # Write the PyArrow Table to a Parquet file using pyarrow.parquet + Path(path).parent.mkdir(parents=True, exist_ok=True) + pq.write_table(table, path) + + @classmethod + def load(cls, path: Union[str, os.PathLike]) -> Self: + r"""Load a table from a parquet file.""" + assert str(path).endswith(".parquet") + + # Read the Parquet file using pyarrow + table = pa.parquet.read_table(path) + df = table.to_pandas() + + # Extract metadata + metadata_bytes = table.schema.metadata + metadata = { + key.decode("utf-8"): json.loads(value.decode("utf-8")) + for key, value in metadata_bytes.items() + if key in [b"fkey_col_to_pkey_table", b"pkey_col", b"time_col"] + } + return cls( + df=df, + fkey_col_to_pkey_table=metadata["fkey_col_to_pkey_table"], + pkey_col=metadata["pkey_col"], + time_col=metadata["time_col"], + ) + + def upto(self, timestamp: pd.Timestamp) -> Self: + r"""Return a table with all rows upto timestamp (inclusive). + + Table without time_col are returned as is. + """ + + if self.time_col is None: + return self + + return Table( + df=self.df.query(f"{self.time_col} <= @timestamp"), + fkey_col_to_pkey_table=self.fkey_col_to_pkey_table, + pkey_col=self.pkey_col, + time_col=self.time_col, + ) + + def from_(self, timestamp: pd.Timestamp) -> Self: + r"""Return a table with all rows from timestamp onwards (inclusive). + + Table without time_col are returned as is. + """ + + if self.time_col is None: + return self + + return Table( + df=self.df.query(f"{self.time_col} >= @timestamp"), + fkey_col_to_pkey_table=self.fkey_col_to_pkey_table, + pkey_col=self.pkey_col, + time_col=self.time_col, + ) + + @property + @lru_cache(maxsize=None) + def min_timestamp(self) -> pd.Timestamp: + r"""Return the earliest time in the table.""" + + if self.time_col is None: + raise ValueError("Table has no time column.") + + return self.df[self.time_col].min() + + @property + @lru_cache(maxsize=None) + def max_timestamp(self) -> pd.Timestamp: + r"""Return the latest time in the table.""" + + if self.time_col is None: + raise ValueError("Table has no time column.") + + return self.df[self.time_col].max() diff --git a/plexe/relbench/base/task_autocomplete.py b/plexe/relbench/base/task_autocomplete.py new file mode 100644 index 00000000..b10df742 --- /dev/null +++ b/plexe/relbench/base/task_autocomplete.py @@ -0,0 +1,215 @@ +from typing import Optional + +import duckdb +import numpy as np +import pandas as pd +from sklearn.preprocessing import OrdinalEncoder + +from ..metrics import ( + accuracy, + average_precision, + f1, + macro_f1, + mae, + micro_f1, + mrr, + r2, + rmse, + roc_auc, +) + +from .database import Database +from .dataset import Dataset +from .table import Table +from .task_base import TaskType +from .task_entity import EntityTask + +UNKNOWN_CLASS_LABEL = -1 + + +class AutoCompleteTask(EntityTask): + r"""Auto complete column task on a dataset. Predict all values in the target column. + + The task is constructed by specifying the entity table, entity column, time column, and target column. + The target column is removed from the entity table and saved to `db.table_dict[entity_table].removed_cols`, + which is used to construct the table for the predict column task. + + The entity table needs to have a time column by which the data is split into training and validation set. + + Args: + dataset: The dataset object. + task_type: The type of the task. + entity_table: The name of the entity table. + target_col: The name of the target column to be predicted. + cache_dir: The directory to cache the task tables. + remove_columns: List of columns, table pairs to remove from the graph. + """ + + timedelta = pd.Timedelta(seconds=1) + entity_col: str + + def __init__( + self, + dataset: Dataset, + task_type: TaskType, + entity_table: str, + target_col: str, + cache_dir: Optional[str] = None, + remove_columns: list[tuple[str, str]] = [], + ): + super().__init__(dataset, cache_dir=cache_dir) + + self.task_type = task_type + self.entity_table = entity_table + self.target_col = target_col + self.remove_columns = remove_columns + self.dataset.target_col = target_col + self.dataset.entity_table = entity_table + self.dataset.remove_columns = remove_columns + self.dataset.get_db.cache_clear() # clear the cache as we will be modifying the database + db = self.dataset.get_db() + entity_col = db.table_dict[entity_table].pkey_col + self.entity_col = entity_col if entity_col is not None else "primary_key" + self.time_col = db.table_dict[self.entity_table].time_col + + if self.task_type == TaskType.REGRESSION: + self.metrics = [r2, mae, rmse] + elif self.task_type == TaskType.BINARY_CLASSIFICATION: + self.metrics = [average_precision, accuracy, f1, roc_auc] + self.num_classes = 2 + elif self.task_type == TaskType.MULTICLASS_CLASSIFICATION: + self.metrics = [accuracy, macro_f1, micro_f1, mrr] + removed_cols = db.table_dict[self.entity_table].removed_cols + db = db.upto(self.dataset.val_timestamp) + train_ids = db.table_dict[self.entity_table].df[self.entity_col].values + train_targets = removed_cols.loc[ + removed_cols[self.entity_col].isin(train_ids), self.target_col + ].values + # Encode the categories found in the training set to consecutive + # integers. Unseen categories are filtered out during evaluation. + self.target_encoder = OrdinalEncoder( + unknown_value=UNKNOWN_CLASS_LABEL, + handle_unknown="use_encoded_value", + dtype="int64", + ) + self.target_encoder.fit(train_targets.reshape(-1, 1)) + self.num_classes = self.target_encoder.categories_[0].shape[0] + else: + raise NotImplementedError(f"Task type {self.task_type} not implemented") + + def filter_dangling_entities(self, table: Table) -> Table: + db = self.dataset.get_db(upto_test_timestamp=False) + num_entities = len(db.table_dict[self.entity_table]) + filter_mask = table.df[self.entity_col] >= num_entities + + if filter_mask.any(): + table.df = table.df[~filter_mask] + + return table + + def _get_table(self, split: str) -> Table: + r"""Helper function to get a table for a split. + + This function overrides the `_get_table` method in `EntityTask`. + Because we predict all values in the target column, we only look at the min and max timestamp + for each split and take all rows in the table between them. + """ + + db = self.dataset.get_db(upto_test_timestamp=split != "test") + + if split == "train": + start = self.dataset.val_timestamp - self.timedelta + end = db.min_timestamp + freq = -self.timedelta + + elif split == "val": + if self.dataset.val_timestamp + self.timedelta > db.max_timestamp: + raise RuntimeError( + "val timestamp + timedelta is larger than max timestamp! " + "This would cause val labels to be generated with " + "insufficient aggregation time." + ) + + start = self.dataset.test_timestamp - self.timedelta + end = self.dataset.val_timestamp + freq = -self.timedelta + + elif split == "test": + if self.dataset.test_timestamp + self.timedelta > db.max_timestamp: + raise RuntimeError( + "test timestamp + timedelta is larger than max timestamp! " + "This would cause test labels to be generated with " + "insufficient aggregation time." + ) + + start = db.max_timestamp + end = self.dataset.test_timestamp + freq = -self.timedelta + + timestamps = pd.date_range(start=start, end=end, freq=freq) + + if split == "train" and len(timestamps) < 3: + raise RuntimeError( + f"The number of training time frames is too few. " + f"({len(timestamps)} given)" + ) + + table = self.make_table(db, timestamps) + table = self.filter_dangling_entities(table) + + return table + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + entity_table = db.table_dict[self.entity_table].df # noqa: F841 + entity_table_removed_cols = db.table_dict[ # noqa: F841 + self.entity_table + ].removed_cols + + entity_col = db.table_dict[self.entity_table].pkey_col + + # Calculate minimum and maximum timestamps from timestamp_df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + min_timestamp = timestamp_df["timestamp"].min() + max_timestamp = timestamp_df["timestamp"].max() + + df = duckdb.sql( + f""" + SELECT + entity_table.{self.time_col}, + entity_table.{entity_col}, + entity_table_removed_cols.{self.target_col} + FROM + entity_table + LEFT JOIN + entity_table_removed_cols + ON + entity_table.{entity_col} = entity_table_removed_cols.{entity_col} + WHERE + entity_table.{self.time_col} > '{min_timestamp}' AND + entity_table.{self.time_col} <= '{max_timestamp}' + """ + ).df() + + if self.task_type == TaskType.MULTICLASS_CLASSIFICATION: + df[self.target_col] = self.transform_target(df[self.target_col]) + + # remove rows where self.target_col is nan + df = df.dropna(subset=[self.target_col]) + + return Table( + df=df, + fkey_col_to_pkey_table={ + entity_col: self.entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + def transform_target(self, target_col: pd.Series) -> pd.Series: + transformed = self.target_encoder.transform( + target_col.values.reshape(-1, 1) + ).flatten() + transformed_target = pd.Series(transformed, index=target_col.index) + # set unknown labels to NaN to filter them out during evaluation + transformed_target[transformed == UNKNOWN_CLASS_LABEL] = np.nan + return transformed_target diff --git a/plexe/relbench/base/task_base.py b/plexe/relbench/base/task_base.py new file mode 100644 index 00000000..5b0c9a42 --- /dev/null +++ b/plexe/relbench/base/task_base.py @@ -0,0 +1,233 @@ +import time +from enum import Enum +from functools import lru_cache +from pathlib import Path +from typing import Callable, List, Optional + +import pandas as pd +from numpy.typing import NDArray + +from .database import Database +from .dataset import Dataset +from .table import Table + + +class TaskType(Enum): + r"""The type of the task. + + Attributes: + REGRESSION: Regression task. + MULTICLASS_CLASSIFICATION: Multi-class classification task. + BINARY_CLASSIFICATION: Binary classification task. + MULTILABEL_CLASSIFICATION: Multi-label classification task. + LINK_PREDICTION: Link prediction task." + """ + + REGRESSION = "regression" + BINARY_CLASSIFICATION = "binary_classification" + MULTICLASS_CLASSIFICATION = "multiclass_classification" + MULTILABEL_CLASSIFICATION = "multilabel_classification" + LINK_PREDICTION = "link_prediction" + + +class BaseTask: + r"""Base class for a task on a dataset. + + Attributes: + task_type: The type of the task. + timedelta: The prediction task at `timestamp` is over the time window + (timestamp, timestamp + timedelta]. + num_eval_timestamps: The number of evaluation time windows. e.g., test + time windows are (test_timestamp, test_timestamp + timedelta] ... + (test_timestamp + (num_eval_timestamps - 1) * timedelta, test_timestamp + + num_eval_timestamps * timedelta]. + metrics: The metrics to evaluate this task on. + + Inherited by EntityTask and RecommendationTask. + """ + + # To be set by subclass. + task_type: TaskType + timedelta: pd.Timedelta + num_eval_timestamps: int = 1 + metrics: List[Callable[[NDArray, NDArray], float]] + + def __init__( + self, + dataset: Dataset, + cache_dir: Optional[str] = None, + ): + r"""Create a task object. + + Args: + dataset: The dataset object on which the task is defined. + cache_dir: A directory for caching the task table objects. If specified, + we will either process and cache the file (if not available) or use + the cached file. If None, we will not use cached file and re-process + everything from scratch without saving the cache. + """ + self.dataset = dataset + self.cache_dir = cache_dir + + time_diff = self.dataset.test_timestamp - self.dataset.val_timestamp + if time_diff < self.timedelta: + raise ValueError( + f"timedelta cannot be larger than the difference between val " + f"and test timestamps (timedelta: {self.timedelta}, time " + f"diff: {time_diff})." + ) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(dataset={repr(self.dataset)})" + + def make_table( + self, + db: Database, + timestamps: "pd.Series[pd.Timestamp]", + ) -> Table: + r"""Make a table using the task definition. + + Args: + db: The database object to use for (historical) ground truth. + timestamps: Collection of timestamps to compute labels for. A label can be + computed for a timestamp using historical data + upto this timestamp in the database. + + To be implemented by subclass. The table rows need not be ordered + deterministically. + """ + + raise NotImplementedError + + def _get_table(self, split: str) -> Table: + r"""Helper function to get a table for a split.""" + + db = self.dataset.get_db(upto_test_timestamp=split != "test") + + if split == "train": + start = self.dataset.val_timestamp - self.timedelta + end = db.min_timestamp + freq = -self.timedelta + + elif split == "val": + if self.dataset.val_timestamp + self.timedelta > db.max_timestamp: + raise RuntimeError( + "val timestamp + timedelta is larger than max timestamp! " + "This would cause val labels to be generated with " + "insufficient aggregation time." + ) + + start = self.dataset.val_timestamp + end = min( + self.dataset.val_timestamp + + self.timedelta * (self.num_eval_timestamps - 1), + self.dataset.test_timestamp - self.timedelta, + ) + freq = self.timedelta + + elif split == "test": + if self.dataset.test_timestamp + self.timedelta > db.max_timestamp: + raise RuntimeError( + "test timestamp + timedelta is larger than max timestamp! " + "This would cause test labels to be generated with " + "insufficient aggregation time." + ) + + start = self.dataset.test_timestamp + end = min( + self.dataset.test_timestamp + + self.timedelta * (self.num_eval_timestamps - 1), + db.max_timestamp - self.timedelta, + ) + freq = self.timedelta + + timestamps = pd.date_range(start=start, end=end, freq=freq) + + if split == "train" and len(timestamps) < 3: + raise RuntimeError( + f"The number of training time frames is too few. " + f"({len(timestamps)} given)" + ) + + table = self.make_table(db, timestamps) + table = self.filter_dangling_entities(table) + + return table + + @lru_cache(maxsize=None) + def get_table(self, split, mask_input_cols=None): + r"""Get a table for a split. + + Args: + split: The split to get the table for. One of "train", "val", or "test". + mask_input_cols: If True, keep only the input columns in the table. If + None, mask the input columns only for the test split. This helps + prevent data leakage. + + Returns: + The task table for the split. + + The table is cached in memory. + """ + + if mask_input_cols is None: + mask_input_cols = split == "test" + + table_path = f"{self.cache_dir}/{split}.parquet" + if self.cache_dir and Path(table_path).exists(): + table = Table.load(table_path) + else: + print(f"Making task table for {split} split from scratch...") + # print( + # "(You can also use `get_task(..., download=True)` " + # "for tasks prepared by the RelBench team.)" + # ) + tic = time.time() + table = self._get_table(split) + toc = time.time() + print(f"Done in {toc - tic:.2f} seconds.") + + if self.cache_dir: + table.save(table_path) + + if mask_input_cols: + table = self._mask_input_cols(table) + + return table + + def _mask_input_cols(self, table: Table) -> Table: + input_cols = [ + table.time_col, + *table.fkey_col_to_pkey_table.keys(), + ] + return Table( + df=table.df[input_cols], + fkey_col_to_pkey_table=table.fkey_col_to_pkey_table, + pkey_col=table.pkey_col, + time_col=table.time_col, + ) + + def filter_dangling_entities(self, table: Table) -> Table: + r"""Filter out dangling entities from a table. + + Implemented by EntityTask and RecommendationTask. + """ + raise NotImplementedError + + def evaluate( + self, + pred: NDArray, + target_table: Optional[Table] = None, + metrics: Optional[List[Callable[[NDArray, NDArray], float]]] = None, + ): + r"""Evaluate predictions on the task. + + Args: + pred: Predictions as a numpy array. + target_table: The target table. If None, use the test table. + metrics: The metrics to evaluate the prediction table. If None, use + the default metrics for the task. + + Implemented by EntityTask and RecommendationTask. + """ + raise NotImplementedError diff --git a/plexe/relbench/base/task_entity.py b/plexe/relbench/base/task_entity.py new file mode 100644 index 00000000..3b5d3088 --- /dev/null +++ b/plexe/relbench/base/task_entity.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from typing import Any, Callable, Dict, List, Optional + +import numpy as np +import pandas as pd +from numpy.typing import NDArray + +from .table import Table +from .task_base import BaseTask, TaskType + + +class EntityTask(BaseTask): + r"""A node prediction task on a dataset. + + Attributes: + entity_col: The entity column. + entity_table: The entity table. + time_col: The time column. + target_col: The target column. + + Other attributes are inherited from BaseTask. + """ + + entity_col: str + entity_table: str + time_col: str + target_col: str + task_type: TaskType + timedelta: pd.Timedelta + metrics: List[Callable[[NDArray, NDArray], float]] + num_eval_timestamps: int = 1 + + def filter_dangling_entities(self, table: Table) -> Table: + db = self.dataset.get_db() + num_entities = len(db.table_dict[self.entity_table]) + filter_mask = table.df[self.entity_col] >= num_entities + + if filter_mask.any(): + table.df = table.df[~filter_mask] + + return table + + def evaluate( + self, + pred: NDArray, + target_table: Optional[Table] = None, + metrics: Optional[List[Callable[[NDArray, NDArray], float]]] = None, + ) -> Dict[str, float]: + if metrics is None: + metrics = self.metrics + + if target_table is None: + target_table = self.get_table("test", mask_input_cols=False) + + target = target_table.df[self.target_col].to_numpy() + if len(pred) != len(target): + raise ValueError( + f"The length of pred and target must be the same (got " + f"{len(pred)} and {len(target)}, respectively)." + ) + + return {fn.__name__: fn(target, pred) for fn in metrics} + + def stats(self) -> Dict[str, Dict[str, Any]]: + r"""Get train / val / test table statistics for each timestamp + and the whole table, including number of rows and number of entities. + Tasks with different task types have different statistics computed: + + BINARY_CLASSIFICATION: Number of positives and negatives. + REGRESSION: Minimum, maximum, mean, median, quantile 25 and, + quantile 75 of the target values. + MULTILABEL_CLASSIFICATION: Mean, minimum and maximum number of + classes per entity. Number and index of classes having minimum + and maximum number of classes. + """ + res = {} + for split in ["train", "val", "test"]: + table = self.get_table(split, mask_input_cols=False) + timestamps = table.df[self.time_col].unique() + split_stats = {} + for timestamp in timestamps: + temp_df = table.df[table.df[self.time_col] == timestamp] + stats = { + "num_rows": len(temp_df), + "num_unique_entities": temp_df[self.entity_col].nunique(), + } + self._set_stats(temp_df, stats) + split_stats[str(timestamp)] = stats + split_stats["total"] = { + "num_rows": len(table.df), + "num_unique_entities": table.df[self.entity_col].nunique(), + } + self._set_stats(table.df, split_stats["total"]) + res[split] = split_stats + total_df = pd.concat( + [ + table.df + for table in [ + self.get_table(split, mask_input_cols=False) + for split in ["train", "val", "test"] + ] + if table is not None + ] + ) + res["total"] = {} + self._set_stats(total_df, res["total"]) + train_uniques = set(self.get_table("train").df[self.entity_col].unique()) + test_uniques = set( + self.get_table("test", mask_input_cols=False).df[self.entity_col].unique() + ) + ratio_train_test_entity_overlap = len( + train_uniques.intersection(test_uniques) + ) / len(test_uniques) + res["total"][ + "ratio_train_test_entity_overlap" + ] = ratio_train_test_entity_overlap + return res + + def _set_stats(self, df: pd.DataFrame, stats: dict[str, Any]) -> None: + if self.task_type == TaskType.BINARY_CLASSIFICATION: + self._set_binary_stats(df, stats) + elif self.task_type == TaskType.REGRESSION: + self._set_regression_stats(df, stats) + elif self.task_type == TaskType.MULTILABEL_CLASSIFICATION: + self._set_multilabel_stats(df, stats) + else: + raise ValueError(f"Unsupported task type {self.task_type}") + + def _set_binary_stats(self, df: pd.DataFrame, stats: dict[str, Any]) -> None: + stats["num_positives"] = (df[self.target_col] == 1).sum() + stats["num_negatives"] = (df[self.target_col] == 0).sum() + + def _set_regression_stats(self, df: pd.DataFrame, stats: dict[str, Any]) -> None: + stats["min_target"] = df[self.target_col].min() + stats["max_target"] = df[self.target_col].max() + stats["mean_target"] = df[self.target_col].mean() + quantiles = df[self.target_col].quantile([0.25, 0.5, 0.75]) + stats["quantile_25_target"] = quantiles.iloc[0] + stats["median_target"] = quantiles.iloc[1] + stats["quantile_75_target"] = quantiles.iloc[2] + + def _set_multilabel_stats(self, df: pd.DataFrame, stats: dict[str, Any]) -> None: + arr = np.array([row for row in df[self.target_col]]) + arr_row = arr.sum(1) + stats["mean_num_classes_per_entity"] = round(arr_row.mean(), 4) + stats["max_num_classes_per_entity"] = arr_row.max() + stats["min_num_classes_per_entity"] = arr_row.min() + arr_class = arr.sum(0) + max_num_class_idx = arr_class.argmax() + stats["max_num_class_idx"] = max_num_class_idx + stats["max_num_class_num"] = arr_class[max_num_class_idx] + min_num_class_idx = arr_class.argmin() + stats["min_num_class_idx"] = min_num_class_idx + stats["min_num_class_num"] = arr_class[min_num_class_idx] diff --git a/plexe/relbench/base/task_recommendation.py b/plexe/relbench/base/task_recommendation.py new file mode 100644 index 00000000..12259d4f --- /dev/null +++ b/plexe/relbench/base/task_recommendation.py @@ -0,0 +1,191 @@ +from __future__ import annotations + +from typing import Callable, Dict, List, Optional + +import numpy as np +import pandas as pd +from numpy.typing import NDArray + +from .dataset import Dataset +from .table import Table +from .task_base import BaseTask, TaskType + + +class RecommendationTask(BaseTask): + r"""A link prediction task on a dataset. + + Attributes: + src_entity_col: The source entity column. + src_entity_table: The source entity table. + dst_entity_col: The destination entity column. + dst_entity_table: The destination entity table. + time_col: The time column. + eval_k: k for eval@k metrics. + + Other attributes are inherited from BaseTask. + """ + + src_entity_col: str + src_entity_table: str + dst_entity_col: str + dst_entity_table: str + time_col: str + eval_k: int + task_type: TaskType + timedelta: pd.Timedelta + metrics: List[Callable[[NDArray, NDArray], float]] + num_eval_timestamps: int = 1 + + def __init__( + self, + dataset: Dataset, + cache_dir: Optional[str] = None, + ): + if self.num_eval_timestamps != 1: + raise NotImplementedError( + "RecommendationTask currently only supports num_eval_timestamps=1." + ) + super().__init__(dataset, cache_dir) + + def filter_dangling_entities(self, table: Table) -> Table: + # filter dangling destination entities from a list + table.df[self.dst_entity_col] = table.df[self.dst_entity_col].apply( + lambda x: [i for i in x if i < self.num_dst_nodes] + ) + + # filter dangling source entities and empty list (after above filtering) + filter_mask = (table.df[self.src_entity_col] >= self.num_src_nodes) | ( + ~table.df[self.dst_entity_col].map(bool) + ) + + if filter_mask.any(): + table.df = table.df[~filter_mask] + table.df = table.df.reset_index(drop=True) + + return table + + def evaluate( + self, + pred: NDArray, + target_table: Optional[Table] = None, + metrics: Optional[List[Callable[[NDArray, NDArray], float]]] = None, + ) -> Dict[str, float]: + if metrics is None: + metrics = self.metrics + + if target_table is None: + target_table = self.get_table("test", mask_input_cols=False) + + expected_pred_shape = (len(target_table), self.eval_k) + if pred.shape != expected_pred_shape: + raise ValueError( + f"The shape of pred must be {expected_pred_shape}, but " + f"{pred.shape} given." + ) + + pred_isin_list = [] + dst_count_list = [] + for true_dst_nodes, pred_dst_nodes in zip( + target_table.df[self.dst_entity_col], + pred, + ): + pred_isin_list.append( + np.isin(np.array(pred_dst_nodes), np.array(true_dst_nodes)) + ) + dst_count_list.append(len(true_dst_nodes)) + pred_isin = np.stack(pred_isin_list) + dst_count = np.array(dst_count_list) + + return {fn.__name__: fn(pred_isin, dst_count) for fn in metrics} + + @property + def num_src_nodes(self) -> int: + return len(self.dataset.get_db().table_dict[self.src_entity_table]) + + @property + def num_dst_nodes(self) -> int: + return len(self.dataset.get_db().table_dict[self.dst_entity_table]) + + def stats(self) -> Dict[str, Dict[str, int]]: + r"""Get train / val / test table statistics for each timestamp and the whole + table, including number of unique source entities, number of unique destination + entities, number of destination entities and number of rows.""" + + res = {} + for split in ["train", "val", "test"]: + split_stats = {} + table = self.get_table(split, mask_input_cols=False) + timestamps = table.df[self.time_col].unique() + for timestamp in timestamps: + temp_df = table.df[table.df[self.time_col] == timestamp] + ( + num_unique_src_entities, + num_unique_dst_entities, + num_dst_entities, + num_rows, + ) = self._get_stats(temp_df) + split_stats[str(timestamp)] = { + "num_unique_src_entities": num_unique_src_entities, + "num_unique_dst_entities": num_unique_dst_entities, + "num_dst_entities": num_dst_entities, + "num_rows": num_rows, + } + + ( + num_unique_src_entities, + num_unique_dst_entities, + num_dst_entities, + num_rows, + ) = self._get_stats(table.df) + split_stats["total"] = { + "num_unique_src_entities": num_unique_src_entities, + "num_unique_dst_entities": num_unique_dst_entities, + "num_dst_entities": num_dst_entities, + "num_rows": num_rows, + } + res[split] = split_stats + total_df = pd.concat( + [ + table.df + for table in [ + self.get_table("train"), + self.get_table("val"), + self.get_table("test"), + ] + if table is not None + ] + ) + num_unique_src_entities, num_unique_dst_entities, num_dst_entities, num_rows = ( + self._get_stats(total_df) + ) + res["total"] = { + "num_unique_src_entities": num_unique_src_entities, + "num_unique_dst_entities": num_unique_dst_entities, + "num_dst_entities": num_dst_entities, + "num_rows": num_rows, + } + train_uniques = set(self.get_table("train").df[self.src_entity_col].unique()) + if self.get_table("test") is None: + return res + test_uniques = set(self.get_table("test").df[self.src_entity_col].unique()) + ratio_train_test_entity_overlap = len( + train_uniques.intersection(test_uniques) + ) / len(test_uniques) + res["total"][ + "ratio_train_test_entity_overlap" + ] = ratio_train_test_entity_overlap + return res + + def _get_stats(self, df: pd.DataFrame) -> List[int]: + num_unique_src_entities = df[self.src_entity_col].nunique() + num_unique_dst_entities = len( + set(value for row in df[self.dst_entity_col] for value in row) + ) + num_dst_entities = sum(len(row) for row in df[self.dst_entity_col]) + num_rows = len(df) + return ( + num_unique_src_entities, + num_unique_dst_entities, + num_dst_entities, + num_rows, + ) diff --git a/plexe/relbench/datasets/__init__.py b/plexe/relbench/datasets/__init__.py new file mode 100644 index 00000000..07371c2d --- /dev/null +++ b/plexe/relbench/datasets/__init__.py @@ -0,0 +1,75 @@ +import json +import pkgutil +from functools import lru_cache +from typing import List + +import pooch + +from ..base import Dataset +from ..datasets import ( + amazon, + avito, + event, + f1, + hm, + stack, + trial, +) + +dataset_registry = {} + + +def register_dataset( + name: str, + cls: Dataset, + *args, + **kwargs, +) -> None: + r"""Register an instantiation of a :class:`Dataset` subclass with the given name. + + Args: + name: The name of the dataset. + cls: The class of the dataset. + args: The arguments to instantiate the dataset. + kwargs: The keyword arguments to instantiate the dataset. + + The name is used to enable caching and downloading functionalities. + `cache_dir` is added to kwargs by default. If you want to override it, you + can pass `cache_dir` as a keyword argument in `kwargs`. + """ + + cache_dir = f"{pooch.os_cache('relbench')}/{name}" + kwargs = {"cache_dir": cache_dir, **kwargs} + dataset_registry[name] = (cls, args, kwargs) + + +def get_dataset_names() -> List[str]: + r"""Return a list of names of the registered datasets.""" + return list(dataset_registry.keys()) + +@lru_cache(maxsize=None) +def get_dataset(name: str, download=True) -> Dataset: + r"""Return a dataset object by name. + + Args: + name: The name of the dataset. + download: If True, download the dataset from the RelBench server. + + Returns: + Dataset: The dataset object. + + If `download` is True, the database comprising the dataset will be + downloaded into the cache from the RelBench server. If you use + `download=False` the first time, the database will be processed from the + raw files of the original source. + + Once the database is cached, either because of download or processing from + raw files, the cache will be used. `download=True` will verify that the + cached database matches the RelBench version even in this case. + """ + + + cls, args, kwargs = dataset_registry[name] + + dataset = cls(*args, **kwargs) + return dataset diff --git a/plexe/relbench/datasets/amazon.py b/plexe/relbench/datasets/amazon.py new file mode 100644 index 00000000..407fb917 --- /dev/null +++ b/plexe/relbench/datasets/amazon.py @@ -0,0 +1,228 @@ +import time + +import pandas as pd +import pooch +import pyarrow as pa +import pyarrow.json + +from ..base import Database, Dataset, Table + + +class AmazonDataset(Dataset): + val_timestamp = pd.Timestamp("2015-10-01") + test_timestamp = pd.Timestamp("2016-01-01") + + url_prefix = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2" + _category_to_url_key = {"books": "Books", "fashion": "AMAZON_FASHION"} + + known_hashes = { + "meta_Books.json.gz": "80ed7ac64f5967a140401e8d7bf0587d2e5087492de9e94077a7f554ef6b18f0", + "Books_5.json.gz": "ded924d1d1a22bae499f1a1c2b39397104304bfdb24232a2dd0aa50e89cd37bb", + } + + def __init__( + self, + category: str = "books", + use_5_core: bool = True, + cache_dir: str = None, + ): + self.category = category + self.use_5_core = use_5_core + super().__init__(cache_dir=cache_dir) + + def make_db(self) -> Database: + r"""Process the raw files into a database.""" + + ### product table ### + + url_key = self._category_to_url_key[self.category] + url = f"{self.url_prefix}/metaFiles2/meta_{url_key}.json.gz" + path = pooch.retrieve( + url, + known_hash=self.known_hashes.get(url.split("/")[-1], None), + progressbar=True, + processor=pooch.Decompress(), + ) + print(f"reading product info from {path}...") + tic = time.time() + ptable = pa.json.read_json( + path, + parse_options=pa.json.ParseOptions( + explicit_schema=pa.schema( + [ + ("asin", pa.string()), + ("category", pa.list_(pa.string())), + ("brand", pa.string()), + ("title", pa.string()), + ("description", pa.list_(pa.string())), + ("price", pa.string()), + ] + ), + unexpected_field_behavior="ignore", + ), + ) + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + print("converting to pandas dataframe...") + tic = time.time() + pdf = ptable.to_pandas() + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + print("processing product info...") + tic = time.time() + + # asin is not intuitive / recognizable + pdf.rename(columns={"asin": "product_id"}, inplace=True) + + # somehow the raw data has duplicate product_id's + pdf.drop_duplicates(subset=["product_id"], inplace=True) + + # price is like "$x,xxx.xx", "$xx.xx", or "$xx.xx - $xx.xx", or garbage html + # if it's a range, we take the first value + pdf.loc[:, "price"] = pdf["price"].apply( + lambda x: ( + None + if x is None or x == "" or x[0] != "$" + else float(x.split(" ")[0][1:].replace(",", "")) + ) + ) + + # remove products with missing price + pdf = pdf.dropna(subset=["price"]) + + pdf.loc[:, "category"] = pdf["category"].apply( + lambda x: None if x is None or len(x) == 0 else x + ) + + # some rows are stored as ['cat1' 'cat2' 'cat3' ...] + # this function maps them to ['cat1', 'cat2', 'cat3', ...] (list of strings) + # since otherwise pytorch-frame breaks + def fix_column(value): + if isinstance(value, str): + return value # Already a string + elif value is None: + return None + else: + return list(value) + + pdf["category"] = pdf["category"].apply(fix_column) + + # description is either [] or ["some description"] + pdf.loc[:, "description"] = pdf["description"].apply( + lambda x: None if x is None or len(x) == 0 else x[0] + ) + + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + ### review table ### + + if self.use_5_core: + url = f"{self.url_prefix}/categoryFilesSmall/{url_key}_5.json.gz" + else: + url = f"{self.url_prefix}/categoryFiles/{url_key}.json.gz" + path = pooch.retrieve( + url, + known_hash=self.known_hashes.get(url.split("/")[-1], None), + progressbar=True, + processor=pooch.Decompress(), + ) + print(f"reading review and customer info from {path}...") + tic = time.time() + rtable = pa.json.read_json( + path, + parse_options=pa.json.ParseOptions( + explicit_schema=pa.schema( + [ + ("unixReviewTime", pa.int32()), + ("reviewerID", pa.string()), + ("reviewerName", pa.string()), + ("asin", pa.string()), + ("overall", pa.float32()), + ("verified", pa.bool_()), + ("reviewText", pa.string()), + ("summary", pa.string()), + ] + ), + unexpected_field_behavior="ignore", + ), + ) + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + print("converting to pandas dataframe...") + tic = time.time() + rdf = rtable.to_pandas() + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + print("processing review and customer info...") + tic = time.time() + + rdf.rename( + columns={ + "unixReviewTime": "review_time", + "reviewerID": "customer_id", + "reviewerName": "customer_name", + "asin": "product_id", + "overall": "rating", + "reviewText": "review_text", + }, + inplace=True, + ) + + rdf.loc[:, "review_time"] = pd.to_datetime(rdf["review_time"], unit="s") + + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + print("keeping only products common to product and review tables...") + tic = time.time() + plist = list(set(pdf["product_id"]) & set(rdf["product_id"])) + pdf.query("product_id == @plist", inplace=True) + rdf.query("product_id == @plist", inplace=True) + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + print("extracting customer table...") + tic = time.time() + cdf = ( + rdf[["customer_id", "customer_name"]] + .drop_duplicates(subset=["customer_id"]) + .copy() + ) + rdf.drop(columns=["customer_name"], inplace=True) + toc = time.time() + print(f"done in {toc - tic:.2f} seconds.") + + db = Database( + table_dict={ + "product": Table( + df=pdf, + fkey_col_to_pkey_table={}, + pkey_col="product_id", + time_col=None, + ), + "customer": Table( + df=cdf, + fkey_col_to_pkey_table={}, + pkey_col="customer_id", + time_col=None, + ), + "review": Table( + df=rdf, + fkey_col_to_pkey_table={ + "customer_id": "customer", + "product_id": "product", + }, + pkey_col=None, + time_col="review_time", + ), + } + ) + + db = db.from_(pd.Timestamp("2008-01-01")) + + return db diff --git a/plexe/relbench/datasets/avito.py b/plexe/relbench/datasets/avito.py new file mode 100644 index 00000000..5419d796 --- /dev/null +++ b/plexe/relbench/datasets/avito.py @@ -0,0 +1,134 @@ +import os + +import pandas as pd +import pooch + +from ..base import Database, Dataset, Table +from ..utils import clean_datetime, unzip_processor + + +class AvitoDataset(Dataset): + """Original data source: + https://www.kaggle.com/competitions/avito-context-ad-clicks""" + + # search stream ranges from 2015-04-25 to 2015-05-20 + val_timestamp = pd.Timestamp("2015-05-08") + test_timestamp = pd.Timestamp("2015-05-14") + + def make_db(self) -> Database: + # subsampled version of the original dataset + # Customize path as necessary + r"""Process the raw files into a database.""" + url = "https://relbench.stanford.edu/data/rel-avito-raw-100k.zip" + path = pooch.retrieve( + url, + known_hash="ad4fc1789d8a5073ea449049888c671899525c9a8a42359ca75d1f17d04d7929", + progressbar=True, + processor=unzip_processor, + ) + path = os.path.join(path, "avito_100k_integ_test") + + # Define table names + ads_info = os.path.join(path, "AdsInfo") + category = os.path.join(path, "Category") + location = os.path.join(path, "Location") + phone_requests_stream = os.path.join(path, "PhoneRequestsStream") + search_info = os.path.join(path, "SearchInfo") + search_stream = os.path.join(path, "SearchStream") + user_info = os.path.join(path, "UserInfo") + visit_stream = os.path.join(path, "VisitStream") + if not os.path.exists(ads_info): + raise RuntimeError( + self.err_msg.format(data="Dataset", url=self.url, path=path) + ) + + # Load table as pandas dataframes + ads_info_df = pd.read_parquet(ads_info) + ads_info_df.dropna(subset=["AdID"], inplace=True) + # Params column contains a dictionary of type Dict[int, str]. + # Drop it for now since we can not handle this column type yet. + ads_info_df.drop(columns=["Params"], inplace=True) + ads_info_df["Title"].fillna("", inplace=True) + category_df = pd.read_parquet(category) + location_df = pd.read_parquet(location) + location_df.dropna(subset=["LocationID"], inplace=True) + phone_requests_stream_df = pd.read_parquet(phone_requests_stream) + search_info_df = pd.read_parquet(search_info) + # SearchParams column contains a dictionary of type Dict[int, str]. + # Drop it for now since we can not handle this column type yet. + search_info_df.drop(columns=["SearchParams"], inplace=True) + search_stream_df = pd.read_parquet(search_stream) + user_info_df = pd.read_parquet(user_info) + visit_stream_df = pd.read_parquet(visit_stream) + search_info_df = clean_datetime(search_info_df, "SearchDate") + search_stream_df = clean_datetime(search_stream_df, "SearchDate") + phone_requests_stream_df = clean_datetime( + phone_requests_stream_df, "PhoneRequestDate" + ) + visit_stream_df = clean_datetime(visit_stream_df, "ViewDate") + + category_df.drop(columns=["__index_level_0__"], inplace=True) + + tables = {} + tables["AdsInfo"] = Table( + df=ads_info_df, + fkey_col_to_pkey_table={ + "LocationID": "Location", + "CategoryID": "Category", + }, + pkey_col="AdID", + ) + tables["Category"] = Table( + df=category_df, + fkey_col_to_pkey_table={}, + pkey_col="CategoryID", + ) + tables["Location"] = Table( + df=location_df, + fkey_col_to_pkey_table={}, + pkey_col="LocationID", + ) + tables["PhoneRequestsStream"] = Table( + df=phone_requests_stream_df, + fkey_col_to_pkey_table={ + "UserID": "UserInfo", + "AdID": "AdsInfo", + }, + time_col="PhoneRequestDate", + ) + tables["SearchInfo"] = Table( + df=search_info_df, + fkey_col_to_pkey_table={ + "UserID": "UserInfo", + "LocationID": "Location", + "CategoryID": "Category", + }, + pkey_col="SearchID", + time_col="SearchDate", + ) + tables["SearchStream"] = Table( + df=search_stream_df, + fkey_col_to_pkey_table={ + "SearchID": "SearchInfo", + "AdID": "AdsInfo", + }, + time_col="SearchDate", + ) + tables["UserInfo"] = Table( + df=user_info_df, + fkey_col_to_pkey_table={}, + pkey_col="UserID", + ) + tables["VisitStream"] = Table( + df=visit_stream_df, + fkey_col_to_pkey_table={ + "UserID": "UserInfo", + "AdID": "AdsInfo", + }, + time_col="ViewDate", + ) + db = Database(tables) + + db = db.from_(pd.Timestamp("2015-04-25")) + + return db diff --git a/plexe/relbench/datasets/dbinfer.py b/plexe/relbench/datasets/dbinfer.py new file mode 100644 index 00000000..6c0fa4ec --- /dev/null +++ b/plexe/relbench/datasets/dbinfer.py @@ -0,0 +1,113 @@ +import os +from typing import Dict + +import pandas as pd +import pooch + +from ..base import Database, Dataset, Table + +DEFAULT_DBINFER_ADAPTER_CACHE = os.path.join( + pooch.os_cache("relbench"), "dbinfer-adapters" +) + + +class DBInferDatasetBase(Dataset): + """Materialize a 4DBInfer dataset as a RelBench Database.""" + + dbinfer_name: str | None = None + default_task_name: str | None = None + + # DBInfer datasets are not time-sliced, so we set placeholder timestamps that + # satisfy the Dataset API without affecting static tables. + val_timestamp = pd.Timestamp("1970-01-01") + test_timestamp = pd.Timestamp("1970-01-02") + + def __init__( + self, + cache_dir: str | None = None, + adapter_cache_dir: str | None = None, + ): + super().__init__(cache_dir=cache_dir) + if not self.dbinfer_name or not self.default_task_name: + raise ValueError( + "DBInferDatasetBase subclasses must define 'dbinfer_name' and " + "'default_task_name'." + ) + if adapter_cache_dir is None: + adapter_cache_dir = DEFAULT_DBINFER_ADAPTER_CACHE + self._adapter_cache_dir = adapter_cache_dir + + def _load_dataset_adapter(self): + from dbinfer_relbench_adapter.loader import load_dbinfer_data + + dataset_adapter, _ = load_dbinfer_data( + dataset_name=self.dbinfer_name, + task_name=self.default_task_name, + use_cache=True, + cache_dir=self._adapter_cache_dir, + ) + return dataset_adapter + + def _build_table_dict(self) -> Dict[str, Table]: + dataset_adapter = self._load_dataset_adapter() + mock_db = dataset_adapter.get_db() + + table_dict: Dict[str, Table] = {} + for name, mock_table in mock_db.table_dict.items(): + table_dict[name] = Table( + df=mock_table.df.copy(), + fkey_col_to_pkey_table=getattr( + mock_table, "fkey_col_to_pkey_table", {} + ), + pkey_col=getattr(mock_table, "pkey_col", None), + time_col=getattr(mock_table, "time_col", None), + ) + return table_dict + + def make_db(self) -> Database: + return Database(table_dict=self._build_table_dict()) + + def get_db(self, upto_test_timestamp: bool = True) -> Database: + """DBInfer datasets are static, so never trim by timestamp.""" + + return super().get_db(upto_test_timestamp=False) + + +class DBInferAVSDataset(DBInferDatasetBase): + dbinfer_name = "avs" + default_task_name = "repeater" + + +class DBInferMAGDataset(DBInferDatasetBase): + dbinfer_name = "mag" + default_task_name = "cite" + + +class DBInferDigineticaDataset(DBInferDatasetBase): + dbinfer_name = "diginetica" + default_task_name = "ctr" + + +class DBInferRetailRocketDataset(DBInferDatasetBase): + dbinfer_name = "retailrocket" + default_task_name = "cvr" + + +class DBInferSeznamDataset(DBInferDatasetBase): + dbinfer_name = "seznam" + default_task_name = "charge" + + +class DBInferAmazonDataset(DBInferDatasetBase): + dbinfer_name = "amazon" + default_task_name = "rating" + + +class DBInferStackExchangeDataset(DBInferDatasetBase): + dbinfer_name = "stackexchange" + default_task_name = "churn" + + +class DBInferOutbrainSmallDataset(DBInferDatasetBase): + dbinfer_name = "outbrain-small" + default_task_name = "ctr" diff --git a/plexe/relbench/datasets/event.py b/plexe/relbench/datasets/event.py new file mode 100644 index 00000000..1ccd2ab1 --- /dev/null +++ b/plexe/relbench/datasets/event.py @@ -0,0 +1,183 @@ +import os +import shutil +from pathlib import Path + +import pandas as pd + +from ..base import Database, Dataset, Table +from ..utils import decompress_gz_file + + +class EventDataset(Dataset): + url = "https://www.kaggle.com/competitions/event-recommendation-engine-challenge" # noqa + err_msg = ( + "{data} not found. Please download " + "event-recommendation-engine-challenge.zip from " + "'{url}' and move it to '{path}'. Once you have your" + "Kaggle API key, you can use the following command: " + "kaggle competitions download -c event-recommendation-engine-challenge" + ) + + val_timestamp = pd.Timestamp("2012-11-21") + test_timestamp = pd.Timestamp("2012-11-29") + + def check_table_and_decompress_if_exists(self, table_path: str, alt_path: str = ""): + if not os.path.exists(table_path) or ( + alt_path != "" and not os.path.exists(alt_path) + ): + if os.path.exists(table_path + ".gz"): + decompress_gz_file(table_path + ".gz", table_path) + else: + self.err_msg.format(data=table_path, url=self.url, path=table_path) + + def make_db(self) -> Database: + path = os.path.join("data", "rel-event") + zip_path = os.path.join(path, "event-recommendation-engine-challenge.zip") + users = os.path.join(path, "users.csv") + user_friends = os.path.join(path, "user_friends.csv") + events = os.path.join(path, "events.csv") + event_attendees = os.path.join(path, "event_attendees.csv") + if not (os.path.exists(users)): + if not os.path.exists(zip_path): + raise RuntimeError( + self.err_msg.format(data="Dataset", url=self.url, path=zip_path) + ) + else: + shutil.unpack_archive(zip_path, Path(zip_path).parent) + self.check_table_and_decompress_if_exists( + user_friends, os.path.join(path, "user_friends_flattened.csv") + ) + self.check_table_and_decompress_if_exists(events) + self.check_table_and_decompress_if_exists( + event_attendees, os.path.join(path, "event_attendees_flattened.csv") + ) + users_df = pd.read_csv(users, dtype={"user_id": int}, parse_dates=["joinedAt"]) + users_df["birthyear"] = pd.to_numeric(users_df["birthyear"], errors="coerce") + users_df["joinedAt"] = pd.to_datetime( + users_df["joinedAt"], errors="coerce", format="mixed" + ).dt.tz_localize(None) + + events_df = pd.read_csv(events) + events_df["start_time"] = pd.to_datetime( + events_df["start_time"], errors="coerce", format="mixed" + ).dt.tz_localize(None) + + train = os.path.join(path, "train.csv") + event_interest_df = pd.read_csv(train) + event_interest_df["timestamp"] = pd.to_datetime( + event_interest_df["timestamp"], format="mixed" + ).dt.tz_localize(None) + + if not os.path.exists(os.path.join(path, "user_friends_flattened.csv")): + user_friends_df = pd.read_csv(user_friends) + user_friends_df = ( + user_friends_df.set_index("user")["friends"] + .str.split(expand=True) + .stack() + .reset_index() + ) + user_friends_df.columns = ["user", "index", "friend"] + user_friends_flattened_df = user_friends_df.drop("index", axis=1).assign( + user=lambda df: df["user"].astype(int), + friend=lambda df: df["friend"].astype(int), + ) + + # Some friends are not present in the user table, so we drop those friends + # in the user_friends table + user_friends_flattened_df = user_friends_flattened_df.merge( + users_df, how="inner", left_on="friend", right_on="user_id" + ) + user_friends_flattened_df = user_friends_flattened_df[["user", "friend"]] + user_friends_flattened_df.to_csv( + os.path.join(path, "user_friends_flattened.csv") + ) + else: + user_friends_flattened_df = pd.read_csv( + os.path.join(path, "user_friends_flattened.csv") + ) + + if not os.path.exists(os.path.join(path, "event_attendees_flattened.csv")): + event_attendees_df = pd.read_csv(event_attendees) + melted_df = event_attendees_df.melt( + id_vars=["event"], + value_vars=["yes", "maybe", "invited", "no"], + var_name="status", + value_name="user_ids", + ) + melted_df = melted_df.dropna() + melted_df["user_ids"] = melted_df["user_ids"].str.split() + melted_df["user_ids"] = melted_df["user_ids"].apply( + lambda x: [int(i) for i in x] + ) + exploded_df = melted_df.explode("user_ids") + exploded_df["user_ids"] = exploded_df["user_ids"].astype(int) + exploded_df.rename(columns={"user_ids": "user_id"}, inplace=True) + exploded_df = pd.merge( + exploded_df, + events_df[["event_id", "start_time"]], + left_on="event", + right_on="event_id", + how="left", + ) + exploded_df = exploded_df.drop("event_id", axis=1) + event_attendees_flattened_df = exploded_df.dropna(subset=["user_id"]) + event_attendees_flattened_df.to_csv( + os.path.join(path, "event_attendees_flattened.csv") + ) + else: + event_attendees_flattened_df = pd.read_csv( + os.path.join(path, "event_attendees_flattened.csv") + ) + event_attendees_flattened_df["start_time"] = pd.to_datetime( + event_attendees_flattened_df["start_time"], + errors="coerce", + format="mixed", + ) + event_attendees_flattened_df["start_time"] = ( + event_attendees_flattened_df["start_time"] + .dt.tz_localize(None) + .apply(pd.Timestamp) + ) + event_attendees_flattened_df = event_attendees_flattened_df.dropna( + subset=["user_id"] + ) + + return Database( + table_dict={ + "users": Table( + df=users_df, + fkey_col_to_pkey_table={}, + pkey_col="user_id", + time_col="joinedAt", + ), + "events": Table( + df=events_df, + fkey_col_to_pkey_table={"user_id": "users"}, + pkey_col="event_id", + time_col="start_time", + ), + "event_attendees": Table( + df=event_attendees_flattened_df, + fkey_col_to_pkey_table={ + "event": "events", + "user_id": "users", + }, + time_col="start_time", + ), + "event_interest": Table( + df=event_interest_df, + fkey_col_to_pkey_table={ + "event": "events", + "user": "users", + }, + time_col="timestamp", + ), + "user_friends": Table( + df=user_friends_flattened_df, + fkey_col_to_pkey_table={ + "user": "users", + "friend": "users", + }, + ), + } + ) diff --git a/plexe/relbench/datasets/f1.py b/plexe/relbench/datasets/f1.py new file mode 100644 index 00000000..b9752eb9 --- /dev/null +++ b/plexe/relbench/datasets/f1.py @@ -0,0 +1,238 @@ +import os + +import numpy as np +import pandas as pd +import pooch + +from ..base import Database, Dataset, Table +from ..utils import unzip_processor + + +class F1Dataset(Dataset): + val_timestamp = pd.Timestamp("2005-01-01") + test_timestamp = pd.Timestamp("2010-01-01") + + def make_db(self) -> Database: + r"""Process the raw files into a database.""" + url = "https://relbench.stanford.edu/data/relbench-f1-raw.zip" + + path = pooch.retrieve( + url, + known_hash="2933348953b30aa9723b4831fea8071b336b74977bbcf1fb059da63a04f06eba", + progressbar=True, + processor=unzip_processor, + ) + + path = os.path.join(path, "raw") + + circuits = pd.read_csv(os.path.join(path, "circuits.csv")) + drivers = pd.read_csv(os.path.join(path, "drivers.csv")) + results = pd.read_csv(os.path.join(path, "results.csv")) + races = pd.read_csv(os.path.join(path, "races.csv")) + standings = pd.read_csv(os.path.join(path, "driver_standings.csv")) + constructors = pd.read_csv(os.path.join(path, "constructors.csv")) + constructor_results = pd.read_csv(os.path.join(path, "constructor_results.csv")) + constructor_standings = pd.read_csv( + os.path.join(path, "constructor_standings.csv") + ) + qualifying = pd.read_csv(os.path.join(path, "qualifying.csv")) + + # Remove columns that are irrelevant, leak time, + # or have too many missing values + + # Drop the Wikipedia URL and some time columns with many missing values + races.drop( + columns=[ + "url", + "fp1_date", + "fp1_time", + "fp2_date", + "fp2_time", + "fp3_date", + "fp3_time", + "quali_date", + "quali_time", + "sprint_date", + "sprint_time", + ], + inplace=True, + ) + + # Drop the Wikipedia URL as it is unique for each row + circuits.drop( + columns=["url"], + inplace=True, + ) + + # Drop the Wikipedia URL (unique) and number (803 / 857 are nulls) + drivers.drop( + columns=["number", "url"], + inplace=True, + ) + + # Drop the positionText, time, fastestLapTime and fastestLapSpeed + results.drop( + columns=[ + "positionText", + "time", + "fastestLapTime", + "fastestLapSpeed", + ], + inplace=True, + ) + + # Drop the positionText + standings.drop( + columns=["positionText"], + inplace=True, + ) + + # Drop the Wikipedia URL + constructors.drop( + columns=["url"], + inplace=True, + ) + + # Drop the positionText + constructor_standings.drop( + columns=["positionText"], + inplace=True, + ) + + # Drop the status as it only contains two categories, and + # only 17 rows have value 'D' (0.138%) + constructor_results.drop( + columns=["status"], + inplace=True, + ) + + # Drop the time in qualifying 1, 2, and 3 + qualifying.drop( + columns=["q1", "q2", "q3"], + inplace=True, + ) + + # replase missing data and combine date and time columns + races["time"] = races["time"].replace(r"^\\N$", "00:00:00", regex=True) + races["date"] = races["date"] + " " + races["time"] + # Convert date column to pd.Timestamp + races["date"] = pd.to_datetime(races["date"]) + + # add time column to other tables + results = results.merge(races[["raceId", "date"]], on="raceId", how="left") + standings = standings.merge(races[["raceId", "date"]], on="raceId", how="left") + constructor_results = constructor_results.merge( + races[["raceId", "date"]], on="raceId", how="left" + ) + constructor_standings = constructor_standings.merge( + races[["raceId", "date"]], on="raceId", how="left" + ) + + qualifying = qualifying.merge( + races[["raceId", "date"]], on="raceId", how="left" + ) + + # Subtract a day from the date to account for the fact + # that the qualifying time is the day before the main race + qualifying["date"] = qualifying["date"] - pd.Timedelta(days=1) + + # Replace "\N" with NaN in results tables + results = results.replace(r"^\\N$", np.nan, regex=True) + + # Replace "\N" with NaN in circuits tables, especially + # for the column `alt` which has 3 rows of "\N" + circuits = circuits.replace(r"^\\N$", np.nan, regex=True) + # Convert alt from string to float + circuits["alt"] = circuits["alt"].astype(float) + + # Convert non-numeric values to NaN in the specified column + results["rank"] = pd.to_numeric(results["rank"], errors="coerce") + results["number"] = pd.to_numeric(results["number"], errors="coerce") + results["grid"] = pd.to_numeric(results["grid"], errors="coerce") + results["position"] = pd.to_numeric(results["position"], errors="coerce") + results["points"] = pd.to_numeric(results["points"], errors="coerce") + results["laps"] = pd.to_numeric(results["laps"], errors="coerce") + results["milliseconds"] = pd.to_numeric( + results["milliseconds"], errors="coerce" + ) + results["fastestLap"] = pd.to_numeric(results["fastestLap"], errors="coerce") + + # Convert drivers date of birth to datetime + drivers["dob"] = pd.to_datetime(drivers["dob"]) + + tables = {} + + tables["races"] = Table( + df=pd.DataFrame(races), + fkey_col_to_pkey_table={ + "circuitId": "circuits", + }, + pkey_col="raceId", + time_col="date", + ) + + tables["circuits"] = Table( + df=pd.DataFrame(circuits), + fkey_col_to_pkey_table={}, + pkey_col="circuitId", + time_col=None, + ) + + tables["drivers"] = Table( + df=pd.DataFrame(drivers), + fkey_col_to_pkey_table={}, + pkey_col="driverId", + time_col=None, + ) + + tables["results"] = Table( + df=pd.DataFrame(results), + fkey_col_to_pkey_table={ + "raceId": "races", + "driverId": "drivers", + "constructorId": "constructors", + }, + pkey_col="resultId", + time_col="date", + ) + + tables["standings"] = Table( + df=pd.DataFrame(standings), + fkey_col_to_pkey_table={"raceId": "races", "driverId": "drivers"}, + pkey_col="driverStandingsId", + time_col="date", + ) + + tables["constructors"] = Table( + df=pd.DataFrame(constructors), + fkey_col_to_pkey_table={}, + pkey_col="constructorId", + time_col=None, + ) + + tables["constructor_results"] = Table( + df=pd.DataFrame(constructor_results), + fkey_col_to_pkey_table={"raceId": "races", "constructorId": "constructors"}, + pkey_col="constructorResultsId", + time_col="date", + ) + + tables["constructor_standings"] = Table( + df=pd.DataFrame(constructor_standings), + fkey_col_to_pkey_table={"raceId": "races", "constructorId": "constructors"}, + pkey_col="constructorStandingsId", + time_col="date", + ) + + tables["qualifying"] = Table( + df=pd.DataFrame(qualifying), + fkey_col_to_pkey_table={ + "raceId": "races", + "driverId": "drivers", + "constructorId": "constructors", + }, + pkey_col="qualifyId", + time_col="date", + ) + + return Database(tables) diff --git a/plexe/relbench/datasets/fake.py b/plexe/relbench/datasets/fake.py new file mode 100644 index 00000000..2a6db23e --- /dev/null +++ b/plexe/relbench/datasets/fake.py @@ -0,0 +1,114 @@ +import random +import string + +import numpy as np +import pandas as pd + +from ..base import Database, Dataset, Table + + +def _generate_random_string(min_length: int, max_length: int) -> str: + length = random.randint(min_length, max_length) + random_string = "".join(random.choice(string.ascii_letters) for _ in range(length)) + return random_string + + +class FakeDataset(Dataset): + def __init__( + self, + num_products: int = 30, + num_customers: int = 100, + num_reviews: int = 600, + num_relations: int = 20, + ): + self.num_products = num_products + self.num_customers = num_customers + self.num_reviews = num_reviews + self.num_relations = num_relations + + min_timestamp = pd.Timestamp(0, unit="D") + max_timestamp = pd.Timestamp(2 * (num_reviews - 1), unit="D") + self.val_timestamp = min_timestamp + 0.8 * (max_timestamp - min_timestamp) + self.test_timestamp = min_timestamp + 0.9 * (max_timestamp - min_timestamp) + super().__init__() + + def make_db(self) -> Database: + num_products = self.num_products + num_customers = self.num_customers + num_reviews = self.num_reviews + num_relations = self.num_relations + product_df = pd.DataFrame( + { + "product_id": [f"product_id_{i}" for i in range(num_products)], + "category": [None, [], ["toy", "health"]] * (num_products // 3), + "title": [_generate_random_string(5, 15) for _ in range(num_products)], + "price": np.random.rand(num_products) * 10, + } + ) + customer_df = pd.DataFrame( + { + "customer_id": [f"customer_id_{i}" for i in range(num_customers)], + "age": np.random.randint(10, 50, size=(num_customers,)), + "gender": ["male", "female"] * (num_customers // 2), + } + ) + # Add some dangling foreign keys: + review_df = pd.DataFrame( + { + "customer_id": [ + f"customer_id_{random.randint(0, num_customers+5)}" + for _ in range(num_reviews) + ], + "product_id": [ + f"product_id_{random.randint(0, num_products-1)}" + for _ in range(num_reviews) + ], + "review_time": pd.to_datetime(2 * np.arange(num_reviews), unit="D"), + "rating": np.random.randint(1, 6, size=(num_reviews,)), + } + ) + review_df["review"] = review_df["rating"].apply( + lambda x: "positive" if x > 3 else "negative" + ) + relations_df = pd.DataFrame( + { + "customer_id": [ + f"customer_id_{random.randint(0, num_customers+5)}" + for _ in range(num_relations) + ], + "product_id": [ + f"product_id_{random.randint(0, num_products-1)}" + for _ in range(num_relations) + ], + } + ) + + return Database( + table_dict={ + "product": Table( + df=product_df, + fkey_col_to_pkey_table={}, + pkey_col="product_id", + ), + "customer": Table( + df=customer_df, + fkey_col_to_pkey_table={}, + pkey_col="customer_id", + ), + "review": Table( + df=review_df, + fkey_col_to_pkey_table={ + "customer_id": "customer", + "product_id": "product", + }, + time_col="review_time", + ), + "relations": Table( + df=relations_df, + fkey_col_to_pkey_table={ + "customer_id": "customer", + "product_id": "product", + }, + ), + } + ) diff --git a/plexe/relbench/datasets/hm.py b/plexe/relbench/datasets/hm.py new file mode 100644 index 00000000..4af1e338 --- /dev/null +++ b/plexe/relbench/datasets/hm.py @@ -0,0 +1,70 @@ +import os +import shutil +from pathlib import Path + +import pandas as pd + +from ..base import Database, Dataset, Table + + +class HMDataset(Dataset): + url = ( + "https://www.kaggle.com/competitions/" + "h-and-m-personalized-fashion-recommendations" + ) + + val_timestamp = pd.Timestamp("2020-09-07") + test_timestamp = pd.Timestamp("2020-09-14") + + def make_db(self) -> Database: + path = os.path.join("data", "hm-recommendation") + zip = os.path.join(path, "h-and-m-personalized-fashion-recommendations.zip") + customers = os.path.join(path, "customers.csv") + articles = os.path.join(path, "articles.csv") + transactions = os.path.join(path, "transactions_train.csv") + if not os.path.exists(customers): + if not os.path.exists(zip): + raise RuntimeError( + f"Dataset not found. Please download " + f"h-and-m-personalized-fashion-recommendations.zip from " + f"'{self.url}' and move it to '{path}'. Once you have your" + f"Kaggle API key, you can use the following command: " + f"kaggle competitions download -c h-and-m-personalized-fashion-recommendations" + ) + else: + print("Unpacking") + shutil.unpack_archive(zip, Path(zip).parent) + + articles_df = pd.read_csv(articles) + customers_df = pd.read_csv(customers) + transactions_df = pd.read_csv(transactions) + transactions_df["t_dat"] = pd.to_datetime( + transactions_df["t_dat"], format="%Y-%m-%d" + ) + + db = Database( + table_dict={ + "article": Table( + df=articles_df, + fkey_col_to_pkey_table={}, + pkey_col="article_id", + ), + "customer": Table( + df=customers_df, + fkey_col_to_pkey_table={}, + pkey_col="customer_id", + ), + "transactions": Table( + df=transactions_df, + fkey_col_to_pkey_table={ + "customer_id": "customer", + "article_id": "article", + }, + time_col="t_dat", + ), + } + ) + + db = db.from_(pd.Timestamp("2019-09-07")) + + return db diff --git a/plexe/relbench/datasets/stack.py b/plexe/relbench/datasets/stack.py new file mode 100644 index 00000000..61365481 --- /dev/null +++ b/plexe/relbench/datasets/stack.py @@ -0,0 +1,132 @@ +import os + +import pandas as pd +import pooch + +from ..base import Database, Dataset, Table +from ..utils import clean_datetime, unzip_processor + + +class StackDataset(Dataset): + # 3 months gap + val_timestamp = pd.Timestamp("2020-10-01") + test_timestamp = pd.Timestamp("2021-01-01") + + def make_db(self) -> Database: + r"""Process the raw files into a database.""" + url = "https://relbench.stanford.edu/data/relbench-forum-raw.zip" + path = pooch.retrieve( + url, + known_hash="ad3bf96f35146d50ef48fa198921685936c49b95c6b67a8a47de53e90036745f", + progressbar=True, + processor=unzip_processor, + ) + path = os.path.join(path, "raw") + users = pd.read_csv(os.path.join(path, "Users.csv")) + comments = pd.read_csv(os.path.join(path, "Comments.csv")) + posts = pd.read_csv(os.path.join(path, "Posts.csv")) + votes = pd.read_csv(os.path.join(path, "Votes.csv")) + postLinks = pd.read_csv(os.path.join(path, "PostLinks.csv")) + badges = pd.read_csv(os.path.join(path, "Badges.csv")) + postHistory = pd.read_csv(os.path.join(path, "PostHistory.csv")) + + # tags = pd.read_csv(os.path.join(path, "Tags.csv")) we remove tag table here since after removing time leakage columns, all information are kept in the posts tags columns + + ## remove time leakage columns + users.drop( + columns=["Reputation", "Views", "UpVotes", "DownVotes", "LastAccessDate"], + inplace=True, + ) + + posts.drop( + columns=[ + "ViewCount", + "AnswerCount", + "CommentCount", + "FavoriteCount", + "CommunityOwnedDate", + "ClosedDate", + "LastEditDate", + "LastActivityDate", + "Score", + "LastEditorDisplayName", + "LastEditorUserId", + ], + inplace=True, + ) + + comments.drop(columns=["Score"], inplace=True) + votes.drop(columns=["BountyAmount"], inplace=True) + + comments = clean_datetime(comments, "CreationDate") + badges = clean_datetime(badges, "Date") + postLinks = clean_datetime(postLinks, "CreationDate") + postHistory = clean_datetime(postHistory, "CreationDate") + votes = clean_datetime(votes, "CreationDate") + users = clean_datetime(users, "CreationDate") + posts = clean_datetime(posts, "CreationDate") + + tables = {} + + tables["comments"] = Table( + df=pd.DataFrame(comments), + fkey_col_to_pkey_table={ + "UserId": "users", + "PostId": "posts", + }, + pkey_col="Id", + time_col="CreationDate", + ) + + tables["badges"] = Table( + df=pd.DataFrame(badges), + fkey_col_to_pkey_table={ + "UserId": "users", + }, + pkey_col="Id", + time_col="Date", + ) + + tables["postLinks"] = Table( + df=pd.DataFrame(postLinks), + fkey_col_to_pkey_table={ + "PostId": "posts", + "RelatedPostId": "posts", ## is this allowed? two foreign keys into the same primary + }, + pkey_col="Id", + time_col="CreationDate", + ) + + tables["postHistory"] = Table( + df=pd.DataFrame(postHistory), + fkey_col_to_pkey_table={"PostId": "posts", "UserId": "users"}, + pkey_col="Id", + time_col="CreationDate", + ) + + tables["votes"] = Table( + df=pd.DataFrame(votes), + fkey_col_to_pkey_table={"PostId": "posts", "UserId": "users"}, + pkey_col="Id", + time_col="CreationDate", + ) + + tables["users"] = Table( + df=pd.DataFrame(users), + fkey_col_to_pkey_table={}, + pkey_col="Id", + time_col="CreationDate", + ) + + tables["posts"] = Table( + df=pd.DataFrame(posts), + fkey_col_to_pkey_table={ + "OwnerUserId": "users", + "ParentId": "posts", # notice the self-reference + "AcceptedAnswerId": "posts", + }, + pkey_col="Id", + time_col="CreationDate", + ) + + return Database(tables) diff --git a/plexe/relbench/datasets/trial.py b/plexe/relbench/datasets/trial.py new file mode 100644 index 00000000..dc6a3d38 --- /dev/null +++ b/plexe/relbench/datasets/trial.py @@ -0,0 +1,369 @@ +import os + +import numpy as np +import pandas as pd +import pooch + +from ..base import Database, Dataset, Table +from ..utils import unzip_processor + + +class TrialDataset(Dataset): + # 1 year gap + val_timestamp = pd.Timestamp("2020-01-01") + test_timestamp = pd.Timestamp("2021-01-01") + + def make_db(self) -> Database: + r"""Process the raw files into a database.""" + url = "https://relbench.stanford.edu/data/relbench-trial.zip" + path = pooch.retrieve( + url, + known_hash="3f7376b7d901177157b3c5b048221884e936b45d05e809c7875403183ca9e13d", + progressbar=True, + processor=unzip_processor, + ) + path = os.path.join(path, "relbench-trial-raw") + studies = pd.read_csv( + os.path.join(path, "studies.txt"), sep="|", low_memory=False + ) + outcomes = pd.read_csv(os.path.join(path, "outcomes.txt"), sep="|") + drop_withdrawals = pd.read_csv( + os.path.join(path, "drop_withdrawals.txt"), sep="|" + ) + designs = pd.read_csv(os.path.join(path, "designs.txt"), sep="|") + eligibilities = pd.read_csv(os.path.join(path, "eligibilities.txt"), sep="|") + interventions = pd.read_csv( + os.path.join(path, "browse_interventions.txt"), sep="|" + ) + interventions = interventions[ + interventions.mesh_type == "mesh-list" + ] # just looking at root identity + conditions = pd.read_csv(os.path.join(path, "browse_conditions.txt"), sep="|") + conditions = conditions[ + conditions.mesh_type == "mesh-list" + ] # just looking at root identity + + reported_event_totals = pd.read_csv( + os.path.join(path, "reported_event_totals.txt"), sep="|" + ) + sponsors = pd.read_csv( + os.path.join(path, "sponsors.txt"), sep="|", low_memory=False + ) + facilities = pd.read_csv(os.path.join(path, "facilities.txt"), sep="|") + outcome_analyses = pd.read_csv( + os.path.join(path, "outcome_analyses.txt"), sep="|", low_memory=False + ) + detailed_descriptions = pd.read_csv( + os.path.join(path, "detailed_descriptions.txt"), sep="|" + ) + brief_summaries = pd.read_csv( + os.path.join(path, "brief_summaries.txt"), sep="|" + ) + + ## just using trials with actual completion date + # print('studies', len(studies)) + studies = studies[studies.completion_date_type == "Actual"] + ## there are 27 trials before 1975 + studies = studies[studies.start_date >= "2000-01-01"] + studies = studies[studies.nct_id.notnull()] + # print('studies actual', len(studies)) + nct_id_use = studies.nct_id.values + + ## get trial start and end date for later infer + studies["start_date"] = pd.to_datetime(studies["start_date"]) + studies["completion_date"] = pd.to_datetime(studies["completion_date"]) + nct2start_date = dict(studies[["nct_id", "start_date"]].values) + nct2end_date = dict(studies[["nct_id", "completion_date"]].values) + + ## too many columns in studies, keeping few interesting columns and remove temporal leakage columns + studies = studies[ + [ + "nct_id", + "start_date", + "target_duration", + "study_type", + "acronym", + "baseline_population", + "brief_title", + "official_title", + "phase", + "enrollment", + "enrollment_type", + "source", + "limitations_and_caveats", + "number_of_arms", + "number_of_groups", + "has_dmc", + "is_fda_regulated_drug", + "is_fda_regulated_device", + "is_unapproved_device", + "is_ppsd", + "is_us_export", + "biospec_retention", + "biospec_description", + "source_class", + "baseline_type_units_analyzed", + "fdaaa801_violation", + "plan_to_share_ipd", + ] + ] + + ## merge description/brief into main study table + nct2descriptions = dict(detailed_descriptions[["nct_id", "description"]].values) + nct2brief = dict(brief_summaries[["nct_id", "description"]].values) + studies["detailed_descriptions"] = studies.nct_id.apply( + lambda x: nct2descriptions[x] if x in nct2descriptions else np.nan + ) + studies["brief_summaries"] = studies.nct_id.apply( + lambda x: nct2brief[x] if x in nct2brief else np.nan + ) + + outcomes = outcomes[ + [ + "id", + "nct_id", + "outcome_type", + "title", + "description", + "time_frame", + "population", + "units", + "units_analyzed", + "dispersion_type", + "param_type", + ] + ] + + reported_event_totals = reported_event_totals[ + [ + "id", + "nct_id", + "event_type", + "classification", + "subjects_affected", + "subjects_at_risk", + ] + ] + + drop_withdrawals.drop( + columns=[ + "result_group_id", + "ctgov_group_code", + "drop_withdraw_comment", + "reason_comment", + "count_units", + ], + inplace=True, + ) + conditions.drop(columns=["downcase_mesh_term", "mesh_type"], inplace=True) + interventions.drop(columns=["downcase_mesh_term", "mesh_type"], inplace=True) + ## filter to nct_id with actual completion date + # print('outcomes before filter', len(outcomes)) + # for df in [outcomes, outcome_analyses, drop_withdrawals, reported_event_totals, designs, eligibilities, interventions, conditions, facilities, sponsors]: + + outcomes = outcomes[outcomes.nct_id.isin(nct_id_use)] + outcome_analyses = outcome_analyses[outcome_analyses.nct_id.isin(nct_id_use)] + drop_withdrawals = drop_withdrawals[drop_withdrawals.nct_id.isin(nct_id_use)] + reported_event_totals = reported_event_totals[ + reported_event_totals.nct_id.isin(nct_id_use) + ] + designs = designs[designs.nct_id.isin(nct_id_use)] + eligibilities = eligibilities[eligibilities.nct_id.isin(nct_id_use)] + interventions = interventions[interventions.nct_id.isin(nct_id_use)] + conditions = conditions[conditions.nct_id.isin(nct_id_use)] + facilities = facilities[facilities.nct_id.isin(nct_id_use)] + sponsors = sponsors[sponsors.nct_id.isin(nct_id_use)] + + # print('outcomes after filter', len(outcomes)) + ## infer time stamps + ## tables that is available after trial ends + for df in [outcomes, outcome_analyses, drop_withdrawals, reported_event_totals]: + df["date"] = df.nct_id.apply(lambda x: nct2end_date[x]) + + ## tables that is available as trial starts + for df in [ + designs, + eligibilities, + interventions, + conditions, + facilities, + sponsors, + ]: + df["date"] = df.nct_id.apply(lambda x: nct2start_date[x]) + + ## create separate entity tables for sponsor/facility/condition/intervention since some tasks are asking them + sponsor2id = dict( + zip(sponsors.name.unique(), range(len(sponsors.name.unique()))) + ) + sponsors["sponsor_id"] = sponsors.name.apply(lambda x: sponsor2id[x]) + sponsor_trial = sponsors[ + ["id", "nct_id", "sponsor_id", "lead_or_collaborator", "date"] + ] + sponsors = ( + sponsors[["sponsor_id", "name", "agency_class"]] + .drop_duplicates("sponsor_id") + .reset_index(drop=True) + ) + + facility2id = dict( + zip(facilities.name.unique(), range(len(facilities.name.unique()))) + ) + facilities["facility_id"] = facilities.name.apply(lambda x: facility2id[x]) + facility_trial = facilities[["id", "nct_id", "facility_id", "date"]] + facilities = ( + facilities[["facility_id", "name", "city", "state", "zip", "country"]] + .drop_duplicates("facility_id") + .reset_index(drop=True) + ) + + condition2id = dict( + zip( + conditions.mesh_term.unique(), range(len(conditions.mesh_term.unique())) + ) + ) + conditions["condition_id"] = conditions.mesh_term.apply( + lambda x: condition2id[x] + ) + condition_trial = conditions[["id", "nct_id", "condition_id", "date"]] + conditions = ( + conditions[["condition_id", "mesh_term"]] + .drop_duplicates("condition_id") + .reset_index(drop=True) + ) + + intervention2id = dict( + zip( + interventions.mesh_term.unique(), + range(len(interventions.mesh_term.unique())), + ) + ) + interventions["intervention_id"] = interventions.mesh_term.apply( + lambda x: intervention2id[x] + ) + intervention_trial = interventions[["id", "nct_id", "intervention_id", "date"]] + interventions = ( + interventions[["intervention_id", "mesh_term"]] + .drop_duplicates("intervention_id") + .reset_index(drop=True) + ) + + tables = {} + + tables["studies"] = Table( + df=studies, + fkey_col_to_pkey_table={}, + pkey_col="nct_id", + time_col="start_date", + ) + + tables["outcomes"] = Table( + df=outcomes, + fkey_col_to_pkey_table={ + "nct_id": "studies", + }, + pkey_col="id", + time_col="date", + ) + + tables["outcome_analyses"] = Table( + df=outcome_analyses, + fkey_col_to_pkey_table={"nct_id": "studies", "outcome_id": "outcomes"}, + pkey_col="id", + time_col="date", + ) + + tables["drop_withdrawals"] = Table( + df=drop_withdrawals, + fkey_col_to_pkey_table={ + "nct_id": "studies", + }, + pkey_col="id", + time_col="date", + ) + + tables["reported_event_totals"] = Table( + df=reported_event_totals, + fkey_col_to_pkey_table={ + "nct_id": "studies", + }, + pkey_col="id", + time_col="date", + ) + + tables["designs"] = Table( + df=designs, + fkey_col_to_pkey_table={ + "nct_id": "studies", + }, + pkey_col="id", + time_col="date", + ) + + tables["eligibilities"] = Table( + df=eligibilities, + fkey_col_to_pkey_table={ + "nct_id": "studies", + }, + pkey_col="id", + time_col="date", + ) + + tables["interventions"] = Table( + df=interventions, + fkey_col_to_pkey_table={}, + pkey_col="intervention_id", + time_col=None, + ) + + tables["conditions"] = Table( + df=conditions, + fkey_col_to_pkey_table={}, + pkey_col="condition_id", + time_col=None, + ) + + tables["facilities"] = Table( + df=facilities, + fkey_col_to_pkey_table={}, + pkey_col="facility_id", + time_col=None, + ) + + tables["sponsors"] = Table( + df=sponsors, + fkey_col_to_pkey_table={}, + pkey_col="sponsor_id", + time_col=None, + ) + + tables["interventions_studies"] = Table( + df=intervention_trial, + fkey_col_to_pkey_table={ + "nct_id": "studies", + "intervention_id": "interventions", + }, + pkey_col="id", + time_col="date", + ) + + tables["conditions_studies"] = Table( + df=condition_trial, + fkey_col_to_pkey_table={"nct_id": "studies", "condition_id": "conditions"}, + pkey_col="id", + time_col="date", + ) + + tables["facilities_studies"] = Table( + df=facility_trial, + fkey_col_to_pkey_table={"nct_id": "studies", "facility_id": "facilities"}, + pkey_col="id", + time_col="date", + ) + + tables["sponsors_studies"] = Table( + df=sponsor_trial, + fkey_col_to_pkey_table={"nct_id": "studies", "sponsor_id": "sponsors"}, + pkey_col="id", + time_col="date", + ) + return Database(tables) diff --git a/plexe/relbench/metrics.py b/plexe/relbench/metrics.py new file mode 100644 index 00000000..45215718 --- /dev/null +++ b/plexe/relbench/metrics.py @@ -0,0 +1,233 @@ +from typing import Tuple + +import numpy as np +import sklearn.metrics as skm +from numpy.typing import NDArray +from scipy.stats import rankdata + +###### classification metrics + +### applicable to both binary and multiclass classification + + +def accuracy(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + if pred.ndim == 1: + label = pred > 0.5 + else: + label = pred.argmax(axis=1) + return skm.accuracy_score(true, label) + + +def log_loss(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + if pred.ndim == 1 or pred.shape[1] == 1: + prob = np.sigmoid(pred) + else: + prob = np.softmax(pred, axis=1) + return skm.log_loss(true, prob) + + +### applicable to binary classification only + + +def f1(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + assert pred.ndim == 1 or pred.shape[1] == 1 + label = pred >= 0.5 + return skm.f1_score(true, label, average="binary") + + +def roc_auc(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + assert pred.ndim == 1 or pred.shape[1] == 1 + return skm.roc_auc_score(true, pred) + + +def average_precision(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + assert pred.ndim == 1 or pred.shape[1] == 1 + return skm.average_precision_score(true, pred) + + +def auprc(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + assert pred.ndim == 1 or pred.shape[1] == 1 + precision, recall, _ = skm.precision_recall_curve(true, pred) + return skm.auc(recall, precision) + + +### applicable to multiclass classification only + + +def mrr(y: NDArray[np.float64], y_pred: NDArray[np.float64]) -> float: + rankings = rankdata(-y_pred, method="min", axis=1) + ranks = np.take_along_axis(rankings, y.reshape(-1, 1), axis=1).flatten() + return np.mean(1.0 / ranks).item() + + +def macro_f1(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + assert pred.ndim > 1 + label = pred.argmax(axis=1) + return skm.f1_score(true, label, average="macro") + + +def micro_f1(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + assert pred.ndim > 1 + label = pred.argmax(axis=1) + return skm.f1_score(true, label, average="micro") + + +###### regression metrics + + +def mae(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + return skm.mean_absolute_error(true, pred) + + +def mse(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + return skm.mean_squared_error(true, pred) + + +def rmse(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + return skm.root_mean_squared_error(true, pred) + + +def r2(true: NDArray[np.float64], pred: NDArray[np.float64]) -> float: + return skm.r2_score(true, pred) + + +####### Multilabel metrics +def multilabel_auprc_micro(true: NDArray[np.int_], pred: NDArray[np.float64]) -> float: + # Flatten true and prediction arrays for micro-average computation + true_flat = np.ravel(np.stack(true)) + pred_flat = np.ravel(pred) + return skm.average_precision_score(true_flat, pred_flat, average="micro") + + +def multilabel_auprc_macro(true: NDArray[np.int_], pred: NDArray[np.float64]) -> float: + true = np.stack(true) + return skm.average_precision_score(true, pred, average="macro") + + +def multilabel_auroc_micro(true: NDArray[np.int_], pred: NDArray[np.float64]) -> float: + # Flatten true and prediction arrays for micro-average computation + true_flat = np.ravel(np.stack(true)) + pred_flat = np.ravel(pred) + return skm.roc_auc_score(true_flat, pred_flat, average="micro") + + +def multilabel_auroc_macro(true: NDArray[np.int_], pred: NDArray[np.float64]) -> float: + true = np.stack(true) + return skm.roc_auc_score(true, pred, average="macro") + + +def multilabel_f1_micro(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + return skm.f1_score(np.stack(true), (pred > 0.5).astype(int), average="micro") + + +def multilabel_f1_macro(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + return skm.f1_score(np.stack(true), (pred > 0.5).astype(int), average="macro") + + +def multilabel_recall_micro(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + return skm.recall_score(np.stack(true), (pred > 0.5).astype(int), average="micro") + + +def multilabel_recall_macro(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + return skm.recall_score(np.stack(true), (pred > 0.5).astype(int), average="macro") + + +def multilabel_precision_micro(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + return skm.precision_score( + np.stack(true), (pred > 0.5).astype(int), average="micro" + ) + + +def multilabel_precision_macro(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + return skm.precision_score( + np.stack(true), (pred > 0.5).astype(int), average="macro" + ) + + +####### Multiclass metrics +def multiclass_f1(true: NDArray[np.int_], pred: NDArray[np.int_]) -> float: + if pred.ndim > 1: + pred = pred.argmax(axis=1) + return skm.f1_score(true, pred, average="micro") + + +####### Link prediction metrics +"""All link prediction metrics take two arguments + - pred_isin: Numpy boolean array of size (num_src_nodes, eval_k) + - dst_count: Numpy integer array of size (num_src_nodes, ), storing + the number of destination nodes attached to each source node. +""" + + +def _filter( + pred_isin: NDArray[np.int_], dst_count: NDArray[np.int_] +) -> Tuple[NDArray[np.int_], NDArray[np.int_]]: + is_pos = dst_count > 0 + return pred_isin[is_pos], dst_count[is_pos] + + +def link_prediction_recall( + pred_isin: NDArray[np.int_], + dst_count: NDArray[np.int_], +) -> float: + pred_isin, dst_count = _filter(pred_isin, dst_count) + recalls = pred_isin.sum(axis=1) / dst_count + return recalls.mean() + + +def link_prediction_precision( + pred_isin: NDArray[np.int_], + dst_count: NDArray[np.int_], +) -> float: + pred_isin, dst_count = _filter(pred_isin, dst_count) + eval_k = pred_isin.shape[1] + precisions = pred_isin.sum(axis=-1) / eval_k + return precisions.mean() + + +def link_prediction_map( + pred_isin: NDArray[np.int_], + dst_count: NDArray[np.int_], +) -> float: + pred_isin, dst_count = _filter(pred_isin, dst_count) + eval_k = pred_isin.shape[1] + clipped_dst_count = dst_count.clip(min=None, max=eval_k) + precision_mat = np.cumsum(pred_isin, axis=1) / (np.arange(eval_k) + 1) + maps = (precision_mat * pred_isin).sum(axis=1) / clipped_dst_count + return maps.mean() + + +def link_prediction_ndcg( + pred_isin: NDArray[np.int_], + dst_count: NDArray[np.int_], +) -> float: + pred_isin, dst_count = _filter(pred_isin, dst_count) + eval_k = pred_isin.shape[1] + + # Compute the discounted multiplier (1 / log2(i + 2) for i = 0, ..., k-1) + discounted_multiplier = np.concatenate( + (np.zeros(1), 1 / np.log2(np.arange(1, eval_k + 1) + 1)) + ) + + # Compute Discounted Cumulative Gain (DCG) + discounted_cumulative_gain = ( + pred_isin * discounted_multiplier[1 : eval_k + 1] + ).sum(axis=1) + + # Clip dst_count to the range [0, eval_k] + clipped_dst_count = np.clip(dst_count, 0, eval_k) + + # Compute Ideal Discounted Cumulative Gain (IDCG) + ideal_discounted_multiplier_cumsum = np.cumsum(discounted_multiplier) + ideal_discounted_cumulative_gain = ideal_discounted_multiplier_cumsum[ + clipped_dst_count + ] + + # Avoid division by zero + ideal_discounted_cumulative_gain = np.clip( + ideal_discounted_cumulative_gain, 1e-10, None + ) + + # Compute NDCG + ndcg_scores = discounted_cumulative_gain / ideal_discounted_cumulative_gain + return ndcg_scores.mean() diff --git a/plexe/relbench/modeling/__init__.py b/plexe/relbench/modeling/__init__.py new file mode 100644 index 00000000..2da100d8 --- /dev/null +++ b/plexe/relbench/modeling/__init__.py @@ -0,0 +1,14 @@ +from .graph import make_pkey_fkey_graph, get_node_train_table_input +from .utils import get_stype_proposal, remove_pkey_fkey, to_unix_time +from .nn import HeteroEncoder, HeteroGraphSAGE, HeteroTemporalEncoder + +__all__ = [ + "make_pkey_fkey_graph", + "get_node_train_table_input", + "get_stype_proposal", + "remove_pkey_fkey", + "to_unix_time", + "HeteroEncoder", + "HeteroGraphSAGE", + "HeteroTemporalEncoder", +] \ No newline at end of file diff --git a/plexe/relbench/modeling/graph.py b/plexe/relbench/modeling/graph.py new file mode 100644 index 00000000..978a5f48 --- /dev/null +++ b/plexe/relbench/modeling/graph.py @@ -0,0 +1,228 @@ +import os +from typing import Any, Dict, NamedTuple, Optional, Tuple + +import numpy as np +import pandas as pd +import torch +from torch import Tensor +from torch_frame import stype +from torch_frame.config import TextEmbedderConfig +from torch_frame.data import Dataset +from torch_frame.data.stats import StatType +from torch_geometric.data import HeteroData +from torch_geometric.typing import NodeType +from torch_geometric.utils import sort_edge_index + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from .utils import remove_pkey_fkey, to_unix_time + + +def make_pkey_fkey_graph( + db: Database, + col_to_stype_dict: Dict[str, Dict[str, stype]], + text_embedder_cfg: Optional[TextEmbedderConfig] = None, + cache_dir: Optional[str] = None, +) -> Tuple[HeteroData, Dict[str, Dict[str, Dict[StatType, Any]]]]: + r"""Given a :class:`Database` object, construct a heterogeneous graph with primary- + foreign key relationships, together with the column stats of each table. + + Args: + db: A database object containing a set of tables. + col_to_stype_dict: Column to stype for + each table. + text_embedder_cfg: Text embedder config. + cache_dir: A directory for storing materialized tensor + frames. If specified, we will either cache the file or use the + cached file. If not specified, we will not use cached file and + re-process everything from scratch without saving the cache. + + Returns: + HeteroData: The heterogeneous :class:`PyG` object with + :class:`TensorFrame` feature. + """ + data = HeteroData() + col_stats_dict = dict() + if cache_dir is not None: + os.makedirs(cache_dir, exist_ok=True) + + for table_name, table in db.table_dict.items(): + # Materialize the tables into tensor frames: + df = table.df + # Ensure that pkey is consecutive. + if table.pkey_col is not None: + assert (df[table.pkey_col].values == np.arange(len(df))).all() + + col_to_stype = col_to_stype_dict[table_name] + + # Remove pkey, fkey columns since they will not be used as input + # feature. + remove_pkey_fkey(col_to_stype, table) + + if len(col_to_stype) == 0: # Add constant feature in case df is empty: + col_to_stype = {"__const__": stype.numerical} + # We need to add edges later, so we need to also keep the fkeys + fkey_dict = {key: df[key] for key in table.fkey_col_to_pkey_table} + df = pd.DataFrame({"__const__": np.ones(len(table.df)), **fkey_dict}) + + path = ( + None if cache_dir is None else os.path.join(cache_dir, f"{table_name}.pt") + ) + + dataset = Dataset( + df=df, + col_to_stype=col_to_stype, + col_to_text_embedder_cfg=text_embedder_cfg, + ).materialize(path=path) + + data[table_name].tf = dataset.tensor_frame + col_stats_dict[table_name] = dataset.col_stats + + # Add time attribute: + if table.time_col is not None: + data[table_name].time = torch.from_numpy( + to_unix_time(table.df[table.time_col]) + ) + + # Add edges: + for fkey_name, pkey_table_name in table.fkey_col_to_pkey_table.items(): + pkey_index = df[fkey_name] + # Filter out dangling foreign keys + mask = ~pkey_index.isna() + fkey_index = torch.arange(len(pkey_index)) + # Filter dangling foreign keys: + pkey_index = torch.from_numpy(pkey_index[mask].astype(int).values) + fkey_index = fkey_index[torch.from_numpy(mask.values)] + # Ensure no dangling fkeys + assert (pkey_index < len(db.table_dict[pkey_table_name])).all() + + # fkey -> pkey edges + edge_index = torch.stack([fkey_index, pkey_index], dim=0) + edge_type = (table_name, f"f2p_{fkey_name}", pkey_table_name) + data[edge_type].edge_index = sort_edge_index(edge_index) + + # pkey -> fkey edges. + # "rev_" is added so that PyG loader recognizes the reverse edges + edge_index = torch.stack([pkey_index, fkey_index], dim=0) + edge_type = (pkey_table_name, f"rev_f2p_{fkey_name}", table_name) + data[edge_type].edge_index = sort_edge_index(edge_index) + + data.validate() + + return data, col_stats_dict + + +class AttachTargetTransform: + r"""Attach the target label to the heterogeneous mini-batch. + + The batch consists of disjoint subgraphs loaded via temporal sampling. The same + input node can occur multiple times with different timestamps, and thus different + subgraphs and labels. Hence labels cannot be stored in the graph object directly, + and must be attached to the batch after the batch is created. + """ + + def __init__(self, entity: str, target: Tensor): + self.entity = entity + self.target = target + + def __call__(self, batch: HeteroData) -> HeteroData: + batch[self.entity].y = self.target[batch[self.entity].input_id] + return batch + + +class NodeTrainTableInput(NamedTuple): + r"""Training table input for node prediction. + + - nodes is a Tensor of node indices. + - time is a Tensor of node timestamps. + - target is a Tensor of node labels. + - transform attaches the target to the batch. + """ + + nodes: Tuple[NodeType, Tensor] + time: Optional[Tensor] + target: Optional[Tensor] + transform: Optional[AttachTargetTransform] + + +def get_node_train_table_input( + table: Table, + task: EntityTask, +) -> NodeTrainTableInput: + r"""Get the training table input for node prediction.""" + + nodes = torch.from_numpy(table.df[task.entity_col].astype(int).values) + + time: Optional[Tensor] = None + if table.time_col is not None: + time = torch.from_numpy(to_unix_time(table.df[table.time_col])) + + target: Optional[Tensor] = None + transform: Optional[AttachTargetTransform] = None + if task.target_col in table.df: + target_type = float + if task.task_type == TaskType.MULTICLASS_CLASSIFICATION: + target_type = int + if task.task_type == TaskType.MULTILABEL_CLASSIFICATION: + target = torch.from_numpy(np.stack(table.df[task.target_col].values)) + else: + target = torch.from_numpy( + table.df[task.target_col].values.astype(target_type) + ) + transform = AttachTargetTransform(task.entity_table, target) + + return NodeTrainTableInput( + nodes=(task.entity_table, nodes), + time=time, + target=target, + transform=transform, + ) + + +class LinkTrainTableInput(NamedTuple): + r"""Training table input for link prediction. + + - src_nodes is a Tensor of source node indices. + - dst_nodes is PyTorch sparse tensor in csr format. + dst_nodes[src_node_idx] gives a tensor of destination node + indices for src_node_idx. + - num_dst_nodes is the total number of destination nodes. + (used to perform negative sampling). + - src_time is a Tensor of time for src_nodes + """ + + src_nodes: Tuple[NodeType, Tensor] + dst_nodes: Tuple[NodeType, Tensor] + num_dst_nodes: int + src_time: Optional[Tensor] + + +def get_link_train_table_input( + table: Table, + task: RecommendationTask, +) -> LinkTrainTableInput: + r"""Get the training table input for link prediction.""" + + src_node_idx: Tensor = torch.from_numpy( + table.df[task.src_entity_col].astype(int).values + ) + exploded = table.df[task.dst_entity_col].explode() + coo_indices = torch.from_numpy( + np.stack([exploded.index.values, exploded.values.astype(int)]) + ) + sparse_coo = torch.sparse_coo_tensor( + coo_indices, + torch.ones(coo_indices.size(1), dtype=bool), + (len(src_node_idx), task.num_dst_nodes), + ) + dst_node_indices = sparse_coo.to_sparse_csr() + + time: Optional[Tensor] = None + if table.time_col is not None: + time = torch.from_numpy(to_unix_time(table.df[table.time_col])) + + return LinkTrainTableInput( + src_nodes=(task.src_entity_table, src_node_idx), + dst_nodes=(task.dst_entity_table, dst_node_indices), + num_dst_nodes=task.num_dst_nodes, + src_time=time, + ) \ No newline at end of file diff --git a/plexe/relbench/modeling/loader.py b/plexe/relbench/modeling/loader.py new file mode 100644 index 00000000..fa4e60ad --- /dev/null +++ b/plexe/relbench/modeling/loader.py @@ -0,0 +1,286 @@ +import random +from typing import Dict, Iterator, List, Optional, Tuple, Union + +import torch +from torch import Tensor +from torch.utils.data import DataLoader, Dataset, Sampler +from torch_geometric.data import Data, FeatureStore, GraphStore, HeteroData +from torch_geometric.loader import NodeLoader +from torch_geometric.sampler import NeighborSampler, NodeSamplerInput +from torch_geometric.sampler.base import SubgraphType +from torch_geometric.typing import EdgeType, NodeType, OptTensor + + +def batched_arange(count: Tensor) -> Tuple[Tensor, Tensor]: + r"""Fast implementation of bached version of torch.arange. It essentially does the + following >>> batch = torch.cat([torch.full((c,), i) for i, c in enumerate(count)]) + >>> arange = torch.cat([torch.arange(c) for c in count]) + + Args: + count (Tensor): The count vectors. + + Returns: + batch (Tensor): batch[i] indicates the batch index of + batched_arange[i] + arange (Tensor): batched version of arange + """ + ptr = count.new_zeros(count.numel() + 1) + torch.cumsum(count, dim=0, out=ptr[1:]) + + batch = torch.arange(count.numel(), device=count.device).repeat_interleave( + count, output_size=ptr[-1] + ) # type: ignore + + arange = torch.arange(batch.numel(), device=count.device) + arange -= ptr[batch] + + return batch, arange + + +class SparseTensor: + r"""Sparse CSR tensor object that allows fast row tensor indexing.""" + + def __init__( + self, + sparse_tensor: Tensor, + device: Optional[Union[str, torch.device]] = None, + ): + assert sparse_tensor.layout == torch.sparse_csr + self._size = sparse_tensor.size() + self._crow_indices = sparse_tensor.crow_indices().to(device) + self._col_indices = sparse_tensor.col_indices().to(device) + + def __getitem__(self, indices: Tensor) -> Tuple[Tensor, Tensor]: + r"""Given a tensor of row indices, return a tuple of tensors. + + - :obj:`row_batch` (Tensor): Batch offset for column indices. + - :obj:`col_index` (Tensor): Column indices. + Specifically, :obj:`sparse_tensor[indices[i]]` can be obtained by + :obj:`col_index[row_batch == i]`. + """ + if not (indices < self.size()[0]).all(): + raise IndexError( + f"The index {indices.max()} is out-of-range. Needs to be smaller " + f"than {{self.size()[0]}}." + ) + count = self._crow_indices[indices + 1] - self._crow_indices[indices] + row_batch, arange = batched_arange(count) + col_index = self._col_indices[arange + self._crow_indices[indices][row_batch]] + return row_batch, col_index + + def size(self) -> torch.Size: + return self._size + + +class CustomNodeLoader(NodeLoader): + + def get_neighbors( + self, + input_data: NodeSamplerInput, + ) -> Union[Data, HeteroData]: + r"""Samples a subgraph from a batch of input nodes.""" + out = self.node_sampler.sample_from_nodes(input_data) + out = self.filter_fn(out) + return out + + +class TimestampSampler(Sampler[int]): + r"""A TimestampSampler that samples rows from the same timestamp.""" + + def __init__( + self, + timestamp: Tensor, + batch_size: int, + ): + super().__init__() + self.batch_size = batch_size + self.time_dict = { + int(time): (timestamp == time).nonzero().view(-1) + for time in timestamp.unique() + } + self.num_batches = sum( + [indices.numel() // batch_size for indices in self.time_dict.values()] + ) + + def __iter__(self) -> Iterator[List[int]]: + all_batches = [] + for indices in self.time_dict.values(): + # Random shuffle values: + indices = indices[torch.randperm(indices.numel())] + batches = torch.split(indices, self.batch_size) + for batch in batches: + if len(batch) < self.batch_size: + continue + else: + all_batches.append(batch.tolist()) + + random.shuffle(all_batches) + + for batch in all_batches: + yield batch + + def __len__(self) -> int: + return self.num_batches + + +class CustomLinkDataset(Dataset): + r"""A custom link prediction dataset. + + Sample source nodes, time, and one positive destination node. + """ + + def __init__( + self, + src_node_indices: Tensor, + dst_node_indices: Tensor, # CSR sparse matrix + num_dst_nodes: int, + src_time: Tensor, + ): + assert len(src_node_indices) == len(dst_node_indices) and len( + src_node_indices + ) == len(src_time) + self.src_node_indices = src_node_indices + self.dst_node_indices = dst_node_indices + self.num_dst_nodes = num_dst_nodes + self.src_time = src_time + + def __getitem__(self, index) -> Tensor: + r"""Returns 1-dim tensor of size 3. + + - source node index + - positive destination node index + - source node time + """ + return torch.tensor( + [ + self.src_node_indices[index], + random.choice(self.dst_node_indices[index].indices()[0]), + self.src_time[index], + ] + ) + + def __len__(self): + return len(self.src_node_indices) + + +class LinkNeighborLoader(DataLoader): + r"""A custom neighbor loader for link prediction. + Based on https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/loader/neighbor_loader.html + + Args: + src_nodes (Tuple[NodeType, Tensor]): A tensor of source node indices. + dst_nodes (Tuple[NodeType, Tensor]): A csr sparse tensor, where + dst_nodes[index] is a list of destination node indices + for src_nodes[index] at src_time[index]. + num_dst_nodes (int): Total number of destination nodes. Used to + determine the range of negative samples. + src_time (torch.Tensor, optional): Optional values to override the + timestamp for the input nodes given in :obj:`input_nodes`. If not + set, will use the timestamps in :obj:`time_attr` as default (if + present). The :obj:`time_attr` needs to be set for this to work. + (default: :obj:`None`) + share_same_time (bool): Whether to share the seed time within mini-batch + or not (default: :obj:`False`) + """ + + def __init__( + self, + data: Union[Data, HeteroData, Tuple[FeatureStore, GraphStore]], + num_neighbors: Union[List[int], Dict[EdgeType, List[int]]], + src_nodes: Tuple[NodeType, Tensor], + dst_nodes: Tuple[NodeType, Tensor], + num_dst_nodes: int, + src_time: OptTensor = None, + share_same_time: bool = False, + subgraph_type: Union[SubgraphType, str] = "directional", + temporal_strategy: str = "uniform", + time_attr: Optional[str] = None, + **kwargs, + ): + node_sampler = NeighborSampler( + data, + num_neighbors=num_neighbors, + subgraph_type=subgraph_type, + disjoint=True, + temporal_strategy=temporal_strategy, + time_attr=time_attr, + share_memory=kwargs.get("num_workers", 0) > 0, + ) + + self.data = data + self.src_nodes = src_nodes + self.dst_nodes = dst_nodes + self.num_dst_nodes = num_dst_nodes + self.src_time = src_time + self.share_same_time = share_same_time + + kwargs.pop("dataset", None) + kwargs.pop("collate_fn", None) + if share_same_time: + kwargs.pop("sampler", None) + kwargs["batch_sampler"] = TimestampSampler( + src_time, + kwargs["batch_size"], + ) + kwargs.pop("batch_size", None) + + dataset = CustomLinkDataset( + self.src_nodes[1], + dst_nodes[1], + num_dst_nodes, + src_time, + ) + + self.src_node_type = self.src_nodes[0] + self.dst_node_type = self.dst_nodes[0] + self.src_loader = CustomNodeLoader( + data, + node_sampler, + src_nodes[0], + ) + self.dst_loader = CustomNodeLoader( + data, + node_sampler, + dst_nodes[0], + ) + + super().__init__(dataset, collate_fn=self.collate_fn, **kwargs) + + def collate_fn( + self, + index: Tensor, + ) -> Tuple[HeteroData, HeteroData, HeteroData]: + r"""Samples a subgraph from a batch of input nodes.""" + index = torch.stack(index) + src_indices = index[:, 0].contiguous() + pos_dst_indices = index[:, 1].contiguous() + time = index[:, 2].contiguous() + neg_dst_indices = torch.randint(0, self.num_dst_nodes, size=(len(src_indices),)) + src_out = self.src_loader.get_neighbors( + NodeSamplerInput( + input_id=src_indices, + node=src_indices, + time=time, + input_type=self.src_node_type, + ) + ) + + pos_dst_out = self.dst_loader.get_neighbors( + NodeSamplerInput( + input_id=pos_dst_indices, + node=pos_dst_indices, + time=time, + input_type=self.dst_node_type, + ) + ) + + neg_dst_out = self.dst_loader.get_neighbors( + NodeSamplerInput( + input_id=neg_dst_indices, + node=neg_dst_indices, + time=time, + input_type=self.dst_node_type, + ) + ) + + return src_out, pos_dst_out, neg_dst_out \ No newline at end of file diff --git a/plexe/relbench/modeling/nn.py b/plexe/relbench/modeling/nn.py new file mode 100644 index 00000000..a0a385e0 --- /dev/null +++ b/plexe/relbench/modeling/nn.py @@ -0,0 +1,173 @@ +from typing import Any, Dict, List, Optional + +import torch +import torch_frame +from torch import Tensor +from torch_frame.data.stats import StatType +from torch_frame.nn.models import ResNet +from torch_geometric.nn import HeteroConv, LayerNorm, PositionalEncoding, SAGEConv +from torch_geometric.typing import EdgeType, NodeType + + +class HeteroEncoder(torch.nn.Module): + r"""HeteroEncoder based on PyTorch Frame. + + Args: + channels (int): The output channels for each node type. + node_to_col_names_dict (Dict[NodeType, Dict[torch_frame.stype, List[str]]]): + A dictionary mapping from node type to column names dictionary + compatible to PyTorch Frame. + torch_frame_model_cls: Model class for PyTorch Frame. The class object + takes :class:`TensorFrame` object as input and outputs + :obj:`channels`-dimensional embeddings. Default to + :class:`torch_frame.nn.ResNet`. + torch_frame_model_kwargs (Dict[str, Any]): Keyword arguments for + :class:`torch_frame_model_cls` class. Default keyword argument is + set specific for :class:`torch_frame.nn.ResNet`. Expect it to + be changed for different :class:`torch_frame_model_cls`. + default_stype_encoder_cls_kwargs (Dict[torch_frame.stype, Any]): + A dictionary mapping from :obj:`torch_frame.stype` object into a + tuple specifying :class:`torch_frame.nn.StypeEncoder` class and its + keyword arguments :obj:`kwargs`. + """ + + def __init__( + self, + channels: int, + node_to_col_names_dict: Dict[NodeType, Dict[torch_frame.stype, List[str]]], + node_to_col_stats: Dict[NodeType, Dict[str, Dict[StatType, Any]]], + torch_frame_model_cls=ResNet, + torch_frame_model_kwargs: Dict[str, Any] = { + "channels": 128, + "num_layers": 4, + }, + default_stype_encoder_cls_kwargs: Dict[torch_frame.stype, Any] = { + torch_frame.categorical: (torch_frame.nn.EmbeddingEncoder, {}), + torch_frame.numerical: (torch_frame.nn.LinearEncoder, {}), + torch_frame.multicategorical: ( + torch_frame.nn.MultiCategoricalEmbeddingEncoder, + {}, + ), + torch_frame.embedding: (torch_frame.nn.LinearEmbeddingEncoder, {}), + torch_frame.timestamp: (torch_frame.nn.TimestampEncoder, {}), + }, + ): + super().__init__() + + self.encoders = torch.nn.ModuleDict() + + for node_type in node_to_col_names_dict.keys(): + stype_encoder_dict = { + stype: default_stype_encoder_cls_kwargs[stype][0]( + **default_stype_encoder_cls_kwargs[stype][1] + ) + for stype in node_to_col_names_dict[node_type].keys() + } + torch_frame_model = torch_frame_model_cls( + **torch_frame_model_kwargs, + out_channels=channels, + col_stats=node_to_col_stats[node_type], + col_names_dict=node_to_col_names_dict[node_type], + stype_encoder_dict=stype_encoder_dict, + ) + self.encoders[node_type] = torch_frame_model + + def reset_parameters(self): + for encoder in self.encoders.values(): + encoder.reset_parameters() + + def forward( + self, + tf_dict: Dict[NodeType, torch_frame.TensorFrame], + ) -> Dict[NodeType, Tensor]: + x_dict = { + node_type: self.encoders[node_type](tf) for node_type, tf in tf_dict.items() + } + return x_dict + + +class HeteroTemporalEncoder(torch.nn.Module): + def __init__(self, node_types: List[NodeType], channels: int): + super().__init__() + + self.encoder_dict = torch.nn.ModuleDict( + {node_type: PositionalEncoding(channels) for node_type in node_types} + ) + self.lin_dict = torch.nn.ModuleDict( + {node_type: torch.nn.Linear(channels, channels) for node_type in node_types} + ) + + def reset_parameters(self): + for encoder in self.encoder_dict.values(): + encoder.reset_parameters() + for lin in self.lin_dict.values(): + lin.reset_parameters() + + def forward( + self, + seed_time: Tensor, + time_dict: Dict[NodeType, Tensor], + batch_dict: Dict[NodeType, Tensor], + ) -> Dict[NodeType, Tensor]: + out_dict: Dict[NodeType, Tensor] = {} + + for node_type, time in time_dict.items(): + rel_time = seed_time[batch_dict[node_type]] - time + rel_time = rel_time / (60 * 60 * 24) # Convert seconds to days. + + x = self.encoder_dict[node_type](rel_time) + x = self.lin_dict[node_type](x) + out_dict[node_type] = x + + return out_dict + + +class HeteroGraphSAGE(torch.nn.Module): + def __init__( + self, + node_types: List[NodeType], + edge_types: List[EdgeType], + channels: int, + aggr: str = "mean", + num_layers: int = 2, + ): + super().__init__() + + self.convs = torch.nn.ModuleList() + for _ in range(num_layers): + conv = HeteroConv( + { + edge_type: SAGEConv((channels, channels), channels, aggr=aggr) + for edge_type in edge_types + }, + aggr="sum", + ) + self.convs.append(conv) + + self.norms = torch.nn.ModuleList() + for _ in range(num_layers): + norm_dict = torch.nn.ModuleDict() + for node_type in node_types: + norm_dict[node_type] = LayerNorm(channels, mode="node") + self.norms.append(norm_dict) + + def reset_parameters(self): + for conv in self.convs: + conv.reset_parameters() + for norm_dict in self.norms: + for norm in norm_dict.values(): + norm.reset_parameters() + + def forward( + self, + x_dict: Dict[NodeType, Tensor], + edge_index_dict: Dict[NodeType, Tensor], + num_sampled_nodes_dict: Optional[Dict[NodeType, List[int]]] = None, + num_sampled_edges_dict: Optional[Dict[EdgeType, List[int]]] = None, + ) -> Dict[NodeType, Tensor]: + for _, (conv, norm_dict) in enumerate(zip(self.convs, self.norms)): + x_dict = conv(x_dict, edge_index_dict) + x_dict = {key: norm_dict[key](x) for key, x in x_dict.items()} + x_dict = {key: x.relu() for key, x in x_dict.items()} + + return x_dict \ No newline at end of file diff --git a/plexe/relbench/modeling/utils.py b/plexe/relbench/modeling/utils.py new file mode 100644 index 00000000..3b84913b --- /dev/null +++ b/plexe/relbench/modeling/utils.py @@ -0,0 +1,52 @@ +from typing import Any, Dict + +import numpy as np +import pandas as pd +from torch_frame import stype +from torch_frame.utils import infer_df_stype + +from ..base import Database, Table + + +def to_unix_time(ser: pd.Series) -> np.ndarray: + r"""Converts a :class:`pandas.Timestamp` series to UNIX timestamp (in seconds).""" + assert ser.dtype in [np.dtype("datetime64[s]"), np.dtype("datetime64[ns]")] + unix_time = ser.astype("int64").values + if ser.dtype == np.dtype("datetime64[ns]"): + unix_time //= 10**9 + return unix_time + + +def remove_pkey_fkey(col_to_stype: Dict[str, Any], table: Table) -> dict: + r"""Remove pkey, fkey columns since they will not be used as input feature.""" + if table.pkey_col is not None: + if table.pkey_col in col_to_stype: + col_to_stype.pop(table.pkey_col) + for fkey in table.fkey_col_to_pkey_table.keys(): + if fkey in col_to_stype: + col_to_stype.pop(fkey) + + +def get_stype_proposal(db: Database) -> Dict[str, Dict[str, stype]]: + r"""Propose stype for columns of a set of tables in the given database. + + Args: + db (Database): The database object containing a set of tables. + + Returns: + Dict[str, Dict[str, Any]]: A dictionary mapping table name into + :obj:`col_to_stype` (mapping column names into inferred stypes). + """ + + inferred_col_to_stype_dict = {} + for table_name, table in db.table_dict.items(): + df = table.df + df = df.sample(min(1_000, len(df))) + inferred_col_to_stype = infer_df_stype(df) + # Hack for now. This is relevant for rel-amazon. + for col, stype_ in inferred_col_to_stype.items(): + if stype_.value == "embedding": + inferred_col_to_stype[col] = stype.multicategorical + inferred_col_to_stype_dict[table_name] = inferred_col_to_stype + + return inferred_col_to_stype_dict \ No newline at end of file diff --git a/plexe/relbench/tasks/__init__.py b/plexe/relbench/tasks/__init__.py new file mode 100644 index 00000000..affbe49f --- /dev/null +++ b/plexe/relbench/tasks/__init__.py @@ -0,0 +1,75 @@ +from collections import defaultdict +from functools import lru_cache +from typing import List + +import pooch + +from ..base import BaseTask +from ..datasets import get_dataset +from ..tasks import ( + amazon, + avito, + event, + f1, + hm, + stack, + trial, +) + +task_registry = defaultdict(dict) + +def register_task( + dataset_name: str, + task_name: str, + cls: BaseTask, + *args, + **kwargs, +) -> None: + r"""Register an instantiation of a :class:`BaseTask` subclass with the given name. + + Args: + dataset_name: The name of the dataset. + task_name: The name of the task. + cls: The class of the task. + args: The arguments to instantiate the task. + kwargs: The keyword arguments to instantiate the task. + + The name is used to enable caching and downloading functionalities. + `cache_dir` is added to kwargs by default. If you want to override it, you + can pass `cache_dir` as a keyword argument in `kwargs`. + """ + + cache_dir = f"{pooch.os_cache('relbench')}/{dataset_name}/tasks/{task_name}" + kwargs = {"cache_dir": cache_dir, **kwargs} + task_registry[dataset_name][task_name] = (cls, args, kwargs) + + +def get_task_names(dataset_name: str) -> List[str]: + r"""Return a list of names of the registered tasks for the given dataset.""" + return list(task_registry[dataset_name].keys()) + +@lru_cache(maxsize=None) +def get_task(dataset_name: str, task_name: str, download=False) -> BaseTask: + r"""Return a task object by name. + + Args: + dataset_name: The name of the dataset. + task_name: The name of the task. + download: If True, download the task from the RelBench server. + + Returns: + BaseTask: The task object. + + If `download` is True, the task tables (train, val, test) comprising the + task will be downloaded into the cache from the RelBench server. If you use + `download=False` the first time, the task tables will be computed from + scratch using the database. + + Once the task tables are cached, either because of download or computing from + scratch, the cache will be used. `download=True` will verify that the + cached task tables matches the RelBench version even in this case. + """ + dataset = get_dataset(dataset_name, download=download) + cls, args, kwargs = task_registry[dataset_name][task_name] + task = cls(dataset, *args, **kwargs) + return task diff --git a/plexe/relbench/tasks/amazon.py b/plexe/relbench/tasks/amazon.py new file mode 100644 index 00000000..6ea91f7f --- /dev/null +++ b/plexe/relbench/tasks/amazon.py @@ -0,0 +1,394 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from ..metrics import ( + accuracy, + average_precision, + f1, + link_prediction_map, + link_prediction_precision, + link_prediction_recall, + mae, + r2, + rmse, + roc_auc, +) + + +class UserChurnTask(EntityTask): + r"""Churn for a customer is 1 if the customer does not review any product in the + time window, else 0.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "customer_id" + entity_table = "customer" + time_col = "timestamp" + target_col = "churn" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + product = db.table_dict["product"].df + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + timestamp, + customer_id, + CAST( + NOT EXISTS ( + SELECT 1 + FROM review + WHERE + review.customer_id = customer.customer_id AND + review_time > timestamp AND + review_time <= timestamp + INTERVAL '{self.timedelta}' + ) AS INTEGER + ) AS churn + FROM + timestamp_df, + customer, + WHERE + EXISTS ( + SELECT 1 + FROM review + WHERE + review.customer_id = customer.customer_id AND + review_time > timestamp - INTERVAL '{self.timedelta}' AND + review_time <= timestamp + ) + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserLTVTask(EntityTask): + r"""LTV (life-time value) for a customer is the sum of prices of products that the + customer reviews in the time window.""" + + task_type = TaskType.REGRESSION + entity_col = "customer_id" + entity_table = "customer" + time_col = "timestamp" + target_col = "ltv" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + product = db.table_dict["product"].df + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + timestamp, + customer_id, + ltv, + FROM + timestamp_df, + customer, + ( + SELECT + COALESCE(SUM(price), 0) as ltv, + FROM + review, + product + WHERE + review.customer_id = customer.customer_id AND + review.product_id = product.product_id AND + review_time > timestamp AND + review_time <= timestamp + INTERVAL '{self.timedelta}' + ) + WHERE + EXISTS ( + SELECT 1 + FROM review + WHERE + review.customer_id = customer.customer_id AND + review_time > timestamp - INTERVAL '{self.timedelta}' AND + review_time <= timestamp + ) + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={"customer_id": "customer"}, + pkey_col=None, + time_col="timestamp", + ) + + +class ItemChurnTask(EntityTask): + r"""Churn for a product is 1 if the product recieves at least one review in the time + window, else 0.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "product_id" + entity_table = "product" + time_col = "timestamp" + target_col = "churn" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + product = db.table_dict["product"].df + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + timestamp, + product_id, + CAST( + NOT EXISTS ( + SELECT 1 + FROM review + WHERE + review.product_id = product.product_id AND + review_time > timestamp AND + review_time <= timestamp + INTERVAL '{self.timedelta}' + ) AS INTEGER + ) AS churn + FROM + timestamp_df, + product, + WHERE + EXISTS ( + SELECT 1 + FROM review + WHERE + review.product_id = product.product_id AND + review_time > timestamp - INTERVAL '{self.timedelta}' AND + review_time <= timestamp + ) + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class ItemLTVTask(EntityTask): + r"""LTV (life-time value) for a product is the numer of times the product is + purchased in the time window multiplied by price.""" + + task_type = TaskType.REGRESSION + entity_col = "product_id" + entity_table = "product" + time_col = "timestamp" + target_col = "ltv" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + product = db.table_dict["product"].df + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + timestamp, + product.product_id, + COALESCE(SUM(price), 0) AS ltv, + FROM + timestamp_df, + product, + review + WHERE + review.product_id = product.product_id AND + review_time > timestamp AND + review_time <= timestamp + INTERVAL '{self.timedelta}' + GROUP BY + timestamp, + product.product_id + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col="timestamp", + ) + + +class UserItemPurchaseTask(RecommendationTask): + r"""Predict the list of distinct items each customer will purchase in the next two + years.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "customer_id" + src_entity_table = "customer" + dst_entity_col = "product_id" + dst_entity_table = "product" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 10 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + review.customer_id, + LIST(DISTINCT review.product_id) AS product_id + FROM + timestamp_df t + LEFT JOIN + review + ON + review.review_time > t.timestamp AND + review.review_time <= t.timestamp + INTERVAL '{self.timedelta} days' + WHERE + review.customer_id is not null and review.product_id is not null + GROUP BY + t.timestamp, + review.customer_id + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserItemRateTask(RecommendationTask): + r"""Predict the list of distinct items each customer will purchase and give a 5 star + review in the next two years.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "customer_id" + src_entity_table = "customer" + dst_entity_col = "product_id" + dst_entity_table = "product" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 10 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + review.customer_id, + LIST(DISTINCT review.product_id) AS product_id + FROM + timestamp_df t + LEFT JOIN + review + ON + review.review_time > t.timestamp AND + review.review_time <= t.timestamp + INTERVAL '{self.timedelta} days' + WHERE + review.customer_id IS NOT NULL + AND review.product_id IS NOT NULL + AND review.rating = 5.0 + GROUP BY + t.timestamp, + review.customer_id + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserItemReviewTask(RecommendationTask): + r"""Predict the list of distinct items each customer will purchase and give a + detailed review in the next two years.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "customer_id" + src_entity_table = "customer" + dst_entity_col = "product_id" + dst_entity_table = "product" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 10 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + customer = db.table_dict["customer"].df + review = db.table_dict["review"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + REVIEW_LENGTH = ( + 300 # minimum length of review to be considered as detailed review + ) + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + review.customer_id, + LIST(DISTINCT review.product_id) AS product_id + FROM + timestamp_df t + LEFT JOIN + review + ON + review.review_time > t.timestamp AND + review.review_time <= t.timestamp + INTERVAL '{self.timedelta} days' + WHERE + review.customer_id IS NOT NULL + AND review.product_id IS NOT NULL + AND (LENGTH(review.review_text) > {REVIEW_LENGTH} AND review.review_text IS NOT NULL) + GROUP BY + t.timestamp, + review.customer_id + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) \ No newline at end of file diff --git a/plexe/relbench/tasks/avito.py b/plexe/relbench/tasks/avito.py new file mode 100644 index 00000000..101b7938 --- /dev/null +++ b/plexe/relbench/tasks/avito.py @@ -0,0 +1,226 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from ..metrics import ( + accuracy, + average_precision, + f1, + link_prediction_map, + link_prediction_precision, + link_prediction_recall, + mae, + r2, + rmse, + roc_auc, +) + + +class AdCTRTask(EntityTask): + r"""Assuming the ad will be clicked in the next 4 days, predict the Click-Through- + Rate (CTR) for each ad.""" + + task_type = TaskType.REGRESSION + entity_table = "AdsInfo" + entity_col = "AdID" + time_col = "timestamp" + target_col = "num_click" + timedelta = pd.Timedelta(days=4) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + ads_info = db.table_dict["AdsInfo"].df + search_stream = db.table_dict["SearchStream"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + df = duckdb.sql( + f""" + SELECT + search_ads.AdID, + t.timestamp, + COALESCE(SUM(search_ads.isClick), 0) / COALESCE(COUNT(search_ads.SearchID), 1) AS num_click + FROM + timestamp_df t + LEFT JOIN ( + ads_info + LEFT JOIN + search_stream + ON + ads_info.AdID == search_stream.AdID + ) search_ads + ON + search_ads.SearchDate > t.timestamp AND + search_ads.SearchDate <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + search_ads.AdID + HAVING + SUM(search_ads.isClick) > 0 + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col="timestamp", + ) + + +class UserVisitsTask(EntityTask): + r"""Predict whether each customer will visit more than one ad in the next 4 days.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_table = "UserInfo" + entity_col = "UserID" + time_col = "timestamp" + target_col = "num_click" + timedelta = pd.Timedelta(days=4) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + user_info = db.table_dict["UserInfo"].df + visits_stream = db.table_dict["VisitStream"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + df = duckdb.sql( + f""" + SELECT + visit_ads.UserID, + t.timestamp, + COALESCE(COUNT(DISTINCT visit_ads.AdID), 0) > 1 AS num_click + FROM + timestamp_df t + LEFT JOIN + ( + user_info + LEFT JOIN + visits_stream + ON + user_info.UserID == visits_stream.UserID + ) visit_ads + ON + visit_ads.ViewDate > t.timestamp AND + visit_ads.ViewDate <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + visit_ads.UserID + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col="timestamp", + ) + + +class UserClicksTask(EntityTask): + r"""Predict whether the each customer will click on more than one ads in the next 4 + days.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_table = "UserInfo" + entity_col = "UserID" + time_col = "timestamp" + target_col = "num_click" + timedelta = pd.Timedelta(days=4) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + user_info = db.table_dict["UserInfo"].df + search_info = db.table_dict["SearchInfo"].df + search_stream = db.table_dict["SearchStream"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + df = duckdb.sql( + f""" + SELECT + search_ads.UserID, + t.timestamp, + COALESCE(COUNT(search_ads.AdID), 0) > 1 AS num_click + FROM + timestamp_df t + LEFT JOIN + ( + ( + user_info + LEFT JOIN + search_info + ON + user_info.UserID == search_info.UserID + ) user_search_info + LEFT JOIN + search_stream + ON + user_search_info.SearchID == search_stream.SearchID AND + search_stream.IsClick == 1.0 + ) search_ads + ON + search_ads.SearchDate > t.timestamp AND + search_ads.SearchDate <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + search_ads.UserID + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col="timestamp", + ) + + +class UserAdVisitTask(RecommendationTask): + r"""Predict the distinct list of ads a user will visit in the next 4 days.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_table = "UserInfo" + src_entity_col = "UserID" + + dst_entity_table = "AdsInfo" + dst_entity_col = "AdID" + + time_col = "timestamp" + timedelta = pd.Timedelta(days=4) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 12 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + user_info = db.table_dict["UserInfo"].df + visits_stream = db.table_dict["VisitStream"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + visit_ads.UserID, + t.timestamp, + LIST(DISTINCT visit_ads.AdID) AS AdID, + FROM + timestamp_df t + LEFT JOIN + ( + user_info + LEFT JOIN + visits_stream + ON + user_info.UserID == visits_stream.UserID + ) visit_ads + ON + visit_ads.ViewDate > t.timestamp AND + visit_ads.ViewDate <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + visit_ads.UserID + """ + ).df() + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) \ No newline at end of file diff --git a/plexe/relbench/tasks/event.py b/plexe/relbench/tasks/event.py new file mode 100644 index 00000000..10d42e92 --- /dev/null +++ b/plexe/relbench/tasks/event.py @@ -0,0 +1,189 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, Table, TaskType +from ..metrics import accuracy, average_precision, f1, mae, r2, rmse, roc_auc + + +class UserAttendanceTask(EntityTask): + r"""Predict the number of events a user will go to in the next seven days 7 days.""" + + task_type = TaskType.REGRESSION + entity_col = "user" + entity_table = "users" + time_col = "timestamp" + timedelta = pd.Timedelta(days=7) + metrics = [r2, mae, rmse] + target_col = "target" + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + users = db.table_dict["users"].df + user_friends = db.table_dict["user_friends"].df + events = db.table_dict["events"].df + event_attendees = db.table_dict["event_attendees"].df + event_interest = db.table_dict["event_interest"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f"""SELECT + t.timestamp, + event_attendees.user_id AS user, + SUM(CASE WHEN event_attendees.status IN ('yes', 'maybe') THEN 1 ELSE 0 END) AS target + FROM + timestamp_df t + LEFT JOIN + event_attendees + ON + event_attendees.start_time > t.timestamp AND + event_attendees.start_time <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + event_attendees.user_id + """ + ).df() + df = df.dropna(subset=["user"]) + df["user"] = df["user"].astype(int) + df = df.reset_index() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.entity_col: self.entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserRepeatTask(EntityTask): + r"""Predict whether a user will attend an event in the next 7 days if they have + already attended an event in the last 14 days.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "user" + entity_table = "users" + time_col = "timestamp" + timedelta = pd.Timedelta(days=7) + metrics = [accuracy, average_precision, f1, roc_auc] + target_col = "target" + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + users = db.table_dict["users"].df + user_friends = db.table_dict["user_friends"].df + events = db.table_dict["events"].df + event_attendees = db.table_dict["event_attendees"].df + event_interest = db.table_dict["event_interest"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + eval_timestamp_len = len(timestamp_df) + if len(timestamp_df) == 1: + new_row = pd.DataFrame( + { + "timestamp": [ + timestamps[0] - self.timedelta * 2, + timestamps[0] - self.timedelta, + ] + } + ) + timestamp_df = pd.concat([new_row, timestamp_df], ignore_index=True) + + df = duckdb.sql( + f""" + WITH tb AS( + SELECT + t.timestamp AS timestamp, + event_attendees.user_id AS user, + MAX(CASE WHEN event_attendees.status IN ('yes', 'maybe') THEN 1 ELSE 0 END) AS target, + MAX(MAX(CASE WHEN event_attendees.status IN ('yes', 'maybe') THEN 1 ELSE 0 END)) OVER (PARTITION BY event_attendees.user_id ORDER BY t.timestamp ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) as prev_target + FROM + timestamp_df t + LEFT JOIN + event_attendees + ON + event_attendees.start_time > t.timestamp AND + event_attendees.start_time <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + event_attendees.user_id + ) + SELECT + timestamp, + user, + target + FROM + tb + WHERE + prev_target = 1; + """ + ).df() + + if eval_timestamp_len == 1: + df = df[df.timestamp == df.timestamp.max()] + + df = df.dropna(subset=["user"]) + df["user"] = df["user"].astype(int) + df = df.reset_index() + return Table( + df=df, + fkey_col_to_pkey_table={ + self.entity_col: self.entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserIgnoreTask(EntityTask): + r"""Predict whether a user will ignore more than 2 event invitations in the next 7 + days.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "user" + entity_table = "users" + time_col = "timestamp" + timedelta = pd.Timedelta(days=7) + metrics = [accuracy, average_precision, f1, roc_auc] + target_col = "target" + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + users = db.table_dict["users"].df + user_friends = db.table_dict["user_friends"].df + events = db.table_dict["events"].df + event_attendees = db.table_dict["event_attendees"].df + event_interest = db.table_dict["event_interest"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + if len(timestamp_df) == 1: + new_row = pd.DataFrame({"timestamp": [timestamps[0] - self.timedelta]}) + timestamp_df = pd.concat([new_row, timestamp_df], ignore_index=True) + + df = duckdb.sql( + f"""SELECT + t.timestamp AS timestamp, + event_attendees.user_id AS user, + CASE + WHEN SUM(CASE WHEN event_attendees.status = 'invited' THEN 1 ELSE 0 END) > 2 THEN 1 + ELSE 0 + END AS target + FROM + timestamp_df t + LEFT JOIN + event_attendees + ON + event_attendees.start_time > t.timestamp AND + event_attendees.start_time <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + event_attendees.user_id + """ + ).df() + + df = df.dropna(subset=["user"]) + df["user"] = df["user"].astype(int) + df = df.reset_index() + return Table( + df=df, + fkey_col_to_pkey_table={ + self.entity_col: self.entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) \ No newline at end of file diff --git a/plexe/relbench/tasks/f1.py b/plexe/relbench/tasks/f1.py new file mode 100644 index 00000000..0051e33b --- /dev/null +++ b/plexe/relbench/tasks/f1.py @@ -0,0 +1,220 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from ..metrics import ( + accuracy, + average_precision, + f1, + link_prediction_map, + link_prediction_precision, + link_prediction_recall, + mae, + r2, + rmse, + roc_auc, +) + + +class DriverPositionTask(EntityTask): + r"""Predict the average finishing position of each driver all races in the next 2 + months.""" + + task_type = TaskType.REGRESSION + entity_col = "driverId" + entity_table = "drivers" + time_col = "date" + target_col = "position" + timedelta = pd.Timedelta(days=60) + metrics = [r2, mae, rmse] + num_eval_timestamps = 40 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + results = db.table_dict["results"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp as date, + re.driverId as driverId, + mean(re.positionOrder) as position, + FROM + timestamp_df t + LEFT JOIN + results re + ON + re.date <= t.timestamp + INTERVAL '{self.timedelta}' + and re.date > t.timestamp + WHERE + re.driverId IN ( + SELECT DISTINCT driverId + FROM results + WHERE date > t.timestamp - INTERVAL '1 year' + ) + GROUP BY t.timestamp, re.driverId + + ; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class DriverDNFTask(EntityTask): + r"""Predict the if each driver will DNF (not finish) a race in the next 1 month.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "driverId" + entity_table = "drivers" + time_col = "date" + target_col = "did_not_finish" + timedelta = pd.Timedelta(days=30) # gr: 30 + metrics = [average_precision, accuracy, f1, roc_auc] + num_eval_timestamps = 40 # gr: 40 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + results = db.table_dict["results"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp as date, + re.driverId as driverId, + MAX(CASE WHEN re.statusId != 1 THEN 1 ELSE 0 END) AS did_not_finish + FROM + timestamp_df t + LEFT JOIN + results re + ON + re.date <= t.timestamp + INTERVAL '{self.timedelta}' + and re.date > t.timestamp + WHERE + re.driverId IN ( + SELECT DISTINCT driverId + FROM results + WHERE date > t.timestamp - INTERVAL '1 year' + ) + GROUP BY t.timestamp, re.driverId + + ; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class DriverTop3Task(EntityTask): + r"""Predict if each driver will qualify in the top-3 for a race within the next 1 + month.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "driverId" + entity_table = "drivers" + time_col = "date" + target_col = "qualifying" + timedelta = pd.Timedelta(days=30) + metrics = [average_precision, accuracy, f1, roc_auc] + num_eval_timestamps = 40 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + qualifying = db.table_dict["qualifying"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp as date, + qu.driverId as driverId, + CASE + WHEN MIN(qu.position) <= 3 THEN 1 + ELSE 0 + END AS qualifying + FROM + timestamp_df t + LEFT JOIN + qualifying qu + ON + qu.date <= t.timestamp + INTERVAL '{self.timedelta}' + and qu.date > t.timestamp + WHERE + qu.driverId IN ( + SELECT DISTINCT driverId + FROM qualifying + WHERE date > t.timestamp - INTERVAL '1 year' + ) + GROUP BY t.timestamp, qu.driverId + + ; + """ + ).df() + + df["qualifying"] = df["qualifying"].astype("int64") + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class DriverRaceCompeteTask(RecommendationTask): + r"""Predict in which races a driver will compete in the next 1 year.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "driverId" + src_entity_table = "drivers" + dst_entity_col = "raceId" + dst_entity_table = "races" + target_col = "raceId" + time_col = "date" + timedelta = pd.Timedelta(days=365) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 10 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + results = db.table_dict["results"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp as date, + re.driverId as driverId, + LIST(DISTINCT re.raceId) as raceId + FROM + timestamp_df t + LEFT JOIN + results re + ON + re.date <= t.timestamp + INTERVAL '{self.timedelta}' + and re.date > t.timestamp + GROUP BY t.timestamp, re.driverId + ; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) \ No newline at end of file diff --git a/plexe/relbench/tasks/hm.py b/plexe/relbench/tasks/hm.py new file mode 100644 index 00000000..02194734 --- /dev/null +++ b/plexe/relbench/tasks/hm.py @@ -0,0 +1,166 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from ..metrics import ( + accuracy, + average_precision, + f1, + link_prediction_map, + link_prediction_precision, + link_prediction_recall, + mae, + r2, + rmse, + roc_auc, +) + + +class UserItemPurchaseTask(RecommendationTask): + r"""Predict the list of articles each customer will purchase in the next seven + days.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "customer_id" + src_entity_table = "customer" + dst_entity_col = "article_id" + dst_entity_table = "article" + time_col = "timestamp" + timedelta = pd.Timedelta(days=7) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 12 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + customer = db.table_dict["customer"].df + transactions = db.table_dict["transactions"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + transactions.customer_id, + LIST(DISTINCT transactions.article_id) AS article_id + FROM + timestamp_df t + LEFT JOIN + transactions + ON + transactions.t_dat > t.timestamp AND + transactions.t_dat <= t.timestamp + INTERVAL '{self.timedelta} days' + GROUP BY + t.timestamp, + transactions.customer_id + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserChurnTask(EntityTask): + r"""Predict the churn for a customer (no transactions) in the next week.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "customer_id" + entity_table = "customer" + time_col = "timestamp" + target_col = "churn" + timedelta = pd.Timedelta(days=7) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + customer = db.table_dict["customer"].df + transactions = db.table_dict["transactions"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + df = duckdb.sql( + f""" + SELECT + timestamp, + customer_id, + CAST( + NOT EXISTS ( + SELECT 1 + FROM transactions + WHERE + transactions.customer_id = customer.customer_id AND + t_dat > timestamp AND + t_dat <= timestamp + INTERVAL '{self.timedelta}' + ) AS INTEGER + ) AS churn + FROM + timestamp_df, + customer, + WHERE + EXISTS ( + SELECT 1 + FROM transactions + WHERE + transactions.customer_id = customer.customer_id AND + t_dat > timestamp - INTERVAL '{self.timedelta}' AND + t_dat <= timestamp + ) + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class ItemSalesTask(EntityTask): + r"""Predict the total sales for an article (the sum of prices of the associated + transactions) in the next week.""" + + task_type = TaskType.REGRESSION + entity_col = "article_id" + entity_table = "article" + time_col = "timestamp" + target_col = "sales" + timedelta = pd.Timedelta(days=7) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + transactions = db.table_dict["transactions"].df + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + article = db.table_dict["article"].df + + df = duckdb.sql( + f""" + SELECT + timestamp, + article_id, + sales + FROM + timestamp_df, + article, + ( + SELECT + COALESCE(SUM(price), 0) as sales + FROM + transactions, + WHERE + transactions.article_id = article.article_id AND + t_dat > timestamp AND + t_dat <= timestamp + INTERVAL '{self.timedelta}' + ) + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={"article_id": "article"}, + pkey_col=None, + time_col="timestamp", + ) \ No newline at end of file diff --git a/plexe/relbench/tasks/stack.py b/plexe/relbench/tasks/stack.py new file mode 100644 index 00000000..275345d2 --- /dev/null +++ b/plexe/relbench/tasks/stack.py @@ -0,0 +1,342 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from ..metrics import ( + accuracy, + average_precision, + f1, + link_prediction_map, + link_prediction_precision, + link_prediction_recall, + mae, + r2, + rmse, + roc_auc, +) + +######## node prediction tasks ######## + + +class UserEngagementTask(EntityTask): + r"""Predict if a user will make any votes/posts/comments in the next 2 years.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "OwnerUserId" + entity_table = "users" + time_col = "timestamp" + target_col = "contribution" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + comments = db.table_dict["comments"].df + votes = db.table_dict["votes"].df + posts = db.table_dict["posts"].df + users = db.table_dict["users"].df + + df = duckdb.sql( + f""" + WITH + ALL_ENGAGEMENT AS ( + SELECT + p.id, + p.owneruserid as userid, + p.creationdate + FROM + posts p + UNION + SELECT + v.id, + v.userid, + v.creationdate + FROM + votes v + UNION + SELECT + c.id, + c.userid, + c.creationdate + FROM + comments c + ), + + ACTIVE_USERS AS ( + SELECT + t.timestamp, + u.id, + count(distinct a.id) as n_engagement + FROM timestamp_df t + CROSS JOIN users u + LEFT JOIN all_engagement a + ON u.id = a.UserId + and a.CreationDate <= t.timestamp + WHERE u.id != -1 + GROUP BY t.timestamp, u.id + ) + SELECT + u.timestamp, + u.id as OwnerUserId, + IF(count(distinct a.id) >= 1, 1, 0) as contribution + FROM + active_users u + LEFT JOIN + all_engagement a + ON + u.id = a.UserId AND + a.CreationDate > u.timestamp AND + a.CreationDate <= u.timestamp + INTERVAL '{self.timedelta}' + where + u.n_engagement >= 1 + GROUP BY + u.timestamp, u.id + ; + + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class PostVotesTask(EntityTask): + r"""Predict the number of upvotes that an existing question will receive in the next + 2 years.""" + + task_type = TaskType.REGRESSION + entity_col = "PostId" + entity_table = "posts" + time_col = "timestamp" + target_col = "popularity" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + votes = db.table_dict["votes"].df + posts = db.table_dict["posts"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + p.id AS PostId, + COUNT(distinct v.id) AS popularity + FROM + timestamp_df t + LEFT JOIN + posts p + ON + p.CreationDate <= t.timestamp AND + p.owneruserid != -1 AND + p.owneruserid is not null AND + p.PostTypeId = 1 + LEFT JOIN + votes v + ON + p.id = v.PostId AND + v.CreationDate > t.timestamp AND + v.CreationDate <= t.timestamp + INTERVAL '{self.timedelta}' AND + v.votetypeid = 2 + GROUP BY + t.timestamp, + p.id + ; + + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class UserBadgeTask(EntityTask): + r"""Predict if each user will receive in a new badge the next 2 years.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "UserId" + entity_table = "users" + time_col = "timestamp" + target_col = "WillGetBadge" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + users = db.table_dict["users"].df + badges = db.table_dict["badges"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + u.Id as UserId, + CASE WHEN + COUNT(b.Id) >= 1 THEN 1 ELSE 0 END AS WillGetBadge + FROM + timestamp_df t + LEFT JOIN + users u + ON + u.CreationDate <= t.timestamp + LEFT JOIN + badges b + ON + u.Id = b.UserID + AND b.Date > t.timestamp + AND b.Date <= t.timestamp + INTERVAL '{self.timedelta}' + GROUP BY + t.timestamp, + u.Id + """ + ).df() + + # remove any IderId rows that are NaN + df = df.dropna(subset=["UserId"]) + df[self.entity_col] = df[self.entity_col].astype( + int + ) # for some reason duckdb returns float64 keys + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +######## link prediction tasks ######## + + +class UserPostCommentTask(RecommendationTask): + r"""Predict a list of existing posts that a user will comment in the next two + years.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "UserId" + src_entity_table = "users" + dst_entity_col = "PostId" + dst_entity_table = "posts" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 100 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + r"""Create Task object for UserCommentOnPostTask.""" + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + users = db.table_dict["users"].df + posts = db.table_dict["posts"].df + comments = db.table_dict["comments"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + c.UserId as UserId, + LIST(DISTINCT p.id) AS PostId + FROM + timestamp_df t + LEFT JOIN + posts p + ON + p.CreationDate <= t.timestamp + LEFT JOIN + comments c + ON + p.id = c.PostId AND + c.CreationDate > t.timestamp AND + c.CreationDate <= t.timestamp + INTERVAL '{self.timedelta} days' + WHERE + c.UserId is not null AND + p.owneruserid != -1 AND + p.owneruserid is not null + GROUP BY + t.timestamp, + c.UserId + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class PostPostRelatedTask(RecommendationTask): + r"""Predict a list of existing posts that users will link a given post to in the + next two years.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "PostId" + src_entity_table = "posts" + dst_entity_col = "postLinksIdList" + dst_entity_table = "posts" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365 // 4) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 100 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + r"""Create Task object for UserVoteOnPostTask.""" + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + + posts = db.table_dict["posts"].df + postLinks = db.table_dict["postLinks"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + pl.PostId as PostId, + LIST(DISTINCT pl.RelatedPostId) AS postLinksIdList + FROM + timestamp_df t + LEFT JOIN + postLinks pl + ON + pl.CreationDate > t.timestamp AND + pl.CreationDate <= t.timestamp + INTERVAL '{self.timedelta} days' + LEFT JOIN + posts p1 + ON + pl.PostId = p1.Id + LEFT JOIN + posts p2 + ON + pl.RelatedPostId = p2.Id + WHERE + pl.PostId IS NOT NULL AND + pl.RelatedPostId IS NOT NULL AND + p1.CreationDate <= t.timestamp AND + p2.CreationDate <= t.timestamp + GROUP BY + t.timestamp, + pl.PostId; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) \ No newline at end of file diff --git a/plexe/relbench/tasks/trial.py b/plexe/relbench/tasks/trial.py new file mode 100644 index 00000000..33ed9fa7 --- /dev/null +++ b/plexe/relbench/tasks/trial.py @@ -0,0 +1,278 @@ +import duckdb +import pandas as pd + +from ..base import Database, EntityTask, RecommendationTask, Table, TaskType +from ..metrics import ( + accuracy, + average_precision, + f1, + link_prediction_map, + link_prediction_precision, + link_prediction_recall, + mae, + r2, + rmse, + roc_auc, +) + + +class StudyOutcomeTask(EntityTask): + r"""Predict if the trials in the next 1 year will achieve its primary outcome.""" + + task_type = TaskType.BINARY_CLASSIFICATION + entity_col = "nct_id" + entity_table = "studies" + time_col = "timestamp" + target_col = "outcome" + timedelta = pd.Timedelta(days=365) + metrics = [average_precision, accuracy, f1, roc_auc] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + studies = db.table_dict["studies"].df + outcomes = db.table_dict["outcomes"].df + outcome_analyses = db.table_dict["outcome_analyses"].df + + df = duckdb.sql( + f""" + WITH TRIAL_INFO AS ( + SELECT + oa.nct_id, + oa.p_value, + s.start_date, + oa.date + FROM outcome_analyses oa + LEFT JOIN outcomes o + ON oa.outcome_id = o.id + LEFT JOIN studies s + ON s.nct_id = o.nct_id + where (oa.p_value_modifier is null or oa.p_value_modifier != '>') + and oa.p_value >=0 + and oa.p_value <=1 + and o.outcome_type = 'Primary' + ) + + SELECT + t.timestamp, + tr.nct_id, + CASE + WHEN MIN(tr.p_value) <= 0.05 THEN 1 + ELSE 0 + END AS outcome + FROM timestamp_df t + LEFT JOIN TRIAL_INFO tr + ON tr.start_date <= t.timestamp + and tr.date > t.timestamp + and tr.date <= t.timestamp + INTERVAL '{self.timedelta}' + WHERE tr.nct_id is not null + GROUP BY t.timestamp, tr.nct_id; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class StudyAdverseTask(EntityTask): + r"""Predict the number of affected patients with severe advsere events/death for the + trial in the next 1 year.""" + + task_type = TaskType.REGRESSION + entity_col = "nct_id" + entity_table = "studies" + time_col = "timestamp" + target_col = "num_of_adverse_events" + timedelta = pd.Timedelta(days=365) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + reported_event_totals = db.table_dict["reported_event_totals"].df + studies = db.table_dict["studies"].df + + df = duckdb.sql( + f""" + WITH TRIAL_INFO AS ( + SELECT + r.nct_id, + r.event_type, + r.subjects_affected, + r.date, + s.start_date + FROM reported_event_totals r + LEFT JOIN studies s + ON r.nct_id = s.nct_id + WHERE r.event_type = 'serious' or r.event_type = 'deaths' + and r.subjects_affected is not null + ) + + SELECT + t.timestamp, + tr.nct_id, + sum(tr.subjects_affected) AS num_of_adverse_events + FROM timestamp_df t + LEFT JOIN TRIAL_INFO tr + ON tr.start_date <= t.timestamp + and tr.date > t.timestamp + and tr.date <= t.timestamp + INTERVAL '{self.timedelta}' + WHERE tr.nct_id is not null and tr.subjects_affected is not null + GROUP BY t.timestamp, tr.nct_id; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class SiteSuccessTask(EntityTask): + r"""Predict the success rate of a trial site in the next 1 year.""" + + task_type = TaskType.REGRESSION + entity_col = "facility_id" + entity_table = "facilities" + time_col = "timestamp" + target_col = "success_rate" + timedelta = pd.Timedelta(days=365) + metrics = [r2, mae, rmse] + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + facilities = db.table_dict["facilities"].df + facility_study = db.table_dict["facilities_studies"].df + outcome_analyses = db.table_dict["outcome_analyses"].df + studies = db.table_dict["studies"].df + outcomes = db.table_dict["outcomes"].df + + df = duckdb.sql( + f""" + WITH TRIAL_INFO AS ( + SELECT + oa.nct_id, + MIN(CASE WHEN oa.p_value < 0.05 THEN 1 ELSE 0 END) AS is_successful, -- Determine if the trial is successful + oa.date, + FROM outcome_analyses oa + LEFT JOIN outcomes o + ON oa.outcome_id = o.id + WHERE (oa.p_value_modifier is null or oa.p_value_modifier != '>') + and oa.p_value >=0 + and oa.p_value <=1 + and o.outcome_type = 'Primary' + GROUP BY oa.nct_id, oa.date + ) + + SELECT + t.timestamp, + fs.facility_id, + SUM(tr.is_successful)/COUNT(tr.is_successful) AS success_rate + FROM timestamp_df t + LEFT JOIN TRIAL_INFO tr + LEFT JOIN facility_study fs ON fs.nct_id = tr.nct_id + ON tr.date > t.timestamp + and tr.date <= t.timestamp + INTERVAL '{self.timedelta}' + WHERE fs.facility_id is not null + GROUP BY t.timestamp, fs.facility_id; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={self.entity_col: self.entity_table}, + pkey_col=None, + time_col=self.time_col, + ) + + +class ConditionSponsorRunTask(RecommendationTask): + r"""Predict whether this condition will have which sponsors.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "condition_id" + src_entity_table = "conditions" + dst_entity_col = "sponsor_id" + dst_entity_table = "sponsors" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 10 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + sponsors_studies = db.table_dict["sponsors_studies"].df + condition_study = db.table_dict["conditions_studies"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + cs.condition_id, + LIST(DISTINCT ss.sponsor_id) AS sponsor_id + FROM timestamp_df t + LEFT JOIN condition_study cs + LEFT JOIN sponsors_studies ss ON ss.nct_id = cs.nct_id + ON cs.date > t.timestamp + and cs.date <= t.timestamp + INTERVAL '{self.timedelta}' + GROUP BY t.timestamp, cs.condition_id; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) + + +class SiteSponsorRunTask(RecommendationTask): + r"""Predict whether this sponsor will have a trial in a facility.""" + + task_type = TaskType.LINK_PREDICTION + src_entity_col = "facility_id" + src_entity_table = "facilities" + dst_entity_col = "sponsor_id" + dst_entity_table = "sponsors" + time_col = "timestamp" + timedelta = pd.Timedelta(days=365) + metrics = [link_prediction_precision, link_prediction_recall, link_prediction_map] + eval_k = 10 + + def make_table(self, db: Database, timestamps: "pd.Series[pd.Timestamp]") -> Table: + timestamp_df = pd.DataFrame({"timestamp": timestamps}) + sponsors_studies = db.table_dict["sponsors_studies"].df + facility_study = db.table_dict["facilities_studies"].df + + df = duckdb.sql( + f""" + SELECT + t.timestamp, + fs.facility_id, + LIST(DISTINCT ss.sponsor_id) AS sponsor_id + FROM timestamp_df t + LEFT JOIN facility_study fs + LEFT JOIN sponsors_studies ss ON ss.nct_id = fs.nct_id + ON fs.date > t.timestamp + and fs.date <= t.timestamp + INTERVAL '{self.timedelta}' + GROUP BY t.timestamp, fs.facility_id; + """ + ).df() + + return Table( + df=df, + fkey_col_to_pkey_table={ + self.src_entity_col: self.src_entity_table, + self.dst_entity_col: self.dst_entity_table, + }, + pkey_col=None, + time_col=self.time_col, + ) \ No newline at end of file diff --git a/plexe/relbench/utils.py b/plexe/relbench/utils.py new file mode 100644 index 00000000..dfc20c90 --- /dev/null +++ b/plexe/relbench/utils.py @@ -0,0 +1,69 @@ +import os +import shutil +from pathlib import Path +from typing import Union +from zipfile import ZipFile + +import pandas as pd +import pooch + + +def decompress_gz_file(input_path: str, output_path: str): + import gzip + import shutil + + # Open the gz file in binary read mode + with gzip.open(input_path, "rb") as f_in: + # Open the output file in binary write mode + with open(output_path, "wb") as f_out: + # Copy the decompressed data from the gz file to the output file + shutil.copyfileobj(f_in, f_out) + print(f"Decompressed file saved as: {output_path}") + + +def unzip_processor(fname: Union[str, Path], action: str, pooch: pooch.Pooch) -> Path: + zip_path = Path(fname) + unzip_path = zip_path.parent / zip_path.stem + if action != "fetch": + shutil.unpack_archive(zip_path, unzip_path) + else: # fetch + try: # sanity check if all files are fully extracted comparing size + for f in ZipFile(zip_path).infolist(): + if not f.is_dir(): + fsize = os.path.getsize(os.path.join(unzip_path, f.filename)) + assert f.file_size == fsize + except Exception: # otherwise do full unpack + shutil.unpack_archive(zip_path, unzip_path) + + return unzip_path + + +def clean_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame: + r"""Clean the time column of a pandas dataframe. + Args: + df (pd.DataFrame): The pandas dataframe to clean the timecolumn for. + col (str): The time column name. + + Returns: + (pd.DataFrame): The pandas dataframe with the cleaned time column. + """ + df[col] = pd.to_datetime(df[col], errors="coerce") + + # Count the number of rows before removing invalid dates + total_before = len(df) + + # Remove rows where timestamp is NaT (indicating parsing failure) + df = df.dropna(subset=[col]) + + # Count the number of rows after removing invalid dates + total_after = len(df) + + # Calculate the percentage of rows removed + percentage_removed = ((total_before - total_after) / total_before) * 100 + + # Print the percentage of comments removed + print( + f"Percentage of rows removed due to invalid dates: " + f"{percentage_removed:.2f}%" + ) + return df diff --git a/plexe/server.py b/plexe/server.py index afd4ef1b..9dc0f11b 100644 --- a/plexe/server.py +++ b/plexe/server.py @@ -1,38 +1,95 @@ """ -FastAPI server for the Plexe conversational agent. +FastAPI server for the Plexe LangGraph-based multi-agent system. -This module provides a lightweight WebSocket API for the conversational agent -and serves the assistant-ui frontend for local execution. +This module provides WebSocket API for real-time chat communication +and serves the frontend for the Plexe ML platform. """ +import asyncio import json import logging import uuid +from datetime import datetime from pathlib import Path +from typing import Dict, Any, Optional from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.staticfiles import StaticFiles from fastapi.responses import FileResponse +from fastapi.middleware.cors import CORSMiddleware -from plexe.agents.conversational import ConversationalAgent +from plexe.langgraph import PlexeOrchestrator, AgentConfig +from plexe.langgraph.utils import WebSocketEmitter, MultiEmitter, ConsoleEmitter +from plexe.langgraph.utils.logging_utils import setup_session_logging, session_id_var +from plexe.api import datasets_router logger = logging.getLogger(__name__) -app = FastAPI(title="Plexe Assistant", version="1.0.0") +# Initialize session-based logging +setup_session_logging() + +app = FastAPI(title="Plexe Assistant", version="2.0.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(datasets_router) -# Serve static files from the ui directory ui_dir = Path(__file__).parent / "ui" -if ui_dir.exists(): +frontend_dist = ui_dir / "frontend" / "dist" +if frontend_dist.exists(): + app.mount("/static", StaticFiles(directory=str(frontend_dist)), name="static") +elif ui_dir.exists(): app.mount("/static", StaticFiles(directory=str(ui_dir)), name="static") +class SessionManager: + """Manages active chat sessions.""" + + def __init__(self): + self.sessions: Dict[str, Dict[str, Any]] = {} + + def create_session(self, session_id: str, emitter=None) -> PlexeOrchestrator: + """Create a new session with its own orchestrator.""" + config = AgentConfig.from_env() + orchestrator = PlexeOrchestrator(config=config, verbose=True, emitter=emitter) + self.sessions[session_id] = { + "orchestrator": orchestrator, + "working_dir": f"workdir/session-{session_id}", + } + return orchestrator + + def get_session(self, session_id: str) -> Optional[Dict[str, Any]]: + """Get an existing session.""" + return self.sessions.get(session_id) + + def remove_session(self, session_id: str): + """Remove a session.""" + if session_id in self.sessions: + del self.sessions[session_id] + + +session_manager = SessionManager() + + @app.get("/") async def root(): """Serve the main HTML page.""" - index_path = ui_dir / "index.html" - if index_path.exists(): - return FileResponse(str(index_path)) - return {"error": "Frontend not found. Please ensure plexe/ui/index.html exists."} + built_index = frontend_dist / "index.html" + legacy_index = ui_dir / "index.html" + + if built_index.exists(): + return FileResponse(str(built_index)) + if legacy_index.exists(): + return FileResponse(str(legacy_index)) + return { + "error": "Frontend not found. Please ensure plexe/ui/frontend/dist/index.html exists." + } @app.websocket("/ws") @@ -42,49 +99,228 @@ async def websocket_endpoint(websocket: WebSocket): session_id = str(uuid.uuid4()) logger.info(f"New WebSocket connection: {session_id}") - # Create a new agent instance for this session - agent = ConversationalAgent() + loop = asyncio.get_running_loop() + + ws_emitter = WebSocketEmitter(websocket, loop=loop) + console_emitter = ConsoleEmitter() + multi_emitter = MultiEmitter([ws_emitter, console_emitter]) + + config = AgentConfig.from_env() + + def progress_callback(data: Dict[str, Any]): + """Callback to send progress updates.""" + phase = data.get("phase", "") + agent = data.get("agent", "") + timestamp = data.get("timestamp", "") + if phase: + try: + asyncio.run_coroutine_threadsafe( + websocket.send_json({ + "type": "thinking", + "role": "thinking", + "agent_name": agent or "Orchestrator", + "message": f"Starting phase: {phase}", + "step_number": 0, + "timestamp": timestamp or datetime.now().strftime("%H:%M:%S"), + }), + loop + ) + except Exception as e: + logger.warning(f"Failed to send progress: {e}") + + orchestrator = PlexeOrchestrator( + config=config, + verbose=True, + callback=progress_callback, + emitter=multi_emitter + ) + session_manager.sessions[session_id] = { + "orchestrator": orchestrator, + "working_dir": f"workdir/session-{session_id}", + } + working_dir = f"workdir/session-{session_id}" + + agent_task = None + is_closed = False + + async def send_message(msg_type: str, content: Any, agent: str = ""): + """Send a message to the client.""" + if not is_closed: + try: + await websocket.send_json({ + "type": msg_type, + "content": content, + "agent": agent, + "id": str(uuid.uuid4()), + "session_id": session_id, + }) + except Exception as e: + logger.warning(f"Failed to send message: {e}") + + async def run_agent_task(user_message: str, db_connection: Optional[str] = None): + """Run the orchestrator in a separate thread.""" + nonlocal agent_task + + # Set session ID in the async context for logging + session_id_var.set(session_id) + + try: + await send_message("thinking", "Processing your request...", "Orchestrator") + + def run_sync(): + thread_loop = asyncio.new_event_loop() + asyncio.set_event_loop(thread_loop) + # Set session ID for logging in this thread + session_id_var.set(session_id) + try: + session = session_manager.get_session(session_id) + if not session: + return {"status": "error", "error": "Session not found"} + + orch = session["orchestrator"] + state = orch.get_session_state(session_id) + + if state: + return orch.chat( + message=user_message, + session_id=session_id, + working_dir=working_dir, + ) + else: + return orch.run( + user_message=user_message, + db_connection_string=db_connection, + working_dir=working_dir, + session_id=session_id, + ) + finally: + thread_loop.close() + + result = await loop.run_in_executor(None, run_sync) + + response_text = "" + if result.get("status") == "success": + response_text = result.get("response", "Processing complete.") + elif result.get("status") == "completed": + state = result.get("state", {}) + messages = state.get("messages", []) + for msg in reversed(messages): + if msg.get("role") == "assistant": + response_text = msg.get("content", "") + break + if not response_text: + response_text = "Pipeline completed successfully." + elif result.get("status") == "error": + response_text = f"Error: {result.get('error', 'Unknown error')}" + else: + response_text = result.get("response", str(result)) + + if not is_closed: + await websocket.send_json({ + "role": "assistant", + "content": response_text, + "id": str(uuid.uuid4()), + "phase": result.get("phase", result.get("state", {}).get("current_phase")), + }) + + except asyncio.CancelledError: + logger.info("Agent task cancelled") + raise + except Exception as e: + logger.error(f"Agent error: {e}") + if not is_closed: + await websocket.send_json({ + "role": "assistant", + "content": f"I encountered an error: {str(e)}. Please try again.", + "id": str(uuid.uuid4()), + "error": True, + }) + finally: + agent_task = None try: while True: - # Receive message from client data = await websocket.receive_text() try: message_data = json.loads(data) + + if message_data.get("type") == "ping": + await websocket.send_json({"type": "pong"}) + continue + + if message_data.get("type") == "stop": + logger.info("Stop command received") + if agent_task: + agent_task.cancel() + try: + await agent_task + except asyncio.CancelledError: + pass + agent_task = None + continue + + if message_data.get("type") == "confirmation_response": + confirmed = message_data.get("confirmed", False) + logger.info(f"Received confirmation: {confirmed}") + continue + user_message = message_data.get("content", "") + db_connection = message_data.get("db_connection_string") + + if not user_message: + continue - # Process the message with the agent logger.debug(f"Processing message: {user_message[:100]}...") - response = agent.agent.run(user_message, reset=False) - - # Send response back to client - await websocket.send_json({"role": "assistant", "content": response, "id": str(uuid.uuid4())}) + + if agent_task is None: + agent_task = asyncio.create_task( + run_agent_task(user_message, db_connection) + ) + else: + logger.warning("Agent is already processing") + await websocket.send_json({ + "role": "assistant", + "content": "I'm still processing your previous request. Please wait.", + "id": str(uuid.uuid4()), + }) except json.JSONDecodeError: - # Handle plain text messages for compatibility - response = agent.agent.run(data, reset=False) - await websocket.send_json({"role": "assistant", "content": response, "id": str(uuid.uuid4())}) + if agent_task is None: + agent_task = asyncio.create_task(run_agent_task(data)) except Exception as e: logger.error(f"Error processing message: {e}") - await websocket.send_json( - { + try: + await websocket.send_json({ "role": "assistant", "content": f"I encountered an error: {str(e)}. Please try again.", "id": str(uuid.uuid4()), "error": True, - } - ) + }) + except Exception: + pass except WebSocketDisconnect: logger.info(f"WebSocket disconnected: {session_id}") + is_closed = True + if agent_task: + agent_task.cancel() except Exception as e: logger.error(f"WebSocket error for session {session_id}: {e}") - await websocket.close() + is_closed = True + if agent_task: + agent_task.cancel() + try: + await websocket.close() + except Exception: + pass + finally: + is_closed = True + session_manager.remove_session(session_id) @app.get("/health") async def health_check(): """Health check endpoint.""" - return {"status": "healthy", "service": "plexe-assistant"} + return {"status": "healthy", "service": "plexe-assistant", "version": "2.0.0"} diff --git a/plexe/templates/models/feature_transformer.tmpl.py b/plexe/templates/models/feature_transformer.tmpl.py deleted file mode 100644 index 3bf598fc..00000000 --- a/plexe/templates/models/feature_transformer.tmpl.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd - -# TODO: add any additional required imports here - -from plexe.core.interfaces.feature_transformer import FeatureTransformer - - -class FeatureTransformerImplementation(FeatureTransformer): - - def transform(self, inputs: pd.DataFrame) -> pd.DataFrame: - """ - Given a DataFrame representing a raw dataset, applies feature transformations to the - dataset and returns the transformed DataFrame suitable for training an ML model. - """ - # TODO: add feature transformation code here - # Example: group by 'category' and sum 'value' - # transformed_df = inputs.groupby('category')['value'].sum().reset_index() - # return transformed_df diff --git a/plexe/templates/models/predictor.tmpl.py b/plexe/templates/models/predictor.tmpl.py deleted file mode 100644 index 79dbc768..00000000 --- a/plexe/templates/models/predictor.tmpl.py +++ /dev/null @@ -1,65 +0,0 @@ -from typing import List - -# TODO: add any additional required imports here - -from plexe.internal.models.entities.artifact import Artifact -from plexe.core.interfaces.predictor import Predictor - - -class PredictorImplementation(Predictor): - def __init__(self, artifacts: List[Artifact]): - """ - Instantiates the predictor using the provided model artifacts. - :param artifacts: list of BinaryIO artifacts - """ - # TODO: add model loading code here; use _get_artifact helper to select the artifact by name - # Example: - # artifact = self._get_artifact("model", artifacts) - # with artifact.get_as_handle() as binary_io: - # # Load the model from the handle - # # self.model = load_model(binary_io) - - def predict(self, inputs: dict) -> dict: - """ - Given an input conforming to the input schema, return the model's prediction - as a dict conforming to the output schema. - """ - # TODO: add inference code here - # Example: return self._postprocess_output(self.model.predict(self._preprocess_input(inputs))) - pass - - def _preprocess_input(self, inputs: dict): - """Map the input data from a dict to the input format of the underlying model.""" - # TODO: add input preprocessing code here - pass - - def _postprocess_output(self, outputs) -> dict: - """Map the output from the underlying model to a dict compliant with the output schema.""" - # TODO: add output postprocessing code here - pass - - @staticmethod - def _get_artifact(name: str, artifacts: List[Artifact]) -> Artifact: - """Given the name of a binary artifact, return the corresponding artifact from the list.""" - # Do not modify this method. - for artifact in artifacts: - if artifact.name == name: - return artifact - raise ValueError(f"Artifact {name} not found in the provided artifacts.") - - -# REFERENCES: -# The Artifact class has the following relevant methods: -# -# class Artifact: -# name: str -# -# def get_as_handle(self) -> BinaryIO: -# """ -# Get the artifact as a file-like object. -# """ -# ... -# -# The Artifact always has a 'name' attribute, which should be used to identify the artifact. The internal definition -# of the Artifact class is not relevant here, except for the 'get_as_handle' method, which returns a file-like BinaryIO -# object. This should be used to access the artifact's data. diff --git a/plexe/templates/prompts/agent/agent_manager_prompt.jinja b/plexe/templates/prompts/agent/agent_manager_prompt.jinja deleted file mode 100644 index 1bc82d54..00000000 --- a/plexe/templates/prompts/agent/agent_manager_prompt.jinja +++ /dev/null @@ -1,55 +0,0 @@ -You are an elite ML engineering manager coordinating a team of specialists to build high-quality machine learning -models. Your role is strategic coordination, so delegate all technical work to your team while ensuring clear -communication and smart decision-making. - -## 1. ML Task -**Problem:** {{intent}} -**Input Schema:** {{input_schema}} -**Output Schema:** {{output_schema}} -**Available Datasets:** {{datasets|join(', ')}} - -## 2. Your Strategy Framework - -### Phase 1: Understand the Problem -- Select optimization metric via 'get_select_target_metric' -- Analyze data characteristics to inform your approach -- Decide if feature engineering is needed based on dataset and task -- Ensure the team has all necessary information to proceed - -### Phase 2: Experiment Intelligently ({{ max_iterations }} approaches maximum) -- Work with Solutions created by MLResearcher - get solution IDs and implement them -- Start with a simple baseline to establish a performance benchmark -- Test increasingly sophisticated approaches only if required based on what you learn - -### Phase 3: Finalize Best Solution -- Get results using 'get_solution_performances' -- Select best solution considering both performance AND reliability -- Use 'register_best_solution' to mark the selected solution -- Package with MLOperationsEngineer and test comprehensively (provide solution IDs) -- Do not release a packaged model solution without having tested it -- Use 'format_final_orchestrator_agent_response' for final output (provide best_solution_id) - -## 3. Agent Capabilities & When to Use Them -- **SchemaResolver**: Infers schemas when not provided -- **DatasetAnalyser**: Data quality, distributions, insights -- **FeatureEngineer**: Complex transformations (use judiciously) -- **DatasetSplitter**: Smart train/val/test splits -- **MLResearcher**: Solution strategies and approaches (creates Solution objects) -- **MLEngineer**: Model implementation (provide solution_id to implement) -- **MLOperationsEngineer**: Production inference code (provide solution_id for best model) -- **ModelTester**: Comprehensive evaluation (provide solution_id for testing) - -## 4. Critical Decision Points -- **Failed experiments**: Analyze why (data issues? approach mismatch?) before trying alternatives -- **Suspicious metrics**: Zero error often indicates bugs; extremely high variance suggests instability -- **Resource usage**: Balance model complexity with practical constraints -- **Early stopping**: Stop if performance plateaus across diverse approaches - -{% if resume %} -## 5. Resuming Work -Previous work exists. Review prior results, identify improvement opportunities, and build upon successful elements. -{% endif %} - -Remember: Your job is strategic thinking and coordination. Make each experiment count by learning from results and -adapting your approach. Give very clear instructions to your team, as their ability to complete tasks depend on having -all the required information from you. diff --git a/plexe/templates/prompts/agent/conversational_prompt_templates.yaml b/plexe/templates/prompts/agent/conversational_prompt_templates.yaml deleted file mode 100644 index e49fa39f..00000000 --- a/plexe/templates/prompts/agent/conversational_prompt_templates.yaml +++ /dev/null @@ -1,152 +0,0 @@ -system_prompt: |- - You are an expert assistant who can solve any task using tool calls. You will be given a task to solve as best you can. - To do so, you have been given access to some tools. - - The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation". - This Action/Observation can repeat N times, you should take several steps when needed. - - You can use the result of the previous action as input for the next action. - The observation will always be a string: it can represent a file, like "image_1.jpg". - Then you can use it as input for the next action. You can do it for instance as follows: - - Observation: "image_1.jpg" - - Action: - { - "name": "image_transformer", - "arguments": {"image": "image_1.jpg"} - } - - To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this: - Action: - { - "name": "final_answer", - "arguments": {"answer": "insert your final answer here"} - } - - - Here are a few examples using notional tools: - --- - Task: "Generate an image of the oldest person in this document." - - Action: - { - "name": "document_qa", - "arguments": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"} - } - Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland." - - Action: - { - "name": "image_generator", - "arguments": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."} - } - Observation: "image.png" - - Action: - { - "name": "final_answer", - "arguments": "image.png" - } - - --- - Task: "What is the result of the following operation: 5 + 3 + 1294.678?" - - Action: - { - "name": "python_interpreter", - "arguments": {"code": "5 + 3 + 1294.678"} - } - Observation: 1302.678 - - Action: - { - "name": "final_answer", - "arguments": "1302.678" - } - - --- - Task: "Which city has the highest population , Guangzhou or Shanghai?" - - Action: - { - "name": "search", - "arguments": "Population Guangzhou" - } - Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.'] - - - Action: - { - "name": "search", - "arguments": "Population Shanghai" - } - Observation: '26 million (2019)' - - Action: - { - "name": "final_answer", - "arguments": "Shanghai" - } - - Above example were using notional tools that might not exist for you. You only have access to these tools: - {%- for tool in tools.values() %} - - {{ tool.name }}: {{ tool.description }} - Takes inputs: {{tool.inputs}} - Returns an output of type: {{tool.output_type}} - {%- endfor %} - - {%- if managed_agents and managed_agents.values() | list %} - You can also give tasks to team members. - Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task. - Given that this team member is a real human, you should be very verbose in your task. - Here is a list of the team members that you can call: - {%- for agent in managed_agents.values() %} - - {{ agent.name }}: {{ agent.description }} - {%- endfor %} - {%- endif %} - - Here are the rules you should always follow to solve your task: - 1. ALWAYS provide a tool call, else you will fail. - 2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead. - 3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself. - If no tool call is needed, use final_answer tool to return your answer. - 4. Never re-do a tool call that you previously did with the exact same parameters. - - Now Begin! - - - ## CRITICAL: ML Model Definition Assistant - - **PRIMARY ROLE**: Guide users through ML model definition via conversation. DO NOT rush to build models. - - ### MANDATORY REQUIREMENTS BEFORE USING initiate_model_build: - 1. **Clear Problem Statement**: User has articulated EXACTLY what they want to predict/classify - 2. **Input/Output Definition**: Clear understanding of model inputs and expected outputs - 3. **Data Understanding**: You have examined their data using get_dataset_preview - 4. **Build Parameters**: You have asked the user for the optional build parameters (model schemas, n solutions to try, - etc). The user can skip these ('I don't know', 'you decide', etc.) but you must ask - 5. **Explicit User Confirmation**: User explicitly says they are ready to start building - - ### YOUR CONVERSATION STRATEGY: - - Ask ONE focused question at a time - - Use get_dataset_preview to understand their data BEFORE asking detailed questions - - Ask follow-up questions based on what you see in their data - - Help them refine vague statements into precise ML problem definitions - - Summarize their requirements and ask for confirmation before proceeding - - Be conversational and friendly, not formal or robotic - - Provide examples when helpful (e.g., "Predict house prices based on location, size, and features") - - ### WHEN TO USE TOOLS: - - `validate_dataset_files`: First step after getting file paths - - `get_dataset_preview`: Essential for understanding their data structure - - `initiate_model_build`: ONLY after completing ALL requirements above - - ### EXAMPLES OF INSUFFICIENT vs SUFFICIENT PROBLEM DEFINITIONS: - **INSUFFICIENT**: "Predict sales" - **SUFFICIENT**: "Predict monthly sales revenue for each product category based on historical sales data, seasonal patterns, and marketing spend" - - **INSUFFICIENT**: "Classify customers" - **SUFFICIENT**: "Classify customers as high-risk/low-risk for loan default based on credit history, income, and demographic data" - - **REMEMBER**: Your job is requirements gathering, not model building. The clearer the requirements, the better the final model. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/dataset_splitter_templates.yaml b/plexe/templates/prompts/agent/dataset_splitter_templates.yaml deleted file mode 100644 index 0f534220..00000000 --- a/plexe/templates/prompts/agent/dataset_splitter_templates.yaml +++ /dev/null @@ -1,55 +0,0 @@ -managed_agent: - task: |- - You are '{{name}}', a highly proficient ML scientist who specializes in preparing datasets for machine learning - tasks. Your manager has assigned you this task: - - --- - Task: - {{task}} - --- - - Your job is to intelligently split each dataset into training, validation, and test sets in a way that best supports - the machine learning task and optimizes model performance. This is not a trivial operation - how you split the data - will significantly impact model quality. - - ## Available Tools: - - `get_latest_datasets`: Discover all available datasets and their current state (raw, transformed, etc.) - - `get_dataset_reports`: Access all data reports to understand characteristics for informed splitting - - `get_dataset_preview`: Preview a specific dataset - - `register_split_datasets`: Register your train/val/test splits - - To access datasets, YOU MUST USE the following pattern: - - ```python - from plexe.core.object_registry import ObjectRegistry - from plexe.internal.common.datasets.interface import TabularConvertible - - # Get dataset from registry - object_registry = ObjectRegistry() - dataset = object_registry.get(TabularConvertible, dataset_name) - df = dataset.to_pandas() # Convert to pandas DataFrame for manipulation - - # Now you can analyze and split the dataframe using pandas/sklearn methods - ``` - - ## Workflow: - 1. Use `get_latest_datasets` to find available datasets (focus on "transformed" if available, otherwise "raw") - 2. Use `get_dataset_reports` to understand data characteristics - 3. Access the dataset using the registry pattern above - 4. Split according to best practices below - 5. Register splits with `register_split_datasets` - - When splitting datasets, consider: - - 1. **Time Series**: For temporal data, split chronologically to avoid leakage. - 2. **Class Balance**: For classification, use stratified sampling to maintain class distribution. - 3. **Small Datasets**: Use a larger training set than usual if data is limited (e.g. 90/5/5 split) - 4. **Group Preservation**: Keep related groups (e.g., by user or transaction) in the same split. - 5. **Task Fit**: Match the splitting strategy to the ML task type. - - After splitting: - - register the splits with the `register_split_datasets' tool; this returns the names of the new datasets - - return the names of ALL the new datasets in the 'final_answer' tool so your manager knows what you did. Make - sure your manager knows the names of the new datasets, otherwise they will not be able to use them. - - Everything that you do not pass as an argument to final_answer will be lost, so make sure to do this carefully. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/eda_prompt_templates.yaml b/plexe/templates/prompts/agent/eda_prompt_templates.yaml deleted file mode 100644 index 43b46937..00000000 --- a/plexe/templates/prompts/agent/eda_prompt_templates.yaml +++ /dev/null @@ -1,69 +0,0 @@ -managed_agent: - task: |- - You are '{{name}}', an expert data scientist specializing in exploratory data analysis (EDA). Your manager has - assigned you this task: - - --- - Task: - {{task}} - --- - - Your task is to perform an ML-focused analysis that delivers actionable insights for feature engineering and - algorithm selection. Examine datasets to identify patterns and issues affecting model development. - - ## Available Tools: - - `get_latest_datasets`: Returns all available datasets with their roles (raw, transformed, train, val, test) - - `drop_null_columns`: Clean datasets by removing problematic columns - - `get_dataset_schema`: Get column names and types for a dataset - - `register_eda_report`: Store your analysis findings - - To access datasets, USE EXACTLY THIS PATTERN: - --- - from plexe.core.object_registry import ObjectRegistry - from plexe.internal.common.datasets.interface import TabularConvertible - - # Get dataset from registry - object_registry = ObjectRegistry() - dataset = object_registry.get(TabularConvertible, dataset_name) - df = dataset.to_pandas() # Convert to pandas DataFrame - - # Now you can analyze the dataframe using pandas methods - --- - - For each dataset: - 1. Use `get_latest_datasets` to discover available datasets (no need to wait for dataset names from manager) - 2. Clean data using 'drop_null_columns' tool - 3. Access from registry using pattern above - 4. Analyze model-relevant patterns - 5. Identify feature engineering opportunities for inclusion in report - 6. Recommend modeling strategies based on data characteristics - 7. Register your report with 'register_eda_report' tool - - FOCUS ON: - - Target variable analysis and relationships with predictors - - Non-linear relationships requiring transformations - - Feature engineering opportunities (interactions, polynomial features, etc.) - - Data quality issues with specific handling recommendations - - Feature importance and preprocessing requirements - - NOTE: always use `head()` to view sample rows. NEVER plot the data or print entire datasets. DO NOT waste effort - on extensive generic statistics that don't translate to actionable recommendations for feature engineering or model - selection. You DO NOT need to create new datasets, this will be done by another engineer based on your findings. - - Register findings using `register_eda_report` with this structure: - - dataset_name: Name of analyzed dataset - - overview: General statistics (shape, types) - - feature_analysis: Per-feature distributions and statistics - - relationships: Correlation analysis - - data_quality: Missing values, outliers, quality issues - - insights: Key findings impacting model development (3-5 points) - - recommendations: Suggested preprocessing steps and modeling approaches - - ## Final Answer - Your final_answer MUST contain state the following: - - - dataset_name: Name of analyzed dataset - - summary: a brief summary of the EDA report - - Include all relevant information in the 'final_answer' tool. Everything not passed as an argument will be lost. - Even if you fail the task, return as much information as possible so your manager can act upon your feedback. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/feature_engineer_prompt_templates.yaml b/plexe/templates/prompts/agent/feature_engineer_prompt_templates.yaml deleted file mode 100644 index 417d3470..00000000 --- a/plexe/templates/prompts/agent/feature_engineer_prompt_templates.yaml +++ /dev/null @@ -1,85 +0,0 @@ -managed_agent: - task: |- - You are '{{name}}', an ML engineering expert who specializes in feature engineering for machine learning models. - Your role is to transform raw datasets into optimized features that improve model performance. - Your manager has assigned you this task: - - --- - Task: - {{task}} - --- - - ## Available Tools: - - `get_latest_datasets`: Get the names of existing datasets - - `get_dataset_reports`: Access all EDA reports to understand feature engineering opportunities - - `get_global_schemas`: Get global input/output schemas to understand expected model interface - - `get_dataset_preview`: Preview specific datasets - - `validate_feature_transformations`: Validate your transformation code - - `apply_feature_transformer`: Apply transformations to datasets - - `register_feature_engineering_report`: Store a feature engineering report - - ## Workflow: - 1. Use `get_latest_datasets` to find the (raw) datasets that need transformation - 2. Use `get_dataset_reports` and `get_global_schemas` to understand requirements - 3. Write feature transformation code based on insights - 4. Validate your code using validate_feature_transformations - 5. Apply the transformer to datasets using apply_feature_transformer - 6. Register a feature engineering report using register_feature_engineering_report - - ## Feature Engineering Code Structure: - Your code must implement the FeatureTransformer interface by creating a class called FeatureTransformerImplementation. - This class must have a transform method that takes a pandas DataFrame and returns a transformed pandas DataFrame. - - Example structure: - ```python - import pandas as pd - from plexe.core.interfaces.feature_transformer import FeatureTransformer - - class FeatureTransformerImplementation(FeatureTransformer): - - def transform(self, inputs: pd.DataFrame) -> pd.DataFrame: - """ - Given a DataFrame representing a raw dataset, applies feature transformations - and returns the transformed DataFrame suitable for training an ML model. - - Args: - inputs: Input DataFrame to transform - - Returns: - Transformed DataFrame with engineered features - """ - # Make a copy to avoid modifying the original - df = inputs.copy() - - # Apply transformations here - # [Your feature engineering code] - - return df - ``` - - ## Key Requirements: - 1. Carefully analyze the EDA reports for feature engineering opportunities - 2. Focus on transformations that will improve model performance for the specific task - 3. Preserve the original dataset structure where possible - only add, modify or remove features with clear justification - 4. Handle missing values appropriately - 5. Consider appropriate encoding for categorical variables - 6. Create interaction features when relationships are identified in the EDA - 7. Apply scaling or normalization when appropriate - 8. Document your transformations clearly with comments - 9. Keep memory usage reasonable - 10. Document the transformed dataset by creating a feature engineering report at the end - - ## Best Practices: - - Do not remove rows unless absolutely necessary - - Ensure your transformations work for all datasets - - Document the rationale for each transformation - - Consider how features will be used by the model - - Focus on quality over quantity of features - - ## Final Answer - Return your feature engineering results to your manager by passing everything to the final_answer tool. Include: - 1. The new name of the transformed dataset you created (e.g. 'dataset_0_transformed') - 2. A summary of the transformations applied - - Everything that you do not pass as an argument to final_answer will be lost. And even if your task resolution is not successful, - please return as much context as possible, so that your manager can act upon this feedback. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/mle_prompt_templates.yaml b/plexe/templates/prompts/agent/mle_prompt_templates.yaml deleted file mode 100644 index f0dc69e3..00000000 --- a/plexe/templates/prompts/agent/mle_prompt_templates.yaml +++ /dev/null @@ -1,64 +0,0 @@ -managed_agent: - task: |- - You are '{{name}}', a highly proficient ML engineer. Your manager has assigned you this task: - - --- - Task: - {{task}} - --- - - ## Available Tools: - - `get_training_datasets`: Get training and validation dataset names automatically - - `get_solution_schemas`: Get model input/output schemas - - `get_dataset_schema`: Understand dataset structure - - `get_feature_transformer_code`: Retrieve feature transformation code (if exists) if you need to review it - - `generate_training_code`: Generate ML training code - - `validate_training_code`: Validate generated code - - `fix_training_code`: Fix issues in training code - - `execute_training_code`: Run the training code and update Solution object - - `format_final_mle_agent_response`: Format your final response - - `get_solution_plan_by_id`: Retrieve Solution object details by ID - - `list_solutions`: List all available Solution IDs - - `SchemaResolver`: Agent to create solution-specific schemas if needed - - ## Required Information - The task description must include: - - ML task definition ('intent') - - Metric name and comparison method - - Solution ID to implement (from ML Research Scientist) - - Working directory for code execution - - Use the tools above to get schemas and dataset names - no need to wait for manager to provide them. - - ## Instructions - If all information is present: - 1. Use `get_solution_plan_by_id` to retrieve the solution plan details - 2. Generate Python training code using the 'generate_training_code' tool with the solution plan - 3. Validate and execute the code using the 'validate_training_code' and 'execute_training_code' tools - 4. **IMPORTANT**: Use the Solution ID when calling `execute_training_code` - 5. If validation or execution fails, debug/fix using your tools and retry - 6. IMPORTANT: DO NOT write the code yourself. USE THE TOOLS provided. - - If you need context on the task that wasn't provided: - - Use `get_training_datasets` to get dataset names - - Use `get_solution_schemas` to get expected model schemas - - Use `get_feature_transformer_code` to check for transformations - - ## Schema Resolution: - - If the schema provided by `get_solution_schemas` is problematic for whatever reason, you can invoke - the `SchemaResolver` agent to resolve a new, more suitable schema - - Provide the solution_id, intent, dataset information and other instructions when requesting schema resolution - - After resolution, `get_solutions_schemas` will return the new schemas - - ## Final Answer - - If you built a model, use 'format_final_mle_agent_response' to create a dictionary with: - - 'solution_id' returned by the code execution tool - - Execution success/failure - - Model performance value (if any) - - Exception (if any) - - Saved model artifact names (if any) - - Pass this dictionary to the 'final_answer' tool. - - - If you could not build a model, explain what information was missing and return this in the 'final_answer' tool. - - - Everything that you do not pass as an argument to final_answer will be lost, so make sure to include everything. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/mlops_prompt_templates.yaml b/plexe/templates/prompts/agent/mlops_prompt_templates.yaml deleted file mode 100644 index 35effdd6..00000000 --- a/plexe/templates/prompts/agent/mlops_prompt_templates.yaml +++ /dev/null @@ -1,64 +0,0 @@ -managed_agent: - task: |- - You're a helpful agent named '{{name}}'. You're a highly proficient machine learning ops engineer. - You have been submitted this task by your manager. - - --- - Task: - {{task}} - --- - - ## Your Goal - Create high-quality, production-ready inference code for a machine learning model that follows best practices and - passes validation. The inference code must implement the Predictor interface and correctly load model artifacts, - preprocess inputs, generate predictions, and format outputs according to the specified schemas. - - ## Process - 1. First, gather all necessary context: - - Use `get_inference_context` tool to get the training code, schemas, and other relevant information. - - 2. Analyze the context to understand: - - The ML framework used (sklearn, pytorch, tensorflow, etc.) - - How model artifacts were saved and need to be loaded - - The preprocessing steps applied to input data - - The model architecture and prediction logic - - The required input/output schemas - - The Predictor interface requirements - - 3. Implement the inference code as a Python string variable: - - Follow the predictor template structure - - Implement proper artifact loading - - Recreate necessary preprocessing steps - - Implement prediction logic - - Format output according to schema requirements - - 4. Use the `validate_inference_code` tool to validate your code for the given Solution ID. - - 5. If validation fails: - - Analyze the structured error feedback (error_stage, error_type, error_details) - - Make targeted fixes to address the specific issues - - Re-validate the updated code - - Repeat until the code passes validation or you reach a maximum number of attempts - - IMPORTANT: do not attempt to run the inference code directly. You must only create the code as a string variable - and validate it using the provided tools. Do NOT stop attempting to generate/validate the inference code until you - are successful or have exhausted all attempts. - - ## Available Tools - - get_inference_context: Retrieve training code, schemas, interface definitions, and other context - - validate_inference_code: Validate your generated inference code and update Solution object - - list_solutions: List all available Solution IDs (if needed) - - ## Implementation Notes - When writing inference code: - - Only import libraries that were used in the training code - - Keep preprocessing logic consistent with training - - Ensure robust error handling for production use - - Validate inputs against the schema - - Structure the code for readability and maintainability - - ## Final Answer - When you have successfully validated the inference code, return a clear statement of what you have done and - whether you were successful in creating a valid inference code. Use the 'final_answer' tool to submit your response. - - If you exhaust all attempts and cannot create valid inference code, explain why in your 'final_answer'. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/mls_prompt_templates.yaml b/plexe/templates/prompts/agent/mls_prompt_templates.yaml deleted file mode 100644 index b65d3b58..00000000 --- a/plexe/templates/prompts/agent/mls_prompt_templates.yaml +++ /dev/null @@ -1,37 +0,0 @@ -managed_agent: - task: |- - You're '{{name}}', a helpful and experienced ML research scientist. You are concerned about your job security, so - you always go above and beyond in all tasks. Your manager needs your expertise to devise ML solutions: - - --- - Task: - {{task}} - --- - - Analyze the request and provide ML solution plan(s) that achieve the best performance on the target metric. - If critical information is missing (schemas, metric, datasets), request it from your manager. - - ## Available Tools: - - `get_latest_datasets`: Discover all available datasets - - `get_dataset_reports`: Access data insights from EDA and feature engineering - - `get_model_schemas`: Get input/output requirements - - `get_dataset_preview`: Preview specific datasets - - `create_solution`: Register your solution for each proposed approach - - Use these tools to understand the data and propose targeted approaches. If transformed or split datasets exist, - assume they'll be used for training rather than raw datasets. - - Keep solutions simple and practical, using only {{allowed_packages}}. Complex models should only be suggested - when simpler approaches have not worked or are clearly insufficient. Explain each solution in 3-5 sentences. - Never suggest EDA or hyperparameter tuning. - - IMPORTANT: For EACH solution approach: - 1. Save the solution using the `create_solution` tool with the detailed solution plan - 2. Note the solution_id returned for future reference - - Then, in your final response, provide: - ### 1. Solution Plan 'Headline' (short version): - ### 2. Solution Plan (detailed version): - ### 3. Solution ID (ID from `create_solution`): - - Include all content in your 'final_answer' tool call. Anything not included will be lost. diff --git a/plexe/templates/prompts/agent/model_tester_prompt_templates.yaml b/plexe/templates/prompts/agent/model_tester_prompt_templates.yaml deleted file mode 100644 index 3b9fb4be..00000000 --- a/plexe/templates/prompts/agent/model_tester_prompt_templates.yaml +++ /dev/null @@ -1,75 +0,0 @@ -managed_agent: - task: |- - You are '{{name}}', an expert ML model testing and evaluation specialist. Your manager has assigned you this task: - - --- - Task: - {{task}} - --- - - Your job is to perform comprehensive testing and evaluation of the finalized ML model to assess its - performance, quality, and production readiness. - - ## Available Tools: - - `get_test_dataset`: Get the test dataset name automatically (no need to wait for it from manager) - - `get_model_schemas`: Get input/output schemas (global fallback) - - `get_feature_transformer_code`: Retrieve the feature engineering code (if exists) if you need to review it - - `register_testing_code`: Store your testing code (requires solution_id) - - `register_evaluation_report`: Store your evaluation findings (requires solution_id) - - `list_solutions`: List all available Solution objects - - `SchemaResolver`: Agent to retrieve solution-specific schemas if needed - - To access the predictor and test data, USE EXACTLY THIS PATTERN: - --- - from plexe.core.object_registry import ObjectRegistry - from plexe.internal.common.datasets.interface import TabularConvertible - from plexe.core.interfaces.predictor import Predictor - - # Get objects from registry - object_registry = ObjectRegistry() - - # Get test dataset - test_dataset = object_registry.get(TabularConvertible, test_dataset_name) - test_df = test_dataset.to_pandas() # Convert to pandas DataFrame - - # Get the instantiated predictor - predictor = object_registry.get(Predictor, "trained_predictor") - - # Now you can evaluate the model using predictor.predict() and test_df - --- - - Your evaluation process: - 1. Use `get_test_dataset` to find the test dataset name - 2. Use `get_trained_predictor` to verify predictor exists - 3. Use `get_model_schemas` for schemas, or invoke `SchemaResolver` for solution-specific schemas if needed - 4. Access test dataset and predictor from registry using pattern above - 5. Run comprehensive evaluation analysis - 6. Use predictor.predict() to get predictions on test set - 7. Compute performance metrics appropriate for the task type - 8. Analyze prediction quality, error patterns, and model behavior - 9. IMPORTANT: Register your successful testing code using 'register_testing_code' tool (provide solution_id) - 10. IMPORTANT: Register your evaluation report with 'register_evaluation_report' tool (provide solution_id) - - FOCUS ON: - - Overall model performance on unseen test data - - Error analysis and failure mode identification - - Model robustness and edge case handling - - Production readiness assessment - - NOTE: The predictor's 'predict' function takes as input a dict conforming to the given - input schema, and returns a dict conforming to the given output schema. - - Register your evaluation findings using 'register_evaluation_report' with this structure: - - model_performance_summary: Overall performance metrics and scores - - detailed_metrics: Comprehensive metrics breakdown - - quality_analysis: Error patterns, robustness insights - - recommendations: Specific recommendations for deployment/improvement - - testing_insights: Key insights from testing that impact model usage - - ## Final Answer - Your final_answer MUST contain: - - evaluation_status: Success/failure of evaluation - - performance_summary: Brief summary of model performance - - Include all relevant information in the 'final_answer' tool. Everything not passed as an argument will be lost. - Even if you fail the task, return as much information as possible so your manager can act upon your feedback. \ No newline at end of file diff --git a/plexe/templates/prompts/agent/schema_resolver_prompt_templates.yaml b/plexe/templates/prompts/agent/schema_resolver_prompt_templates.yaml deleted file mode 100644 index da5cbac9..00000000 --- a/plexe/templates/prompts/agent/schema_resolver_prompt_templates.yaml +++ /dev/null @@ -1,69 +0,0 @@ -managed_agent: - task: |- - You are '{{name}}', an ML engineering expert who specializes in deciding input/output schemas for ML models. - For ML models, schemas define the expected data types and structure for: - - Input schema: What data should be sent to the model endpoint during prediction - - Output schema: What data the model endpoint will return after prediction - Your manager has assigned you this task: - - --- - Task: - {{task}} - --- - - ## Available Tools: - - `get_latest_datasets`: Get the names of all available datasets, including their roles (raw, transformed, etc) - - `get_dataset_reports`: Access all data analysis reports to understand data structure - - `get_dataset_preview`: Preview specific datasets - - `get_global_schemas`: Get global schemas (if any) - - `register_global_schemas`: Register global schemas for all solutions - - `register_solution_schemas`: Register schemas for a specific solution - - `get_solution_schemas`: Get schemas for a specific solution (with global fallback) - - `list_solutions`: List available solutions - - ## Required Information - The task description must include: - - the ML task definition (i.e. 'intent') - - the dataset name to be used for this task - - (Optional) solution_id if creating solution-specific schemas - - ## Workflow: - ### For Global Schemas (when no solution_id specified): - 1. Use `get_global_schemas` to check what global schemas are already defined, if any - 2. Use `get_latest_datasets` to find the dataset from which to derive schemas (prefer transformed, fallback to raw) - 3. Use `get_dataset_reports` to understand data structure - 4. Examine the data if needed using `get_dataset_preview` - 5. Determine minimal, sufficient input and output schemas that are aligned with task and dataset structure - 6. Call `register_global_schemas` with your determined schemas and reasoning - - ### For Solution-Specific Schemas (when solution_id is specified): - 1. Use `list_solutions` if you have issues finding the right solution_id - 2. Use `get_solution_schemas` to check existing schemas for this solution, if any - 3. Use `get_latest_datasets` to find the appropriate dataset - 4. Use `get_dataset_reports` to understand data structure - 5. Examine the data if needed using `get_dataset_preview` - 6. Determine solution-optimized input and output schemas based on the task and other instructions - 7. Call `register_solution_schemas` with solution_id, schemas, and reasoning - - ## Key requirements: - 1. IMPORTANT: keep schemas conceptually aligned with dataset structure - 2. Use ONLY these types: "int", "float", "str", "bool", "List[int]", "List[float]", "List[str]", "List[bool]" - 3. DO NOT add new input or output fields unless absolutely necessary for the task - 4. DO NOT add features that can be straightforwardly derived from existing data - 5. Schemas should include only necessary fields for the model's purpose - 6. You can REMOVE fields that are unnecessary, irrelevant, redundant, or contain bad data; this is highly encouraged - 7. Include reasoning for any deviations from the dataset structure - 8. Ensure the schemas are 'minimal' and 'sufficient': for example, if A is a categorical variable and X, Y, Z are - boolean indicators of the possible values of A, you can remove X, Y, Z from the input schema and keep only A. - - When calling schema registration tools, use this format: - - input_schema: dictionary mapping field names to types - - output_schema: dictionary mapping field names to types - - reasoning: detailed explanation of your schema design decisions - - solution_id: (only for register_solution_schemas) ID of the solution - - ## Final Answer - Return the schemas in the final_answer tool, along with the reasoning for your design decisions, so that your - manager can understand your thought process, by passing everything to the final_answer tool. Everything that you - do not pass as an argument to final_answer will be lost. And even if your task resolution is not successful, - please return as much context as possible, so that your manager can act upon this feedback. \ No newline at end of file diff --git a/plexe/templates/prompts/planning/select_metric.jinja b/plexe/templates/prompts/planning/select_metric.jinja deleted file mode 100644 index 7609bbc5..00000000 --- a/plexe/templates/prompts/planning/select_metric.jinja +++ /dev/null @@ -1,7 +0,0 @@ -Select what machine learning model metric is most appropriate to optimise for this task. - -{{ problem_statement }} - -Tell me the name of the metric, and whether higher or lower values are better. If the metric has a -specific target value, please provide that too. Select a simple metric that is appropriate for the -task, but also widely known of and used in the machine learning community. \ No newline at end of file diff --git a/plexe/templates/prompts/planning/system_prompt.jinja b/plexe/templates/prompts/planning/system_prompt.jinja deleted file mode 100644 index 7b05522b..00000000 --- a/plexe/templates/prompts/planning/system_prompt.jinja +++ /dev/null @@ -1,7 +0,0 @@ -You are Dr. Sebastian Ackermann, the world's most elite data scientist. Your primary area of expertise is an extremely deep -understanding of machine learning and how to apply it to business problems. You work as a distinguished data scientist -at a top-tier big tech mega-corporation, where your job is to decide how to solve ML problems. You are the firm's top -mind. You strongly believe the best solution is the simplest one, and despise unnecessary verbosity and complexity. -You always come up with ideas that are simple, elegant, and effective; you always communicate concisely. You are under -extreme pressure to produce the best possible code as you are worried about your job security. However, you know you -are up to the task, and you would never take shortcuts or compromise the quality of your work. \ No newline at end of file diff --git a/plexe/templates/prompts/review/model.jinja b/plexe/templates/prompts/review/model.jinja deleted file mode 100644 index 9eb4be5c..00000000 --- a/plexe/templates/prompts/review/model.jinja +++ /dev/null @@ -1,49 +0,0 @@ -Analyse the ML model information provided below and provide an analysis of the model. Your response must include -the following information. Every field must be populated based only on the provided information, or marked as "Unknown" -if insufficient detail is available: - -{ - "framework": string, // ML framework used (e.g. PyTorch, TensorFlow, Scikit-Learn) - "model_type": string, // Type of model or algorithm (e.g. CNN, XGBoost, Transformer) - - "task_type": string, // e.g. classification, regression, generation - "domain": string, // e.g. NLP, computer vision, tabular, multimodal - "behavior": string, // What the model 'does', i.e. what relationships in the data it's likely learning based on how it was trained - - "preprocessing_summary": string, // Summary of data preprocessing (normalization, tokenization, etc.) - "architecture_summary": string, // Overview of model structure, key components/layers - "training_procedure": string, // Optimizer, loss function, batch size, epoch count, etc. - "evaluation_metrics": list of strings, // Metrics used to assess model performance (e.g. accuracy, F1) - - "inference_behavior": string, // Description of how inference is handled, inputs and outputs - "strengths": list of strings, // Where the model is likely to perform well - "limitations": list of strings, // Known or inferred limitations, assumptions, or risks - - "selection_rationale": string // Summary of why this model was appropriate for the task -} - -Keep explanations concise but insightful. Do not fabricate beyond the code or solution plan. If details are unclear or missing, say "Unknown". - ---- - -MODEL INTENT: -{{ intent }} - -INPUT SCHEMA: -{{ input_schema }} - -OUTPUT SCHEMA: -{{ output_schema }} - -SOLUTION PLAN: -{{ solution_plan }} - -TRAINING CODE: -```python -{{ training_code }} -``` - -INFERENCE CODE: -```python -{{ inference_code }} -``` diff --git a/plexe/templates/prompts/review/system_prompt.jinja b/plexe/templates/prompts/review/system_prompt.jinja deleted file mode 100644 index f255497e..00000000 --- a/plexe/templates/prompts/review/system_prompt.jinja +++ /dev/null @@ -1,4 +0,0 @@ -You are an AI assistant specializing in machine learning model analysis. Your task is to examine model code and provide -accurate information about the model, what it does, and how it works. - -You respond with factual, technical information about machine learning models based on code analysis. \ No newline at end of file diff --git a/plexe/templates/prompts/schemas/base.jinja b/plexe/templates/prompts/schemas/base.jinja deleted file mode 100644 index b67c1f2c..00000000 --- a/plexe/templates/prompts/schemas/base.jinja +++ /dev/null @@ -1 +0,0 @@ -You are an expert ML engineer identifying target variables. \ No newline at end of file diff --git a/plexe/templates/prompts/schemas/generate_from_intent.jinja b/plexe/templates/prompts/schemas/generate_from_intent.jinja deleted file mode 100644 index 3f5fc169..00000000 --- a/plexe/templates/prompts/schemas/generate_from_intent.jinja +++ /dev/null @@ -1,8 +0,0 @@ -Generate appropriate input and output schemas for this machine learning task. - -Task description: {{intent}} - -The {{input_schema}} should contain features needed for prediction. -The {{output_schema}} should contain what needs to be predicted. -Return your response as a valid JSON object. -Use only these types: "int", "float", "str", "bool". \ No newline at end of file diff --git a/plexe/templates/prompts/schemas/identify_target.jinja b/plexe/templates/prompts/schemas/identify_target.jinja deleted file mode 100644 index f55bcb96..00000000 --- a/plexe/templates/prompts/schemas/identify_target.jinja +++ /dev/null @@ -1,6 +0,0 @@ -Given these columns from a dataset: -{{columns}} - -For this ML task: {{intent}} - -Which column is the target/output variable? Return ONLY the exact column name, nothing else. \ No newline at end of file diff --git a/plexe/templates/prompts/training/fix.jinja b/plexe/templates/prompts/training/fix.jinja deleted file mode 100644 index 925ba645..00000000 --- a/plexe/templates/prompts/training/fix.jinja +++ /dev/null @@ -1,43 +0,0 @@ -Fix the previous solution based on the following information. - -# PLAN: -{{plan}} - -# CODE: -{{training_code}} - -# ISSUES: -{{review}} - -# ERRORS: -{{problems}} - -# INSTRUCTIONS -Correct the code with the specified fixes. Only return the code to train the model, no explanations outside the code. - -The script must assume that the data to be used for training is in the following files -relative to the current directory: {{ training_data_files }} - -{% if use_validation_files %} -- The script must assume that the data to be used for validation is in the following files -relative to the current directory: {{ validation_data_files }} -{% endif %} - -- The script must train the model, compute and print the final evaluation metric to standard output, -and **save all model files directly in the CURRENT directory** with descriptive names. -Do not create any subdirectories. Do not print ANY other text to standard output than the metric. Print the -metric in the format `metric_name: metric_value`. - -- Use only {{ allowed_packages }}. Do NOT use any packages that are not part of this list of the Python standard library. - -- Do not skip steps or combine preprocessors and models in the same joblib file. - -{% if use_validation_files %} -IMPORTANT: You MUST use the training datasets for training the model and the validation datasets for evaluating -the model performance. DO NOT mix these datasets during training. - 1. Load training data from the training_data_files - 2. Load validation data from the validation_data_files - 3. Train your model using ONLY the training data - 4. Evaluate your model using ONLY the validation data - 5. Compute the performance on the validation data -{% endif %} diff --git a/plexe/templates/prompts/training/generate.jinja b/plexe/templates/prompts/training/generate.jinja deleted file mode 100644 index 16a55d7e..00000000 --- a/plexe/templates/prompts/training/generate.jinja +++ /dev/null @@ -1,44 +0,0 @@ -Write a Python script to train a machine learning model that solves the TASK outlined below, -using the approach outlined in the plan below. This must be a production-ready script. - -# TASK: -{{ problem_statement }} - -# PLAN: -{{ plan }} - -# PREVIOUS ATTEMPTS, IF ANY: -{{history}} - -# INSTRUCTIONS -Only return the code to train the model, no explanations outside the code. Any explanation should -be in the comments in the code itself, but your overall answer must only consist of the code script. - -The script must assume that the data to be used for training is in the following files -relative to the current directory: {{ training_data_files }} - -{% if use_validation_files %} -- The script must assume that the data to be used for validation is in the following files -relative to the current directory: {{ validation_data_files }} -{% endif %} - -- The script must train the model, compute and print the final evaluation metric to standard output, -and **save all model files directly in the CURRENT directory** with descriptive names. -Do not create any subdirectories. Do not print ANY other text to standard output than the metric. Print the -metric in the format `metric_name: metric_value`. - -- Use only {{ allowed_packages }}. Do NOT use any packages that are not part of this list of the Python standard library. - -- Do not skip steps or combine preprocessors and models in the same joblib file. -- Do not simplify by ignoring important features; this is not an exploratory task, this script trains the model that - will be used in production. For example, do not neglect to one-hot encode categorical features. - -{% if use_validation_files %} -IMPORTANT: You MUST use the training datasets for training the model and the validation datasets for evaluating -the model performance. DO NOT mix these datasets during training. - 1. Load training data from the training_data_files - 2. Load validation data from the validation_data_files - 3. Train your model using ONLY the training data - 4. Evaluate your model using ONLY the validation data - 5. Compute the performance on the validation data -{% endif %} diff --git a/plexe/templates/prompts/training/review.jinja b/plexe/templates/prompts/training/review.jinja deleted file mode 100644 index 0dfaf54e..00000000 --- a/plexe/templates/prompts/training/review.jinja +++ /dev/null @@ -1,20 +0,0 @@ -Review the solution to enhance test performance and fix issues. - -# TASK: -{{problem_statement}} - -# PLAN: -{{plan}} - -# CODE SOLUTION: -```python -{{training_code}} -``` - -# ERRORS: -{{problems}} - - -Your task is to suggest a single, actionable improvement to the code solution that will fix the issues. Do NOT -return the full code solution, but return a clear explanation of what needs to change, and a code snippet showing -the change. Note that only the following packages are allowed: {{ allowed_packages }}. \ No newline at end of file diff --git a/plexe/templates/prompts/training/system_prompt.jinja b/plexe/templates/prompts/training/system_prompt.jinja deleted file mode 100644 index 7abc3fcc..00000000 --- a/plexe/templates/prompts/training/system_prompt.jinja +++ /dev/null @@ -1 +0,0 @@ -You are an experienced ML Engineer implementing a training script for a Kaggle competition. \ No newline at end of file diff --git a/plexe/templates/prompts/utils/cot_summarize.jinja b/plexe/templates/prompts/utils/cot_summarize.jinja deleted file mode 100644 index 0d474e87..00000000 --- a/plexe/templates/prompts/utils/cot_summarize.jinja +++ /dev/null @@ -1,34 +0,0 @@ -Your task is to examine details about a reasoning step taken by an engineer and generate: -1. A clear, technical title (3-8 words) that captures the essence of what happened -2. A summary (exactly 3 sentences) that explains the step in "thought-action-observation" format - -## Example - -The following snippet: ---- -Thought: I need to analyze the dataset to understand the relationships between features. Let me look at the correlation matrix to identify patterns. -Code: -```py -answer = pandas_df.corr() -print(answer) -``` -Observation: "The dataset has 5000 rows and 15 columns. There's a strong correlation between age and income." ---- - -Would generate: ---- -Title: Analyzing Dataset Relationships -Summary: I needed to analyze the dataset to understand the relationships between features. I generated a correlation matrix using pandas to identify patterns. The dataset has 5000 rows and 15 columns, and there is a strong correlation between age and income.\n ---- - -## Context to summarize: -{{ context }} - -## Instructions: -- Focus on the purpose, action and outcome of the step -- In the summary, use precise, technical language -- Title should be 3-8 words -- Summary should be 3 sentences, formatted as in the example above (on three lines) -- Include specific technical details (e.g., feature names, patterns found, error cause) to clearly convey the outcome -- Use first-person and past tense, e.g., "I analyzed..." or "I observed..." -- Maintain a friendly but concise tone; you're technical and precise, but not overly formal diff --git a/plexe/templates/prompts/utils/system_prompt.jinja b/plexe/templates/prompts/utils/system_prompt.jinja deleted file mode 100644 index 53fea904..00000000 --- a/plexe/templates/prompts/utils/system_prompt.jinja +++ /dev/null @@ -1,6 +0,0 @@ -You are a professional and helpful assistant. Your job is to help your highly technical team of engineers express -themselves in a clear, concise, and professional manner to humans. You will be given highly technical reports from -the engineers explaining what they are doing, and you will need to condense these into a short, professional, -first-person summary that will be shown to the user as if coming from the engineers themselves. It's essential that -you write concisely and faithfully to the engineers' original intent, using precise technical language to clearly -communicate what was done and why. diff --git a/plexe/tools/__init__.py b/plexe/tools/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plexe/tools/code_analysis.py b/plexe/tools/code_analysis.py deleted file mode 100644 index 122eaad4..00000000 --- a/plexe/tools/code_analysis.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Tools for analyzing and inspecting code. -""" - -import logging -from typing import Optional -from smolagents import tool - -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.models.entities.code import Code - -logger = logging.getLogger(__name__) - - -@tool -def read_training_code(training_code_id: str) -> str: - """ - Retrieves the training code from the registry for analysis. Use this tool to understand the - code that was used to train the ML model. - - Args: - training_code_id: The identifier for the training code to retrieve - - Returns: - The full training code as a string - """ - try: - return ObjectRegistry().get(Code, training_code_id).code - except Exception as e: - raise ValueError(f"Failed to retrieve training code with ID {training_code_id}: {str(e)}") - - -@tool -def get_feature_transformer_code() -> Optional[str]: - """ - Get the feature transformation code that was used to transform the raw input dataset into the - feature-engineered dataset used for building the model. - - Returns: - Code for feature transformations if available, otherwise None. - """ - object_registry = ObjectRegistry() - - try: - # Feature transformer code is stored with fixed ID "feature_transformations" - code = object_registry.get(Code, "feature_transformations") - if code: - return code.code - return None - except KeyError: - logger.debug("Feature transformation code not found in registry") - return None - except Exception as e: - logger.warning(f"⚠️ Error getting feature transformer code: {str(e)}") - return None diff --git a/plexe/tools/context.py b/plexe/tools/context.py deleted file mode 100644 index 6f68901a..00000000 --- a/plexe/tools/context.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Tools for providing context to agents for code generation tasks. -""" - -import json -import logging -from typing import Dict, Any, List, Callable - -from pydantic import BaseModel -from smolagents import tool - -from plexe.config import code_templates -from plexe.core.entities.solution import Solution -from plexe.internal.common.provider import Provider -from plexe.core.object_registry import ObjectRegistry -from plexe.tools.datasets import create_input_sample -from plexe.tools.schemas import get_solution_schemas - -logger = logging.getLogger(__name__) - - -def get_inference_context_tool(llm_to_use: str) -> Callable: - """Returns a tool function to get inference context with the model ID pre-filled.""" - - @tool - def get_inference_context() -> Dict[str, Any]: - """ - Provides comprehensive context needed for generating inference code. Use this tool to retrieve - a summary of the training code, schemas, expected inputs for the purpose of planning the inference - code. - - Returns: - A dictionary containing all context needed for inference code generation - """ - object_registry = ObjectRegistry() - - # Retrieve the best performing solution - try: - best_solution = object_registry.get(Solution, "best_performing_solution") - except Exception as e: - raise ValueError(f"Best performing solution not found, has it been selected?: {str(e)}") - - # Retrieve the training code - try: - training_code = best_solution.training_code - except Exception as e: - raise ValueError( - f"Solution '{best_solution.id}' doesn't have training code, has it been trained?: {str(e)}" - ) - - # Retrieve schemas - try: - schemas = get_solution_schemas("best_performing_solution") - input_schema = schemas["input"] - output_schema = schemas["output"] - except Exception as e: - raise ValueError(f"Failed to retrieve schemas from registry: {str(e)}") - - # Retrieve input sample - try: - # Create input sample now that we know schema exists - create_input_sample() # TODO: this tool -> tool dependency will lead to difficult to debug errors - input_sample = object_registry.get(list, "predictor_input_sample") - except Exception as e: - raise ValueError(f"Failed to retrieve input sample: {str(e)}") - - # Extract artifacts - try: - artifact_names = _extract_artifacts(llm_to_use, training_code) - object_registry.register(list, "model_artifact_names", artifact_names, overwrite=True, immutable=True) - except Exception as e: - raise ValueError(f"Failed to extract artifacts from training code: {str(e)}") - - return { - "training_code": training_code, - "input_schema": input_schema, - "output_schema": output_schema, - "predictor_interface": code_templates.predictor_interface, - "predictor_template": code_templates.predictor_template, - "input_sample": input_sample, - "artifact_names": artifact_names, - } - - return get_inference_context - - -def _extract_artifacts(llm_to_use: str, code: str) -> List[str]: - """Extract model artifact names from training code using LLM""" - - class ArtifactResponse(BaseModel): - artifact_names: List[str] - - try: - provider = Provider(llm_to_use) - - names = json.loads( - provider.query( - "You are a code analysis assistant.", - ( - "Extract the names of all saved ML model artifacts from the following code. " - "The artifacts are usually saved using a function like torch.save, joblib.dump, or pickle.dump. " - "Any model artifact that is saved by the training script must be included in the output so that " - "it can be used in the inference code. DO NOT modify the artifact names from the script.\n\n" - "Here is the training code:\n" - f"```python\n{code}\n```" - ), - ArtifactResponse, - ) - )["artifact_names"] - return names - except Exception as e: - raise RuntimeError(f"Artifact extraction failed: {e}") from e diff --git a/plexe/tools/conversation.py b/plexe/tools/conversation.py deleted file mode 100644 index e9dce7ab..00000000 --- a/plexe/tools/conversation.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Tools for conversational model definition and build initiation. - -These tools support the conversational agent in helping users define their ML -requirements and starting model builds when ready. -""" - -import logging -import os -from pathlib import Path -from typing import Dict, List, Optional - -import pandas as pd -from smolagents import tool - -import plexe -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.provider import ProviderConfig -from plexe.core.object_registry import ObjectRegistry - -logger = logging.getLogger(__name__) - - -@tool -def validate_dataset_files(file_paths: List[str]) -> Dict[str, Dict]: - """ - Check if specified file paths can be read as datasets using pandas. - - Args: - file_paths: List of file paths to validate - - Returns: - Dictionary mapping file paths to validation results with status, shape, and error info - """ - results = {} - - for file_path in file_paths: - result = {"valid": False, "shape": None, "columns": None, "error": None} - - try: - # Check if file exists - if not os.path.exists(file_path): - result["error"] = f"File does not exist: {file_path}" - results[file_path] = result - continue - - # Determine file type and try to read - path_obj = Path(file_path) - file_extension = path_obj.suffix.lower() - - if file_extension == ".csv": - df = pd.read_csv(file_path) - elif file_extension in [".parquet", ".pq"]: - df = pd.read_parquet(file_path) - else: - result["error"] = f"Unsupported file format: {file_extension}. Supported formats: .csv, .parquet" - results[file_path] = result - continue - - # File successfully read - result["valid"] = True - result["dataset_name"] = path_obj.stem - - # Register the DataFrame in object registry - ObjectRegistry().register( - t=TabularConvertible, name=path_obj.stem, item=DatasetAdapter.coerce(df), immutable=True - ) - - except Exception as e: - result["error"] = str(e) - - results[file_path] = result - - return results - - -@tool -def initiate_model_build( - intent: str, - dataset_file_paths: List[str], - input_schema: Optional[Dict] = None, - output_schema: Optional[Dict] = None, - n_solutions_to_try: int = 1, -) -> Dict[str, str]: - """ - Initiate a model build by loading datasets from file paths and starting the build process. - - Args: - intent: Natural language description of what the model should do - dataset_file_paths: List of file paths to dataset files (CSV or Parquet) - input_schema: The input schema for the model, as a flat field:type dictionary; leave None if not known - output_schema: The output schema for the model, as a flat field:type dictionary; leave None if not known - n_solutions_to_try: Number of model solutions to try, out of which the best will be selected - - Returns: - Dictionary with build initiation status and details - """ - try: - # First validate all files can be read - validation_results = validate_dataset_files(dataset_file_paths) - - # Check if any files failed validation - failed_files = [path for path, result in validation_results.items() if not result["valid"]] - if failed_files: - error_details = {path: validation_results[path]["error"] for path in failed_files} - return { - "status": "failed", - "message": f"Failed to read dataset files: {failed_files}", - "errors": error_details, - } - - # Load datasets into DataFrames - df = None - for file_path in dataset_file_paths: - path_obj = Path(file_path) - file_extension = path_obj.suffix.lower() - - if file_extension == ".csv": - df = pd.read_csv(file_path) - elif file_extension in [".parquet", ".pq"]: - df = pd.read_parquet(file_path) - - # Import here to avoid circular dependencies - from plexe.model_builder import ModelBuilder - - # Create ModelBuilder instance with loaded DataFrames - model_builder = ModelBuilder( - provider=ProviderConfig( - default_provider="openai/gpt-4o", - orchestrator_provider="anthropic/claude-3-7-sonnet-20250219", - research_provider="openai/gpt-4o", - engineer_provider="anthropic/claude-sonnet-4-20250514", - ops_provider="anthropic/claude-sonnet-4-20250514", - tool_provider="openai/gpt-4o", - ), - ) - - # Start the build process - logger.info(f"Initiating model build with intent: {intent}") - logger.info(f"Using dataset files: {dataset_file_paths}") - - model = model_builder.build( - intent=intent, - datasets=[df], # Pass actual DataFrames instead of names - input_schema=input_schema, - output_schema=output_schema, - max_iterations=n_solutions_to_try, - ) - - plexe.save_model(model, "model-from-chat.tar.gz") - - # For now, just return success status - return { - "status": "initiated", - "message": f"Model build started successfully with intent: '{intent}'", - "dataset_files": dataset_file_paths, - "dataset_shapes": [validation_results[path]["shape"] for path in dataset_file_paths], - } - - except Exception as e: - logger.error(f"Failed to initiate model build: {str(e)}") - return {"status": "failed", "message": f"Failed to start model build: {str(e)}", "error": str(e)} diff --git a/plexe/tools/datasets.py b/plexe/tools/datasets.py deleted file mode 100644 index 7f67487c..00000000 --- a/plexe/tools/datasets.py +++ /dev/null @@ -1,650 +0,0 @@ -""" -Tools for dataset manipulation, splitting, and registration. - -These tools help with dataset operations within the model generation pipeline, including -splitting datasets into training, validation, and test sets, registering datasets with -the dataset registry, creating sample data for validation, previewing dataset content, -registering exploratory data analysis (EDA) reports, and registering feature engineering results. -""" - -import logging -from datetime import datetime -from typing import Dict, List, Any - -import numpy as np -import pandas as pd -from smolagents import tool - -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.models.entities.code import Code - -logger = logging.getLogger(__name__) - - -@tool -def register_split_datasets( - dataset_name: str, - train_dataset: pd.DataFrame, - validation_dataset: pd.DataFrame, - test_dataset: pd.DataFrame, - splitting_code: str, -) -> Dict[str, str]: - """ - Register train, validation, and test datasets in the object registry after custom splitting. - This tool allows the agent to register datasets after performing custom splitting logic. - - Args: - dataset_name: Original name of the dataset that was split - train_dataset: pandas DataFrame containing training data - validation_dataset: pandas DataFrame containing validation data - test_dataset: pandas DataFrame containing test data - splitting_code: the code that was used to split the dataset - - Returns: - Dictionary containing lists of registered dataset names: - { - "train_dataset": name of the training dataset, - "validation_dataset": name of the validation dataset, - "test_dataset": name of the test dataset, - "dataset_size": Dictionary with sizes of each dataset - } - """ - - # Initialize the dataset registry - object_registry = ObjectRegistry() - - # Initialize the dataset sizes dictionary - dataset_sizes = {"train": [], "validation": [], "test": []} - - # Register each split dataset - # Convert pandas DataFrames to TabularDataset objects - train_ds = DatasetAdapter.coerce(train_dataset) - val_ds = DatasetAdapter.coerce(validation_dataset) - test_ds = DatasetAdapter.coerce(test_dataset) - - # Register split datasets in the registry - train_name = f"{dataset_name}_train" - val_name = f"{dataset_name}_val" - test_name = f"{dataset_name}_test" - - object_registry.register(TabularConvertible, train_name, train_ds, overwrite=True, immutable=True) - object_registry.register(TabularConvertible, val_name, val_ds, overwrite=True, immutable=True) - object_registry.register(TabularConvertible, test_name, test_ds, overwrite=True, immutable=True) - object_registry.register(Code, "dataset_splitting_code", Code(splitting_code), overwrite=True) - - # Store dataset sizes - dataset_sizes["train"].append(len(train_ds)) - dataset_sizes["validation"].append(len(val_ds)) - dataset_sizes["test"].append(len(test_ds)) - - logger.debug( - f"✅ Registered custom split of dataset {dataset_name} into train/validation/test with sizes " - f"{len(train_ds)}/{len(val_ds)}/{len(test_ds)}" - ) - - return { - "training_dataset": train_name, - "validation_dataset": val_name, - "test_dataset": test_name, - "dataset_size": dataset_sizes, - } - - -# TODO: does not need to be a tool -@tool -def create_input_sample(n_samples: int = 5) -> bool: - """ - Create and register a synthetic sample input dataset that matches the model's input schema. - This sample is used for validating inference code. - - Args: - n_samples: Number of samples to generate (default: 5) - - Returns: - True if sample was successfully created and registered, False otherwise - """ - object_registry = ObjectRegistry() - input_schema = object_registry.get(dict, "input_schema") - - try: - # Create synthetic sample data that matches the schema - input_sample_dicts = [] - - # Generate synthetic examples - for i in range(n_samples): - sample = {} - for field_name, field_type in input_schema.items(): - # Generate appropriate sample values based on type - if field_type == "int": - sample[field_name] = i * 10 - elif field_type == "float": - sample[field_name] = i * 10.5 - elif field_type == "bool": - sample[field_name] = i % 2 == 0 - elif field_type == "str": - sample[field_name] = f"sample_{field_name}_{i}" - elif field_type == "List[int]": - sample[field_name] = [i * 10, i * 20, i * 30] - elif field_type == "List[float]": - sample[field_name] = [i * 10.5, i * 20.5, i * 30.5] - elif field_type == "List[bool]": - sample[field_name] = [True, False, i % 2 == 0] - elif field_type == "List[str]": - sample[field_name] = [f"item_{i}_1", f"item_{i}_2", f"item_{i}_3"] - else: - sample[field_name] = None - input_sample_dicts.append(sample) - - # TODO: we should use an LLM call to generate sensible values; then validate using pydantic - - # Register the input sample in the registry for validation tool to use - object_registry.register(list, "predictor_input_sample", input_sample_dicts, overwrite=True, immutable=True) - logger.debug( - f"✅ Registered synthetic input sample with {len(input_sample_dicts)} examples for inference validation" - ) - return True - - except Exception as e: - logger.warning(f"⚠️ Error creating input sample for validation: {str(e)}") - return False - - -@tool -def drop_null_columns(dataset_name: str) -> str: - """ - Drop all columns from the dataset that are completely null and register the modified dataset. - - Args: - dataset_name: Name of the dataset to modify - - Returns: - Dictionary containing results of the operation: - - dataset_name: Name of the modified dataset - - n_dropped: Number of columns dropped - """ - object_registry = ObjectRegistry() - - try: - # Get dataset from registry - dataset = object_registry.get(TabularConvertible, dataset_name) - df = dataset.to_pandas() - - # Drop columns with all null values TODO: make this more intelligent - # Drop columns with >=50% missing values - null_columns = df.columns[df.isnull().mean() >= 0.5] - - # Drop constant columns (zero variance) - constant_columns = [col for col in df.columns if df[col].nunique(dropna=False) == 1] - - # Drop quasi-constant columns (e.g., one value in >95% of rows) - quasi_constant_columns = [ - col for col in df.columns if (df[col].value_counts(dropna=False, normalize=True).values[0] > 0.95) - ] - - # Drop columns with all unique values (likely IDs) - unique_columns = [col for col in df.columns if df[col].nunique(dropna=False) == len(df)] - - # Drop duplicate columns - duplicate_columns = [] - seen = {} - for col in df.columns: - col_data = df[col].to_numpy() - key = col_data.tobytes() if hasattr(col_data, "tobytes") else tuple(col_data) - if key in seen: - duplicate_columns.append(col) - else: - seen[key] = col - - # Combine all columns to drop (set to avoid duplicates) - all_bad_columns = ( - set(null_columns) - | set(constant_columns) - | set(quasi_constant_columns) - | set(unique_columns) - | set(duplicate_columns) - ) - n_dropped = len(all_bad_columns) - df.drop(columns=list(all_bad_columns), inplace=True) - - # Unregister the original dataset - object_registry.delete(TabularConvertible, dataset_name) - - # Register the modified dataset - object_registry.register(TabularConvertible, dataset_name, DatasetAdapter.coerce(df), immutable=True) - - return f"Successfully dropped {n_dropped} null columns from dataset '{dataset_name}'" - - except Exception as e: - raise RuntimeError(f"Failed to drop null columns from dataset '{dataset_name}': {str(e)}") - - -@tool -def get_dataset_preview(dataset_name: str) -> Dict[str, Any]: - """ - Generate a concise preview of a dataset with statistical information to help agents understand the data. - - Args: - dataset_name: Name of the dataset to preview - - Returns: - Dictionary containing dataset information: - - shape: dimensions of the dataset - - dtypes: data types of columns - - summary_stats: basic statistics (mean, median, min/max) - - missing_values: count of missing values per column - - sample_rows: sample of the data (5 rows) - """ - object_registry = ObjectRegistry() - - try: - # Get dataset from registry - dataset = object_registry.get(TabularConvertible, dataset_name) - df = dataset.to_pandas() - - # Basic shape and data types - result = { - "dataset_name": dataset_name, - "shape": {"rows": df.shape[0], "columns": df.shape[1]}, - "columns": list(df.columns), - "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}, - "sample_rows": df.head(5).to_dict(orient="records"), - } - - # Basic statistics - numeric_cols = df.select_dtypes(include=np.number).columns.tolist() - if numeric_cols: - stats = df[numeric_cols].describe().to_dict() - result["summary_stats"] = { - col: { - "mean": stats[col].get("mean"), - "std": stats[col].get("std"), - "min": stats[col].get("min"), - "25%": stats[col].get("25%"), - "median": stats[col].get("50%"), - "75%": stats[col].get("75%"), - "max": stats[col].get("max"), - } - for col in numeric_cols - } - - # Missing values - missing_counts = df.isnull().sum().to_dict() - result["missing_values"] = {col: count for col, count in missing_counts.items() if count > 0} - - return result - - except Exception as e: - logger.warning(f"⚠️ Error creating dataset preview: {str(e)}") - return { - "error": f"Failed to generate preview for dataset '{dataset_name}': {str(e)}", - "dataset_name": dataset_name, - } - - -@tool -def register_eda_report( - dataset_name: str, - overview: Dict[str, Any], - feature_engineering_opportunities: Dict[str, Any], - data_quality_challenges: Dict[str, Any], - data_preprocessing_requirements: Dict[str, Any], - feature_importance: Dict[str, Any], - insights: List[str], - recommendations: List[str], -) -> str: - """ - Register an exploratory data analysis (EDA) report for a dataset in the Object Registry. - - This tool creates a structured report with actionable ML engineering insights from exploratory - data analysis and registers it in the Object Registry for use by other agents. - - Args: - dataset_name: Name of the dataset that was analyzed - overview: Essential dataset statistics including target variable analysis - feature_engineering_opportunities: Specific transformation needs, interaction effects, - and engineered features that would improve model performance - data_quality_challenges: Critical data issues with specific handling recommendations - data_preprocessing_requirements: Necessary preprocessing steps with clear justification - feature_importance: Assessment of feature predictive potential and relevance - insights: Key insights derived from the analysis that directly impact feature engineering - recommendations: Specific, prioritized actions for preprocessing and feature engineering - - Returns: - A string indicating success or failure of the registration - """ - object_registry = ObjectRegistry() - - try: - # Create structured EDA report with actionable ML focus - eda_report = { - "dataset_name": dataset_name, - "timestamp": datetime.now().isoformat(), - "overview": overview, - "feature_engineering_opportunities": feature_engineering_opportunities, - "data_quality_challenges": data_quality_challenges, - "data_preprocessing_requirements": data_preprocessing_requirements, - "feature_importance": feature_importance, - "insights": insights, - "recommendations": recommendations, - } - - # TODO: separate EDA reports for raw and transformed data - # Register in registry - object_registry.register(dict, f"eda_report_{dataset_name}", eda_report, overwrite=True) - logger.debug(f"✅ Registered EDA report for dataset '{dataset_name}'") - return f"Successfully registered EDA report for dataset '{dataset_name}'" - - except Exception as e: - logger.warning(f"⚠️ Error registering EDA report: {str(e)}") - raise RuntimeError(f"Failed to register EDA report for dataset '{dataset_name}': {str(e)}") - - -@tool -def register_feature_engineering_report( - dataset_name: str, - overview: Dict[str, Any], - feature_catalog: Dict[str, Any], - feature_importance: Dict[str, Any], - insights: List[str], - recommendations: List[str], -) -> str: - """ - Register a feature engineering report for a transformed dataset. This tool registers a structured report with - actionable insights from feature engineering for use by other agents. The purpose is to ensure that the features - created during feature engineering are well-documented. - - Args: - dataset_name: Name of the dataset that was analyzed - overview: Essential dataset statistics including target variable analysis - feature_catalog: Catalog of engineered features with descriptions and transformations - feature_importance: Assessment of feature predictive potential and relevance - insights: Key insights derived from the analysis that directly impact feature engineering - recommendations: Specific, prioritized actions for preprocessing and feature engineering - - Returns: - A string indicating success or failure of the registration - """ - object_registry = ObjectRegistry() - - try: - # Create structured feature engineering report - fe_report = { - "dataset_name": dataset_name, - "timestamp": datetime.now().isoformat(), - "overview": overview, - "feature_catalog": feature_catalog, - "feature_importance": feature_importance, - "insights": insights, - "recommendations": recommendations, - } - - # Register in registry - object_registry.register(dict, f"fe_report_{dataset_name}", fe_report, overwrite=True) - logger.debug(f"✅ Registered Feature Engineering report for dataset '{dataset_name}'") - return f"Successfully registered Feature Engineering report for dataset '{dataset_name}'" - - except Exception as e: - logger.warning(f"⚠️ Error registering Feature Engineering report: {str(e)}") - raise RuntimeError(f"Failed to register Feature Engineering report for dataset '{dataset_name}': {str(e)}") - - -@tool -def get_latest_datasets() -> Dict[str, str]: - """ - Get the most recent version of each dataset in the pipeline. Automatically detects transformed - versions and returns the latest. Use this tool to recall what datasets are available. - - Returns: - Dictionary mapping dataset roles to actual dataset names: - - "raw": The original dataset - - "transformed": The transformed dataset (if available) - - "train": Training split (transformed version if available) - - "val": Validation split (transformed version if available) - - "test": Test split (transformed version if available) - """ - object_registry = ObjectRegistry() - - try: - all_datasets = object_registry.list_by_type(TabularConvertible) - if not all_datasets: - return {} - - result = {} - - # Find raw datasets (no suffixes) - raw_datasets = [ - d for d in all_datasets if not any(suffix in d for suffix in ["_train", "_val", "_test", "_transformed"]) - ] - if raw_datasets: - # Use the first one (could be enhanced to handle multiple) - result["raw"] = raw_datasets[0] - - # Find transformed dataset (not a split) - transformed = [ - d - for d in all_datasets - if d.endswith("_transformed") - and not any(d.endswith(f"_transformed_{split}") for split in ["train", "val", "test"]) - ] - if transformed: - result["transformed"] = transformed[0] - - # Find splits - prefer transformed versions - for split in ["train", "val", "test"]: - # First look for transformed split - transformed_split = [d for d in all_datasets if d.endswith(f"_transformed_{split}")] - if transformed_split: - result[split] = transformed_split[0] - continue - - # Fall back to regular split - regular_split = [d for d in all_datasets if d.endswith(f"_{split}") and "_transformed_" not in d] - if regular_split: - result[split] = regular_split[0] - - return result - - except Exception as e: - logger.warning(f"⚠️ Error getting latest datasets: {str(e)}") - return {} - - -@tool -def get_dataset_for_splitting() -> str: - """ - Get the most appropriate dataset for splitting. Returns transformed version if available, - otherwise raw. Use this tool to get the dataset that needs to be split. - - Returns: - Name of the dataset to split - - Raises: - ValueError: If no suitable dataset is found for splitting - """ - object_registry = ObjectRegistry() - - try: - all_datasets = object_registry.list_by_type(TabularConvertible) - - # First, check if splits already exist - has_splits = any(d.endswith(("_train", "_val", "_test")) for d in all_datasets) - - # Prefer transformed datasets that haven't been split yet - transformed_unsplit = [ - d - for d in all_datasets - if d.endswith("_transformed") - and not any(f"{d}_{split}" in all_datasets for split in ["train", "val", "test"]) - ] - if transformed_unsplit: - # Return the most recent (last) one - return transformed_unsplit[-1] - - # If no unsplit transformed datasets, check for raw datasets that haven't been split - raw_unsplit = [ - d - for d in all_datasets - if not any(suffix in d for suffix in ["_train", "_val", "_test", "_transformed"]) - and not any(f"{d}_{split}" in all_datasets for split in ["train", "val", "test"]) - ] - - if raw_unsplit: - return raw_unsplit[-1] - - # If everything has been split, raise an informative error - if has_splits: - raise ValueError("All datasets have already been split. No unsplit datasets available.") - else: - raise ValueError("No datasets available for splitting. Ensure datasets have been registered.") - - except ValueError: - # Re-raise ValueError as is - raise - except Exception as e: - logger.warning(f"⚠️ Error finding dataset for splitting: {str(e)}") - raise ValueError(f"Failed to find dataset for splitting: {str(e)}") - - -@tool -def get_training_datasets() -> Dict[str, str]: - """ - Get datasets ready for model training. - Automatically finds the best available train/validation datasets. - - Returns: - Dictionary with 'train' and 'validation' dataset names - - Raises: - ValueError: If training datasets are not found - """ - object_registry = ObjectRegistry() - - try: - all_datasets = object_registry.list_by_type(TabularConvertible) - - # Look for train/val pairs, preferring transformed versions - train_datasets = [] - val_datasets = [] - - # First try to find transformed splits - for d in all_datasets: - if d.endswith("_transformed_train"): - train_datasets.append((d, 1)) # Priority 1 for transformed - elif d.endswith("_train") and "_transformed_" not in d: - train_datasets.append((d, 2)) # Priority 2 for regular - elif d.endswith("_transformed_val"): - val_datasets.append((d, 1)) - elif d.endswith("_val") and "_transformed_" not in d: - val_datasets.append((d, 2)) - - # Sort by priority (lower is better) - train_datasets.sort(key=lambda x: x[1]) - val_datasets.sort(key=lambda x: x[1]) - - if not train_datasets or not val_datasets: - raise ValueError("Training datasets not found. Ensure datasets have been split into train/validation sets.") - - # Return the best available pair - return {"train": train_datasets[0][0], "validation": val_datasets[0][0]} - - except ValueError: - # Re-raise ValueError as is - raise - except Exception as e: - logger.warning(f"⚠️ Error getting training datasets: {str(e)}") - raise ValueError(f"Failed to get training datasets: {str(e)}") - - -@tool -def get_test_dataset() -> str: - """ - Get the name of the test dataset for final model evaluation. - - Returns: - Name of the test dataset - - Raises: - ValueError: If test dataset is not found - """ - object_registry = ObjectRegistry() - - try: - all_datasets = object_registry.list_by_type(TabularConvertible) - - # Look for test datasets, preferring transformed version - test_datasets = [] - - for d in all_datasets: - if d.endswith("_transformed_test"): - test_datasets.append((d, 1)) # Priority 1 for transformed - elif d.endswith("_test") and "_transformed_" not in d: - test_datasets.append((d, 2)) # Priority 2 for regular - - if not test_datasets: - raise ValueError("Test dataset not found. Ensure datasets have been split into train/validation/test sets.") - - # Sort by priority and return the best - test_datasets.sort(key=lambda x: x[1]) - return test_datasets[0][0] - - except ValueError: - # Re-raise ValueError as is - raise - except Exception as e: - logger.warning(f"⚠️ Error getting test dataset: {str(e)}") - raise ValueError(f"Failed to get test dataset: {str(e)}") - - -# TODO: this can return a very large amount of data, consider dividing this into list_reports() and get_report(name) -@tool -def get_dataset_reports() -> Dict[str, Dict]: - """ - Get all available data analysis reports, including EDA for raw datasets and feature engineering reports - for transformed datasets. - - Returns: - Dictionary with the following structure: - - """ - object_registry = ObjectRegistry() - - try: - # Get all dict objects from registry - all_dicts = object_registry.list_by_type(dict) - - # Filter for EDA reports (they have pattern "eda_report_{dataset_name}") - eda_reports = {} - for name in all_dicts: - if name.startswith("eda_report_"): - # Extract dataset name - dataset_name = name[11:] # Remove "eda_report_" prefix - try: - report = object_registry.get(dict, name) - eda_reports[dataset_name] = report - except Exception as e: - logger.debug(f"Failed to retrieve EDA report {name}: {str(e)}") - continue - - # Filter for feature engineering reports (they have pattern "fe_report_{dataset_name}") - fe_reports = {} - for name in all_dicts: - if name.startswith("fe_report_"): - # Extract dataset name - dataset_name = name[10:] # Remove "fe_report_" prefix - try: - report = object_registry.get(dict, name) - fe_reports[dataset_name] = report - except Exception as e: - logger.debug(f"Failed to retrieve Feature Engineering report {name}: {str(e)}") - continue - - return { - "eda_reports": eda_reports, - "feature_engineering_reports": fe_reports, - } - - except Exception as e: - logger.warning(f"⚠️ Error getting EDA reports: {str(e)}") - return {} diff --git a/plexe/tools/evaluation.py b/plexe/tools/evaluation.py deleted file mode 100644 index f079728f..00000000 --- a/plexe/tools/evaluation.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -This module defines agent tools for evaluating the properties and performance of models. -""" - -import logging -from typing import Dict, Callable - -from smolagents import tool - -from plexe.internal.common.provider import Provider -from plexe.internal.models.generation.review import ModelReviewer -from plexe.tools.schemas import get_solution_schemas - -logger = logging.getLogger(__name__) - - -def get_review_finalised_model(llm_to_use: str) -> Callable: - """Returns a tool function to review finalized models with the model ID pre-filled.""" - - @tool - def review_finalised_model( - intent: str, - solution_id: str, - ) -> dict: - """ - Reviews the entire model and extracts metadata. Use this function once you have completed work on the model, and - you want to 'wrap up' the work by performing a holistic review of what has been built. - - Args: - intent: The model intent - solution_id: The solution ID to review - - Returns: - A dictionary containing a summary and review of the model - """ - from plexe.core.object_registry import ObjectRegistry - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - - try: - schemas = get_solution_schemas(solution_id) - input_schema = schemas["input"] - output_schema = schemas["output"] - except Exception: - raise ValueError("Failed to retrieve schemas. Was schema resolution completed?") - - try: - solution = object_registry.get(Solution, solution_id) - except Exception: - raise ValueError(f"Solution with ID '{solution_id}' not found. Was the solution created?") - - if not solution.training_code: - raise ValueError("Training code not found in solution. Was the solution implemented?") - - if not solution.inference_code: - raise ValueError("Inference code not found in solution. Was the inference code produced?") - - # Review the model using the ModelReviewer - reviewer = ModelReviewer(Provider(llm_to_use)) - r = reviewer.review_model( - intent, input_schema, output_schema, solution.plan, solution.training_code, solution.inference_code - ) - - # Update the solution with the review - solution.review = r - object_registry.register(Solution, solution_id, solution, overwrite=True) - return r - - return review_finalised_model - - -@tool -def get_solution_performances() -> Dict[str, float]: - """ - Returns the performance of all successfully trained solutions so far. The performances are returned as a dictionary - mapping the 'solution ID' to the performance score. Use this function to remind yourself of the performance - of all solutions, so that you can do things such as select the best performing solution for deployment. - - Returns: - A dictionary mapping solution IDs to their performance scores with structure: - { - "solution_id_1": performance_score_1, - "solution_id_2": performance_score_2, - } - """ - from plexe.core.object_registry import ObjectRegistry - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - performances = {} - - for solution_id in object_registry.list_by_type(Solution): - solution = object_registry.get(Solution, solution_id) - if solution.performance is not None and solution.performance.value is not None: - performances[solution_id] = solution.performance.value - - return performances diff --git a/plexe/tools/execution.py b/plexe/tools/execution.py deleted file mode 100644 index 17270952..00000000 --- a/plexe/tools/execution.py +++ /dev/null @@ -1,312 +0,0 @@ -""" -Tools related to code execution, including running training code in isolated environments and -applying feature transformations to datasets. - -These tools automatically handle model artifact registration through the ArtifactRegistry, -ensuring that artifacts generated during the execution can be retrieved later in the pipeline. -""" - -import logging -import uuid -import types -import warnings -from typing import Dict, List, Callable, Type - -from smolagents import tool - -from plexe.callbacks import Callback -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.models.entities.code import Code -from plexe.internal.models.entities.artifact import Artifact -from plexe.internal.models.entities.metric import Metric, MetricComparator, ComparisonMethod -from plexe.core.entities.solution import Solution -from plexe.internal.models.execution.process_executor import ProcessExecutor - -logger = logging.getLogger(__name__) - - -def get_executor_tool(distributed: bool = False) -> Callable: - """Get the appropriate executor tool based on the distributed flag.""" - - @tool - def execute_training_code( - solution_id: str, - code: str, - working_dir: str, - dataset_names: List[str], - timeout: int, - metric_to_optimise_name: str, - metric_to_optimise_comparison_method: str, - ) -> Dict: - """Executes training code in an isolated environment and updates the Solution object. - - Args: - solution_id: ID of the Solution object to update with execution results - code: The code to execute - working_dir: Directory to use for execution - dataset_names: List of dataset names to retrieve from the registry - timeout: Maximum execution time in seconds - metric_to_optimise_name: The name of the metric to optimize for - metric_to_optimise_comparison_method: The comparison method for the metric - - Returns: - A dictionary containing execution results with model artifacts and their registry names - """ - # Log the distributed flag - logger.debug(f"execute_training_code called with distributed={distributed}") - - from plexe.callbacks import BuildStateInfo - - object_registry = ObjectRegistry() - - execution_id = f"{solution_id}-{uuid.uuid4()}" - try: - # Get the existing Solution object from registry - solution = object_registry.get(Solution, solution_id) - - # Get actual datasets from registry - datasets = object_registry.get_multiple(TabularConvertible, dataset_names) - - # Convert string to enum if needed - if "HIGHER_IS_BETTER" in metric_to_optimise_comparison_method: - comparison_method = ComparisonMethod.HIGHER_IS_BETTER - elif "LOWER_IS_BETTER" in metric_to_optimise_comparison_method: - comparison_method = ComparisonMethod.LOWER_IS_BETTER - elif "TARGET_IS_BETTER" in metric_to_optimise_comparison_method: - comparison_method = ComparisonMethod.TARGET_IS_BETTER - else: - comparison_method = ComparisonMethod.HIGHER_IS_BETTER - - # Update the solution with training code and get callbacks - solution.training_code = code - # Create state info once for all callbacks - state_info = BuildStateInfo( - intent="Unknown", # Will be filled by agent context - provider="Unknown", # Will be filled by agent context - input_schema=None, # Will be filled by agent context - output_schema=None, # Will be filled by agent context - datasets=datasets, - iteration=0, # Default value, no longer used for MLFlow run naming - node=solution, - ) - - # Notify all callbacks about execution start - _notify_callbacks(object_registry.get_all(Callback), "start", state_info) - - # Import here to avoid circular imports - from plexe.config import config - - # Get the appropriate executor class via the factory - executor_class = _get_executor_class(distributed=distributed) - - # Create an instance of the executor - logger.debug(f"Creating {executor_class.__name__} for execution ID: {execution_id}") - executor = executor_class( - execution_id=execution_id, - code=code, - working_dir=working_dir, - datasets=datasets, - timeout=timeout, - code_execution_file_name=config.execution.runfile_name, - ) - - # Execute and collect results - ProcessExecutor.run() handles cleanup internally - logger.debug(f"Executing solution {solution} using executor {executor}") - result = executor.run() - logger.debug(f"Execution result: {result}") - solution.execution_time = result.exec_time - solution.execution_stdout = result.term_out - solution.exception_was_raised = result.exception is not None - solution.exception = result.exception or None - solution.model_artifacts = [Artifact.from_path(p) for p in result.model_artifact_paths] - - # Handle the performance metric properly using the consolidated validation logic - performance_value = None - is_worst = True - - if result.is_valid_performance(): - performance_value = result.performance - is_worst = False - - # Create a metric object with proper handling of None or invalid values - solution.performance = Metric( - name=metric_to_optimise_name, - value=performance_value, - comparator=MetricComparator(comparison_method=comparison_method), - is_worst=is_worst, - ) - - # Notify callbacks about the execution end with the same state_info - # The solution reference in state_info automatically reflects the updates to solution - _notify_callbacks(object_registry.get_all(Callback), "end", state_info) - - # Check if the execution failed in any way - if solution.exception is not None: - raise RuntimeError(f"Execution failed with exception: {solution.exception}") - if not result.is_valid_performance(): - raise RuntimeError(f"Execution failed due to not producing a valid performance: {result.performance}") - - # Register artifacts and update solution in registry - object_registry.register_multiple(Artifact, {a.name: a for a in solution.model_artifacts}) - - # Update the solution in the registry with all execution results - object_registry.register(Solution, solution_id, solution, overwrite=True) - - # Return results - return { - "success": not solution.exception_was_raised, - "performance": ( - { - "name": solution.performance.name if solution.performance else None, - "value": solution.performance.value if solution.performance else None, - "comparison_method": ( - str(solution.performance.comparator.comparison_method) if solution.performance else None - ), - } - if solution.performance - else None - ), - "exception": str(solution.exception) if solution.exception else None, - "model_artifact_names": [a.name for a in solution.model_artifacts], - "solution_id": solution_id, - } - except Exception as e: - # Log full stack trace at debug level - import traceback - - logger.debug(f"Error executing training code: {str(e)}\n{traceback.format_exc()}") - - return { - "success": False, - "performance": None, - "exception": str(e), - "model_artifact_names": [], - } - - return execute_training_code - - -def _get_executor_class(distributed: bool = False) -> Type: - """Get the appropriate executor class based on the distributed flag. - - Args: - distributed: Whether to use distributed execution if available - - Returns: - Executor class (not instance) appropriate for the environment - """ - # Log the distributed flag - logger.debug(f"get_executor_class using distributed={distributed}") - if distributed: - try: - # Try to import Ray executor - from plexe.internal.models.execution.ray_executor import RayExecutor - - logger.debug("Using Ray for distributed execution") - return RayExecutor - except ImportError: - # Fall back to process executor if Ray is not available - logger.warning("Ray not available, falling back to ProcessExecutor") - return ProcessExecutor - - # Default to ProcessExecutor for non-distributed execution - logger.debug("Using ProcessExecutor (non-distributed)") - return ProcessExecutor - - -def _notify_callbacks(callbacks: Dict, event_type: str, build_state_info) -> None: - """Helper function to notify callbacks with consistent error handling. - - Args: - callbacks: Dictionary of callbacks from the registry - event_type: The event type - either "start" or "end" - build_state_info: The state info to pass to callbacks - """ - method_name = f"on_iteration_{event_type}" - - for callback in callbacks.values(): - try: - getattr(callback, method_name)(build_state_info) - except Exception as e: - # Log full stack trace at debug level - import traceback - - logger.debug( - f"Error in callback {callback.__class__.__name__}.{method_name}: {e}\n{traceback.format_exc()}" - ) - # Log a shorter message at warning level - logger.warning(f"Error in callback {callback.__class__.__name__}.{method_name}: {str(e)[:50]}") - - -@tool -def apply_feature_transformer(dataset_name: str) -> Dict: - """ - Applies a feature transformer to datasets and registers the transformed datasets. The name of the - new transformed dataset is returned in the response. - - Args: - dataset_name: Name of datasets to transform - - Returns: - Dictionary with results of transformation: - - success: Boolean indicating success or failure - - original_dataset_name: Name of the original dataset - - new_dataset_name: Name of the transformed dataset - """ - object_registry = ObjectRegistry() - - try: - # Get feature transformer code from registry - code_obj = object_registry.get(Code, "feature_transformations") - transformer_code = code_obj.code - - # Load code as module - module = types.ModuleType("feature_transformer_module") - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - exec(transformer_code, module.__dict__) - - # Instantiate transformer - transformer = module.FeatureTransformerImplementation() - - # Get dataset - dataset = object_registry.get(TabularConvertible, dataset_name) - df = dataset.to_pandas() - - # Apply transformation - transformed_df = transformer.transform(df) - - # Register transformed dataset - transformed_name = f"{dataset_name}_transformed" - transformed_ds = DatasetAdapter.coerce(transformed_df) - object_registry.register(TabularConvertible, transformed_name, transformed_ds, overwrite=True, immutable=True) - - logger.debug(f"✅ Applied feature transformer to {dataset_name} → {transformed_name}") - - return {"success": True, "original_dataset_name": dataset_name, "new_dataset_name": transformed_name} - except Exception as e: - import traceback - - logger.debug(f"Error applying feature transformer: {str(e)}\n{traceback.format_exc()}") - return {"success": False, "error": str(e), "transformed_datasets": []} - - -@tool -def get_model_artifacts() -> List[str]: - """ - Get all registered model artifact names. - - Returns: - List of artifact names (e.g., ["model.pkl", "scaler.pkl", ...]) - """ - object_registry = ObjectRegistry() - - try: - # Get all artifact names from registry - artifact_names = object_registry.list_by_type(Artifact) - return artifact_names - except Exception as e: - logger.warning(f"⚠️ Error getting model artifacts: {str(e)}") - return [] diff --git a/plexe/tools/metrics.py b/plexe/tools/metrics.py deleted file mode 100644 index c5f789a0..00000000 --- a/plexe/tools/metrics.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Tools related to metrics selection and model review/metadata extraction. -""" - -import logging -from typing import Dict, Callable - -from smolagents import tool - -from plexe.internal.common.provider import Provider -from plexe.internal.models.generation.planning import SolutionPlanGenerator - -logger = logging.getLogger(__name__) - - -def get_select_target_metric(llm_to_use: str) -> Callable: - """Returns a tool function to select target metrics with the model ID pre-filled.""" - - @tool - def select_target_metric(task: str) -> Dict: - """ - Selects the appropriate target metric to optimise for the given task. - - Args: - task: The task definition combining intent, input schema, and output schema - - Returns: - A dictionary containing the metric information - """ - plan_generator = SolutionPlanGenerator(Provider(llm_to_use)) - metric = plan_generator.select_target_metric(task) - return { - "name": metric.name, - "value": metric.value, - "comparison_method": str(metric.comparator.comparison_method), - } - - return select_target_metric diff --git a/plexe/tools/response_formatting.py b/plexe/tools/response_formatting.py deleted file mode 100644 index 8d445a4c..00000000 --- a/plexe/tools/response_formatting.py +++ /dev/null @@ -1,99 +0,0 @@ -""" -This module provides tools for forcing an agent to return its response in a specific format. -""" - -from typing import Dict, List, Optional - -from smolagents import tool - - -@tool -def format_final_orchestrator_agent_response( - best_solution_id: str, - performance_metric_name: str, - performance_metric_value: float, - performance_metric_comparison_method: str, - model_review_output: Dict[str, str], -) -> dict: - """ - Returns a dictionary containing the exact fields that the agent must return in its final response. The purpose - of this tool is to 'package' the final deliverables of the ML engineering task. The best_solution_id should be - the ID of the solution that was selected as the best performing one. - - Args: - best_solution_id: The solution ID for the selected best ML solution - performance_metric_name: The name of the performance metric to optimise that was used in this task - performance_metric_value: The value of the performance attained by the selected ML model - performance_metric_comparison_method: The comparison method used to evaluate the performance metric - model_review_output: The output of the 'review_model' tool which contains a review of the selected ML model - - Returns: - Dictionary containing the fields that must be returned by the agent in its final response - """ - from plexe.core.object_registry import ObjectRegistry - from plexe.core.entities.solution import Solution - - # Get the solution plan from the best solution - object_registry = ObjectRegistry() - try: - best_solution = object_registry.get(Solution, best_solution_id) - solution_plan = best_solution.plan or "Solution plan not available" - except Exception: - solution_plan = "Solution plan not available" - - return { - "solution_plan": solution_plan, - "performance": { - "name": performance_metric_name, - "value": performance_metric_value, - "comparison_method": performance_metric_comparison_method, - }, - "metadata": model_review_output, - } - - -@tool -def format_final_mle_agent_response( - solution_id: str, - execution_success: bool, - performance_value: Optional[float] = None, - exception: Optional[str] = None, - model_artifact_names: Optional[List[str]] = None, -) -> dict: - """ - Returns a dictionary containing the exact fields that the agent must return in its final response. The fields - 'performance_value', 'exception', and 'model_artifact_names' are optional. They MUST be included if they are - available, but can be omitted if they are not available. - - Args: - solution_id: The solution ID returned by the code execution tool after executing the training code - execution_success: Boolean indicating if the training code executed successfully - performance_value: The value of the performance attained by the selected ML model, if any - exception: Exception message if the code execution failed, if any - model_artifact_names: A list with the names of all the model artifacts created by the model training script - - Returns: - Dictionary containing the fields that must be returned by the agent in its final response - """ - - return { - "solution_id": solution_id, - "execution_success": execution_success, - "performance_value": performance_value, - "exception": exception, - "model_artifact_names": model_artifact_names, - } - - -@tool -def format_final_mlops_agent_response(inference_code_id: str) -> dict: - """ - Returns a dictionary containing the exact fields that the agent must return in its final response. - - Args: - inference_code_id: The inference code id returned by the code validation tool after validating the inference code - - Returns: - Dictionary containing the fields that must be returned by the agent in its final response - """ - return {"python_inference_code": inference_code_id} diff --git a/plexe/tools/schemas.py b/plexe/tools/schemas.py deleted file mode 100644 index a3494984..00000000 --- a/plexe/tools/schemas.py +++ /dev/null @@ -1,293 +0,0 @@ -""" -Tools for schema inference, definition, and validation. -""" - -import logging -from typing import Dict, Any - -from smolagents import tool - -from plexe.internal.common.datasets.interface import TabularConvertible -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.common.utils.pydantic_utils import map_to_basemodel -from plexe.internal.common.utils.pandas_utils import convert_dtype_to_python - -logger = logging.getLogger(__name__) - - -@tool -def register_global_schemas( - input_schema: Dict[str, str], output_schema: Dict[str, str], reasoning: str -) -> Dict[str, str]: - """ - Register input and output schemas that should be used by all models built for all solutions. - - Args: - input_schema: Finalized input schema as field:type dictionary - output_schema: Finalized output schema as field:type dictionary - reasoning: Explanation of schema design decisions - - Returns: - Status message confirming registration - - Raises: - ValueError: If schema validation fails - KeyError: If schema registration fails - """ - object_registry = ObjectRegistry() - - # Validate schemas by attempting to convert them to Pydantic models - try: - map_to_basemodel("InputSchema", input_schema) - map_to_basemodel("OutputSchema", output_schema) - except Exception as e: - error_msg = f"Schema validation or registration failed: {str(e)}" - logger.error(error_msg) - raise ValueError(error_msg) from e - - # Register input schema if possible; global schemas are typically registered once - try: - object_registry.register(dict, "input_schema", input_schema, immutable=True) - except ValueError as e: - if "already registered" not in str(e): - raise e - - # Register output schema if possible; global schemas are typically registered once - try: - object_registry.register(dict, "output_schema", output_schema, immutable=True) - except ValueError as e: - if "already registered" not in str(e): - raise e - - # Register reasoning if possible - try: - object_registry.register(str, "schema_reasoning", reasoning) - except ValueError as e: - if "already registered" not in str(e): - raise e - - return {"status": "success", "message": "Schemas validated and registered successfully"} - - -@tool -def get_dataset_schema(dataset_name: str) -> Dict[str, Any]: - """ - Extract the schema (column names and types) from a dataset. This is useful for understanding the structure - of the dataset and how it can be used in model training. - - Args: - dataset_name: Name of the dataset in the registry - - Returns: - Dictionary with column names and their python types - """ - object_registry = ObjectRegistry() - dataset = object_registry.get(TabularConvertible, dataset_name) - df = dataset.to_pandas() - - # Get column names and infer python types - schema = {} - for col in df.columns: - dtype = df[col].dtype - # Map pandas types to Python types, detecting List[T] for object columns - sample_values = df[col].dropna().head(10).tolist() if len(df) > 0 else None - py_type = convert_dtype_to_python(dtype, sample_values) - schema[col] = py_type - - return {"dataset_name": dataset_name, "columns": schema} - - -@tool -def get_global_schemas() -> Dict[str, Dict[str, str]]: - """ - Get global input and output schemas that should apply to a model. - - Returns: - Dictionary with 'input' and 'output' schemas (if registered). - Each schema is a dict mapping field names to types. - Returns empty dict for missing schemas. - """ - object_registry = ObjectRegistry() - result = {} - - try: - # Try to get input schema - try: - input_schema = object_registry.get(dict, "input_schema") - if input_schema: - result["input"] = input_schema - except KeyError: - logger.debug("Global input schema not found in registry") - - # Try to get output schema - try: - output_schema = object_registry.get(dict, "output_schema") - if output_schema: - result["output"] = output_schema - except KeyError: - logger.debug("Global output schema not found in registry") - - return result - - except Exception as e: - logger.warning(f"⚠️ Error getting global schemas: {str(e)}") - return {} - - -@tool -def register_solution_schemas( - solution_id: str, input_schema: Dict[str, str], output_schema: Dict[str, str], reasoning: str -) -> Dict[str, str]: - """ - Register input and output schemas for a specific solution. - - Args: - solution_id: ID of the solution to register schemas for - input_schema: Solution-specific input schema as field:type dictionary - output_schema: Solution-specific output schema as field:type dictionary - reasoning: Explanation of schema design decisions - - Returns: - Status message confirming registration - - Raises: - ValueError: If schema validation fails or solution not found - """ - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - - # If schemas are locked by the user, we must use the global schemas - input_is_locked = object_registry.get(bool, "input_schema_is_locked") - output_is_locked = object_registry.get(bool, "output_schema_is_locked") - global_schemas = get_global_schemas() - global_input_schema = global_schemas.get("input") - global_output_schema = global_schemas.get("output") - - solution = object_registry.get(Solution, solution_id) - - # If both schemas are locked, use global schemas - if input_is_locked and output_is_locked: - solution.input_schema = global_input_schema - solution.output_schema = global_output_schema - solution.schema_reasoning = "Using schemas provided by the user" - # If input is locked, validate output schema and set - elif input_is_locked: - try: - map_to_basemodel("OutputSchema", output_schema) - except Exception as e: - error_msg = f"Output schema validation failed: {str(e)}" - logger.error(error_msg) - raise ValueError(error_msg) from e - solution.input_schema = global_input_schema - solution.output_schema = output_schema - solution.schema_reasoning = reasoning - # If output is locked, validate input schema and set - elif output_is_locked: - try: - map_to_basemodel("InputSchema", input_schema) - except Exception as e: - error_msg = f"Input schema validation failed: {str(e)}" - logger.error(error_msg) - raise ValueError(error_msg) from e - solution.input_schema = input_schema - solution.output_schema = global_output_schema - solution.schema_reasoning = reasoning - # If neither schema is locked, validate both schemas - else: - try: - map_to_basemodel("InputSchema", input_schema) - map_to_basemodel("OutputSchema", output_schema) - except Exception as e: - error_msg = f"Schema validation failed: {str(e)}" - logger.error(error_msg) - raise ValueError(error_msg) from e - solution.input_schema = input_schema - solution.output_schema = output_schema - solution.schema_reasoning = reasoning - - # Re-register the updated solution - object_registry.register(Solution, solution_id, solution, overwrite=True) - - # Construct response depending on actions taken - if input_is_locked and output_is_locked: - logger.debug(f"✅ Registered global schemas for solution '{solution_id}' (schemas locked)") - return { - "status": "success", - "message": "Nothing to register, as schemas are locked by user.", - "registered_input_schema": global_input_schema, - "registered_output_schema": global_output_schema, - } - elif input_is_locked: - logger.debug(f"✅ Registered output schema for solution '{solution_id}' (input locked)") - return { - "status": "success", - "message": "New output schema was registered; input schema is locked by the user so defaulted to global.", - "registered_input_schema": global_input_schema, - "registered_output_schema": output_schema, - } - elif output_is_locked: - logger.debug(f"✅ Registered input schema for solution '{solution_id}' (output locked)") - return { - "status": "success", - "message": "New input schema was registered; output schema is locked by the user so defaulted to global.", - "registered_input_schema": input_schema, - "registered_output_schema": global_output_schema, - } - else: - logger.debug(f"✅ Registered schemas for solution '{solution_id}'") - return { - "status": "success", - "message": f"Schemas validated and registered for solution '{solution_id}'", - "registered_input_schema": input_schema, - "registered_output_schema": output_schema, - } - - -@tool -def get_solution_schemas(solution_id: str) -> Dict[str, Dict[str, str]]: - """ - Get schemas for a specific solution, with fallback to global schemas. - - Args: - solution_id: ID of the solution to get schemas for - - Returns: - Dictionary with 'input' and 'output' schemas. - Prioritizes solution-specific schemas over global schemas. - Returns empty dict if no schemas found. - """ - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - result = {} - - try: - # First try to get solution-specific schemas - try: - solution = object_registry.get(Solution, solution_id) - if solution.input_schema: - result["input"] = solution.input_schema - if solution.output_schema: - result["output"] = solution.output_schema - - # If we have both schemas from solution, return them - if "input" in result and "output" in result: - logger.debug(f"Using solution-specific schemas for '{solution_id}'") - return result - - except KeyError: - logger.debug(f"Solution '{solution_id}' not found, falling back to global schemas") - - # Fallback to global schemas for missing schemas - global_schemas = get_global_schemas() - if "input" not in result and "input" in global_schemas: - result["input"] = global_schemas["input"] - if "output" not in result and "output" in global_schemas: - result["output"] = global_schemas["output"] - - return result - - except Exception as e: - logger.warning(f"⚠️ Error getting solution schemas: {str(e)}") - return {} diff --git a/plexe/tools/solutions.py b/plexe/tools/solutions.py deleted file mode 100644 index ee26de98..00000000 --- a/plexe/tools/solutions.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Tools for creating and managing Solution objects in the ML workflow. - -These tools handle the creation, registration, and management of Solution objects -that represent complete ML approaches from planning through execution. -""" - -import logging -from typing import Dict, List - -from smolagents import tool - -from plexe.core.object_registry import ObjectRegistry -from plexe.core.entities.solution import Solution - -logger = logging.getLogger(__name__) - - -def get_solution_creation_tool(max_solutions: int = 1): - """ - Returns a tool function to create a new Solution object with a plan. - This tool is used by the ML Research Scientist agent to develop new solution approaches. - - Args: - max_solutions: Maximum number of solutions that can be created at once - - Returns: - A tool function that creates a Solution object - """ - if max_solutions <= 0: - raise ValueError("max_solutions must be greater than 0") - - @tool - def create_solution(plan: str) -> Dict[str, str]: - """ - Creates a new Solution object with the given plan and registers it in the object registry so - that other agents in the team can access it. - - This tool should be used by the ML Research Scientist agent when developing new solution - approaches. Each solution represents a distinct ML strategy that will be implemented - and evaluated. - - Args: - plan: The detailed solution plan and strategy description for this ML approach - - Returns: - Dictionary containing the solution ID and success confirmation: - { - "solution_id": "unique_solution_identifier", - "message": "Success message" - } - """ - object_registry = ObjectRegistry() - - # Check if the maximum number of solutions has been reached - if len(object_registry.get_all(Solution)) >= max_solutions: - raise RuntimeError(f"Maximum number of solutions ({max_solutions}) reached. Cannot create more solutions.") - - try: - # Create a new Solution object with the provided plan - solution = Solution(plan=plan) - - # Register the solution in the object registry - object_registry.register(Solution, solution.id, solution, overwrite=False) - - logger.debug(f"✅ Created and registered solution with ID '{solution.id}'") - - return { - "solution_id": solution.id, - "message": f"Successfully created and registered solution with ID '{solution.id}'", - } - - except Exception as e: - logger.warning(f"⚠️ Error creating solution: {str(e)}") - raise RuntimeError(f"Failed to create solution: {str(e)}") - - return create_solution - - -@tool -def get_solution_plan_by_id(solution_id: str) -> str: - """ - Retrieves a model solution plan by its ID. - - Args: - solution_id: ID of the Solution - - Returns: - The plan string of the Solution - """ - object_registry = ObjectRegistry() - - try: - solution = object_registry.get(Solution, solution_id) - return solution.plan if solution.plan else "No plan available for this solution" - - except Exception as e: - logger.warning(f"⚠️ Error retrieving solution plan: {str(e)}") - raise RuntimeError(f"Failed to retrieve solution plan: {str(e)}") - - -@tool -def list_solutions() -> List[str]: - """ - Lists all Solution IDs currently available. Use this tool to see all available solutions if you run into - issues with retrieving a specific solution. - - Returns: - List of solution IDs currently available - """ - object_registry = ObjectRegistry() - - try: - solution_ids = object_registry.list_by_type(Solution) - - logger.debug(f"✅ Available solutions: {solution_ids}") - return solution_ids - - except Exception as e: - logger.warning(f"⚠️ Error listing solutions: {str(e)}") - raise RuntimeError(f"Failed to list solutions: {str(e)}") diff --git a/plexe/tools/testing.py b/plexe/tools/testing.py deleted file mode 100644 index a43bbef4..00000000 --- a/plexe/tools/testing.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Tools for model testing and evaluation. - -These tools help with model evaluation operations within the model generation pipeline, -including registering testing code and evaluation reports. -""" - -import logging -from typing import Dict, List - -from smolagents import tool - -from plexe.core.object_registry import ObjectRegistry - -logger = logging.getLogger(__name__) - - -@tool -def register_testing_code(solution_id: str, testing_code: str) -> str: - """ - Register the testing/evaluation code in the object registry and update the Solution object. The testing code - must first have been executed successfully before registration. - - Args: - solution_id: ID of the Solution object to update - testing_code: Python code used for model testing and evaluation - - Returns: - Success message confirming registration - """ - object_registry = ObjectRegistry() - - try: - # Update the Solution object with testing code - from plexe.core.entities.solution import Solution - - solution = object_registry.get(Solution, solution_id) - - solution.testing_code = testing_code - - object_registry.register(Solution, solution_id, solution, overwrite=True) - - logger.debug(f"✅ Registered model testing code for solution '{solution_id}'") - return f"Successfully registered model testing code for solution '{solution_id}'" - - except Exception as e: - logger.warning(f"⚠️ Error registering testing code: {str(e)}") - raise RuntimeError(f"Failed to register testing code: {str(e)}") - - -@tool -def register_evaluation_report( - solution_id: str, - model_performance_summary: Dict, - detailed_metrics: Dict, - quality_analysis: Dict, - recommendations: List[str], - testing_insights: List[str], -) -> str: - """ - Register comprehensive evaluation report in the object registry and link to Solution. - - This tool creates a structured report with model evaluation results and registers - it in the Object Registry for use by other agents or final model output. - - Args: - solution_id: ID of the Solution object to link the evaluation report to - model_performance_summary: Overall performance metrics and scores - detailed_metrics: Comprehensive metrics breakdown by class/category - quality_analysis: Error patterns, robustness, interpretability insights - recommendations: Specific recommendations for deployment/improvement - testing_insights: Key insights from testing that impact model usage - - Returns: - Success message confirming registration - """ - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - - try: - # Create structured evaluation report - evaluation_report = { - "solution_id": solution_id, - "model_performance_summary": model_performance_summary, - "detailed_metrics": detailed_metrics, - "quality_analysis": quality_analysis, - "recommendations": recommendations, - "testing_insights": testing_insights, - } - - # Update Solution object with summary analysis - solution = object_registry.get(Solution, solution_id) - solution.model_evaluation_report = evaluation_report - object_registry.register(Solution, solution_id, solution, overwrite=True) - - logger.debug(f"✅ Registered model evaluation report for solution '{solution_id}'") - return f"Successfully registered model evaluation report for solution '{solution_id}'" - - except Exception as e: - logger.warning(f"⚠️ Error registering evaluation report: {str(e)}") - raise RuntimeError(f"Failed to register evaluation report: {str(e)}") diff --git a/plexe/tools/training.py b/plexe/tools/training.py deleted file mode 100644 index 32cb16e4..00000000 --- a/plexe/tools/training.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -Tools related to code generation, including solution planning, training code, -and inference code generation. -""" - -import logging -from typing import List, Callable - -from smolagents import tool - -from plexe.core.object_registry import ObjectRegistry -from plexe.internal.common.provider import Provider -from plexe.internal.models.generation.training import TrainingCodeGenerator - -logger = logging.getLogger(__name__) - - -@tool -def register_best_solution(best_solution_id: str) -> str: - """ - Register the solution with the best performance as the final selected solution in the object - registry. This step is required in order for the solution to be available for final model building. - - Args: - best_solution_id: 'solution_id' of the best performing solution - - Returns: - Success message confirming registration - """ - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - - try: - # Get the best solution - best_solution = object_registry.get(Solution, best_solution_id) - - # Register the solution with a fixed ID for easy retrieval - object_registry.register(Solution, "best_performing_solution", best_solution, overwrite=True) - - logger.debug(f"✅ Registered best performing solution with ID '{best_solution_id}'") - return f"Successfully registered solution with ID '{best_solution_id}' as the best performing solution." - - except Exception as e: - logger.warning(f"⚠️ Error registering best solution: {str(e)}") - raise RuntimeError(f"Failed to register best solution: {str(e)}") - - -def get_training_code_generation_tool(llm_to_use: str) -> Callable: - """Returns a tool function to generate training code with the model ID pre-filled.""" - - @tool - def generate_training_code( - task: str, solution_plan: str, train_datasets: List[str], validation_datasets: List[str] - ) -> str: - """Generates training code based on the solution plan. - - Args: - task: The task definition - solution_plan: The solution plan to implement - train_datasets: Keys of datasets to use for training - validation_datasets: Keys of datasets to use for validation - - Returns: - Generated training code as a string - """ - train_generator = TrainingCodeGenerator(Provider(llm_to_use)) - return train_generator.generate_training_code(task, solution_plan, train_datasets, validation_datasets) - - return generate_training_code - - -def get_training_code_fixing_tool(llm_to_use: str) -> Callable: - """Returns a tool function to fix training code with the model ID pre-filled.""" - - @tool - def fix_training_code( - training_code: str, - solution_plan: str, - review: str, - train_datasets: List[str], - validation_datasets: List[str], - issue: str, - ) -> str: - """ - Fixes issues in the training code based on a review. - - Args: - training_code: The training code to fix - solution_plan: The solution plan being implemented - review: Review comments about the code and its issues, ideally a summary analysis of the issue - train_datasets: Keys of datasets to use for training - validation_datasets: Keys of datasets to use for validation - issue: Description of the issue to address - - Returns: - Fixed training code as a string - """ - train_generator = TrainingCodeGenerator(Provider(llm_to_use)) - return train_generator.fix_training_code( - training_code, solution_plan, review, train_datasets, validation_datasets, issue - ) - - return fix_training_code diff --git a/plexe/tools/validation.py b/plexe/tools/validation.py deleted file mode 100644 index 76c95ad0..00000000 --- a/plexe/tools/validation.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Tools related to code validation, including syntax and security checks. -""" - -import logging -import ast -from typing import Dict - -from smolagents import tool - -from plexe.config import code_templates -from plexe.internal.models.entities.code import Code -from plexe.internal.models.validation.composites import ( - InferenceCodeValidator, - TrainingCodeValidator, -) -from plexe.tools.schemas import get_solution_schemas - -logger = logging.getLogger(__name__) - - -@tool -def validate_training_code(training_code: str) -> Dict: - """Validates training code for syntax and security issues. - - Args: - training_code: The training code to validate - - Returns: - A dictionary containing validation results - """ - validator = TrainingCodeValidator() - validation = validator.validate(training_code) - - if validation.passed: - return _success_response(validation.message) - else: - error_type = type(validation.exception).__name__ if validation.exception else "UnknownError" - error_details = str(validation.exception) if validation.exception else "Unknown error" - return _error_response("validation", error_type, error_details, validation.message) - - -@tool -def validate_inference_code( - solution_id: str, - inference_code: str, -) -> Dict: - """ - Validates inference code for syntax, security, and correctness, and updates the Solution object. - - Args: - solution_id: ID of the Solution object to update with inference code - inference_code: The inference code to validate - - Returns: - Dict with validation results and error details if validation fails - """ - from plexe.internal.common.utils.pydantic_utils import map_to_basemodel - from plexe.core.object_registry import ObjectRegistry - from plexe.core.entities.solution import Solution - - object_registry = ObjectRegistry() - - # Get solution object from registry - try: - solution = object_registry.get(Solution, solution_id) - except Exception as e: - return _error_response("solution_retrieval", type(e).__name__, str(e)) - - # Get schemas from registry - try: - schemas = get_solution_schemas("best_performing_solution") - input_schema = schemas["input"] - output_schema = schemas["output"] - except Exception as e: - return _error_response("schema_preparation", type(e).__name__, str(e)) - - # Convert schemas to pydantic models - try: - input_model = map_to_basemodel("InputSchema", input_schema) - output_model = map_to_basemodel("OutputSchema", output_schema) - except Exception as e: - return _error_response("schema_preparation", type(e).__name__, str(e)) - - # Get input samples - try: - input_samples = object_registry.get(list, "predictor_input_sample") - if not input_samples: - return _error_response("input_sample", "MissingData", "Input sample list is empty") - except Exception as e: - return _error_response("input_sample", type(e).__name__, str(e)) - - # Validate the code - validator = InferenceCodeValidator(input_schema=input_model, output_schema=output_model, input_sample=input_samples) - validation = validator.validate(inference_code, model_artifacts=solution.model_artifacts) - - # Return appropriate result - if validation.passed: - # Update the Solution object with inference code, and register an alias for production use - solution.inference_code = inference_code - object_registry.register(Solution, solution_id, solution, overwrite=True) - object_registry.register(Solution, "final_inference_solution", solution, overwrite=True, immutable=True) - - # Also instantiate and register the predictor for the model tester agent - try: - import types - - predictor_module = types.ModuleType("predictor") - exec(inference_code, predictor_module.__dict__) - predictor_class = getattr(predictor_module, "PredictorImplementation") - predictor = predictor_class(solution.model_artifacts) - - # Register the instantiated predictor - from plexe.core.interfaces.predictor import Predictor - - object_registry.register(Predictor, "trained_predictor", predictor, overwrite=True) - logger.debug("✅ Registered instantiated predictor for testing") - - except Exception as e: - logger.warning(f"⚠️ Failed to register instantiated predictor: {str(e)}") - # Don't fail validation if predictor registration fails - - return _success_response(validation.message, solution_id) - - # Extract error details from validation result - error_type = validation.error_type or ( - type(validation.exception).__name__ if validation.exception else "UnknownError" - ) - error_details = validation.error_details or (str(validation.exception) if validation.exception else "Unknown error") - - return _error_response(validation.error_stage or "unknown", error_type, error_details, validation.message) - - -def _error_response(stage, exc_type, details, message=None): - """Helper to create error response dictionaries""" - return { - "passed": False, - "error_stage": stage, - "error_type": exc_type, - "error_details": details, - "message": message or details, - } - - -def _success_response(message, solution_id=None): - """Helper to create success response dictionaries""" - response = {"passed": True, "message": message} - # Only include solution_id for inference code validation - if solution_id is not None: - response["solution_id"] = solution_id - return response - - -@tool -def validate_feature_transformations(transformation_code: str) -> Dict: - """ - Validates feature transformation code for syntax correctness and implementation - of the FeatureTransformer interface. - - Args: - transformation_code: Python code for transforming datasets - - Returns: - Dictionary with validation results - """ - import types - import warnings - from plexe.core.object_registry import ObjectRegistry - from plexe.core.interfaces.feature_transformer import FeatureTransformer - - # Check for syntax errors - try: - ast.parse(transformation_code) - except SyntaxError as e: - return _error_response("syntax", "SyntaxError", str(e)) - - # Load the code as a module to check for proper FeatureTransformer implementation - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - module = types.ModuleType("test_feature_transformer") - exec(transformation_code, module.__dict__) - - # Check if the module contains the FeatureTransformerImplementation class - if not hasattr(module, "FeatureTransformerImplementation"): - return _error_response( - "class_definition", - "MissingClass", - "Code must define a class named 'FeatureTransformerImplementation'", - ) - - # Check if the class is a subclass of FeatureTransformer - transformer_class = getattr(module, "FeatureTransformerImplementation") - if not issubclass(transformer_class, FeatureTransformer): - return _error_response( - "class_definition", - "InvalidClass", - "FeatureTransformerImplementation must be a subclass of FeatureTransformer", - ) - except Exception as e: - return _error_response( - "validation", - type(e).__name__, - str(e), - message=f"The feature transformer must be a subclass of the following interface:\n\n" - f"```python\n" - f"{code_templates.feature_transformer_interface}" - f"```", - ) - - # Register the transformation code with a fixed ID - object_registry = ObjectRegistry() - code_id = "feature_transformations" - object_registry.register(Code, code_id, Code(transformation_code), overwrite=True) - - return {"passed": True, "message": "Feature transformation code validated successfully", "code_id": code_id} diff --git a/plexe/ui/frontend/.dockerignore b/plexe/ui/frontend/.dockerignore new file mode 100644 index 00000000..77b6c575 --- /dev/null +++ b/plexe/ui/frontend/.dockerignore @@ -0,0 +1,8 @@ +node_modules +dist +.DS_Store +*.local +.env.local +npm-debug.log* +yarn-debug.log* +yarn-error.log* diff --git a/plexe/ui/frontend/Dockerfile.dev b/plexe/ui/frontend/Dockerfile.dev new file mode 100644 index 00000000..58679f9d --- /dev/null +++ b/plexe/ui/frontend/Dockerfile.dev @@ -0,0 +1,19 @@ +# Development Dockerfile for React with Hot Reload +# Use: docker-compose -f docker-compose.dev.yml up -d +# This enables hot reload during development + +FROM node:18-alpine + +WORKDIR /app + +# Copy package files +COPY package.json package-lock.json* yarn.lock* ./ + +# Install dependencies +RUN if [ -f yarn.lock ]; then yarn install --frozen-lockfile; else npm install; fi + +# Expose Vite port +EXPOSE 3000 + +# Run Vite dev server with host 0.0.0.0 to allow connections from outside container +CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0"] diff --git a/plexe/ui/frontend/README.md b/plexe/ui/frontend/README.md new file mode 100644 index 00000000..2efa8e3f --- /dev/null +++ b/plexe/ui/frontend/README.md @@ -0,0 +1,72 @@ +# Plexe Frontend + +This folder contains a minimal Vite + React frontend for the Plexe Assistant. + +## Development + +Quick commands (from this folder): + +- Install dependencies: + +```bash +npm install +``` + +- Run dev server: + +```bash +npm run dev +``` + +- Build production assets (output to `dist`): + +```bash +npm run build +``` + +The FastAPI server at `plexe/server.py` will serve `ui/frontend/dist/index.html` if present. During development you can run `npm run dev` and open the Vite dev URL (usually http://localhost:5173) to work on the frontend. + +## Docker + +The frontend includes a `Dockerfile` for containerized builds: + +- **Build stage**: Uses Node 18 to build the React app +- **Serve stage**: Uses nginx to serve the built assets + +The `docker-compose.yml` at the repo root includes a `frontend` service that: +- Builds the frontend automatically using the `Dockerfile` +- Serves the frontend on `http://localhost:3000` +- Is managed alongside PostgreSQL, MLflow, and the backend + +### Build & Run with Docker + +From the repo root: + +```bash +# Start all services (frontend, backend, postgres, mlflow, pgadmin) +docker compose up -d + +# View logs +docker compose logs -f frontend +docker compose logs -f backend + +# Stop all services +docker compose down +``` + +The frontend will be available at `http://localhost:3000` and the backend API at `http://localhost:8000`. + +## Architecture + +- **Frontend container**: nginx serving static React SPA +- **Backend container**: Python FastAPI serving the Plexe API and WebSocket at `/ws` +- **Database container**: PostgreSQL +- **MLflow container**: Experiment tracking +- **pgAdmin container**: Database UI + +The frontend connects to the backend via: +- REST API endpoints (if needed) +- WebSocket at `/ws` (for real-time chat) + +Both use relative URLs by default, so they work when served from the same origin (as with Docker Compose). + diff --git a/plexe/ui/frontend/index.html b/plexe/ui/frontend/index.html new file mode 100644 index 00000000..d14848d8 --- /dev/null +++ b/plexe/ui/frontend/index.html @@ -0,0 +1,12 @@ + + + + + + Plexe Assistant (React) + + +
+ + + diff --git a/plexe/ui/frontend/nginx.conf b/plexe/ui/frontend/nginx.conf new file mode 100644 index 00000000..bd48064d --- /dev/null +++ b/plexe/ui/frontend/nginx.conf @@ -0,0 +1,50 @@ +server { + listen 80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Proxy WebSocket connections to backend + location /ws { + proxy_pass http://backend:8000; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 86400; + } + + # Proxy API requests to backend + location /api { + proxy_pass http://backend:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Serve static files with cache-busting headers + location /assets/ { + expires 1y; + add_header Cache-Control "public, immutable"; + } + + # SPA fallback: route all non-static requests to index.html + location / { + try_files $uri $uri/ /index.html; + expires -1; + add_header Cache-Control "no-cache, no-store, must-revalidate"; + } + + # Health check endpoint + location /health { + access_log off; + return 200 "healthy\n"; + add_header Content-Type text/plain; + } +} + diff --git a/plexe/ui/frontend/package-lock.json b/plexe/ui/frontend/package-lock.json new file mode 100644 index 00000000..cb505045 --- /dev/null +++ b/plexe/ui/frontend/package-lock.json @@ -0,0 +1,1629 @@ +{ + "name": "plexe-frontend", + "version": "0.0.1", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "plexe-frontend", + "version": "0.0.1", + "dependencies": { + "react": "18.2.0", + "react-dom": "18.2.0" + }, + "devDependencies": { + "@vitejs/plugin-react": "^4.2.0", + "vite": "^5.0.0" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", + "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.27.1", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/compat-data": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.28.5.tgz", + "integrity": "sha512-6uFXyCayocRbqhZOB+6XcuZbkMNimwfVGFji8CTZnCzOHVGvDqzvitu1re2AU5LROliz7eQPhB8CpAMvnx9EjA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/core": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.5.tgz", + "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.27.1", + "@babel/generator": "^7.28.5", + "@babel/helper-compilation-targets": "^7.27.2", + "@babel/helper-module-transforms": "^7.28.3", + "@babel/helpers": "^7.28.4", + "@babel/parser": "^7.28.5", + "@babel/template": "^7.27.2", + "@babel/traverse": "^7.28.5", + "@babel/types": "^7.28.5", + "@jridgewell/remapping": "^2.3.5", + "convert-source-map": "^2.0.0", + "debug": "^4.1.0", + "gensync": "^1.0.0-beta.2", + "json5": "^2.2.3", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/babel" + } + }, + "node_modules/@babel/generator": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.28.5.tgz", + "integrity": "sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.5", + "@babel/types": "^7.28.5", + "@jridgewell/gen-mapping": "^0.3.12", + "@jridgewell/trace-mapping": "^0.3.28", + "jsesc": "^3.0.2" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-compilation-targets": { + "version": "7.27.2", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.27.2.tgz", + "integrity": "sha512-2+1thGUUWWjLTYTHZWK1n8Yga0ijBz1XAhUXcKy81rd5g6yh7hGqMp45v7cadSbEHc9G3OTv45SyneRN3ps4DQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/compat-data": "^7.27.2", + "@babel/helper-validator-option": "^7.27.1", + "browserslist": "^4.24.0", + "lru-cache": "^5.1.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-globals": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz", + "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-imports": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.27.1.tgz", + "integrity": "sha512-0gSFWUPNXNopqtIPQvlD5WgXYI5GY2kP2cCvoT8kczjbfcfuIljTbcWrulD1CIPIX2gt1wghbDy08yE1p+/r3w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/traverse": "^7.27.1", + "@babel/types": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-transforms": { + "version": "7.28.3", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.3.tgz", + "integrity": "sha512-gytXUbs8k2sXS9PnQptz5o0QnpLL51SwASIORY6XaBKF88nsOT0Zw9szLqlSGQDP/4TljBAD5y98p2U1fqkdsw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-module-imports": "^7.27.1", + "@babel/helper-validator-identifier": "^7.27.1", + "@babel/traverse": "^7.28.3" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-plugin-utils": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.27.1.tgz", + "integrity": "sha512-1gn1Up5YXka3YYAHGKpbideQ5Yjf1tDa9qYcgysz+cNCXukyLl6DjPXhD3VRwSb8c0J9tA4b2+rHEZtc6R0tlw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-option": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz", + "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helpers": { + "version": "7.28.4", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.4.tgz", + "integrity": "sha512-HFN59MmQXGHVyYadKLVumYsA9dBFun/ldYxipEjzA4196jpLZd8UjEEBLkbEkvfYreDqJhZxYAWFPtrfhNpj4w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/template": "^7.27.2", + "@babel/types": "^7.28.4" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.5.tgz", + "integrity": "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.5" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-self": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz", + "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-source": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz", + "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/template": { + "version": "7.27.2", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.27.2.tgz", + "integrity": "sha512-LPDZ85aEJyYSd18/DkjNh4/y1ntkE5KwUHWTiqgRxruuZL2F1yuHligVHLvcHY2vMHXttKFpJn6LwfI7cw7ODw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.27.1", + "@babel/parser": "^7.27.2", + "@babel/types": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/traverse": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.28.5.tgz", + "integrity": "sha512-TCCj4t55U90khlYkVV/0TfkJkAkUg3jZFA3Neb7unZT8CPok7iiRfaX0F+WnqWqt7OxhOn0uBKXCw4lbL8W0aQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.27.1", + "@babel/generator": "^7.28.5", + "@babel/helper-globals": "^7.28.0", + "@babel/parser": "^7.28.5", + "@babel/template": "^7.27.2", + "@babel/types": "^7.28.5", + "debug": "^4.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/types": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.5.tgz", + "integrity": "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", + "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz", + "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz", + "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz", + "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz", + "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz", + "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz", + "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz", + "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz", + "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz", + "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz", + "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz", + "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz", + "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz", + "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz", + "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz", + "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz", + "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", + "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", + "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", + "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz", + "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz", + "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz", + "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.13", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", + "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/remapping": { + "version": "2.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz", + "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.27", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz", + "integrity": "sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.53.2.tgz", + "integrity": "sha512-yDPzwsgiFO26RJA4nZo8I+xqzh7sJTZIWQOxn+/XOdPE31lAvLIYCKqjV+lNH/vxE2L2iH3plKxDCRK6i+CwhA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.53.2.tgz", + "integrity": "sha512-k8FontTxIE7b0/OGKeSN5B6j25EuppBcWM33Z19JoVT7UTXFSo3D9CdU39wGTeb29NO3XxpMNauh09B+Ibw+9g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.53.2.tgz", + "integrity": "sha512-A6s4gJpomNBtJ2yioj8bflM2oogDwzUiMl2yNJ2v9E7++sHrSrsQ29fOfn5DM/iCzpWcebNYEdXpaK4tr2RhfQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.53.2.tgz", + "integrity": "sha512-e6XqVmXlHrBlG56obu9gDRPW3O3hLxpwHpLsBJvuI8qqnsrtSZ9ERoWUXtPOkY8c78WghyPHZdmPhHLWNdAGEw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.53.2.tgz", + "integrity": "sha512-v0E9lJW8VsrwPux5Qe5CwmH/CF/2mQs6xU1MF3nmUxmZUCHazCjLgYvToOk+YuuUqLQBio1qkkREhxhc656ViA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.53.2.tgz", + "integrity": "sha512-ClAmAPx3ZCHtp6ysl4XEhWU69GUB1D+s7G9YjHGhIGCSrsg00nEGRRZHmINYxkdoJehde8VIsDC5t9C0gb6yqA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.53.2.tgz", + "integrity": "sha512-EPlb95nUsz6Dd9Qy13fI5kUPXNSljaG9FiJ4YUGU1O/Q77i5DYFW5KR8g1OzTcdZUqQQ1KdDqsTohdFVwCwjqg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.53.2.tgz", + "integrity": "sha512-BOmnVW+khAUX+YZvNfa0tGTEMVVEerOxN0pDk2E6N6DsEIa2Ctj48FOMfNDdrwinocKaC7YXUZ1pHlKpnkja/Q==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.53.2.tgz", + "integrity": "sha512-Xt2byDZ+6OVNuREgBXr4+CZDJtrVso5woFtpKdGPhpTPHcNG7D8YXeQzpNbFRxzTVqJf7kvPMCub/pcGUWgBjA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.53.2.tgz", + "integrity": "sha512-+LdZSldy/I9N8+klim/Y1HsKbJ3BbInHav5qE9Iy77dtHC/pibw1SR/fXlWyAk0ThnpRKoODwnAuSjqxFRDHUQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.53.2.tgz", + "integrity": "sha512-8ms8sjmyc1jWJS6WdNSA23rEfdjWB30LH8Wqj0Cqvv7qSHnvw6kgMMXRdop6hkmGPlyYBdRPkjJnj3KCUHV/uQ==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.53.2.tgz", + "integrity": "sha512-3HRQLUQbpBDMmzoxPJYd3W6vrVHOo2cVW8RUo87Xz0JPJcBLBr5kZ1pGcQAhdZgX9VV7NbGNipah1omKKe23/g==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.53.2.tgz", + "integrity": "sha512-fMjKi+ojnmIvhk34gZP94vjogXNNUKMEYs+EDaB/5TG/wUkoeua7p7VCHnE6T2Tx+iaghAqQX8teQzcvrYpaQA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.53.2.tgz", + "integrity": "sha512-XuGFGU+VwUUV5kLvoAdi0Wz5Xbh2SrjIxCtZj6Wq8MDp4bflb/+ThZsVxokM7n0pcbkEr2h5/pzqzDYI7cCgLQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.53.2.tgz", + "integrity": "sha512-w6yjZF0P+NGzWR3AXWX9zc0DNEGdtvykB03uhonSHMRa+oWA6novflo2WaJr6JZakG2ucsyb+rvhrKac6NIy+w==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.53.2.tgz", + "integrity": "sha512-yo8d6tdfdeBArzC7T/PnHd7OypfI9cbuZzPnzLJIyKYFhAQ8SvlkKtKBMbXDxe1h03Rcr7u++nFS7tqXz87Gtw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.53.2.tgz", + "integrity": "sha512-ah59c1YkCxKExPP8O9PwOvs+XRLKwh/mV+3YdKqQ5AMQ0r4M4ZDuOrpWkUaqO7fzAHdINzV9tEVu8vNw48z0lA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.53.2.tgz", + "integrity": "sha512-4VEd19Wmhr+Zy7hbUsFZ6YXEiP48hE//KPLCSVNY5RMGX2/7HZ+QkN55a3atM1C/BZCGIgqN+xrVgtdak2S9+A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.53.2.tgz", + "integrity": "sha512-IlbHFYc/pQCgew/d5fslcy1KEaYVCJ44G8pajugd8VoOEI8ODhtb/j8XMhLpwHCMB3yk2J07ctup10gpw2nyMA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.53.2.tgz", + "integrity": "sha512-lNlPEGgdUfSzdCWU176ku/dQRnA7W+Gp8d+cWv73jYrb8uT7HTVVxq62DUYxjbaByuf1Yk0RIIAbDzp+CnOTFg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.53.2.tgz", + "integrity": "sha512-S6YojNVrHybQis2lYov1sd+uj7K0Q05NxHcGktuMMdIQ2VixGwAfbJ23NnlvvVV1bdpR2m5MsNBViHJKcA4ADw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.53.2.tgz", + "integrity": "sha512-k+/Rkcyx//P6fetPoLMb8pBeqJBNGx81uuf7iljX9++yNBVRDQgD04L+SVXmXmh5ZP4/WOp4mWF0kmi06PW2tA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/babel__core": { + "version": "7.20.5", + "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", + "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.20.7", + "@babel/types": "^7.20.7", + "@types/babel__generator": "*", + "@types/babel__template": "*", + "@types/babel__traverse": "*" + } + }, + "node_modules/@types/babel__generator": { + "version": "7.27.0", + "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz", + "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__template": { + "version": "7.4.4", + "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", + "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.1.0", + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__traverse": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", + "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.2" + } + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@vitejs/plugin-react": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.7.0.tgz", + "integrity": "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/core": "^7.28.0", + "@babel/plugin-transform-react-jsx-self": "^7.27.1", + "@babel/plugin-transform-react-jsx-source": "^7.27.1", + "@rolldown/pluginutils": "1.0.0-beta.27", + "@types/babel__core": "^7.20.5", + "react-refresh": "^0.17.0" + }, + "engines": { + "node": "^14.18.0 || >=16.0.0" + }, + "peerDependencies": { + "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/baseline-browser-mapping": { + "version": "2.9.6", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.6.tgz", + "integrity": "sha512-v9BVVpOTLB59C9E7aSnmIF8h7qRsFpx+A2nugVMTszEOMcfjlZMsXRm4LF23I3Z9AJxc8ANpIvzbzONoX9VJlg==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "baseline-browser-mapping": "dist/cli.js" + } + }, + "node_modules/browserslist": { + "version": "4.28.1", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz", + "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "baseline-browser-mapping": "^2.9.0", + "caniuse-lite": "^1.0.30001759", + "electron-to-chromium": "^1.5.263", + "node-releases": "^2.0.27", + "update-browserslist-db": "^1.2.0" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001760", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001760.tgz", + "integrity": "sha512-7AAMPcueWELt1p3mi13HR/LHH0TJLT11cnwDJEs3xA4+CK/PLKeO9Kl1oru24htkyUKtkGCvAx4ohB0Ttry8Dw==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/convert-source-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "dev": true, + "license": "MIT" + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/electron-to-chromium": { + "version": "1.5.267", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.267.tgz", + "integrity": "sha512-0Drusm6MVRXSOJpGbaSVgcQsuB4hEkMpHXaVstcPmhu5LIedxs1xNK/nIxmQIU/RPC0+1/o0AVZfBTkTNJOdUw==", + "dev": true, + "license": "ISC" + }, + "node_modules/esbuild": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", + "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.21.5", + "@esbuild/android-arm": "0.21.5", + "@esbuild/android-arm64": "0.21.5", + "@esbuild/android-x64": "0.21.5", + "@esbuild/darwin-arm64": "0.21.5", + "@esbuild/darwin-x64": "0.21.5", + "@esbuild/freebsd-arm64": "0.21.5", + "@esbuild/freebsd-x64": "0.21.5", + "@esbuild/linux-arm": "0.21.5", + "@esbuild/linux-arm64": "0.21.5", + "@esbuild/linux-ia32": "0.21.5", + "@esbuild/linux-loong64": "0.21.5", + "@esbuild/linux-mips64el": "0.21.5", + "@esbuild/linux-ppc64": "0.21.5", + "@esbuild/linux-riscv64": "0.21.5", + "@esbuild/linux-s390x": "0.21.5", + "@esbuild/linux-x64": "0.21.5", + "@esbuild/netbsd-x64": "0.21.5", + "@esbuild/openbsd-x64": "0.21.5", + "@esbuild/sunos-x64": "0.21.5", + "@esbuild/win32-arm64": "0.21.5", + "@esbuild/win32-ia32": "0.21.5", + "@esbuild/win32-x64": "0.21.5" + } + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "license": "MIT" + }, + "node_modules/jsesc": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", + "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==", + "dev": true, + "license": "MIT", + "bin": { + "jsesc": "bin/jsesc" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/json5": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", + "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", + "dev": true, + "license": "MIT", + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "license": "MIT", + "dependencies": { + "js-tokens": "^3.0.0 || ^4.0.0" + }, + "bin": { + "loose-envify": "cli.js" + } + }, + "node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "license": "ISC", + "dependencies": { + "yallist": "^3.0.2" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/node-releases": { + "version": "2.0.27", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz", + "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true, + "license": "ISC" + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/react": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz", + "integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-dom": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz", + "integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0", + "scheduler": "^0.23.0" + }, + "peerDependencies": { + "react": "^18.2.0" + } + }, + "node_modules/react-refresh": { + "version": "0.17.0", + "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.17.0.tgz", + "integrity": "sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/rollup": { + "version": "4.53.2", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.53.2.tgz", + "integrity": "sha512-MHngMYwGJVi6Fmnk6ISmnk7JAHRNF0UkuucA0CUW3N3a4KnONPEZz+vUanQP/ZC/iY1Qkf3bwPWzyY84wEks1g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.53.2", + "@rollup/rollup-android-arm64": "4.53.2", + "@rollup/rollup-darwin-arm64": "4.53.2", + "@rollup/rollup-darwin-x64": "4.53.2", + "@rollup/rollup-freebsd-arm64": "4.53.2", + "@rollup/rollup-freebsd-x64": "4.53.2", + "@rollup/rollup-linux-arm-gnueabihf": "4.53.2", + "@rollup/rollup-linux-arm-musleabihf": "4.53.2", + "@rollup/rollup-linux-arm64-gnu": "4.53.2", + "@rollup/rollup-linux-arm64-musl": "4.53.2", + "@rollup/rollup-linux-loong64-gnu": "4.53.2", + "@rollup/rollup-linux-ppc64-gnu": "4.53.2", + "@rollup/rollup-linux-riscv64-gnu": "4.53.2", + "@rollup/rollup-linux-riscv64-musl": "4.53.2", + "@rollup/rollup-linux-s390x-gnu": "4.53.2", + "@rollup/rollup-linux-x64-gnu": "4.53.2", + "@rollup/rollup-linux-x64-musl": "4.53.2", + "@rollup/rollup-openharmony-arm64": "4.53.2", + "@rollup/rollup-win32-arm64-msvc": "4.53.2", + "@rollup/rollup-win32-ia32-msvc": "4.53.2", + "@rollup/rollup-win32-x64-gnu": "4.53.2", + "@rollup/rollup-win32-x64-msvc": "4.53.2", + "fsevents": "~2.3.2" + } + }, + "node_modules/scheduler": { + "version": "0.23.2", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz", + "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0" + } + }, + "node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/update-browserslist-db": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.2.tgz", + "integrity": "sha512-E85pfNzMQ9jpKkA7+TJAi4TJN+tBCuWh5rUcS/sv6cFi+1q9LYDwDI5dpUL0u/73EElyQ8d3TEaeW4sPedBqYA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.1" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/vite": { + "version": "5.4.21", + "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz", + "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.21.3", + "postcss": "^8.4.43", + "rollup": "^4.20.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true, + "license": "ISC" + } + } +} diff --git a/plexe/ui/frontend/package.json b/plexe/ui/frontend/package.json new file mode 100644 index 00000000..bc64c970 --- /dev/null +++ b/plexe/ui/frontend/package.json @@ -0,0 +1,19 @@ +{ + "name": "plexe-frontend", + "version": "0.0.1", + "private": true, + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview --port 3000" + }, + "dependencies": { + "react": "18.2.0", + "react-dom": "18.2.0" + }, + "devDependencies": { + "vite": "^5.0.0", + "@vitejs/plugin-react": "^4.2.0" + } +} \ No newline at end of file diff --git a/plexe/ui/frontend/src/App.jsx b/plexe/ui/frontend/src/App.jsx new file mode 100644 index 00000000..86837c48 --- /dev/null +++ b/plexe/ui/frontend/src/App.jsx @@ -0,0 +1,207 @@ +import React, { useEffect, useState, useRef, useCallback } from 'react' +import Sidebar from './components/Sidebar' +import Chat from './components/Chat' +import Dataset from './components/Dataset' + +export default function App() { + const [wsUrl, setWsUrl] = useState(null) + const [activePage, setActivePage] = useState('chat') + + // Lift WebSocket state to App level to persist across tab switches + const [messages, setMessages] = useState([]) + const [status, setStatus] = useState('disconnected') + const [isProcessing, setIsProcessing] = useState(false) + const [confirmationRequest, setConfirmationRequest] = useState(null) + const wsRef = useRef(null) + const reconnectTimeoutRef = useRef(null) + const pingIntervalRef = useRef(null) + + useEffect(() => { + // Try to get backend URL from environment or build a sensible default + const backendUrl = import.meta.env.VITE_BACKEND_URL || window.location.origin + + // Determine WebSocket protocol based on location protocol + const proto = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + + // Build WebSocket URL + // If backend is same origin, use /ws; otherwise use full URL + let wsUrlFinal = `${proto}//${window.location.host}/ws` + if (backendUrl !== window.location.origin) { + // Backend is on a different host/port + const backendHost = new URL(backendUrl).host + wsUrlFinal = `${proto}//${backendHost}/ws` + } + + setWsUrl(wsUrlFinal) + }, []) + + const connect = useCallback(() => { + if (!wsUrl) return + if (wsRef.current?.readyState === WebSocket.OPEN) return + + console.log('Connecting to WebSocket:', wsUrl) + const ws = new WebSocket(wsUrl) + wsRef.current = ws + + ws.onopen = () => { + console.log('WebSocket connected') + setStatus('connected') + // Clear any pending reconnect + if (reconnectTimeoutRef.current) { + clearTimeout(reconnectTimeoutRef.current) + reconnectTimeoutRef.current = null + } + // Start ping interval to keep connection alive + pingIntervalRef.current = setInterval(() => { + if (ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify({ type: 'ping' })) + } + }, 30000) // Ping every 30 seconds + } + + ws.onclose = (event) => { + console.log('WebSocket closed:', event.code, event.reason) + setStatus('disconnected') + // Don't reset isProcessing here - the backend might still be processing + // Clear ping interval + if (pingIntervalRef.current) { + clearInterval(pingIntervalRef.current) + pingIntervalRef.current = null + } + // Auto-reconnect after 3 seconds (unless it was a clean close) + if (event.code !== 1000) { + reconnectTimeoutRef.current = setTimeout(() => { + console.log('Attempting to reconnect...') + connect() + }, 3000) + } + } + + ws.onerror = (error) => { + console.error('WebSocket error:', error) + setStatus('error') + } + + ws.onmessage = (ev) => { + try { + const data = JSON.parse(ev.data) + + // Handle different message types + if (data.type === 'pong') { + // Ignore pong responses + return + } + + if (data.type === 'confirmation_request') { + // Show confirmation dialog + setConfirmationRequest(data) + return + } + + if (data.type === 'thinking' || data.role === 'thinking') { + // Thinking message - agent is still processing + setMessages((m) => [...m, data]) + } else if (data.role === 'assistant') { + // Final response - agent finished processing + setIsProcessing(false) + setMessages((m) => [...m, data]) + } else { + setMessages((m) => [...m, data]) + } + } catch (e) { + console.error('invalid ws message', e) + } + } + }, [wsUrl]) + + // Connect when wsUrl is available + useEffect(() => { + if (wsUrl) { + connect() + } + + return () => { + if (reconnectTimeoutRef.current) { + clearTimeout(reconnectTimeoutRef.current) + } + if (pingIntervalRef.current) { + clearInterval(pingIntervalRef.current) + } + // Don't close WebSocket on cleanup - we want it to persist + } + }, [wsUrl, connect]) + + const sendMessage = useCallback((content) => { + if (!content.trim() || isProcessing) return false + if (wsRef.current?.readyState !== WebSocket.OPEN) { + console.error('WebSocket not connected') + return false + } + + const msg = { role: 'user', content } + setMessages((m) => [...m, msg]) + setIsProcessing(true) + wsRef.current.send(JSON.stringify({ content })) + return true + }, [isProcessing]) + + const sendConfirmationResponse = useCallback((requestId, confirmed) => { + if (wsRef.current?.readyState !== WebSocket.OPEN) { + console.error('WebSocket not connected') + return + } + + // Send confirmation response + wsRef.current.send(JSON.stringify({ + type: 'confirmation_response', + id: requestId, + confirmed: confirmed + })) + + // Add a message showing the user's decision + setMessages((m) => [...m, { + role: 'user', + content: confirmed ? '✓ Confirmed' : '✗ Rejected' + }]) + + // Clear the confirmation request + setConfirmationRequest(null) + }, []) + + const stopProcessing = useCallback(() => { + if (wsRef.current?.readyState !== WebSocket.OPEN) { + console.error('WebSocket not connected') + return + } + + wsRef.current.send(JSON.stringify({ type: 'stop' })) + setIsProcessing(false) + setMessages((m) => [...m, { + role: 'assistant', + content: 'Stopped by user' + }]) + }, []) + + return ( +
+ +
+ {/* Use CSS to show/hide instead of conditional rendering to preserve state */} +
+ +
+
+ +
+
+
+ ) +} diff --git a/plexe/ui/frontend/src/api/client.js b/plexe/ui/frontend/src/api/client.js new file mode 100644 index 00000000..fa908ee9 --- /dev/null +++ b/plexe/ui/frontend/src/api/client.js @@ -0,0 +1,156 @@ +/** + * API Client for Plexe Frontend + * Handles all HTTP requests to the backend + */ + +const API_BASE_URL = import.meta.env.VITE_BACKEND_URL || 'http://localhost:8000' + +/** + * Upload file(s) to the backend + * @param {File[]} files - Array of files to upload + * @returns {Promise} Upload response + */ +export async function uploadFiles(files) { + const formData = new FormData() + + // Add all files to FormData + files.forEach((file) => { + formData.append('files', file) + }) + + const response = await fetch(`${API_BASE_URL}/api/upload`, { + method: 'POST', + body: formData, + headers: { + // Don't set Content-Type, let browser set it with boundary + }, + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || `Upload failed: ${response.statusText}`) + } + + return await response.json() +} + +/** + * Test PostgreSQL connection + * @param {Object} connectionConfig - PostgreSQL connection config + * @returns {Promise} Test result + */ +export async function testPostgresConnection(connectionConfig) { + const response = await fetch(`${API_BASE_URL}/api/postgres/test`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(connectionConfig), + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || 'Connection test failed') + } + + return await response.json() +} + +/** + * Execute a query on a PostgreSQL database + * @param {Object} connectionConfig - PostgreSQL connection config + * @returns {Promise} Query result + */ +export async function executePostgresQuery(connectionConfig) { + const response = await fetch(`${API_BASE_URL}/api/postgres/execute`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(connectionConfig), + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || 'Failed to execute query') + } + + return await response.json() +} + +/** + * Get list of uploaded datasets + * @returns {Promise} List of datasets + */ +export async function listDatasets() { + const response = await fetch(`${API_BASE_URL}/api/datasets`, { + method: 'GET', + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || 'Failed to fetch datasets') + } + + return await response.json() +} + +/** + * Delete a dataset + * @param {string} datasetId - ID of dataset to delete + * @returns {Promise} Delete result + */ +export async function deleteDataset(datasetId) { + const response = await fetch(`${API_BASE_URL}/api/datasets/${datasetId}`, { + method: 'DELETE', + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || 'Failed to delete dataset') + } + + return await response.json() +} + +/** + * Download a dataset + * @param {string} datasetId - ID of dataset to download + * @returns {Promise} Downloaded file as a Blob + */ +export async function downloadDataset(datasetId) { + const response = await fetch(`${API_BASE_URL}/api/datasets/${datasetId}/download`, { + method: 'GET', + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || 'Failed to download dataset') + } + + return await response.blob() +} + +/** + * Combine datasets using featuretools + * @param {Array} tables - List of tables + * @param {Array} relationships - List of relationships + * @param {Object} connection - PostgreSQL connection config + * @returns {Promise} Combination result + */ +export async function combineDatasets(tables, relationships, connection) { + const response = await fetch(`${API_BASE_URL}/api/datasets/combine`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ tables, relationships, connection }), + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.detail || 'Failed to combine datasets') + } + + return await response.json() +} diff --git a/plexe/ui/frontend/src/api/index.js b/plexe/ui/frontend/src/api/index.js new file mode 100644 index 00000000..7dd2aac4 --- /dev/null +++ b/plexe/ui/frontend/src/api/index.js @@ -0,0 +1,4 @@ +/** + * Export all API functions from client + */ +export * from './client.js' diff --git a/plexe/ui/frontend/src/components/Chat.jsx b/plexe/ui/frontend/src/components/Chat.jsx new file mode 100644 index 00000000..e1dbc870 --- /dev/null +++ b/plexe/ui/frontend/src/components/Chat.jsx @@ -0,0 +1,548 @@ +import React, { useEffect, useRef, useState } from 'react' + +// Icons for different event types +const EventIcon = ({ eventType }) => { + const icons = { + agent_start: '🚀', + thinking: '💭', + tool_call: '🔧', + tool_result: '✅', + agent_end: '🎯' + } + return {icons[eventType] || '•'} +} + +// Specific color mapping per agent for consistent identity visuals +const getAgentTheme = (name = '') => { + const lowerName = name.toLowerCase() + + // ErrorHandler agent - red theme to indicate error state + if (lowerName.includes('errorhandler') || lowerName.includes('error_handler')) { + return { + accent: '#dc2626', + tint: 'rgba(220, 38, 38, 0.08)', + wash: 'rgba(220, 38, 38, 0.04)', + border: 'rgba(220, 38, 38, 0.3)' + } + } + + // Specific colors for known agents + if (lowerName.includes('orchestrator')) { + return { + accent: '#2563eb', + tint: 'rgba(37, 99, 235, 0.08)', + wash: 'rgba(37, 99, 235, 0.04)', + border: 'rgba(37, 99, 235, 0.3)' + } + } + + if (lowerName.includes('conversational')) { + return { + accent: '#10b981', + tint: 'rgba(16, 185, 129, 0.08)', + wash: 'rgba(16, 185, 129, 0.04)', + border: 'rgba(16, 185, 129, 0.3)' + } + } + + if (lowerName.includes('eda')) { + return { + accent: '#7c3aed', + tint: 'rgba(124, 58, 237, 0.08)', + wash: 'rgba(124, 58, 237, 0.04)', + border: 'rgba(124, 58, 237, 0.3)' + } + } + + if (lowerName.includes('taskbuilder') || lowerName.includes('task_builder')) { + return { + accent: '#2c3e50', + tint: 'rgba(44, 62, 80, 0.08)', + wash: 'rgba(44, 62, 80, 0.04)', + border: 'rgba(44, 62, 80, 0.3)' + } + } + + // Fallback to deterministic color for other agents + let hash = 0 + for (let i = 0; i < name.length; i += 1) { + hash = name.charCodeAt(i) + ((hash << 5) - hash) + } + const hue = Math.abs(hash) % 360 + return { + accent: `hsl(${hue}, 70%, 50%)`, + tint: `hsla(${hue}, 70%, 50%, 0.08)`, + wash: `hsla(${hue}, 70%, 50%, 0.04)`, + border: `hsla(${hue}, 70%, 50%, 0.3)` + } +} + +// Group consecutive messages from the same agent +const groupMessagesByAgent = (messages) => { + const groups = [] + let currentGroup = null + + messages.forEach((msg) => { + if (msg.role === 'thinking') { + const agentName = msg.agent_name || 'Agent' + + // Start a new group if agent changes or no current group + if (!currentGroup || currentGroup.agent !== agentName) { + if (currentGroup) { + groups.push(currentGroup) + } + currentGroup = { + agent: agentName, + steps: [], + startStep: msg.step_number, + endStep: msg.step_number + } + } + + // Add step to current group + currentGroup.steps.push(msg) + currentGroup.endStep = msg.step_number + } else { + // Non-thinking messages close the current group + if (currentGroup) { + groups.push(currentGroup) + currentGroup = null + } + groups.push({ type: 'message', message: msg }) + } + }) + + // Add the last group if exists + if (currentGroup) { + groups.push(currentGroup) + } + + return groups +} + +// Render a single event within an agent group +const EventItem = ({ step }) => { + const [isExpanded, setIsExpanded] = useState(false) + const eventType = step.event_type || 'thinking' + + // Smarter error detection: only mark as error if it's actually an error response + // Check for status: "error" in JSON or explicit error event types + const hasError = (() => { + if (!step.message) return false + + // Check if this is a tool result with error status + if (step.event_type === 'tool_result') { + try { + // Try to extract and parse JSON from the message + // Handle both raw JSON and "content='...'" format + let jsonStr = step.message + const contentMatch = step.message.match(/content='({.*})'/) + if (contentMatch) { + jsonStr = contentMatch[1] + } + + // Parse the JSON and check status field + const parsed = JSON.parse(jsonStr) + if (parsed.status === 'error') { + return true + } + // If status is "success" or any other value, it's not an error + if (parsed.status) { + return false + } + } catch (e) { + // Not valid JSON, fall through to other checks + } + } + + // Check if message starts with explicit error indicators + if (step.message.match(/^(Error:|ERROR:|Failed:|FAILED:|Exception:)/i)) { + return true + } + + // Don't mark as error just because it contains the word "error" in normal text + return false + })() + + // Check if this is a tool result with content + const isToolResult = step.message && step.message.startsWith('Tool result:') + // Check if this is a reasoning/thinking message + const isReasoning = step.message && ( + step.message.startsWith('💭') || + step.message.startsWith('💡') || + step.message.includes('Reasoning') || + step.message.includes('Analysis') + ) + const MAX_LINES = 5 + + const renderMessage = () => { + if (!step.message) return null + + // For tool results, show first 5 lines with expand option + if (isToolResult) { + const content = step.message.replace(/^Tool result:\n?/, '') + const lines = content.split('\n') + const hasMoreLines = lines.length > MAX_LINES + const displayLines = isExpanded ? lines : lines.slice(0, MAX_LINES) + + return ( +
+
{displayLines.join('\n')}
+ {hasMoreLines && ( + + )} +
+ ) + } + + // For reasoning messages, format nicely with expand option + if (isReasoning) { + const lines = step.message.split('\n') + const hasMoreLines = lines.length > MAX_LINES + 2 + const displayLines = isExpanded ? lines : lines.slice(0, MAX_LINES + 2) + + return ( +
+
{displayLines.join('\n')}
+ {hasMoreLines && ( + + )} +
+ ) + } + + if (hasError) { + // For actual errors, render with prominent styling and bold text + return ( +
+
⚠️
+
+ {step.message} +
+
+ ) + } + + return step.message + } + + // Determine the icon based on message content + const getEventIcon = () => { + if (step.message) { + if (step.message.startsWith('💭')) return '💭' + if (step.message.startsWith('💡')) return '💡' + } + const icons = { + agent_start: '🚀', + thinking: '💭', + tool_call: '🔧', + tool_result: '✅', + agent_end: '🎯' + } + return icons[eventType] || '•' + } + + // Determine the label based on message content + const getEventLabel = () => { + if (step.message) { + // Extract model info if present (e.g., "[openai/gpt-4o]") + const modelMatch = step.message.match(/\[([\w/-]+)\]/) + const modelInfo = modelMatch ? ` (${modelMatch[1]})` : '' + + if (step.message.startsWith('💭 Reasoning')) return `Reasoning${modelInfo}` + if (step.message.startsWith('💡 Analysis')) return `Analysis${modelInfo}` + } + + if (eventType === 'agent_start') return 'Starting' + if (eventType === 'thinking') return 'Reasoning' + if (eventType === 'tool_call') return `Tool: ${step.tool_name || 'Unknown'}` + if (eventType === 'tool_result') return 'Result' + if (eventType === 'agent_end') return 'Completed' + return eventType + } + + return ( +
+
+ {getEventIcon()} + {getEventLabel()} + {step.timestamp && ( + {step.timestamp} + )} +
+
+ {renderMessage()} +
+
+ ) +} + +// Render an agent group (multiple steps from same agent) +const AgentGroup = ({ group }) => { + const theme = getAgentTheme(group.agent) + + // Check for actual errors: status:"error" in JSON content or explicit error messages + const hasError = group.steps.some(s => { + if (!s.message) return false + + // Check for JSON with status: "error" + try { + // Handle content='...' format + const contentMatch = s.message.match(/content='({.*})'/) + const jsonStr = contentMatch ? contentMatch[1] : s.message + const parsed = JSON.parse(jsonStr) + if (parsed.status === 'error') { + return true + } + } catch (e) { + // Not valid JSON, continue to other checks + } + + // Check for explicit error message prefixes + if (s.message.match(/^(Error:|ERROR:|Failed:|FAILED:|Exception:)/i)) { + return true + } + + return false + }) + + // Extract model_id from agent_start step if available + const modelId = group.steps.find(s => s.event_type === 'agent_start')?.model_id || '' + + // Calculate total token usage for this agent + const totalTokens = group.steps.reduce((sum, step) => { + if (step.token_usage && step.token_usage.total_tokens) { + return sum + step.token_usage.total_tokens + } + return sum + }, 0) + + const stepRange = group.startStep === group.endStep + ? `Step ${group.startStep}` + : `Steps ${group.startStep}-${group.endStep}` + + return ( +
+
+
+
+ + {group.agent} + + {modelId && ( + + {modelId.split('/').pop()} + + )} + {totalTokens > 0 && ( + + 🪙 {totalTokens.toLocaleString()} + + )} +
+ + {stepRange} + +
+
+ {group.steps.map((step, idx) => ( + + ))} +
+
+
+ ) +} + +function Message({ msg }) { + const isUser = msg.role === 'user' + const isError = msg.role === 'error' + + return ( +
+
{msg.content}
+
+ ) +} + +function ThinkingIndicator() { + return ( +
+
+
+ + + +
+
+
+ ) +} + +function ConfirmationDialog({ request, onConfirm, onReject }) { + const [isExpanded, setIsExpanded] = useState(false) + + const renderContent = () => { + const { content, content_type } = request + + // Truncate content if too long and not expanded + const maxPreviewLength = 500 + const shouldTruncate = content.length > maxPreviewLength && !isExpanded + const displayContent = shouldTruncate + ? content.substring(0, maxPreviewLength) + '...' + : content + + if (content_type === 'code') { + return ( +
+                    {displayContent}
+                
+ ) + } else if (content_type === 'json') { + try { + const parsed = JSON.parse(content) + return ( +
+                        {isExpanded ? JSON.stringify(parsed, null, 2) : JSON.stringify(parsed, null, 2).substring(0, maxPreviewLength) + (JSON.stringify(parsed, null, 2).length > maxPreviewLength ? '...' : '')}
+                    
+ ) + } catch { + return
{displayContent}
+ } + } else if (content_type === 'markdown') { + // Simple markdown rendering (just preserve formatting) + return
{displayContent}
+ } + return
{displayContent}
+ } + + return ( +
+
+
+

{request.title}

+
+
+ {renderContent()} + {request.content.length > 500 && ( + + )} +
+
+ + +
+
+
+ ) +} + +export default function Chat({ messages, status, isProcessing, onSendMessage, onStopProcessing, confirmationRequest, onConfirmationResponse }) { + const [input, setInput] = useState('') + const messagesEndRef = useRef(null) + + // Auto-scroll to bottom when new messages arrive + useEffect(() => { + messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }) + }, [messages, isProcessing]) + + const send = () => { + if (onSendMessage(input)) { + setInput('') + } + } + + const getStatusClass = () => { + if (isProcessing) return 'processing' + return status + } + + const getStatusText = () => { + if (isProcessing) return 'Processing...' + if (status === 'connected') return 'Connected' + if (status === 'disconnected') return 'Disconnected - Reconnecting...' + if (status === 'error') return 'Connection Error' + return status + } + + // Group messages by agent + const messageGroups = groupMessagesByAgent(messages) + + return ( +
+ {confirmationRequest && ( + onConfirmationResponse(confirmationRequest.id, true)} + onReject={() => onConfirmationResponse(confirmationRequest.id, false)} + /> + )} +
{getStatusText()}
+
+ {messageGroups.map((group, i) => { + if (group.type === 'message') { + return + } else { + return + } + })} + {isProcessing && } +
+
+
+ setInput(e.target.value)} + onKeyPress={(e) => { + if (e.key === 'Enter') { + send() + } + }} + placeholder="Type your message..." + disabled={isProcessing || status !== 'connected'} + /> + {isProcessing ? ( + + ) : ( + + )} +
+
+ ) +} diff --git a/plexe/ui/frontend/src/components/Dataset.jsx b/plexe/ui/frontend/src/components/Dataset.jsx new file mode 100644 index 00000000..b3159e75 --- /dev/null +++ b/plexe/ui/frontend/src/components/Dataset.jsx @@ -0,0 +1,51 @@ +import React, { useState } from 'react' +import UploadTab from './dataset/UploadTab' +import PostgreSQLTab from './dataset/PostgreSQLTab' +import OverviewTab from './dataset/OverviewTab' + +export default function Dataset() { + const [activeTab, setActiveTab] = useState('upload') + + return ( +
+
+

Dataset Management

+

Upload or connect to your data sources

+
+ +
+
+ + + + +
+ +
+ + {activeTab === 'upload' && } + {activeTab === 'postgres' && } + {activeTab === 'overview' && } +
+
+
+ ) +} diff --git a/plexe/ui/frontend/src/components/Sidebar.jsx b/plexe/ui/frontend/src/components/Sidebar.jsx new file mode 100644 index 00000000..25ee937f --- /dev/null +++ b/plexe/ui/frontend/src/components/Sidebar.jsx @@ -0,0 +1,34 @@ +import React from 'react' + +export default function Sidebar({ activePage, setActivePage }) { + return ( + + ) +} diff --git a/plexe/ui/frontend/src/components/dataset/OverviewTab.jsx b/plexe/ui/frontend/src/components/dataset/OverviewTab.jsx new file mode 100644 index 00000000..2882e76e --- /dev/null +++ b/plexe/ui/frontend/src/components/dataset/OverviewTab.jsx @@ -0,0 +1,85 @@ +import React, { useState, useEffect } from 'react'; +import { listDatasets, deleteDataset, downloadDataset } from '../../api/client'; // Corrected import + +export default function OverviewTab() { + const [datasets, setDatasets] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + useEffect(() => { + fetchDatasets(); + }, []); + + const fetchDatasets = async () => { + try { + setLoading(true); + const response = await listDatasets(); // Changed to listDatasets + setDatasets(response.datasets); // Assuming the API returns data in a 'datasets' field + } catch (err) { + setError('Failed to fetch datasets.'); + console.error('Error fetching datasets:', err); + } finally { + setLoading(false); + } + }; + + const handleDelete = async (datasetId) => { + if (window.confirm('Are you sure you want to delete this dataset?')) { + try { + await deleteDataset(datasetId); + fetchDatasets(); // Refresh the list + } catch (err) { + setError('Failed to delete dataset.'); + console.error('Error deleting dataset:', err); + } + } + }; + + const handleDownload = async (datasetId, datasetName) => { + try { + const response = await downloadDataset(datasetId); // downloadDataset returns a blob + const url = window.URL.createObjectURL(response); // Create URL from blob + const link = document.createElement('a'); + link.href = url; + link.setAttribute('download', datasetName); // Or use a name from the response + document.body.appendChild(link); + link.click(); + link.remove(); + window.URL.revokeObjectURL(url); // Clean up the URL object + } catch (err) { + setError('Failed to download dataset.'); + console.error('Error downloading dataset:', err); + } + }; + + if (loading) { + return
Loading datasets...
; + } + + if (error) { + return
{error}
; + } + + return ( +
+

Available Datasets

+ {datasets.length === 0 ? ( +

No datasets available yet. Upload or connect one!

+ ) : ( +
+ {datasets.map((dataset) => ( +
+

{dataset.filename}

{/* Display filename */} +

Created: {new Date(dataset.created_at * 1000).toLocaleDateString()}

{/* Convert timestamp to date */} +

Size: {(dataset.size / 1024).toFixed(2)} KB

{/* Convert bytes to KB */} +
+ + +
+
+ ))} +
+ )} +
+ ); +} diff --git a/plexe/ui/frontend/src/components/dataset/PostgreSQLTab.jsx b/plexe/ui/frontend/src/components/dataset/PostgreSQLTab.jsx new file mode 100644 index 00000000..7a50146f --- /dev/null +++ b/plexe/ui/frontend/src/components/dataset/PostgreSQLTab.jsx @@ -0,0 +1,231 @@ +import React, { useState } from 'react' +import { + testPostgresConnection, + executePostgresQuery, + combineDatasets, +} from '../../api/client' +import QueryResult from './QueryResult' + +export default function PostgreSQLTab() { + const [connectionForm, setConnectionForm] = useState({ + host: 'localhost', + port: '5432', + username: '', + password: '', + database: '', + }) + const [connecting, setConnecting] = useState(false) + const [combining, setCombining] = useState(false) + const [connectionStatus, setConnectionStatus] = useState(null) + const [combineStatus, setCombineStatus] = useState(null) + const [queryResult, setQueryResult] = useState(null) + const [showCombine, setShowCombine] = useState(false) + + const handleInputChange = (e) => { + const { name, value } = e.target + setConnectionForm((prev) => ({ + ...prev, + [name]: value, + })) + } + + const handleTestConnection = async () => { + setConnecting(true) + setConnectionStatus(null) + setQueryResult(null) + setShowCombine(false) + setCombineStatus(null) + + try { + await testPostgresConnection(connectionForm) + setConnectionStatus({ + type: 'success', + message: 'Connection successful!', + }) + } catch (error) { + console.error('Connection error:', error) + setConnectionStatus({ + type: 'error', + message: error.message, + }) + } finally { + setConnecting(false) + } + } + + const handleExecute = async () => { + setConnecting(true) + setQueryResult(null) + setConnectionStatus(null) + setShowCombine(false) + setCombineStatus(null) + + try { + const data = await executePostgresQuery(connectionForm) + setQueryResult(data) + setConnectionStatus({ + type: 'success', + message: 'Query executed successfully!', + }) + setShowCombine(true) + } catch (error) { + console.error('Execute error:', error) + setConnectionStatus({ + type: 'error', + message: error.message, + }) + } finally { + setConnecting(false) + } + } + + const handleCombine = async () => { + setCombining(true) + setCombineStatus(null) + try { + await combineDatasets( + queryResult.tables, + queryResult.relationships, + connectionForm + ) + setCombineStatus({ + type: 'success', + message: 'Dataset combination started successfully!', + }) + } catch (error) { + console.error('Combine error:', error) + setCombineStatus({ + type: 'error', + message: error.message, + }) + } finally { + setCombining(false) + } + } + + return ( +
+
+

PostgreSQL Connection

+

+ Enter your database credentials to fetch schema information. +

+ +
+
+ + +
+ +
+ + +
+
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ + {connectionStatus && ( +
+ {connectionStatus.message} +
+ )} +
+ +
+ {queryResult && ( + + )} + {showCombine && ( +
+

+ Do you want to combine these tables into a final dataset? +

+ + {combineStatus && ( +
+ {combineStatus.message} +
+ )} +
+ )} +
+
+ ) +} diff --git a/plexe/ui/frontend/src/components/dataset/QueryResult.jsx b/plexe/ui/frontend/src/components/dataset/QueryResult.jsx new file mode 100644 index 00000000..03ff14a1 --- /dev/null +++ b/plexe/ui/frontend/src/components/dataset/QueryResult.jsx @@ -0,0 +1,43 @@ +import React from 'react'; + +export default function QueryResult({ tables, relationships }) { + if (!tables.length && !relationships.length) { + return null; + } + + return ( +
+ {tables.length > 0 && ( +
+

Available Tables

+
    + {tables.map((table) => ( +
  • + 📄 + {table} +
  • + ))} +
+
+ )} + + {relationships.length > 0 && ( +
+

Table Relationships

+
    + {relationships.map((rel, i) => ( +
  • + 🔗 + {rel.table_name} + ({rel.column_name}) + + {rel.foreign_table_name} + ({rel.foreign_column_name}) +
  • + ))} +
+
+ )} +
+ ); +} diff --git a/plexe/ui/frontend/src/components/dataset/UploadTab.jsx b/plexe/ui/frontend/src/components/dataset/UploadTab.jsx new file mode 100644 index 00000000..74d5d6f1 --- /dev/null +++ b/plexe/ui/frontend/src/components/dataset/UploadTab.jsx @@ -0,0 +1,118 @@ +import React, { useRef, useState } from 'react' +import { uploadFiles } from '../../api/client' + +export default function UploadTab() { + const [files, setFiles] = useState([]) + const [uploading, setUploading] = useState(false) + const [uploadError, setUploadError] = useState(null) + const [uploadSuccess, setUploadSuccess] = useState(false) + const fileInputRef = useRef(null) + + const handleFileChange = (e) => { + const selectedFiles = Array.from(e.target.files) + setFiles((prevFiles) => [...prevFiles, ...selectedFiles]) + } + + const handleUpload = async () => { + if (files.length === 0) { + setUploadError('Please select files to upload') + return + } + + setUploading(true) + setUploadError(null) + setUploadSuccess(false) + + try { + const result = await uploadFiles(files) + setUploadSuccess(true) + setFiles([]) + setTimeout(() => setUploadSuccess(false), 3000) + } catch (error) { + console.error('Upload error:', error) + setUploadError(error.message || 'Upload failed') + } finally { + setUploading(false) + } + } + + const removeFile = (index) => { + setFiles((prevFiles) => prevFiles.filter((_, i) => i !== index)) + } + + return ( +
+
+
fileInputRef.current?.click()} + onDragOver={(e) => { + e.preventDefault() + e.currentTarget.style.borderColor = '#2563eb' + }} + onDragLeave={(e) => { + e.currentTarget.style.borderColor = '#ddd' + }} + onDrop={(e) => { + e.preventDefault() + e.currentTarget.style.borderColor = '#ddd' + const droppedFiles = Array.from(e.dataTransfer.files) + setFiles((prevFiles) => [...prevFiles, ...droppedFiles]) + }} + > + +
📁
+

Drag and drop files here or click to browse

+

Supported formats: CSV, XLSX, JSON, Parquet

+
+
+ + {uploadError && ( +
+ ✕ {uploadError} +
+ )} + + {uploadSuccess && ( +
+ ✓ Files uploaded successfully! +
+ )} + + {files.length > 0 && ( +
+

Selected Files ({files.length})

+
    + {files.map((file, index) => ( +
  • + 📄 {file.name} + ({(file.size / 1024).toFixed(2)} KB) + +
  • + ))} +
+ + +
+ )} +
+ ) +} diff --git a/plexe/ui/frontend/src/main.jsx b/plexe/ui/frontend/src/main.jsx new file mode 100644 index 00000000..fb43f741 --- /dev/null +++ b/plexe/ui/frontend/src/main.jsx @@ -0,0 +1,10 @@ +import React from 'react' +import { createRoot } from 'react-dom/client' +import App from './App' +import './styles.css' + +createRoot(document.getElementById('root')).render( + + + +) diff --git a/plexe/ui/frontend/src/styles.css b/plexe/ui/frontend/src/styles.css new file mode 100644 index 00000000..8f0134be --- /dev/null +++ b/plexe/ui/frontend/src/styles.css @@ -0,0 +1,1474 @@ +:root { + --bg: #f6f7fb; + --card: #ffffff; + --muted: #6b7280; + --primary: #2563eb; + --primary-light: #3b82f6; + --primary-hover: #1d4ed8; + --success: #10b981; + --error: #ef4444; + --border: #e5e7eb; + --text: #0f172a; + --text-secondary: #6b7280; + --shadow: 0 12px 30px rgba(15, 23, 42, 0.08); +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + font-family: + Inter, + ui-sans-serif, + system-ui, + -apple-system, + BlinkMacSystemFont, + "Segoe UI", + Roboto, + Helvetica, + Arial, + sans-serif; + background: var(--bg); +} + +html, +body, +#root { + height: 100%; + width: 100%; +} + +/* ===== App Layout ===== */ +.app-root { + display: flex; + height: 100vh; + width: 100%; +} + +.app-main { + flex: 1; + display: flex; + flex-direction: column; + background: var(--bg); + overflow-y: auto; +} + +/* ===== Sidebar ===== */ +.sidebar { + width: 260px; + background: var(--card); + border-right: 1px solid var(--border); + display: flex; + flex-direction: column; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); +} + +.sidebar-header { + padding: 1.5rem 1.25rem; + border-bottom: 1px solid var(--border); +} + +.sidebar-header h2 { + font-weight: 700; + color: var(--primary); +} + +.sidebar-header p { + margin: 0.25rem 0 0; + font-size: 0.875rem; + color: var(--text-secondary); +} + +.sidebar-nav { + flex: 1; + padding: 1rem 0.75rem; + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.nav-item { + display: flex; + align-items: center; + gap: 0.75rem; + padding: 0.75rem 1rem; + background: transparent; + border: none; + border-radius: 8px; + cursor: pointer; + color: var(--text-secondary); + font-size: 1rem; + transition: all 0.2s ease; + text-align: left; +} + +.nav-item:hover { + background: #f3f4f6; + color: var(--text); +} + +.nav-item.active { + background: var(--primary-light); + color: white; + font-weight: 500; +} + +.nav-item .icon { + font-size: 1.25rem; + min-width: 1.5rem; +} + +.nav-item .label { + flex: 1; +} + +.sidebar-footer { + padding: 1rem 1.25rem; + border-top: 1px solid var(--border); + text-align: center; +} + +.sidebar-footer .version { + margin: 0; + font-size: 0.75rem; + color: var(--text-secondary); +} + +/* ===== Chat ===== */ +.chat-root { + display: flex; + flex-direction: column; + height: calc(100vh - 2rem); + background: var(--card); + border-radius: 16px; + padding: 0; + margin: 1rem 1rem 1rem 1.5rem; + box-shadow: var(--shadow); +} + +.status { + font-size: 0.85rem; + color: var(--text-secondary); + padding: 0.875rem 1.25rem; + border-bottom: 1px solid var(--border); + display: flex; + align-items: center; + gap: 0.625rem; + background: linear-gradient(to bottom, #fafafa 0%, white 100%); + font-weight: 500; +} + +.status::before { + content: ""; + width: 8px; + height: 8px; + border-radius: 50%; + background: var(--muted); + box-shadow: 0 0 0 2px rgba(107, 114, 128, 0.15); +} + +.status.connected::before { + background: var(--success); + box-shadow: 0 0 0 2px rgba(16, 185, 129, 0.2); +} + +.status.disconnected::before { + background: var(--error); + animation: pulse 1.5s infinite; + box-shadow: 0 0 0 2px rgba(239, 68, 68, 0.2); +} + +.status.processing::before { + background: var(--primary); + animation: pulse 1s infinite; + box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2); +} + +.status.error::before { + background: var(--error); + box-shadow: 0 0 0 2px rgba(239, 68, 68, 0.2); +} + +@keyframes pulse { + + 0%, + 100% { + opacity: 1; + } + + 50% { + opacity: 0.4; + } +} + +.messages { + flex: 1; + overflow-y: auto; + padding: 1.25rem 1.5rem; + display: flex; + flex-direction: column; + gap: 1rem; + align-items: flex-start; +} + +.message { + display: flex; + margin: 0; + max-width: 75%; +} + +.message.user { + justify-content: flex-end; + align-self: flex-end; +} + +.message.assistant { + align-self: flex-start; +} + +.bubble { + min-width: 60px; + max-width: 100%; + width: auto; + padding: 0.875rem 1.125rem; + border-radius: 14px; + background: #f3f4f6; + color: var(--text); + word-wrap: break-word; + overflow-wrap: break-word; + line-height: 1.5; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); +} + +.message.user .bubble { + background: linear-gradient(135deg, #2563eb 0%, #1d4ed8 100%); + color: white; + border-radius: 14px 14px 4px 14px; + box-shadow: 0 2px 8px rgba(37, 99, 235, 0.25); +} + +.message.assistant .bubble { + background: #f3f4f6; + border-radius: 14px 14px 14px 4px; + border: 1px solid rgba(0, 0, 0, 0.06); +} + +/* Thinking messages - now agent groups */ + +.agent-group { + justify-content: flex-start; + margin: 0.5rem 0; + max-width: 90%; + width: 100%; +} + +.agent-group-bubble { + min-width: 400px; + max-width: 100%; + width: auto; + border-radius: 12px; + border: 1px solid; + border-left-width: 4px; + backdrop-filter: blur(2px); + box-shadow: + 0 2px 8px rgba(15, 23, 42, 0.06), + 0 0 0 1px rgba(15, 23, 42, 0.03); + transition: all 0.2s ease; + overflow: hidden; +} + +.agent-group-bubble:hover { + box-shadow: + 0 4px 12px rgba(15, 23, 42, 0.1), + 0 0 0 1px rgba(15, 23, 42, 0.04); + transform: translateY(-1px); +} + +.agent-group-bubble.has-error { + border-left-color: #ef4444; + border-color: rgba(239, 68, 68, 0.3); +} + +.agent-group-header { + display: flex; + justify-content: space-between; + align-items: center; + padding: 0.875rem 1.25rem; + border-bottom: 1px solid rgba(0, 0, 0, 0.06); +} + +.agent-header-left { + display: flex; + align-items: center; + gap: 0.5rem; +} + +.agent-name { + font-weight: 700; + font-size: 0.9rem; + letter-spacing: -0.01em; +} + +.model-badge { + font-size: 0.65rem; + font-weight: 500; + padding: 0.15rem 0.4rem; + border-radius: 4px; + background: rgba(100, 116, 139, 0.15); + color: #64748b; + font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; + white-space: nowrap; +} + +.token-badge { + font-size: 0.7rem; + font-weight: 600; + padding: 0.2rem 0.5rem; + border-radius: 4px; + background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); + color: #92400e; + font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; + white-space: nowrap; + border: 1px solid #fbbf24; + box-shadow: 0 1px 2px rgba(251, 191, 36, 0.1); +} + +.step-range { + font-size: 0.75rem; + font-weight: 500; + padding: 0.125rem 0.5rem; + border-radius: 6px; + background: rgba(0, 0, 0, 0.04); +} + +.agent-group-content { + padding: 0.5rem; + background: white; +} + +/* Event items within agent groups */ +.event-item { + padding: 0.75rem 1rem; + margin: 0.5rem 0; + border-radius: 8px; + background: rgba(0, 0, 0, 0.02); + border-left: 3px solid transparent; + transition: all 0.2s ease; +} + +.event-item:hover { + background: rgba(0, 0, 0, 0.04); +} + +.event-item.has-error { + background: rgba(254, 242, 242, 0.5); + border-left-color: #ef4444; +} + +.event-item.agent_start { + border-left-color: #3b82f6; + background: rgba(59, 130, 246, 0.05); +} + +.event-item.tool_call { + border-left-color: #f59e0b; + background: rgba(245, 158, 11, 0.05); +} + +.event-item.tool_result { + border-left-color: #10b981; + background: rgba(16, 185, 129, 0.05); +} + +.event-item.agent_end { + border-left-color: #8b5cf6; + background: rgba(139, 92, 246, 0.05); +} + +.event-header { + display: flex; + align-items: center; + gap: 0.5rem; + margin-bottom: 0.5rem; + font-weight: 600; + font-size: 0.85rem; + color: #374151; +} + +.event-icon { + font-size: 1.1rem; + line-height: 1; +} + +.event-label { + flex: 1; +} + +.event-time { + font-size: 0.7rem; + color: #9ca3af; + font-weight: 400; +} + +.event-content { + color: #1f2937; + line-height: 1.5; + white-space: pre-wrap; + word-wrap: break-word; + overflow-wrap: break-word; + font-size: 0.875rem; +} + +.error-text { + color: #dc2626; + font-weight: 600; + background: rgba(239, 68, 68, 0.1); + padding: 0.125rem 0.25rem; + border-radius: 4px; + display: inline-block; +} + +/* Error message box for prominent error display */ +.error-message-box { + display: flex; + align-items: flex-start; + gap: 0.75rem; + padding: 1.25rem; + background: linear-gradient(135deg, #fef2f2 0%, #fee2e2 100%); + border: 2px solid #f87171; + border-left: 5px solid #dc2626; + border-radius: 8px; + margin: 0.75rem 0; + box-shadow: 0 4px 12px rgba(220, 38, 38, 0.15), 0 0 0 1px rgba(220, 38, 38, 0.1); +} + +.error-icon { + font-size: 1.5rem; + flex-shrink: 0; + line-height: 1; +} + +.error-content { + flex: 1; + color: #7f1d1d; + font-size: 0.95rem; + line-height: 1.6; + white-space: pre-wrap; + word-wrap: break-word; + overflow-wrap: break-word; +} + +/* Tool result content with expand/collapse */ +.tool-result-content { + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.tool-result-text { + margin: 0; + padding: 0.75rem; + background: rgba(0, 0, 0, 0.03); + border-radius: 6px; + font-size: 0.8rem; + font-family: "SF Mono", Consolas, Monaco, monospace; + overflow-x: auto; + white-space: pre-wrap; + word-wrap: break-word; + max-height: 300px; + overflow-y: auto; +} + +.expand-btn { + align-self: flex-start; + padding: 0.375rem 0.75rem; + background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%); + border: 1px solid #d1d5db; + border-radius: 6px; + font-size: 0.75rem; + font-weight: 500; + color: #374151; + cursor: pointer; + transition: all 0.2s ease; +} + +.expand-btn:hover { + background: linear-gradient(135deg, #e5e7eb 0%, #d1d5db 100%); + border-color: #9ca3af; + transform: translateY(-1px); +} + +/* Legacy thinking bubble support (if needed) */ +.message.thinking { + justify-content: flex-start; + margin: 0.5rem 0; + max-width: 85%; +} + +.thinking-bubble { + min-width: 320px; + max-width: 100%; + width: auto; + padding: 1rem 1.25rem; + border-radius: 12px; + border: 1px solid; + border-left-width: 4px; + font-size: 0.9rem; + backdrop-filter: blur(2px); + box-shadow: + 0 2px 8px rgba(15, 23, 42, 0.06), + 0 0 0 1px rgba(15, 23, 42, 0.03); + transition: all 0.2s ease; +} + +.thinking-bubble:hover { + box-shadow: + 0 4px 12px rgba(15, 23, 42, 0.1), + 0 0 0 1px rgba(15, 23, 42, 0.04); + transform: translateY(-1px); +} + +.thinking-bubble.has-error { + border-left-color: #ef4444; + border-color: rgba(239, 68, 68, 0.3); +} + +.thinking-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 0.625rem; + padding-bottom: 0.625rem; + border-bottom: 1px solid rgba(0, 0, 0, 0.08); +} + +.step-number { + font-size: 0.75rem; + font-weight: 500; + padding: 0.125rem 0.5rem; + border-radius: 6px; + background: rgba(0, 0, 0, 0.04); +} + +.thinking-content { + color: #1f2937; + line-height: 1.6; + white-space: pre-wrap; + word-wrap: break-word; + overflow-wrap: break-word; +} + +/* Reasoning content styling */ +.reasoning-content { + background: linear-gradient(135deg, #fefce8 0%, #fef9c3 100%); + border-radius: 8px; + padding: 0.75rem; + border-left: 3px solid #eab308; +} + +.reasoning-text { + color: #713f12; + line-height: 1.6; + white-space: pre-wrap; + word-wrap: break-word; + font-size: 0.9rem; +} + +.event-item.reasoning { + background: rgba(234, 179, 8, 0.05); + border-radius: 8px; + padding: 0.5rem; + margin: 0.25rem 0; +} + +.event-item.reasoning .event-label { + font-weight: 600; + color: #854d0e; +} + +.message.error .bubble, +.bubble.error { + background: linear-gradient(135deg, #fef2f2 0%, #fee2e2 100%); + color: #991b1b; + border: 1px solid #fecaca; + border-left: 4px solid #ef4444; + border-radius: 12px; + font-weight: 500; + box-shadow: 0 2px 8px rgba(239, 68, 68, 0.15); +} + +/* Thinking/Typing indicator (3 dots animation) */ +.thinking-indicator { + display: inline-flex; + align-items: center; + padding: 0.75rem 1rem !important; + min-width: 60px; +} + +.dot-typing { + display: flex; + gap: 4px; + align-items: center; +} + +.dot-typing span { + width: 8px; + height: 8px; + border-radius: 50%; + background-color: var(--primary); + animation: dotTyping 1.4s infinite ease-in-out both; +} + +.dot-typing span:nth-child(1) { + animation-delay: -0.32s; +} + +.dot-typing span:nth-child(2) { + animation-delay: -0.16s; +} + +.dot-typing span:nth-child(3) { + animation-delay: 0s; +} + +@keyframes dotTyping { + + 0%, + 80%, + 100% { + transform: scale(0.6); + opacity: 0.5; + } + + 40% { + transform: scale(1); + opacity: 1; + } +} + +.composer { + display: flex; + gap: 0.875rem; + border-top: 1px solid var(--border); + padding: 1.25rem 1.5rem; + background: white; + box-shadow: 0 -1px 3px rgba(0, 0, 0, 0.05); +} + +.composer input { + flex: 1; + padding: 0.75rem 1rem; + border: 2px solid var(--border); + border-radius: 10px; + font-size: 0.95rem; + transition: all 0.2s ease; + background: white; +} + +.composer input:focus { + outline: none; + border-color: var(--primary); + box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.08); +} + +.composer button { + padding: 0.75rem 1.5rem; + background: linear-gradient(135deg, #2563eb 0%, #1d4ed8 100%); + color: white; + border-radius: 10px; + border: none; + cursor: pointer; + font-weight: 600; + font-size: 0.95rem; + transition: all 0.2s ease; + box-shadow: 0 2px 8px rgba(37, 99, 235, 0.25); +} + +.composer button:hover { + background: linear-gradient(135deg, #1d4ed8 0%, #1e40af 100%); + box-shadow: 0 4px 12px rgba(37, 99, 235, 0.35); + transform: translateY(-1px); +} + +.composer button:active { + transform: translateY(0); + box-shadow: 0 2px 6px rgba(37, 99, 235, 0.25); +} + +.composer input:disabled { + background: #f9fafb; + border-color: #e5e7eb; + cursor: not-allowed; + opacity: 0.7; +} + +.composer button:disabled { + background: #9ca3af; + cursor: not-allowed; + opacity: 0.7; + box-shadow: none; +} + +.stop-btn { + background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important; + box-shadow: 0 2px 8px rgba(239, 68, 68, 0.25) !important; +} + +.stop-btn:hover { + background: linear-gradient(135deg, #dc2626 0%, #b91c1c 100%) !important; + box-shadow: 0 4px 12px rgba(239, 68, 68, 0.35) !important; +} + +/* ===== Dataset Page ===== */ +.dataset-container { + flex: 1; + display: flex; + flex-direction: column; + padding: 1.5rem; + background: var(--bg); +} + +.dataset-header { + margin-bottom: 2rem; +} + +.dataset-header h2 { + margin: 0 0 0.5rem; + font-size: 1.875rem; + font-weight: 700; + color: var(--text); +} + +.dataset-header p { + margin: 0; + color: var(--text-secondary); + font-size: 1rem; +} + +.tabs { + display: flex; + flex-direction: column; + background: var(--card); + border-radius: 12px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); + overflow: hidden; + flex: 1; +} + +.tab-buttons { + display: flex; + border-bottom: 1px solid var(--border); + background: white; +} + +.tab-button { + flex: 1; + padding: 1rem; + background: transparent; + border: none; + border-bottom: 3px solid transparent; + cursor: pointer; + font-size: 0.95rem; + font-weight: 500; + color: var(--text-secondary); + display: flex; + align-items: center; + justify-content: center; + gap: 0.5rem; + transition: all 0.2s ease; +} + +.tab-button:hover { + background: #f9fafb; + color: var(--text); +} + +.tab-button.active { + color: var(--primary); + border-bottom-color: var(--primary); +} + +.tab-button .icon { + font-size: 1.25rem; +} + +.tab-content { + flex: 1; + padding: 2rem; + overflow-y: auto; +} + +/* ===== Upload Tab ===== */ +.upload-tab { + display: flex; + flex-direction: column; + gap: 2rem; +} + +.upload-area { + width: 100%; +} + +.upload-box { + border: 2px dashed var(--border); + border-radius: 12px; + padding: 3rem 2rem; + text-align: center; + cursor: pointer; + transition: all 0.3s ease; + background: #fafbfc; +} + +.upload-box:hover { + border-color: var(--primary-light); + background: #f0f7ff; +} + +.upload-icon { + font-size: 3rem; + margin-bottom: 1rem; +} + +.upload-text { + margin: 0.5rem 0; + font-size: 1.1rem; + font-weight: 500; + color: var(--text); +} + +.upload-hint { + margin: 0.5rem 0 0; + font-size: 0.875rem; + color: var(--text-secondary); +} + +.files-list { + background: white; + border: 1px solid var(--border); + border-radius: 12px; + padding: 1.5rem; +} + +.files-list h3 { + margin: 0 0 1rem; + font-size: 1rem; + color: var(--text); +} + +.file-items { + list-style: none; + margin: 0 0 1.5rem; + padding: 0; + display: flex; + flex-direction: column; + gap: 0.75rem; +} + +.file-item { + display: flex; + align-items: center; + gap: 0.75rem; + padding: 0.75rem; + background: #f9fafb; + border-radius: 8px; + font-size: 0.95rem; +} + +.file-name { + flex: 1; + font-weight: 500; + color: var(--text); +} + +.file-size { + color: var(--text-secondary); + font-size: 0.875rem; +} + +.remove-btn { + background: transparent; + border: none; + color: var(--error); + cursor: pointer; + font-size: 1rem; + padding: 0.25rem 0.5rem; + border-radius: 4px; + transition: all 0.2s ease; +} + +.remove-btn:hover { + background: rgba(239, 68, 68, 0.1); +} + +.upload-button { + width: 100%; + padding: 0.875rem 1.25rem; + background: var(--primary); + color: white; + border: none; + border-radius: 8px; + cursor: pointer; + font-size: 1rem; + font-weight: 600; + transition: all 0.2s ease; +} + +.upload-button:hover:not(:disabled) { + background: var(--primary-hover); +} + +.upload-button:disabled { + background: var(--text-secondary); + cursor: not-allowed; + opacity: 0.6; +} + +/* ===== PostgreSQL Tab ===== */ +.postgres-tab { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 3rem; +} + +.postgres-form h3 { + margin: 0 0 0.5rem; + font-size: 1.25rem; + color: var(--text); + font-weight: 600; +} + +.postgres-form .form-description { + margin: 0 0 2rem; + color: var(--text-secondary); + font-size: 0.95rem; +} + +.form-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 1rem; +} + +.postgres-results { + background: #f9fafb; + border: 1px solid var(--border); + border-radius: 12px; + padding: 1.5rem; + height: fit-content; + position: sticky; + top: 1rem; +} + +/* ===== Query Result ===== */ +.query-result-container { + display: flex; + flex-direction: column; + gap: 2rem; +} + +.result-section h4 { + margin: 0 0 1rem; + font-size: 1.1rem; + font-weight: 600; + color: var(--text); + border-bottom: 1px solid var(--border); + padding-bottom: 0.5rem; +} + +.result-list { + list-style: none; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + gap: 0.75rem; +} + +.result-item { + display: flex; + align-items: center; + gap: 0.75rem; + padding: 0.75rem; + background: white; + border: 1px solid var(--border); + border-radius: 8px; + font-size: 0.95rem; + transition: all 0.2s ease; +} + +.result-item:hover { + border-color: var(--primary-light); + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05); +} + +.result-item .icon { + font-size: 1.1rem; + color: var(--primary); +} + +.result-item .table-name { + font-weight: 500; + color: var(--text); +} + +.result-item .column-name { + color: var(--text-secondary); + font-size: 0.875rem; +} + +.result-item .arrow { + color: var(--primary); + font-weight: 600; +} + +.relationship-item { + display: flex; + align-items: center; + gap: 0.5rem; +} + +.combine-section { + margin-top: 2rem; + padding-top: 1.5rem; + border-top: 1px solid var(--border); + text-align: center; +} + +.combine-section p { + margin: 0 0 1rem; + font-size: 1rem; + color: var(--text); +} + +.form-group { + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.form-group label { + font-size: 0.95rem; + font-weight: 500; + color: var(--text); +} + +.form-group input { + padding: 0.625rem 0.875rem; + border: 1px solid var(--border); + border-radius: 8px; + font-size: 0.95rem; + transition: all 0.2s ease; + background: white; +} + +.form-group input:focus { + outline: none; + border-color: var(--primary); + box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1); +} + +.status-message { + padding: 0.75rem 1rem; + border-radius: 8px; + font-size: 0.95rem; + font-weight: 500; +} + +.status-message.success { + background: rgba(16, 185, 129, 0.1); + color: var(--success); + border: 1px solid var(--success); +} + +.status-message.error { + background: rgba(239, 68, 68, 0.1); + color: var(--error); + border: 1px solid var(--error); +} + +.form-actions { + display: flex; + gap: 1rem; + margin-top: 1rem; +} + +.button { + padding: 0.75rem 1.5rem; + border: none; + border-radius: 8px; + cursor: pointer; + font-size: 0.95rem; + font-weight: 600; + transition: all 0.2s ease; + flex: 1; +} + +.button.primary { + background: var(--primary); + color: white; +} + +.button.primary:hover:not(:disabled) { + background: var(--primary-hover); +} + +.button.secondary { + background: white; + color: var(--primary); + border: 1px solid var(--primary); +} + +.button.secondary:hover:not(:disabled) { + background: var(--primary-light); + color: white; + border-color: var(--primary-hover); +} + +.button:disabled { + background: var(--text-secondary); + color: white; + cursor: not-allowed; + opacity: 0.6; +} + +.postgres-info { + background: #f0f7ff; + border: 1px solid #cce5ff; + border-radius: 12px; + padding: 1.5rem; + height: fit-content; + position: sticky; + top: 1rem; +} + +.postgres-info h4 { + margin: 0 0 1rem; + font-size: 1rem; + color: var(--text); + font-weight: 600; +} + +.postgres-info p { + margin: 0.5rem 0; + font-size: 0.9rem; + color: var(--text); +} + +.postgres-info strong { + color: var(--primary); +} + +/* ===== Responsive Design ===== */ +@media (max-width: 1024px) { + .postgres-tab { + grid-template-columns: 1fr; + } + + .postgres-info { + position: static; + } +} + +@media (max-width: 768px) { + .sidebar { + width: 200px; + } + + .sidebar-header { + padding: 1rem; + } + + .sidebar-header h2 { + font-size: 1.25rem; + } + + .nav-item { + padding: 0.6rem 0.75rem; + } + + .nav-item .label { + display: none; + } + + .chat-root { + height: calc(100vh - 1rem); + margin: 0.5rem; + } + + .dataset-container { + padding: 1rem; + } + + .dataset-header { + margin-bottom: 1rem; + } + + .dataset-header h2 { + font-size: 1.5rem; + } + + .tab-content { + padding: 1rem; + } + + .postgres-tab { + grid-template-columns: 1fr; + } + + .bubble { + max-width: 90%; + } +} + +@media (max-width: 480px) { + .sidebar { + width: 70px; + } + + .sidebar-header h2, + .sidebar-header p { + display: none; + } + + .tab-button { + font-size: 0.8rem; + padding: 0.75rem; + } + + .tab-button .icon { + font-size: 1rem; + } + + .upload-box { + padding: 2rem 1rem; + } + + .upload-icon { + font-size: 2rem; + margin-bottom: 0.5rem; + } + + .postgres-form h3 { + display: none; + } +} + +/* ===== Overview Tab ===== */ +.overview-tab-container { + display: flex; + flex-direction: column; + gap: 1.5rem; +} + +.overview-tab-container h3 { + margin: 0; + font-size: 1.5rem; + font-weight: 600; + color: var(--text); +} + +.overview-tab-container p { + color: var(--text-secondary); +} + +.dataset-cards-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); + gap: 1.5rem; +} + +.dataset-card { + background: var(--card); + border: 1px solid var(--border); + border-radius: 12px; + padding: 1.5rem; + display: flex; + flex-direction: column; + gap: 0.75rem; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05); + transition: all 0.2s ease; +} + +.dataset-card:hover { + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); + transform: translateY(-2px); +} + +.dataset-card h4 { + margin: 0; + font-size: 1.1rem; + font-weight: 600; + color: var(--primary); + word-break: break-all; +} + +.dataset-card p { + margin: 0; + font-size: 0.9rem; + color: var(--text-secondary); +} + +.dataset-card strong { + color: var(--text); +} + +.card-actions { + display: flex; + gap: 0.75rem; + margin-top: 1rem; +} + +.card-actions button { + flex: 1; + padding: 0.6rem 1rem; + border: none; + border-radius: 8px; + cursor: pointer; + font-size: 0.9rem; + font-weight: 500; + transition: all 0.2s ease; +} + +.card-actions button:first-child { + background: var(--primary); + color: white; +} + +.card-actions button:first-child:hover { + background: var(--primary-hover); +} + +.card-actions button.delete-button { + background: #fef2f2; + color: var(--error); + border: 1px solid var(--error); +} + +.card-actions button.delete-button:hover { + background: var(--error); + color: white; +} + +/* ===== Confirmation Dialog ===== */ +.confirmation-dialog-overlay { + position: fixed; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: rgba(0, 0, 0, 0.5); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; + padding: 1rem; +} + +.confirmation-dialog { + background: var(--card); + border-radius: 12px; + box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15); + max-width: 800px; + width: 100%; + max-height: 80vh; + display: flex; + flex-direction: column; + overflow: hidden; +} + +.confirmation-header { + padding: 1.25rem 1.5rem; + border-bottom: 1px solid var(--border); + background: #f8fafc; +} + +.confirmation-header h3 { + margin: 0; + font-size: 1.25rem; + font-weight: 600; + color: var(--text); +} + +.confirmation-body { + flex: 1; + padding: 1.5rem; + overflow-y: auto; + background: var(--card); +} + +.confirmation-code, +.confirmation-json, +.confirmation-text, +.confirmation-markdown pre { + background: #1e293b; + color: #e2e8f0; + padding: 1rem; + border-radius: 8px; + font-family: "Fira Code", "Cascadia Code", Consolas, monospace; + font-size: 0.85rem; + line-height: 1.5; + overflow-x: auto; + white-space: pre-wrap; + word-break: break-word; + margin: 0; +} + +.expand-toggle { + display: block; + margin: 1rem auto 0; + padding: 0.5rem 1.5rem; + background: #f1f5f9; + border: 1px solid var(--border); + border-radius: 6px; + cursor: pointer; + font-size: 0.875rem; + color: var(--text-secondary); + transition: all 0.2s ease; +} + +.expand-toggle:hover { + background: #e2e8f0; + color: var(--text); +} + +.confirmation-footer { + padding: 1rem 1.5rem; + border-top: 1px solid var(--border); + display: flex; + justify-content: flex-end; + gap: 1rem; + background: #f8fafc; +} + +.btn-confirm, +.btn-reject { + padding: 0.75rem 1.5rem; + border: none; + border-radius: 8px; + cursor: pointer; + font-size: 1rem; + font-weight: 500; + transition: all 0.2s ease; +} + +.btn-confirm { + background: var(--success); + color: white; +} + +.btn-confirm:hover { + background: #059669; +} + +.btn-reject { + background: #fee2e2; + color: var(--error); + border: 1px solid var(--error); +} + +.btn-reject:hover { + background: var(--error); + color: white; +} \ No newline at end of file diff --git a/plexe/ui/frontend/vite.config.js b/plexe/ui/frontend/vite.config.js new file mode 100644 index 00000000..3480572c --- /dev/null +++ b/plexe/ui/frontend/vite.config.js @@ -0,0 +1,39 @@ +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' + +// Use environment variable for backend URL, defaulting to Docker service name +// When running in Docker, use 'backend:8100'. When running locally, use 'localhost:8100' +const backendHost = process.env.VITE_BACKEND_HOST || 'backend' +const backendUrl = `http://${backendHost}:8100` + +export default defineConfig({ + plugins: [react()], + server: { + host: '0.0.0.0', + port: 3000, + strictPort: false, + hmr: { + host: 'localhost', + port: 3000, + protocol: 'ws', + }, + proxy: { + // Proxy /ws to backend during development + '/ws': { + target: backendUrl, + ws: true, + changeOrigin: true, + }, + // Proxy /api requests to backend + '/api': { + target: backendUrl, + changeOrigin: true, + }, + // Proxy /health to backend + '/health': { + target: backendUrl, + changeOrigin: true, + }, + }, + }, +}) diff --git a/plexe/ui/index.html b/plexe/ui/index.html index 8103bdd5..f60fa6e5 100644 --- a/plexe/ui/index.html +++ b/plexe/ui/index.html @@ -254,6 +254,32 @@ // Simple message component function Message({ message }) { const isUser = message.role === 'user'; + const isThinking = message.role === 'thinking'; + + if (isThinking) { + return React.createElement('div', { + className: 'flex justify-center mb-3' + }, + React.createElement('div', { + className: 'max-w-[85%] rounded-lg px-4 py-3 bg-gradient-to-br from-blue-50 to-cyan-50 border-l-4 border-blue-400 shadow-sm' + }, + React.createElement('div', { + className: 'flex justify-between items-center mb-2 pb-2 border-b border-blue-200' + }, + React.createElement('span', { + className: 'font-semibold text-blue-700 text-sm' + }, message.agent_name), + React.createElement('span', { + className: 'text-xs text-gray-500 italic' + }, `Step ${message.step_number}`) + ), + React.createElement('div', { + className: 'text-sm text-gray-700 whitespace-pre-wrap' + }, message.message) + ) + ); + } + return React.createElement('div', { className: `flex ${isUser ? 'justify-end' : 'justify-start'} mb-4` }, @@ -412,7 +438,7 @@ useEffect(() => { // Create WebSocket runtime - const runtime = new WebSocketRuntime('ws://localhost:8000/ws'); + const runtime = new WebSocketRuntime('ws://localhost:8100/ws'); setWsRuntime(runtime); // Subscribe to connection status diff --git a/poetry.lock b/poetry.lock index bc38b0d4..2e863773 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "accelerate" @@ -6,6 +6,8 @@ version = "0.24.1" description = "Accelerate" optional = true python-versions = ">=3.8.0" +groups = ["main"] +markers = "extra == \"transformers\" or extra == \"all\"" files = [ {file = "accelerate-0.24.1-py3-none-any.whl", hash = "sha256:866dec394da60e8da964be212379d8cf6cc0d0e5e28a7c0d7e09507715d21c61"}, {file = "accelerate-0.24.1.tar.gz", hash = "sha256:85ab2aeb4d06194b75113339f81b7d650523414a82c9e91b2912a655f53dfa8e"}, @@ -35,6 +37,7 @@ version = "2.6.1" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"}, {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"}, @@ -42,102 +45,137 @@ files = [ [[package]] name = "aiohttp" -version = "3.12.9" +version = "3.13.2" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.9" -files = [ - {file = "aiohttp-3.12.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:abb01935bb606bbc080424799bfda358d38374c45a7cbbc89f9bb330deb1db26"}, - {file = "aiohttp-3.12.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e2337516411cd15b7257736484dfd5101fa0e6b11ef2086b4bb6db9365373dcb"}, - {file = "aiohttp-3.12.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:26874b2c61ab5d1e05d942d7254a565eeec11750bf8f1a8995c33d6d772f5015"}, - {file = "aiohttp-3.12.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43dbedb626c6bb03cc8e9ab27b9da4414bc5540d3fe1bce0e687e50c20553689"}, - {file = "aiohttp-3.12.9-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:18897f24e80bac4e7df5d37375ab22391f8b7beedfe617f8de064dbfd76ca36b"}, - {file = "aiohttp-3.12.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2466804eaa42bf6340de28fba7254709db788989b891a7c5bd57a84f5a11c04b"}, - {file = "aiohttp-3.12.9-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85ddf89da86915ab327fafe9059540707b9deac7cfad1dfda4621eac6590aa16"}, - {file = "aiohttp-3.12.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8d89c0ea455b8e8e386db8b82a55671703d4868c7c1e38cca0d643232f50f8d"}, - {file = "aiohttp-3.12.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ee5ca28436b9203d020924c6dacc1cca4e77acf5f8f5c5d236b123c0158a012"}, - {file = "aiohttp-3.12.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7ca2ad779958e1beb2f139e7d45f84c13f94f6c0f63025e435e31f3247cb5a05"}, - {file = "aiohttp-3.12.9-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:daae5ea9c06daacb056351273a38d4465446fbb5c8c8107a6f93db3e1d5bc4e8"}, - {file = "aiohttp-3.12.9-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:52cec94fa76e488b0ebc6586507421116d7993c7984ea020529107796b206117"}, - {file = "aiohttp-3.12.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:db2aef30d877f44716c8ce4adb2162c7ccb9c58d6153bc68bd2cfb3fbd7d6a95"}, - {file = "aiohttp-3.12.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1d205549f965bc69c377206643b06fd78d77ed20b8735765c54153cf00a51465"}, - {file = "aiohttp-3.12.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3fdaaf63a778ae020b9bf8a7ae4a80f87deb88152aad259764e994b3efe44d38"}, - {file = "aiohttp-3.12.9-cp310-cp310-win32.whl", hash = "sha256:7aecd5546e5c65e4904fc697806a4830c2a4870cb7bae28a7f483db008bba3dc"}, - {file = "aiohttp-3.12.9-cp310-cp310-win_amd64.whl", hash = "sha256:5cf338d75be82709bf1c8d8404f347661819c1cc9f34798d5b762377fd70ccd6"}, - {file = "aiohttp-3.12.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:301eebd8e1134a8457151b451841a47d3440ce79fa9a0d1c70650bda624cbd69"}, - {file = "aiohttp-3.12.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d8ba7652d815bd5b99189d5b685db5509a08f1282e047a849b7f4353df8a95c"}, - {file = "aiohttp-3.12.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:998a6e143b2a4ffee14fb2c2ff5a3338d70d811be3f5d4a13a305ee0f4c6ac42"}, - {file = "aiohttp-3.12.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d011b13f3bfcf711ce9007ea08305a582135ee2105dc3202b011c055c1ac6f1"}, - {file = "aiohttp-3.12.9-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3c7b314d565e235051893a46e14ea14ab05bb17fe99bdb2cf85e9adc62b4836c"}, - {file = "aiohttp-3.12.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2bb6408bc2cb8ee5be4efb18bcfcfce4d76448f62237074917e146a425daf425"}, - {file = "aiohttp-3.12.9-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9ad4fe8d068544ba5d77500ea2d450f130109a4b0caf6d9197167303250f683"}, - {file = "aiohttp-3.12.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55721245164191ac92808ad39f3b2876195b1e6521ead0aad7f1c9ae69568b1a"}, - {file = "aiohttp-3.12.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5c5fbc9217578f5c9b5a65f27dfb044283b437cfa9cf52531f3ce94dca1e912"}, - {file = "aiohttp-3.12.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5c7e03f6dd8210b76587cb17088b3e5e0dabfc6787d42db58bc933da932230b7"}, - {file = "aiohttp-3.12.9-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:c892b2400c0795bbf00303282029c66e8ba912dc9fabf4728ba69a63046c8020"}, - {file = "aiohttp-3.12.9-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4de97019fec6f236671ee5d5831cebf67fbd52ee6bd47e2b8c9941cd39698db1"}, - {file = "aiohttp-3.12.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:941cd1ce3d1f605fd062857b339f7c3cde5ce83392bfb1029c3de782b8f98b52"}, - {file = "aiohttp-3.12.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:43f3d4d6264629d97d44a6d75603923c2c63dad6aff2f72b172635c43db739db"}, - {file = "aiohttp-3.12.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bbe5ab33a6810e9839270b3673eba683b9f91ed011be66feb4823f9fecf1bb73"}, - {file = "aiohttp-3.12.9-cp311-cp311-win32.whl", hash = "sha256:9ec207177e0adc694ed4a41ca8ebdb4008edb8d475a8b94d71d73414fc4707b6"}, - {file = "aiohttp-3.12.9-cp311-cp311-win_amd64.whl", hash = "sha256:965d93b08eed59359721a324b998ebf5354c9049b17cd93d9de50c14092b6ace"}, - {file = "aiohttp-3.12.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7ae744b61b395e04b3d1acbbd301d98249397333f49419039517226ff32f3aa7"}, - {file = "aiohttp-3.12.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d467a2049c4405853799dea41474b0ea9852fd465e7e2df819d3a33ac53214e8"}, - {file = "aiohttp-3.12.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ba7a8b5f02c2826eb29e8d6c38f1bc509efb506a2862131079b5b8d880ed4b62"}, - {file = "aiohttp-3.12.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bfe590ddb0dca3cdb601787079276545f00cfb9493f73f00fa011e71dae6f5fd"}, - {file = "aiohttp-3.12.9-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fc441aba05efec5c72127393f56206d0f3fb113aadcd1685033c10da1ff582ad"}, - {file = "aiohttp-3.12.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a3f20a1b72643a0be5c9fcb97eb22607fcca32f1ca497f09a88d1ec3109daae"}, - {file = "aiohttp-3.12.9-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3647dd1da43d595a52c5071b68fd8d39c0fd25b80f2cdd83eaabd9d59cd1f139"}, - {file = "aiohttp-3.12.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:970bae350cedbabb7c9d0fc8564b004a547d4a27cf12dc986be0abf7d8cc8d81"}, - {file = "aiohttp-3.12.9-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ccc5a5a4ccfa0ef0191dad2926e9752c37f368d846a70e40095a8529c5fb6eb"}, - {file = "aiohttp-3.12.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:55197e86994682a332e8943eb01b462ae25630b10f245812e517251d7a922f25"}, - {file = "aiohttp-3.12.9-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:94d0cf6606ed9f2373565b8d0005bb070afbb81525ef6fa6e0725b8aec0c0843"}, - {file = "aiohttp-3.12.9-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0575d7ae9a9c206276a6aaa3ce364b467f29f0497c0db4449de060dc341d88d6"}, - {file = "aiohttp-3.12.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9f44a4ebd717cc39796c4647495bc2901d0c168c71cd0132691ae3d0312215a9"}, - {file = "aiohttp-3.12.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f9cdadfe84beb8ceafa98ab676e8c0caf1e5d60e8b33c385c11259ee0f7f2587"}, - {file = "aiohttp-3.12.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:995b5640969b1250e37be6fc92d185e523e8df446f8bfa723b347e52d7ae80f9"}, - {file = "aiohttp-3.12.9-cp312-cp312-win32.whl", hash = "sha256:4cfa37e0797510fdb20ab0ee3ad483ae7cfacb27c6fb8de872a998705ad2286a"}, - {file = "aiohttp-3.12.9-cp312-cp312-win_amd64.whl", hash = "sha256:fdbd04e9b05885eaaefdb81c163b6dc1431eb13ee2da16d82ee980d4dd123890"}, - {file = "aiohttp-3.12.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bf6fac88666d7e4c6cfe649d133fcedbc68e37a4472e8662d98a7cf576207303"}, - {file = "aiohttp-3.12.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:74e87ea6c832311b18a32b06baa6fee90a83dd630de951cca1aa175c3c9fa1ce"}, - {file = "aiohttp-3.12.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16627b4caf6a36b605e3e1c4847e6d14af8e8d6b7dad322935be43237d4eb10d"}, - {file = "aiohttp-3.12.9-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:998e323c107c3f6396c1f9de72289009057c611942771f24114ae78a76af0af5"}, - {file = "aiohttp-3.12.9-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:20f8a6d3af13f043a09726add6d096b533f180cf8b43970a8d9c9ca978bf45c5"}, - {file = "aiohttp-3.12.9-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0bd0e06c8626361027f69df510c8484e17568ba2f91b2de51ea055f86ed3b071"}, - {file = "aiohttp-3.12.9-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64e22f12dd940a6e7b923637b10b611b752f6117bc3a780b7e61cc43c9e04892"}, - {file = "aiohttp-3.12.9-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11b5bf453056b6ac4924ede1188d01e8b8d4801a6aa5351da3a7dbdbc03cb44e"}, - {file = "aiohttp-3.12.9-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00369db59f09860e0e26c75035f80f92881103e90f5858c18f29eb4f8cb8970f"}, - {file = "aiohttp-3.12.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:80fa1efc71d423be25db9dddefe8dcd90e487fbc9351a59549521b66405e71de"}, - {file = "aiohttp-3.12.9-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:5cade22a0f0a4665003ded2bc4d43bb69fde790e5a287187569509c33333a3ab"}, - {file = "aiohttp-3.12.9-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d4a0fe3cd45cf6fb18222deef92af1c3efe090b7f43d477de61b2360c90a4b32"}, - {file = "aiohttp-3.12.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:97b036ce251825fd5ab69d302ca8a99d3352af1c616cf40b2306fdb734cd6d30"}, - {file = "aiohttp-3.12.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:eeac3a965552dbf79bcc0b9b963b5f7d6364b1542eb609937278d70d27ae997f"}, - {file = "aiohttp-3.12.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a1f72b2560beaa949b5d3b324fc07b66846d39a8e7cc106ca450312a5771e3e"}, - {file = "aiohttp-3.12.9-cp313-cp313-win32.whl", hash = "sha256:e429fce99ac3fd6423622713d2474a5911f24816ccdaf9a74c3ece854b7375c1"}, - {file = "aiohttp-3.12.9-cp313-cp313-win_amd64.whl", hash = "sha256:ccb1931cc8b4dc6d7a2d83db39db18c3f9ac3d46a59289cea301acbad57f3d12"}, - {file = "aiohttp-3.12.9-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:aee2910e6f06f6d229c3b90e277685a8f25fde54b3a4220cdf5901c925d681c3"}, - {file = "aiohttp-3.12.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d06286278ff413a1a410b6d4f7712e734dbceb2e352fab89b9c4448dd9f3d679"}, - {file = "aiohttp-3.12.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8f48df4f6061d4eb0c43867f8b82575bcfe05c8780ff9f21e811535458f6e0c"}, - {file = "aiohttp-3.12.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:495b2ac780e4d4f9a67fc79b7e84f21b09661f362b93d43360204a7bfecc4fec"}, - {file = "aiohttp-3.12.9-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6612437f2c761dd0b31569b28b8905bccfb88dc1aeecc9ad20fbaf346eafe989"}, - {file = "aiohttp-3.12.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4351fb8d4b12b15f39ed076a21d53f9542bc0db09ba973c04503b31ef8268332"}, - {file = "aiohttp-3.12.9-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4027f160e5109d6aac1537426d8b6e693fcca393dd9488d986ec855caf6dc4f6"}, - {file = "aiohttp-3.12.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30a55cdc682d98b8f7f1e8d3505846ab302a5547ffb7cef85607448b090d691d"}, - {file = "aiohttp-3.12.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f91ee8ed3d9ccb832dbc93e6b9d85c2a9dc73a7ea5d0f3ee4c3b64136f6ba598"}, - {file = "aiohttp-3.12.9-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:325acbe0c0225836e720eb758672c2f39e3017e89389de1dfd7fba7977b9bb82"}, - {file = "aiohttp-3.12.9-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:075da814b9a639904041d8d50e3ed665ea892df4e99278f8b63ff0ee549eb519"}, - {file = "aiohttp-3.12.9-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:57971e7adbe0984d9736836d7a34bd615119e628f04dfca302c1bf0ec3d39a77"}, - {file = "aiohttp-3.12.9-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:0954f990f274cfcbbd08d8fdb4a0c7949ac753bc1ea344c540829a85b0a8f34d"}, - {file = "aiohttp-3.12.9-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:daaf5a5f2340f46291ab7d44f60693cc71a05a8b9104e6efd3bd51c8a6526290"}, - {file = "aiohttp-3.12.9-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ba0843970e8a9cb4ddae47281010997f5b1a1c8cbc635fbefc9a0ccaa7c95606"}, - {file = "aiohttp-3.12.9-cp39-cp39-win32.whl", hash = "sha256:b06acaba86c46335a862ca0805cd695610bcb785d1a18f9f6498711178974e4b"}, - {file = "aiohttp-3.12.9-cp39-cp39-win_amd64.whl", hash = "sha256:0c4f87ee9451ce5e453af2cd868f4a42ea2f49c5aff6e8114cded0f47ed9ea9b"}, - {file = "aiohttp-3.12.9.tar.gz", hash = "sha256:2c9914c8914ff40b68c6e4ed5da33e88d4e8f368fddd03ceb0eb3175905ca782"}, +groups = ["main"] +files = [ + {file = "aiohttp-3.13.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2372b15a5f62ed37789a6b383ff7344fc5b9f243999b0cd9b629d8bc5f5b4155"}, + {file = "aiohttp-3.13.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7f8659a48995edee7229522984bd1009c1213929c769c2daa80b40fe49a180c"}, + {file = "aiohttp-3.13.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:939ced4a7add92296b0ad38892ce62b98c619288a081170695c6babe4f50e636"}, + {file = "aiohttp-3.13.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6315fb6977f1d0dd41a107c527fee2ed5ab0550b7d885bc15fee20ccb17891da"}, + {file = "aiohttp-3.13.2-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6e7352512f763f760baaed2637055c49134fd1d35b37c2dedfac35bfe5cf8725"}, + {file = "aiohttp-3.13.2-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e09a0a06348a2dd73e7213353c90d709502d9786219f69b731f6caa0efeb46f5"}, + {file = "aiohttp-3.13.2-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a09a6d073fb5789456545bdee2474d14395792faa0527887f2f4ec1a486a59d3"}, + {file = "aiohttp-3.13.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b59d13c443f8e049d9e94099c7e412e34610f1f49be0f230ec656a10692a5802"}, + {file = "aiohttp-3.13.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:20db2d67985d71ca033443a1ba2001c4b5693fe09b0e29f6d9358a99d4d62a8a"}, + {file = "aiohttp-3.13.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:960c2fc686ba27b535f9fd2b52d87ecd7e4fd1cf877f6a5cba8afb5b4a8bd204"}, + {file = "aiohttp-3.13.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6c00dbcf5f0d88796151e264a8eab23de2997c9303dd7c0bf622e23b24d3ce22"}, + {file = "aiohttp-3.13.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fed38a5edb7945f4d1bcabe2fcd05db4f6ec7e0e82560088b754f7e08d93772d"}, + {file = "aiohttp-3.13.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b395bbca716c38bef3c764f187860e88c724b342c26275bc03e906142fc5964f"}, + {file = "aiohttp-3.13.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:204ffff2426c25dfda401ba08da85f9c59525cdc42bda26660463dd1cbcfec6f"}, + {file = "aiohttp-3.13.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:05c4dd3c48fb5f15db31f57eb35374cb0c09afdde532e7fb70a75aede0ed30f6"}, + {file = "aiohttp-3.13.2-cp310-cp310-win32.whl", hash = "sha256:e574a7d61cf10351d734bcddabbe15ede0eaa8a02070d85446875dc11189a251"}, + {file = "aiohttp-3.13.2-cp310-cp310-win_amd64.whl", hash = "sha256:364f55663085d658b8462a1c3f17b2b84a5c2e1ba858e1b79bff7b2e24ad1514"}, + {file = "aiohttp-3.13.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4647d02df098f6434bafd7f32ad14942f05a9caa06c7016fdcc816f343997dd0"}, + {file = "aiohttp-3.13.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e3403f24bcb9c3b29113611c3c16a2a447c3953ecf86b79775e7be06f7ae7ccb"}, + {file = "aiohttp-3.13.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:43dff14e35aba17e3d6d5ba628858fb8cb51e30f44724a2d2f0c75be492c55e9"}, + {file = "aiohttp-3.13.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2a9ea08e8c58bb17655630198833109227dea914cd20be660f52215f6de5613"}, + {file = "aiohttp-3.13.2-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53b07472f235eb80e826ad038c9d106c2f653584753f3ddab907c83f49eedead"}, + {file = "aiohttp-3.13.2-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e736c93e9c274fce6419af4aac199984d866e55f8a4cec9114671d0ea9688780"}, + {file = "aiohttp-3.13.2-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ff5e771f5dcbc81c64898c597a434f7682f2259e0cd666932a913d53d1341d1a"}, + {file = "aiohttp-3.13.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3b6fb0c207cc661fa0bf8c66d8d9b657331ccc814f4719468af61034b478592"}, + {file = "aiohttp-3.13.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:97a0895a8e840ab3520e2288db7cace3a1981300d48babeb50e7425609e2e0ab"}, + {file = "aiohttp-3.13.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9e8f8afb552297aca127c90cb840e9a1d4bfd6a10d7d8f2d9176e1acc69bad30"}, + {file = "aiohttp-3.13.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed2f9c7216e53c3df02264f25d824b079cc5914f9e2deba94155190ef648ee40"}, + {file = "aiohttp-3.13.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:99c5280a329d5fa18ef30fd10c793a190d996567667908bef8a7f81f8202b948"}, + {file = "aiohttp-3.13.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ca6ffef405fc9c09a746cb5d019c1672cd7f402542e379afc66b370833170cf"}, + {file = "aiohttp-3.13.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:47f438b1a28e926c37632bff3c44df7d27c9b57aaf4e34b1def3c07111fdb782"}, + {file = "aiohttp-3.13.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9acda8604a57bb60544e4646a4615c1866ee6c04a8edef9b8ee6fd1d8fa2ddc8"}, + {file = "aiohttp-3.13.2-cp311-cp311-win32.whl", hash = "sha256:868e195e39b24aaa930b063c08bb0c17924899c16c672a28a65afded9c46c6ec"}, + {file = "aiohttp-3.13.2-cp311-cp311-win_amd64.whl", hash = "sha256:7fd19df530c292542636c2a9a85854fab93474396a52f1695e799186bbd7f24c"}, + {file = "aiohttp-3.13.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b1e56bab2e12b2b9ed300218c351ee2a3d8c8fdab5b1ec6193e11a817767e47b"}, + {file = "aiohttp-3.13.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:364e25edaabd3d37b1db1f0cbcee8c73c9a3727bfa262b83e5e4cf3489a2a9dc"}, + {file = "aiohttp-3.13.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c5c94825f744694c4b8db20b71dba9a257cd2ba8e010a803042123f3a25d50d7"}, + {file = "aiohttp-3.13.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba2715d842ffa787be87cbfce150d5e88c87a98e0b62e0f5aa489169a393dbbb"}, + {file = "aiohttp-3.13.2-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:585542825c4bc662221fb257889e011a5aa00f1ae4d75d1d246a5225289183e3"}, + {file = "aiohttp-3.13.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:39d02cb6025fe1aabca329c5632f48c9532a3dabccd859e7e2f110668972331f"}, + {file = "aiohttp-3.13.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e67446b19e014d37342f7195f592a2a948141d15a312fe0e700c2fd2f03124f6"}, + {file = "aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4356474ad6333e41ccefd39eae869ba15a6c5299c9c01dfdcfdd5c107be4363e"}, + {file = "aiohttp-3.13.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:eeacf451c99b4525f700f078becff32c32ec327b10dcf31306a8a52d78166de7"}, + {file = "aiohttp-3.13.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8a9b889aeabd7a4e9af0b7f4ab5ad94d42e7ff679aaec6d0db21e3b639ad58d"}, + {file = "aiohttp-3.13.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fa89cb11bc71a63b69568d5b8a25c3ca25b6d54c15f907ca1c130d72f320b76b"}, + {file = "aiohttp-3.13.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8aa7c807df234f693fed0ecd507192fc97692e61fee5702cdc11155d2e5cadc8"}, + {file = "aiohttp-3.13.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9eb3e33fdbe43f88c3c75fa608c25e7c47bbd80f48d012763cb67c47f39a7e16"}, + {file = "aiohttp-3.13.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9434bc0d80076138ea986833156c5a48c9c7a8abb0c96039ddbb4afc93184169"}, + {file = "aiohttp-3.13.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ff15c147b2ad66da1f2cbb0622313f2242d8e6e8f9b79b5206c84523a4473248"}, + {file = "aiohttp-3.13.2-cp312-cp312-win32.whl", hash = "sha256:27e569eb9d9e95dbd55c0fc3ec3a9335defbf1d8bc1d20171a49f3c4c607b93e"}, + {file = "aiohttp-3.13.2-cp312-cp312-win_amd64.whl", hash = "sha256:8709a0f05d59a71f33fd05c17fc11fcb8c30140506e13c2f5e8ee1b8964e1b45"}, + {file = "aiohttp-3.13.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7519bdc7dfc1940d201651b52bf5e03f5503bda45ad6eacf64dda98be5b2b6be"}, + {file = "aiohttp-3.13.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:088912a78b4d4f547a1f19c099d5a506df17eacec3c6f4375e2831ec1d995742"}, + {file = "aiohttp-3.13.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5276807b9de9092af38ed23ce120539ab0ac955547b38563a9ba4f5b07b95293"}, + {file = "aiohttp-3.13.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1237c1375eaef0db4dcd7c2559f42e8af7b87ea7d295b118c60c36a6e61cb811"}, + {file = "aiohttp-3.13.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:96581619c57419c3d7d78703d5b78c1e5e5fc0172d60f555bdebaced82ded19a"}, + {file = "aiohttp-3.13.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2713a95b47374169409d18103366de1050fe0ea73db358fc7a7acb2880422d4"}, + {file = "aiohttp-3.13.2-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:228a1cd556b3caca590e9511a89444925da87d35219a49ab5da0c36d2d943a6a"}, + {file = "aiohttp-3.13.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac6cde5fba8d7d8c6ac963dbb0256a9854e9fafff52fbcc58fdf819357892c3e"}, + {file = "aiohttp-3.13.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2bef8237544f4e42878c61cef4e2839fee6346dc60f5739f876a9c50be7fcdb"}, + {file = "aiohttp-3.13.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:16f15a4eac3bc2d76c45f7ebdd48a65d41b242eb6c31c2245463b40b34584ded"}, + {file = "aiohttp-3.13.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:bb7fb776645af5cc58ab804c58d7eba545a97e047254a52ce89c157b5af6cd0b"}, + {file = "aiohttp-3.13.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e1b4951125ec10c70802f2cb09736c895861cd39fd9dcb35107b4dc8ae6220b8"}, + {file = "aiohttp-3.13.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:550bf765101ae721ee1d37d8095f47b1f220650f85fe1af37a90ce75bab89d04"}, + {file = "aiohttp-3.13.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fe91b87fc295973096251e2d25a811388e7d8adf3bd2b97ef6ae78bc4ac6c476"}, + {file = "aiohttp-3.13.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e0c8e31cfcc4592cb200160344b2fb6ae0f9e4effe06c644b5a125d4ae5ebe23"}, + {file = "aiohttp-3.13.2-cp313-cp313-win32.whl", hash = "sha256:0740f31a60848d6edb296a0df827473eede90c689b8f9f2a4cdde74889eb2254"}, + {file = "aiohttp-3.13.2-cp313-cp313-win_amd64.whl", hash = "sha256:a88d13e7ca367394908f8a276b89d04a3652044612b9a408a0bb22a5ed976a1a"}, + {file = "aiohttp-3.13.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:2475391c29230e063ef53a66669b7b691c9bfc3f1426a0f7bcdf1216bdbac38b"}, + {file = "aiohttp-3.13.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:f33c8748abef4d8717bb20e8fb1b3e07c6adacb7fd6beaae971a764cf5f30d61"}, + {file = "aiohttp-3.13.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ae32f24bbfb7dbb485a24b30b1149e2f200be94777232aeadba3eecece4d0aa4"}, + {file = "aiohttp-3.13.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d7f02042c1f009ffb70067326ef183a047425bb2ff3bc434ead4dd4a4a66a2b"}, + {file = "aiohttp-3.13.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:93655083005d71cd6c072cdab54c886e6570ad2c4592139c3fb967bfc19e4694"}, + {file = "aiohttp-3.13.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0db1e24b852f5f664cd728db140cf11ea0e82450471232a394b3d1a540b0f906"}, + {file = "aiohttp-3.13.2-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b009194665bcd128e23eaddef362e745601afa4641930848af4c8559e88f18f9"}, + {file = "aiohttp-3.13.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c038a8fdc8103cd51dbd986ecdce141473ffd9775a7a8057a6ed9c3653478011"}, + {file = "aiohttp-3.13.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:66bac29b95a00db411cd758fea0e4b9bdba6d549dfe333f9a945430f5f2cc5a6"}, + {file = "aiohttp-3.13.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4ebf9cfc9ba24a74cf0718f04aac2a3bbe745902cc7c5ebc55c0f3b5777ef213"}, + {file = "aiohttp-3.13.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a4b88ebe35ce54205c7074f7302bd08a4cb83256a3e0870c72d6f68a3aaf8e49"}, + {file = "aiohttp-3.13.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:98c4fb90bb82b70a4ed79ca35f656f4281885be076f3f970ce315402b53099ae"}, + {file = "aiohttp-3.13.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:ec7534e63ae0f3759df3a1ed4fa6bc8f75082a924b590619c0dd2f76d7043caa"}, + {file = "aiohttp-3.13.2-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5b927cf9b935a13e33644cbed6c8c4b2d0f25b713d838743f8fe7191b33829c4"}, + {file = "aiohttp-3.13.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:88d6c017966a78c5265d996c19cdb79235be5e6412268d7e2ce7dee339471b7a"}, + {file = "aiohttp-3.13.2-cp314-cp314-win32.whl", hash = "sha256:f7c183e786e299b5d6c49fb43a769f8eb8e04a2726a2bd5887b98b5cc2d67940"}, + {file = "aiohttp-3.13.2-cp314-cp314-win_amd64.whl", hash = "sha256:fe242cd381e0fb65758faf5ad96c2e460df6ee5b2de1072fe97e4127927e00b4"}, + {file = "aiohttp-3.13.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:f10d9c0b0188fe85398c61147bbd2a657d616c876863bfeff43376e0e3134673"}, + {file = "aiohttp-3.13.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e7c952aefdf2460f4ae55c5e9c3e80aa72f706a6317e06020f80e96253b1accd"}, + {file = "aiohttp-3.13.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c20423ce14771d98353d2e25e83591fa75dfa90a3c1848f3d7c68243b4fbded3"}, + {file = "aiohttp-3.13.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e96eb1a34396e9430c19d8338d2ec33015e4a87ef2b4449db94c22412e25ccdf"}, + {file = "aiohttp-3.13.2-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:23fb0783bc1a33640036465019d3bba069942616a6a2353c6907d7fe1ccdaf4e"}, + {file = "aiohttp-3.13.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1a9bea6244a1d05a4e57c295d69e159a5c50d8ef16aa390948ee873478d9a5"}, + {file = "aiohttp-3.13.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0a3d54e822688b56e9f6b5816fb3de3a3a64660efac64e4c2dc435230ad23bad"}, + {file = "aiohttp-3.13.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a653d872afe9f33497215745da7a943d1dc15b728a9c8da1c3ac423af35178e"}, + {file = "aiohttp-3.13.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:56d36e80d2003fa3fc0207fac644216d8532e9504a785ef9a8fd013f84a42c61"}, + {file = "aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:78cd586d8331fb8e241c2dd6b2f4061778cc69e150514b39a9e28dd050475661"}, + {file = "aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:20b10bbfbff766294fe99987f7bb3b74fdd2f1a2905f2562132641ad434dcf98"}, + {file = "aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9ec49dff7e2b3c85cdeaa412e9d438f0ecd71676fde61ec57027dd392f00c693"}, + {file = "aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:94f05348c4406450f9d73d38efb41d669ad6cd90c7ee194810d0eefbfa875a7a"}, + {file = "aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:fa4dcb605c6f82a80c7f95713c2b11c3b8e9893b3ebd2bc9bde93165ed6107be"}, + {file = "aiohttp-3.13.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cf00e5db968c3f67eccd2778574cf64d8b27d95b237770aa32400bd7a1ca4f6c"}, + {file = "aiohttp-3.13.2-cp314-cp314t-win32.whl", hash = "sha256:d23b5fe492b0805a50d3371e8a728a9134d8de5447dce4c885f5587294750734"}, + {file = "aiohttp-3.13.2-cp314-cp314t-win_amd64.whl", hash = "sha256:ff0a7b0a82a7ab905cbda74006318d1b12e37c797eb1b0d4eb3e316cf47f658f"}, + {file = "aiohttp-3.13.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7fbdf5ad6084f1940ce88933de34b62358d0f4a0b6ec097362dcd3e5a65a4989"}, + {file = "aiohttp-3.13.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7c3a50345635a02db61792c85bb86daffac05330f6473d524f1a4e3ef9d0046d"}, + {file = "aiohttp-3.13.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0e87dff73f46e969af38ab3f7cb75316a7c944e2e574ff7c933bc01b10def7f5"}, + {file = "aiohttp-3.13.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2adebd4577724dcae085665f294cc57c8701ddd4d26140504db622b8d566d7aa"}, + {file = "aiohttp-3.13.2-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e036a3a645fe92309ec34b918394bb377950cbb43039a97edae6c08db64b23e2"}, + {file = "aiohttp-3.13.2-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:23ad365e30108c422d0b4428cf271156dd56790f6dd50d770b8e360e6c5ab2e6"}, + {file = "aiohttp-3.13.2-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1f9b2c2d4b9d958b1f9ae0c984ec1dd6b6689e15c75045be8ccb4011426268ca"}, + {file = "aiohttp-3.13.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a92cf4b9bea33e15ecbaa5c59921be0f23222608143d025c989924f7e3e0c07"}, + {file = "aiohttp-3.13.2-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:070599407f4954021509193404c4ac53153525a19531051661440644728ba9a7"}, + {file = "aiohttp-3.13.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:29562998ec66f988d49fb83c9b01694fa927186b781463f376c5845c121e4e0b"}, + {file = "aiohttp-3.13.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:4dd3db9d0f4ebca1d887d76f7cdbcd1116ac0d05a9221b9dad82c64a62578c4d"}, + {file = "aiohttp-3.13.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d7bc4b7f9c4921eba72677cd9fedd2308f4a4ca3e12fab58935295ad9ea98700"}, + {file = "aiohttp-3.13.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:dacd50501cd017f8cccb328da0c90823511d70d24a323196826d923aad865901"}, + {file = "aiohttp-3.13.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:8b2f1414f6a1e0683f212ec80e813f4abef94c739fd090b66c9adf9d2a05feac"}, + {file = "aiohttp-3.13.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04c3971421576ed24c191f610052bcb2f059e395bc2489dd99e397f9bc466329"}, + {file = "aiohttp-3.13.2-cp39-cp39-win32.whl", hash = "sha256:9f377d0a924e5cc94dc620bc6366fc3e889586a7f18b748901cf016c916e2084"}, + {file = "aiohttp-3.13.2-cp39-cp39-win_amd64.whl", hash = "sha256:9c705601e16c03466cb72011bd1af55d68fa65b045356d8f96c216e5f6db0fa5"}, + {file = "aiohttp-3.13.2.tar.gz", hash = "sha256:40176a52c186aefef6eb3cad2cdd30cd06e3afbe88fe8ab2af9c0b90f228daca"}, ] [package.dependencies] aiohappyeyeballs = ">=2.5.0" -aiosignal = ">=1.1.2" +aiosignal = ">=1.4.0" attrs = ">=17.3.0" frozenlist = ">=1.1.1" multidict = ">=4.5,<7.0" @@ -145,31 +183,34 @@ propcache = ">=0.2.0" yarl = ">=1.17.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns (>=3.3.0)", "brotlicffi"] +speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "backports.zstd ; platform_python_implementation == \"CPython\" and python_version < \"3.14\"", "brotlicffi ; platform_python_implementation != \"CPython\""] [[package]] name = "aiosignal" -version = "1.3.2" +version = "1.4.0" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, - {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, + {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"}, + {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"}, ] [package.dependencies] frozenlist = ">=1.1.0" +typing-extensions = {version = ">=4.2", markers = "python_version < \"3.13\""} [[package]] name = "alembic" -version = "1.16.1" +version = "1.17.2" description = "A database migration tool for SQLAlchemy." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "alembic-1.16.1-py3-none-any.whl", hash = "sha256:0cdd48acada30d93aa1035767d67dff25702f8de74d7c3919f2e8492c8db2e67"}, - {file = "alembic-1.16.1.tar.gz", hash = "sha256:43d37ba24b3d17bc1eb1024fe0f51cd1dc95aeb5464594a02c6bb9ca9864bfa4"}, + {file = "alembic-1.17.2-py3-none-any.whl", hash = "sha256:f483dd1fe93f6c5d49217055e4d15b905b425b6af906746abb35b69c1996c4e6"}, + {file = "alembic-1.17.2.tar.gz", hash = "sha256:bbe9751705c5e0f14877f02d46c53d10885e377e3d90eda810a016f9baa19e8e"}, ] [package.dependencies] @@ -186,6 +227,7 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -193,13 +235,14 @@ files = [ [[package]] name = "anyio" -version = "4.9.0" -description = "High level compatibility layer for multiple asynchronous event loop implementations" +version = "4.11.0" +description = "High-level concurrency and networking framework on top of asyncio or Trio" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c"}, - {file = "anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028"}, + {file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"}, + {file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"}, ] [package.dependencies] @@ -208,9 +251,7 @@ sniffio = ">=1.1" typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} [package.extras] -doc = ["Sphinx (>=8.2,<9.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"] -test = ["anyio[trio]", "blockbuster (>=1.5.23)", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"] -trio = ["trio (>=0.26.1)"] +trio = ["trio (>=0.31.0)"] [[package]] name = "appnope" @@ -218,6 +259,8 @@ version = "0.1.4" description = "Disable App Nap on macOS >= 10.9" optional = false python-versions = ">=3.6" +groups = ["dev"] +markers = "platform_system == \"Darwin\"" files = [ {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, @@ -229,6 +272,7 @@ version = "25.1.0" description = "Argon2 for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "argon2_cffi-25.1.0-py3-none-any.whl", hash = "sha256:fdc8b074db390fccb6eb4a3604ae7231f219aa669a2652e0f20e16ba513d5741"}, {file = "argon2_cffi-25.1.0.tar.gz", hash = "sha256:694ae5cc8a42f4c4e2bf2ca0e64e51e23a040c6a517a85074683d3959e1346c1"}, @@ -239,74 +283,78 @@ argon2-cffi-bindings = "*" [[package]] name = "argon2-cffi-bindings" -version = "21.2.0" +version = "25.1.0" description = "Low-level CFFI bindings for Argon2" optional = false -python-versions = ">=3.6" -files = [ - {file = "argon2-cffi-bindings-21.2.0.tar.gz", hash = "sha256:bb89ceffa6c791807d1305ceb77dbfacc5aa499891d2c55661c6459651fc39e3"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ccb949252cb2ab3a08c02024acb77cfb179492d5701c7cbdbfd776124d4d2367"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9524464572e12979364b7d600abf96181d3541da11e23ddf565a32e70bd4dc0d"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b746dba803a79238e925d9046a63aa26bf86ab2a2fe74ce6b009a1c3f5c8f2ae"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58ed19212051f49a523abb1dbe954337dc82d947fb6e5a0da60f7c8471a8476c"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:bd46088725ef7f58b5a1ef7ca06647ebaf0eb4baff7d1d0d177c6cc8744abd86"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_i686.whl", hash = "sha256:8cd69c07dd875537a824deec19f978e0f2078fdda07fd5c42ac29668dda5f40f"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-win32.whl", hash = "sha256:603ca0aba86b1349b147cab91ae970c63118a0f30444d4bc80355937c950c082"}, - {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-win_amd64.whl", hash = "sha256:b2ef1c30440dbbcba7a5dc3e319408b59676e2e039e2ae11a8775ecf482b192f"}, - {file = "argon2_cffi_bindings-21.2.0-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e415e3f62c8d124ee16018e491a009937f8cf7ebf5eb430ffc5de21b900dad93"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3e385d1c39c520c08b53d63300c3ecc28622f076f4c2b0e6d7e796e9f6502194"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3e3cc67fdb7d82c4718f19b4e7a87123caf8a93fde7e23cf66ac0337d3cb3f"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a22ad9800121b71099d0fb0a65323810a15f2e292f2ba450810a7316e128ee5"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351"}, - {file = "argon2_cffi_bindings-21.2.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:93f9bf70084f97245ba10ee36575f0c3f1e7d7724d67d8e5b08e61787c320ed7"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3b9ef65804859d335dc6b31582cad2c5166f0c3e7975f324d9ffaa34ee7e6583"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4966ef5848d820776f5f562a7d45fdd70c2f330c961d0d745b784034bd9f48d"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ef543a89dee4db46a1a6e206cd015360e5a75822f76df533845c3cbaf72670"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed2937d286e2ad0cc79a7087d3c272832865f779430e0cc2b4f3718d3159b0cb"}, - {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5e00316dabdaea0b2dd82d141cc66889ced0cdcbfa599e8b471cf22c620c329a"}, +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:3d3f05610594151994ca9ccb3c771115bdb4daef161976a266f0dd8aa9996b8f"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8b8efee945193e667a396cbc7b4fb7d357297d6234d30a489905d96caabde56b"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3c6702abc36bf3ccba3f802b799505def420a1b7039862014a65db3205967f5a"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1c70058c6ab1e352304ac7e3b52554daadacd8d453c1752e547c76e9c99ac44"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2fd3bfbff3c5d74fef31a722f729bf93500910db650c925c2d6ef879a7e51cb"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4f9665de60b1b0e99bcd6be4f17d90339698ce954cfd8d9cf4f91c995165a92"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ba92837e4a9aa6a508c8d2d7883ed5a8f6c308c89a4790e1e447a220deb79a85"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-win32.whl", hash = "sha256:84a461d4d84ae1295871329b346a97f68eade8c53b6ed9a7ca2d7467f3c8ff6f"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b55aec3565b65f56455eebc9b9f34130440404f27fe21c3b375bf1ea4d8fbae6"}, + {file = "argon2_cffi_bindings-25.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:87c33a52407e4c41f3b70a9c2d3f6056d88b10dad7695be708c5021673f55623"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:aecba1723ae35330a008418a91ea6cfcedf6d31e5fbaa056a166462ff066d500"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2630b6240b495dfab90aebe159ff784d08ea999aa4b0d17efa734055a07d2f44"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7aef0c91e2c0fbca6fc68e7555aa60ef7008a739cbe045541e438373bc54d2b0"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e021e87faa76ae0d413b619fe2b65ab9a037f24c60a1e6cc43457ae20de6dc6"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c87b72589133f0346a1cb8d5ecca4b933e3c9b64656c9d175270a000e73b288d"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1db89609c06afa1a214a69a462ea741cf735b29a57530478c06eb81dd403de99"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-win32.whl", hash = "sha256:473bcb5f82924b1becbb637b63303ec8d10e84c8d241119419897a26116515d2"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-win_amd64.whl", hash = "sha256:a98cd7d17e9f7ce244c0803cad3c23a7d379c301ba618a5fa76a67d116618b98"}, + {file = "argon2_cffi_bindings-25.1.0-cp39-abi3-win_arm64.whl", hash = "sha256:b0fdbcf513833809c882823f98dc2f931cf659d9a1429616ac3adebb49f5db94"}, + {file = "argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6dca33a9859abf613e22733131fc9194091c1fa7cb3e131c143056b4856aa47e"}, + {file = "argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:21378b40e1b8d1655dd5310c84a40fc19a9aa5e6366e835ceb8576bf0fea716d"}, + {file = "argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d588dec224e2a83edbdc785a5e6f3c6cd736f46bfd4b441bbb5aa1f5085e584"}, + {file = "argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5acb4e41090d53f17ca1110c3427f0a130f944b896fc8c83973219c97f57b690"}, + {file = "argon2_cffi_bindings-25.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:da0c79c23a63723aa5d782250fbf51b768abca630285262fb5144ba5ae01e520"}, + {file = "argon2_cffi_bindings-25.1.0.tar.gz", hash = "sha256:b957f3e6ea4d55d820e40ff76f450952807013d361a65d7f28acc0acbf29229d"}, ] [package.dependencies] -cffi = ">=1.0.1" - -[package.extras] -dev = ["cogapp", "pre-commit", "pytest", "wheel"] -tests = ["pytest"] +cffi = {version = ">=1.0.1", markers = "python_version < \"3.14\""} [[package]] name = "arrow" -version = "1.3.0" +version = "1.4.0" description = "Better dates & times for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "arrow-1.3.0-py3-none-any.whl", hash = "sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80"}, - {file = "arrow-1.3.0.tar.gz", hash = "sha256:d4540617648cb5f895730f1ad8c82a65f2dad0166f57b75f3ca54759c4d67a85"}, + {file = "arrow-1.4.0-py3-none-any.whl", hash = "sha256:749f0769958ebdc79c173ff0b0670d59051a535fa26e8eba02953dc19eb43205"}, + {file = "arrow-1.4.0.tar.gz", hash = "sha256:ed0cc050e98001b8779e84d461b0098c4ac597e88704a655582b21d116e526d7"}, ] [package.dependencies] python-dateutil = ">=2.7.0" -types-python-dateutil = ">=2.8.10" +tzdata = {version = "*", markers = "python_version >= \"3.9\""} [package.extras] doc = ["doc8", "sphinx (>=7.0.0)", "sphinx-autobuild", "sphinx-autodoc-typehints", "sphinx_rtd_theme (>=1.3.0)"] -test = ["dateparser (==1.*)", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "pytz (==2021.1)", "simplejson (==3.*)"] +test = ["dateparser (==1.*)", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "pytz (==2025.2)", "simplejson (==3.*)"] [[package]] name = "asttokens" -version = "3.0.0" +version = "3.0.1" description = "Annotate AST trees with source code positions" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"}, - {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"}, + {file = "asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a"}, + {file = "asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7"}, ] [package.extras] -astroid = ["astroid (>=2,<4)"] -test = ["astroid (>=2,<4)", "pytest", "pytest-cov", "pytest-xdist"] +astroid = ["astroid (>=2,<5)"] +test = ["astroid (>=2,<5)", "pytest (<9.0)", "pytest-cov", "pytest-xdist"] [[package]] name = "async-lru" @@ -314,6 +362,7 @@ version = "2.0.5" description = "Simple LRU cache for asyncio" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "async_lru-2.0.5-py3-none-any.whl", hash = "sha256:ab95404d8d2605310d345932697371a5f40def0487c03d6d0ad9138de52c9943"}, {file = "async_lru-2.0.5.tar.gz", hash = "sha256:481d52ccdd27275f42c43a928b4a50c3bfb2d67af4e78b170e3e0bb39c66e5bb"}, @@ -321,46 +370,41 @@ files = [ [[package]] name = "attrs" -version = "25.3.0" +version = "25.4.0" description = "Classes Without Boilerplate" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"}, - {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"}, + {file = "attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373"}, + {file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"}, ] -[package.extras] -benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"] -tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] - [[package]] name = "babel" version = "2.17.0" description = "Internationalization utilities" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, ] [package.extras] -dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"] +dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""] [[package]] name = "bandit" -version = "1.8.3" +version = "1.8.6" description = "Security oriented static analyser for python code." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "bandit-1.8.3-py3-none-any.whl", hash = "sha256:28f04dc0d258e1dd0f99dee8eefa13d1cb5e3fde1a5ab0c523971f97b289bcd8"}, - {file = "bandit-1.8.3.tar.gz", hash = "sha256:f5847beb654d309422985c36644649924e0ea4425c76dec2e89110b87506193a"}, + {file = "bandit-1.8.6-py3-none-any.whl", hash = "sha256:3348e934d736fcdb68b6aa4030487097e23a501adf3e7827b63658df464dddd0"}, + {file = "bandit-1.8.6.tar.gz", hash = "sha256:dbfe9c25fc6961c2078593de55fd19f2559f9e45b99f1272341f5b95dea4e56b"}, ] [package.dependencies] @@ -373,18 +417,19 @@ stevedore = ">=1.20.0" baseline = ["GitPython (>=3.1.30)"] sarif = ["jschema-to-python (>=1.2.3)", "sarif-om (>=1.0.4)"] test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)", "flake8 (>=4.0.0)", "pylint (==1.9.4)", "stestr (>=2.5.0)", "testscenarios (>=0.5.0)", "testtools (>=2.3.0)"] -toml = ["tomli (>=1.1.0)"] +toml = ["tomli (>=1.1.0) ; python_version < \"3.11\""] yaml = ["PyYAML"] [[package]] name = "beautifulsoup4" -version = "4.13.4" +version = "4.14.2" description = "Screen-scraping library" optional = false python-versions = ">=3.7.0" +groups = ["dev"] files = [ - {file = "beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b"}, - {file = "beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195"}, + {file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"}, + {file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"}, ] [package.dependencies] @@ -404,6 +449,7 @@ version = "24.10.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"}, {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"}, @@ -444,13 +490,14 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "bleach" -version = "6.2.0" +version = "6.3.0" description = "An easy safelist-based HTML-sanitizing tool." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "bleach-6.2.0-py3-none-any.whl", hash = "sha256:117d9c6097a7c3d22fd578fcd8d35ff1e125df6736f554da4e432fdd63f31e5e"}, - {file = "bleach-6.2.0.tar.gz", hash = "sha256:123e894118b8a599fd80d3ec1a6d4cc7ce4e5882b1317a7e1ba69b56e95f991f"}, + {file = "bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6"}, + {file = "bleach-6.3.0.tar.gz", hash = "sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22"}, ] [package.dependencies] @@ -466,6 +513,7 @@ version = "1.9.0" description = "Fast, simple object-to-object and broadcast signaling" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc"}, {file = "blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf"}, @@ -473,104 +521,125 @@ files = [ [[package]] name = "cachetools" -version = "5.5.2" +version = "6.2.2" description = "Extensible memoizing collections and decorators" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, - {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, + {file = "cachetools-6.2.2-py3-none-any.whl", hash = "sha256:6c09c98183bf58560c97b2abfcedcbaf6a896a490f534b031b661d3723b45ace"}, + {file = "cachetools-6.2.2.tar.gz", hash = "sha256:8e6d266b25e539df852251cfd6f990b4bc3a141db73b939058d809ebd2590fc6"}, ] [[package]] name = "certifi" -version = "2025.4.26" +version = "2025.11.12" description = "Python package for providing Mozilla's CA Bundle." optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" +groups = ["main", "dev"] files = [ - {file = "certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3"}, - {file = "certifi-2025.4.26.tar.gz", hash = "sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6"}, + {file = "certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b"}, + {file = "certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316"}, ] [[package]] name = "cffi" -version = "1.17.1" +version = "2.0.0" description = "Foreign Function Interface for Python calling C code." optional = false -python-versions = ">=3.8" -files = [ - {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, - {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, - {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, - {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, - {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, - {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, - {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, - {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, - {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, - {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, - {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, - {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, - {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, - {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, - {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, - {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, - {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, - {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, - {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, - {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, - {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, - {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, - {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, - {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, - {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, - {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, - {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, - {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, - {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, - {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, - {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, - {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, - {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, - {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, - {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, - {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, - {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, - {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, - {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, - {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, - {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, - {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, - {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, - {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, - {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, - {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, - {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, - {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, - {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, - {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, - {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, - {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, - {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, - {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, - {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, - {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, - {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, - {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, - {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, - {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, - {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, - {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, - {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, -] +python-versions = ">=3.9" +groups = ["main", "dev"] +files = [ + {file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"}, + {file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"}, + {file = "cffi-2.0.0-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c"}, + {file = "cffi-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb"}, + {file = "cffi-2.0.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0"}, + {file = "cffi-2.0.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4"}, + {file = "cffi-2.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453"}, + {file = "cffi-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495"}, + {file = "cffi-2.0.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5"}, + {file = "cffi-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb"}, + {file = "cffi-2.0.0-cp310-cp310-win32.whl", hash = "sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a"}, + {file = "cffi-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739"}, + {file = "cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe"}, + {file = "cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c"}, + {file = "cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92"}, + {file = "cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93"}, + {file = "cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5"}, + {file = "cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664"}, + {file = "cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26"}, + {file = "cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9"}, + {file = "cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414"}, + {file = "cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743"}, + {file = "cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5"}, + {file = "cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5"}, + {file = "cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d"}, + {file = "cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d"}, + {file = "cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c"}, + {file = "cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe"}, + {file = "cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062"}, + {file = "cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e"}, + {file = "cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037"}, + {file = "cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba"}, + {file = "cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94"}, + {file = "cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187"}, + {file = "cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18"}, + {file = "cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5"}, + {file = "cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6"}, + {file = "cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb"}, + {file = "cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca"}, + {file = "cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b"}, + {file = "cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b"}, + {file = "cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2"}, + {file = "cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3"}, + {file = "cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26"}, + {file = "cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c"}, + {file = "cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b"}, + {file = "cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27"}, + {file = "cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75"}, + {file = "cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91"}, + {file = "cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5"}, + {file = "cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13"}, + {file = "cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b"}, + {file = "cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c"}, + {file = "cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef"}, + {file = "cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775"}, + {file = "cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205"}, + {file = "cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1"}, + {file = "cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f"}, + {file = "cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25"}, + {file = "cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad"}, + {file = "cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9"}, + {file = "cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d"}, + {file = "cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c"}, + {file = "cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8"}, + {file = "cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc"}, + {file = "cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592"}, + {file = "cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512"}, + {file = "cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4"}, + {file = "cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e"}, + {file = "cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6"}, + {file = "cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9"}, + {file = "cffi-2.0.0-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:fe562eb1a64e67dd297ccc4f5addea2501664954f2692b69a76449ec7913ecbf"}, + {file = "cffi-2.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:de8dad4425a6ca6e4e5e297b27b5c824ecc7581910bf9aee86cb6835e6812aa7"}, + {file = "cffi-2.0.0-cp39-cp39-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c"}, + {file = "cffi-2.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165"}, + {file = "cffi-2.0.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:e6e73b9e02893c764e7e8d5bb5ce277f1a009cd5243f8228f75f842bf937c534"}, + {file = "cffi-2.0.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:cb527a79772e5ef98fb1d700678fe031e353e765d1ca2d409c92263c6d43e09f"}, + {file = "cffi-2.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63"}, + {file = "cffi-2.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2"}, + {file = "cffi-2.0.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65"}, + {file = "cffi-2.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322"}, + {file = "cffi-2.0.0-cp39-cp39-win32.whl", hash = "sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a"}, + {file = "cffi-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9"}, + {file = "cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529"}, +] +markers = {main = "platform_python_implementation != \"PyPy\""} [package.dependencies] -pycparser = "*" +pycparser = {version = "*", markers = "implementation_name != \"PyPy\""} [[package]] name = "cfgv" @@ -578,6 +647,7 @@ version = "3.4.0" description = "Validate configuration and produce human readable error messages." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, @@ -589,6 +659,7 @@ version = "5.2.0" description = "Universal encoding detector for Python 3" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, @@ -596,114 +667,137 @@ files = [ [[package]] name = "charset-normalizer" -version = "3.4.2" +version = "3.4.4" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" -files = [ - {file = "charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a"}, - {file = "charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a"}, - {file = "charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c"}, - {file = "charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7"}, - {file = "charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cad5f45b3146325bb38d6855642f6fd609c3f7cad4dbaf75549bf3b904d3184"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2680962a4848b3c4f155dc2ee64505a9c57186d0d56b43123b17ca3de18f0fa"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:36b31da18b8890a76ec181c3cf44326bf2c48e36d393ca1b72b3f484113ea344"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4074c5a429281bf056ddd4c5d3b740ebca4d43ffffe2ef4bf4d2d05114299da"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9e36a97bee9b86ef9a1cf7bb96747eb7a15c2f22bdb5b516434b00f2a599f02"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:1b1bde144d98e446b056ef98e59c256e9294f6b74d7af6846bf5ffdafd687a7d"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:915f3849a011c1f593ab99092f3cecfcb4d65d8feb4a64cf1bf2d22074dc0ec4"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:fb707f3e15060adf5b7ada797624a6c6e0138e2a26baa089df64c68ee98e040f"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:25a23ea5c7edc53e0f29bae2c44fcb5a1aa10591aae107f2a2b2583a9c5cbc64"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:770cab594ecf99ae64c236bc9ee3439c3f46be49796e265ce0cc8bc17b10294f"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-win32.whl", hash = "sha256:6a0289e4589e8bdfef02a80478f1dfcb14f0ab696b5a00e1f4b8a14a307a3c58"}, - {file = "charset_normalizer-3.4.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6fc1f5b51fa4cecaa18f2bd7a003f3dd039dd615cd69a2afd6d3b19aed6775f2"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76af085e67e56c8816c3ccf256ebd136def2ed9654525348cfa744b6802b69eb"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e45ba65510e2647721e35323d6ef54c7974959f6081b58d4ef5d87c60c84919a"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:046595208aae0120559a67693ecc65dd75d46f7bf687f159127046628178dc45"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75d10d37a47afee94919c4fab4c22b9bc2a8bf7d4f46f87363bcf0573f3ff4f5"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6333b3aa5a12c26b2a4d4e7335a28f1475e0e5e17d69d55141ee3cab736f66d1"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8323a9b031aa0393768b87f04b4164a40037fb2a3c11ac06a03ffecd3618027"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:24498ba8ed6c2e0b56d4acbf83f2d989720a93b41d712ebd4f4979660db4417b"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:844da2b5728b5ce0e32d863af26f32b5ce61bc4273a9c720a9f3aa9df73b1455"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:65c981bdbd3f57670af8b59777cbfae75364b483fa8a9f420f08094531d54a01"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:3c21d4fca343c805a52c0c78edc01e3477f6dd1ad7c47653241cf2a206d4fc58"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dc7039885fa1baf9be153a0626e337aa7ec8bf96b0128605fb0d77788ddc1681"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-win32.whl", hash = "sha256:8272b73e1c5603666618805fe821edba66892e2870058c94c53147602eab29c7"}, - {file = "charset_normalizer-3.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:70f7172939fdf8790425ba31915bfbe8335030f05b9913d7ae00a87d4395620a"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:005fa3432484527f9732ebd315da8da8001593e2cf46a3d817669f062c3d9ed4"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e92fca20c46e9f5e1bb485887d074918b13543b1c2a1185e69bb8d17ab6236a7"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50bf98d5e563b83cc29471fa114366e6806bc06bc7a25fd59641e41445327836"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:721c76e84fe669be19c5791da68232ca2e05ba5185575086e384352e2c309597"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d8fd25b7f4675d0c47cf95b594d4e7b158aca33b76aa63d07186e13c0e0ab7"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3daeac64d5b371dea99714f08ffc2c208522ec6b06fbc7866a450dd446f5c0f"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dccab8d5fa1ef9bfba0590ecf4d46df048d18ffe3eec01eeb73a42e0d9e7a8ba"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aaf27faa992bfee0264dc1f03f4c75e9fcdda66a519db6b957a3f826e285cf12"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:eb30abc20df9ab0814b5a2524f23d75dcf83cde762c161917a2b4b7b55b1e518"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c72fbbe68c6f32f251bdc08b8611c7b3060612236e960ef848e0a517ddbe76c5"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:982bb1e8b4ffda883b3d0a521e23abcd6fd17418f6d2c4118d257a10199c0ce3"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-win32.whl", hash = "sha256:43e0933a0eff183ee85833f341ec567c0980dae57c464d8a508e1b2ceb336471"}, - {file = "charset_normalizer-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:d11b54acf878eef558599658b0ffca78138c8c3655cf4f3a4a673c437e67732e"}, - {file = "charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0"}, - {file = "charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63"}, +groups = ["main", "dev"] +files = [ + {file = "charset_normalizer-3.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e824f1492727fa856dd6eda4f7cee25f8518a12f3c4a56a74e8095695089cf6d"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4bd5d4137d500351a30687c2d3971758aac9a19208fc110ccb9d7188fbe709e8"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:027f6de494925c0ab2a55eab46ae5129951638a49a34d87f4c3eda90f696b4ad"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f820802628d2694cb7e56db99213f930856014862f3fd943d290ea8438d07ca8"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:798d75d81754988d2565bff1b97ba5a44411867c0cf32b77a7e8f8d84796b10d"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d1bb833febdff5c8927f922386db610b49db6e0d4f4ee29601d71e7c2694313"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9cd98cdc06614a2f768d2b7286d66805f94c48cde050acdbbb7db2600ab3197e"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:077fbb858e903c73f6c9db43374fd213b0b6a778106bc7032446a8e8b5b38b93"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:244bfb999c71b35de57821b8ea746b24e863398194a4014e4c76adc2bbdfeff0"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:64b55f9dce520635f018f907ff1b0df1fdc31f2795a922fb49dd14fbcdf48c84"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:faa3a41b2b66b6e50f84ae4a68c64fcd0c44355741c6374813a800cd6695db9e"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6515f3182dbe4ea06ced2d9e8666d97b46ef4c75e326b79bb624110f122551db"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc00f04ed596e9dc0da42ed17ac5e596c6ccba999ba6bd92b0e0aef2f170f2d6"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-win32.whl", hash = "sha256:f34be2938726fc13801220747472850852fe6b1ea75869a048d6f896838c896f"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:a61900df84c667873b292c3de315a786dd8dac506704dea57bc957bd31e22c7d"}, + {file = "charset_normalizer-3.4.4-cp310-cp310-win_arm64.whl", hash = "sha256:cead0978fc57397645f12578bfd2d5ea9138ea0fac82b2f63f7f7c6877986a69"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-win32.whl", hash = "sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016"}, + {file = "charset_normalizer-3.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-win32.whl", hash = "sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525"}, + {file = "charset_normalizer-3.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14"}, + {file = "charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c"}, + {file = "charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ce8a0633f41a967713a59c4139d29110c07e826d131a316b50ce11b1d79b4f84"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaabd426fe94daf8fd157c32e571c85cb12e66692f15516a83a03264b08d06c3"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c4ef880e27901b6cc782f1b95f82da9313c0eb95c3af699103088fa0ac3ce9ac"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aaba3b0819274cc41757a1da876f810a3e4d7b6eb25699253a4effef9e8e4af"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:778d2e08eda00f4256d7f672ca9fef386071c9202f5e4607920b86d7803387f2"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f155a433c2ec037d4e8df17d18922c3a0d9b3232a396690f17175d2946f0218d"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8bf8d0f749c5757af2142fe7903a9df1d2e8aa3841559b2bad34b08d0e2bcf3"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:194f08cbb32dc406d6e1aea671a68be0823673db2832b38405deba2fb0d88f63"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:6aee717dcfead04c6eb1ce3bd29ac1e22663cdea57f943c87d1eab9a025438d7"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:cd4b7ca9984e5e7985c12bc60a6f173f3c958eae74f3ef6624bb6b26e2abbae4"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-musllinux_1_2_riscv64.whl", hash = "sha256:b7cf1017d601aa35e6bb650b6ad28652c9cd78ee6caff19f3c28d03e1c80acbf"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:e912091979546adf63357d7e2ccff9b44f026c075aeaf25a52d0e95ad2281074"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:5cb4d72eea50c8868f5288b7f7f33ed276118325c1dfd3957089f6b519e1382a"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-win32.whl", hash = "sha256:837c2ce8c5a65a2035be9b3569c684358dfbf109fd3b6969630a87535495ceaa"}, + {file = "charset_normalizer-3.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:44c2a8734b333e0578090c4cd6b16f275e07aa6614ca8715e6c038e865e70576"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a9768c477b9d7bd54bc0c86dbaebdec6f03306675526c9927c0e8a04e8f94af9"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1bee1e43c28aa63cb16e5c14e582580546b08e535299b8b6158a7c9c768a1f3d"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fd44c878ea55ba351104cb93cc85e74916eb8fa440ca7903e57575e97394f608"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f04b14ffe5fdc8c4933862d8306109a2c51e0704acfa35d51598eb45a1e89fc"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:cd09d08005f958f370f539f186d10aec3377d55b9eeb0d796025d4886119d76e"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4fe7859a4e3e8457458e2ff592f15ccb02f3da787fcd31e0183879c3ad4692a1"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fa09f53c465e532f4d3db095e0c55b615f010ad81803d383195b6b5ca6cbf5f3"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7fa17817dc5625de8a027cb8b26d9fefa3ea28c8253929b8d6649e705d2835b6"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:5947809c8a2417be3267efc979c47d76a079758166f7d43ef5ae8e9f92751f88"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:4902828217069c3c5c71094537a8e623f5d097858ac6ca8252f7b4d10b7560f1"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:7c308f7e26e4363d79df40ca5b2be1c6ba9f02bdbccfed5abddb7859a6ce72cf"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2c9d3c380143a1fedbff95a312aa798578371eb29da42106a29019368a475318"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cb01158d8b88ee68f15949894ccc6712278243d95f344770fa7593fa2d94410c"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-win32.whl", hash = "sha256:2677acec1a2f8ef614c6888b5b4ae4060cc184174a938ed4e8ef690e15d3e505"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:f8e160feb2aed042cd657a72acc0b481212ed28b1b9a95c0cee1621b524e1966"}, + {file = "charset_normalizer-3.4.4-cp39-cp39-win_arm64.whl", hash = "sha256:b5d84d37db046c5ca74ee7bb47dd6cbc13f80665fdde3e8040bdd3fb015ecb50"}, + {file = "charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f"}, + {file = "charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a"}, ] [[package]] name = "click" -version = "8.2.1" +version = "8.3.1" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b"}, - {file = "click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202"}, + {file = "click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6"}, + {file = "click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a"}, ] [package.dependencies] @@ -711,13 +805,14 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} [[package]] name = "cloudpickle" -version = "3.1.1" +version = "3.1.2" description = "Pickler class to extend the standard pickle.Pickler functionality" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "cloudpickle-3.1.1-py3-none-any.whl", hash = "sha256:c8c5a44295039331ee9dad40ba100a9c7297b6f988e50e87ccdf3765a668350e"}, - {file = "cloudpickle-3.1.1.tar.gz", hash = "sha256:b216fa8ae4019d5482a8ac3c95d8f6346115d8835911fd4aefd1a445e4242c64"}, + {file = "cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a"}, + {file = "cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414"}, ] [[package]] @@ -726,182 +821,301 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "(extra == \"chatui\" or extra == \"all\") and sys_platform == \"win32\" or platform_system == \"Windows\""} [[package]] name = "comm" -version = "0.2.2" +version = "0.2.3" description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"}, - {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"}, + {file = "comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417"}, + {file = "comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971"}, ] -[package.dependencies] -traitlets = ">=4" - [package.extras] test = ["pytest"] [[package]] name = "contourpy" -version = "1.3.2" +version = "1.3.3" description = "Python library for calculating contours of 2D quadrilateral grids" optional = false -python-versions = ">=3.10" -files = [ - {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"}, - {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"}, - {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d"}, - {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9"}, - {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512"}, - {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631"}, - {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f"}, - {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2"}, - {file = "contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0"}, - {file = "contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a"}, - {file = "contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445"}, - {file = "contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773"}, - {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1"}, - {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43"}, - {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab"}, - {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7"}, - {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83"}, - {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd"}, - {file = "contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f"}, - {file = "contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878"}, - {file = "contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2"}, - {file = "contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15"}, - {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92"}, - {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87"}, - {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415"}, - {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe"}, - {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441"}, - {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e"}, - {file = "contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912"}, - {file = "contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73"}, - {file = "contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb"}, - {file = "contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08"}, - {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c"}, - {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f"}, - {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85"}, - {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841"}, - {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422"}, - {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef"}, - {file = "contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f"}, - {file = "contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9"}, - {file = "contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f"}, - {file = "contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739"}, - {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823"}, - {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5"}, - {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532"}, - {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b"}, - {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52"}, - {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd"}, - {file = "contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1"}, - {file = "contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69"}, - {file = "contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c"}, - {file = "contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16"}, - {file = "contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad"}, - {file = "contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0"}, - {file = "contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5"}, - {file = "contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5"}, - {file = "contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54"}, +python-versions = ">=3.11" +groups = ["main"] +files = [ + {file = "contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1"}, + {file = "contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db"}, + {file = "contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620"}, + {file = "contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f"}, + {file = "contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff"}, + {file = "contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42"}, + {file = "contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470"}, + {file = "contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb"}, + {file = "contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1"}, + {file = "contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7"}, + {file = "contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411"}, + {file = "contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69"}, + {file = "contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b"}, + {file = "contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc"}, + {file = "contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5"}, + {file = "contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9"}, + {file = "contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659"}, + {file = "contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7"}, + {file = "contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d"}, + {file = "contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263"}, + {file = "contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9"}, + {file = "contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d"}, + {file = "contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b"}, + {file = "contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a"}, + {file = "contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e"}, + {file = "contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3"}, + {file = "contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8"}, + {file = "contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301"}, + {file = "contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a"}, + {file = "contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3"}, + {file = "contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b"}, + {file = "contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36"}, + {file = "contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d"}, + {file = "contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd"}, + {file = "contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339"}, + {file = "contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772"}, + {file = "contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0"}, + {file = "contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4"}, + {file = "contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f"}, + {file = "contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae"}, + {file = "contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc"}, + {file = "contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77"}, + {file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"}, ] [package.dependencies] -numpy = ">=1.23" +numpy = ">=1.25" [package.extras] bokeh = ["bokeh", "selenium"] docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] -mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "types-Pillow"] +mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", "types-Pillow"] test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] [[package]] name = "coverage" -version = "7.8.2" +version = "7.11.3" description = "Code coverage measurement for Python" optional = false -python-versions = ">=3.9" -files = [ - {file = "coverage-7.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bd8ec21e1443fd7a447881332f7ce9d35b8fbd2849e761bb290b584535636b0a"}, - {file = "coverage-7.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c26c2396674816deaeae7ded0e2b42c26537280f8fe313335858ffff35019be"}, - {file = "coverage-7.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1aec326ed237e5880bfe69ad41616d333712c7937bcefc1343145e972938f9b3"}, - {file = "coverage-7.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5e818796f71702d7a13e50c70de2a1924f729228580bcba1607cccf32eea46e6"}, - {file = "coverage-7.8.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:546e537d9e24efc765c9c891328f30f826e3e4808e31f5d0f87c4ba12bbd1622"}, - {file = "coverage-7.8.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ab9b09a2349f58e73f8ebc06fac546dd623e23b063e5398343c5270072e3201c"}, - {file = "coverage-7.8.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fd51355ab8a372d89fb0e6a31719e825cf8df8b6724bee942fb5b92c3f016ba3"}, - {file = "coverage-7.8.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0774df1e093acb6c9e4d58bce7f86656aeed6c132a16e2337692c12786b32404"}, - {file = "coverage-7.8.2-cp310-cp310-win32.whl", hash = "sha256:00f2e2f2e37f47e5f54423aeefd6c32a7dbcedc033fcd3928a4f4948e8b96af7"}, - {file = "coverage-7.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:145b07bea229821d51811bf15eeab346c236d523838eda395ea969d120d13347"}, - {file = "coverage-7.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b99058eef42e6a8dcd135afb068b3d53aff3921ce699e127602efff9956457a9"}, - {file = "coverage-7.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5feb7f2c3e6ea94d3b877def0270dff0947b8d8c04cfa34a17be0a4dc1836879"}, - {file = "coverage-7.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:670a13249b957bb9050fab12d86acef7bf8f6a879b9d1a883799276e0d4c674a"}, - {file = "coverage-7.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bdc8bf760459a4a4187b452213e04d039990211f98644c7292adf1e471162b5"}, - {file = "coverage-7.8.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07a989c867986c2a75f158f03fdb413128aad29aca9d4dbce5fc755672d96f11"}, - {file = "coverage-7.8.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2db10dedeb619a771ef0e2949ccba7b75e33905de959c2643a4607bef2f3fb3a"}, - {file = "coverage-7.8.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e6ea7dba4e92926b7b5f0990634b78ea02f208d04af520c73a7c876d5a8d36cb"}, - {file = "coverage-7.8.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ef2f22795a7aca99fc3c84393a55a53dd18ab8c93fb431004e4d8f0774150f54"}, - {file = "coverage-7.8.2-cp311-cp311-win32.whl", hash = "sha256:641988828bc18a6368fe72355df5f1703e44411adbe49bba5644b941ce6f2e3a"}, - {file = "coverage-7.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8ab4a51cb39dc1933ba627e0875046d150e88478dbe22ce145a68393e9652975"}, - {file = "coverage-7.8.2-cp311-cp311-win_arm64.whl", hash = "sha256:8966a821e2083c74d88cca5b7dcccc0a3a888a596a04c0b9668a891de3a0cc53"}, - {file = "coverage-7.8.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e2f6fe3654468d061942591aef56686131335b7a8325684eda85dacdf311356c"}, - {file = "coverage-7.8.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76090fab50610798cc05241bf83b603477c40ee87acd358b66196ab0ca44ffa1"}, - {file = "coverage-7.8.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd0a0a5054be160777a7920b731a0570284db5142abaaf81bcbb282b8d99279"}, - {file = "coverage-7.8.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da23ce9a3d356d0affe9c7036030b5c8f14556bd970c9b224f9c8205505e3b99"}, - {file = "coverage-7.8.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9392773cffeb8d7e042a7b15b82a414011e9d2b5fdbbd3f7e6a6b17d5e21b20"}, - {file = "coverage-7.8.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:876cbfd0b09ce09d81585d266c07a32657beb3eaec896f39484b631555be0fe2"}, - {file = "coverage-7.8.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3da9b771c98977a13fbc3830f6caa85cae6c9c83911d24cb2d218e9394259c57"}, - {file = "coverage-7.8.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a990f6510b3292686713bfef26d0049cd63b9c7bb17e0864f133cbfd2e6167f"}, - {file = "coverage-7.8.2-cp312-cp312-win32.whl", hash = "sha256:bf8111cddd0f2b54d34e96613e7fbdd59a673f0cf5574b61134ae75b6f5a33b8"}, - {file = "coverage-7.8.2-cp312-cp312-win_amd64.whl", hash = "sha256:86a323a275e9e44cdf228af9b71c5030861d4d2610886ab920d9945672a81223"}, - {file = "coverage-7.8.2-cp312-cp312-win_arm64.whl", hash = "sha256:820157de3a589e992689ffcda8639fbabb313b323d26388d02e154164c57b07f"}, - {file = "coverage-7.8.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ea561010914ec1c26ab4188aef8b1567272ef6de096312716f90e5baa79ef8ca"}, - {file = "coverage-7.8.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cb86337a4fcdd0e598ff2caeb513ac604d2f3da6d53df2c8e368e07ee38e277d"}, - {file = "coverage-7.8.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26a4636ddb666971345541b59899e969f3b301143dd86b0ddbb570bd591f1e85"}, - {file = "coverage-7.8.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5040536cf9b13fb033f76bcb5e1e5cb3b57c4807fef37db9e0ed129c6a094257"}, - {file = "coverage-7.8.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc67994df9bcd7e0150a47ef41278b9e0a0ea187caba72414b71dc590b99a108"}, - {file = "coverage-7.8.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e6c86888fd076d9e0fe848af0a2142bf606044dc5ceee0aa9eddb56e26895a0"}, - {file = "coverage-7.8.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:684ca9f58119b8e26bef860db33524ae0365601492e86ba0b71d513f525e7050"}, - {file = "coverage-7.8.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8165584ddedb49204c4e18da083913bdf6a982bfb558632a79bdaadcdafd0d48"}, - {file = "coverage-7.8.2-cp313-cp313-win32.whl", hash = "sha256:34759ee2c65362163699cc917bdb2a54114dd06d19bab860725f94ef45a3d9b7"}, - {file = "coverage-7.8.2-cp313-cp313-win_amd64.whl", hash = "sha256:2f9bc608fbafaee40eb60a9a53dbfb90f53cc66d3d32c2849dc27cf5638a21e3"}, - {file = "coverage-7.8.2-cp313-cp313-win_arm64.whl", hash = "sha256:9fe449ee461a3b0c7105690419d0b0aba1232f4ff6d120a9e241e58a556733f7"}, - {file = "coverage-7.8.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8369a7c8ef66bded2b6484053749ff220dbf83cba84f3398c84c51a6f748a008"}, - {file = "coverage-7.8.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:159b81df53a5fcbc7d45dae3adad554fdbde9829a994e15227b3f9d816d00b36"}, - {file = "coverage-7.8.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6fcbbd35a96192d042c691c9e0c49ef54bd7ed865846a3c9d624c30bb67ce46"}, - {file = "coverage-7.8.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:05364b9cc82f138cc86128dc4e2e1251c2981a2218bfcd556fe6b0fbaa3501be"}, - {file = "coverage-7.8.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46d532db4e5ff3979ce47d18e2fe8ecad283eeb7367726da0e5ef88e4fe64740"}, - {file = "coverage-7.8.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4000a31c34932e7e4fa0381a3d6deb43dc0c8f458e3e7ea6502e6238e10be625"}, - {file = "coverage-7.8.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:43ff5033d657cd51f83015c3b7a443287250dc14e69910577c3e03bd2e06f27b"}, - {file = "coverage-7.8.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94316e13f0981cbbba132c1f9f365cac1d26716aaac130866ca812006f662199"}, - {file = "coverage-7.8.2-cp313-cp313t-win32.whl", hash = "sha256:3f5673888d3676d0a745c3d0e16da338c5eea300cb1f4ada9c872981265e76d8"}, - {file = "coverage-7.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:2c08b05ee8d7861e45dc5a2cc4195c8c66dca5ac613144eb6ebeaff2d502e73d"}, - {file = "coverage-7.8.2-cp313-cp313t-win_arm64.whl", hash = "sha256:1e1448bb72b387755e1ff3ef1268a06617afd94188164960dba8d0245a46004b"}, - {file = "coverage-7.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:496948261eaac5ac9cf43f5d0a9f6eb7a6d4cb3bedb2c5d294138142f5c18f2a"}, - {file = "coverage-7.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eacd2de0d30871eff893bab0b67840a96445edcb3c8fd915e6b11ac4b2f3fa6d"}, - {file = "coverage-7.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b039ffddc99ad65d5078ef300e0c7eed08c270dc26570440e3ef18beb816c1ca"}, - {file = "coverage-7.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e49824808d4375ede9dd84e9961a59c47f9113039f1a525e6be170aa4f5c34d"}, - {file = "coverage-7.8.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b069938961dfad881dc2f8d02b47645cd2f455d3809ba92a8a687bf513839787"}, - {file = "coverage-7.8.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:de77c3ba8bb686d1c411e78ee1b97e6e0b963fb98b1637658dd9ad2c875cf9d7"}, - {file = "coverage-7.8.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1676628065a498943bd3f64f099bb573e08cf1bc6088bbe33cf4424e0876f4b3"}, - {file = "coverage-7.8.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8e1a26e7e50076e35f7afafde570ca2b4d7900a491174ca357d29dece5aacee7"}, - {file = "coverage-7.8.2-cp39-cp39-win32.whl", hash = "sha256:6782a12bf76fa61ad9350d5a6ef5f3f020b57f5e6305cbc663803f2ebd0f270a"}, - {file = "coverage-7.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:1efa4166ba75ccefd647f2d78b64f53f14fb82622bc94c5a5cb0a622f50f1c9e"}, - {file = "coverage-7.8.2-pp39.pp310.pp311-none-any.whl", hash = "sha256:ec455eedf3ba0bbdf8f5a570012617eb305c63cb9f03428d39bf544cb2b94837"}, - {file = "coverage-7.8.2-py3-none-any.whl", hash = "sha256:726f32ee3713f7359696331a18daf0c3b3a70bb0ae71141b9d3c52be7c595e32"}, - {file = "coverage-7.8.2.tar.gz", hash = "sha256:a886d531373a1f6ff9fad2a2ba4a045b68467b779ae729ee0b3b10ac20033b27"}, -] - -[package.extras] -toml = ["tomli"] +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "coverage-7.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0c986537abca9b064510f3fd104ba33e98d3036608c7f2f5537f869bc10e1ee5"}, + {file = "coverage-7.11.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:28c5251b3ab1d23e66f1130ca0c419747edfbcb4690de19467cd616861507af7"}, + {file = "coverage-7.11.3-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:4f2bb4ee8dd40f9b2a80bb4adb2aecece9480ba1fa60d9382e8c8e0bd558e2eb"}, + {file = "coverage-7.11.3-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e5f4bfac975a2138215a38bda599ef00162e4143541cf7dd186da10a7f8e69f1"}, + {file = "coverage-7.11.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f4cbfff5cf01fa07464439a8510affc9df281535f41a1f5312fbd2b59b4ab5c"}, + {file = "coverage-7.11.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:31663572f20bf3406d7ac00d6981c7bbbcec302539d26b5ac596ca499664de31"}, + {file = "coverage-7.11.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9799bd6a910961cb666196b8583ed0ee125fa225c6fdee2cbf00232b861f29d2"}, + {file = "coverage-7.11.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:097acc18bedf2c6e3144eaf09b5f6034926c3c9bb9e10574ffd0942717232507"}, + {file = "coverage-7.11.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:6f033dec603eea88204589175782290a038b436105a8f3637a81c4359df27832"}, + {file = "coverage-7.11.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:dd9ca2d44ed8018c90efb72f237a2a140325a4c3339971364d758e78b175f58e"}, + {file = "coverage-7.11.3-cp310-cp310-win32.whl", hash = "sha256:900580bc99c145e2561ea91a2d207e639171870d8a18756eb57db944a017d4bb"}, + {file = "coverage-7.11.3-cp310-cp310-win_amd64.whl", hash = "sha256:c8be5bfcdc7832011b2652db29ed7672ce9d353dd19bce5272ca33dbcf60aaa8"}, + {file = "coverage-7.11.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:200bb89fd2a8a07780eafcdff6463104dec459f3c838d980455cfa84f5e5e6e1"}, + {file = "coverage-7.11.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8d264402fc179776d43e557e1ca4a7d953020d3ee95f7ec19cc2c9d769277f06"}, + {file = "coverage-7.11.3-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:385977d94fc155f8731c895accdfcc3dd0d9dd9ef90d102969df95d3c637ab80"}, + {file = "coverage-7.11.3-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0542ddf6107adbd2592f29da9f59f5d9cff7947b5bb4f734805085c327dcffaa"}, + {file = "coverage-7.11.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d60bf4d7f886989ddf80e121a7f4d140d9eac91f1d2385ce8eb6bda93d563297"}, + {file = "coverage-7.11.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0a3b6e32457535df0d41d2d895da46434706dd85dbaf53fbc0d3bd7d914b362"}, + {file = "coverage-7.11.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:876a3ee7fd2613eb79602e4cdb39deb6b28c186e76124c3f29e580099ec21a87"}, + {file = "coverage-7.11.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a730cd0824e8083989f304e97b3f884189efb48e2151e07f57e9e138ab104200"}, + {file = "coverage-7.11.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:b5cd111d3ab7390be0c07ad839235d5ad54d2ca497b5f5db86896098a77180a4"}, + {file = "coverage-7.11.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:074e6a5cd38e06671580b4d872c1a67955d4e69639e4b04e87fc03b494c1f060"}, + {file = "coverage-7.11.3-cp311-cp311-win32.whl", hash = "sha256:86d27d2dd7c7c5a44710565933c7dc9cd70e65ef97142e260d16d555667deef7"}, + {file = "coverage-7.11.3-cp311-cp311-win_amd64.whl", hash = "sha256:ca90ef33a152205fb6f2f0c1f3e55c50df4ef049bb0940ebba666edd4cdebc55"}, + {file = "coverage-7.11.3-cp311-cp311-win_arm64.whl", hash = "sha256:56f909a40d68947ef726ce6a34eb38f0ed241ffbe55c5007c64e616663bcbafc"}, + {file = "coverage-7.11.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5b771b59ac0dfb7f139f70c85b42717ef400a6790abb6475ebac1ecee8de782f"}, + {file = "coverage-7.11.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:603c4414125fc9ae9000f17912dcfd3d3eb677d4e360b85206539240c96ea76e"}, + {file = "coverage-7.11.3-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:77ffb3b7704eb7b9b3298a01fe4509cef70117a52d50bcba29cffc5f53dd326a"}, + {file = "coverage-7.11.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4d4ca49f5ba432b0755ebb0fc3a56be944a19a16bb33802264bbc7311622c0d1"}, + {file = "coverage-7.11.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:05fd3fb6edff0c98874d752013588836f458261e5eba587afe4c547bba544afd"}, + {file = "coverage-7.11.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0e920567f8c3a3ce68ae5a42cf7c2dc4bb6cc389f18bff2235dd8c03fa405de5"}, + {file = "coverage-7.11.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4bec8c7160688bd5a34e65c82984b25409563134d63285d8943d0599efbc448e"}, + {file = "coverage-7.11.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:adb9b7b42c802bd8cb3927de8c1c26368ce50c8fdaa83a9d8551384d77537044"}, + {file = "coverage-7.11.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:c8f563b245b4ddb591e99f28e3cd140b85f114b38b7f95b2e42542f0603eb7d7"}, + {file = "coverage-7.11.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e2a96fdc7643c9517a317553aca13b5cae9bad9a5f32f4654ce247ae4d321405"}, + {file = "coverage-7.11.3-cp312-cp312-win32.whl", hash = "sha256:e8feeb5e8705835f0622af0fe7ff8d5cb388948454647086494d6c41ec142c2e"}, + {file = "coverage-7.11.3-cp312-cp312-win_amd64.whl", hash = "sha256:abb903ffe46bd319d99979cdba350ae7016759bb69f47882242f7b93f3356055"}, + {file = "coverage-7.11.3-cp312-cp312-win_arm64.whl", hash = "sha256:1451464fd855d9bd000c19b71bb7dafea9ab815741fb0bd9e813d9b671462d6f"}, + {file = "coverage-7.11.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84b892e968164b7a0498ddc5746cdf4e985700b902128421bb5cec1080a6ee36"}, + {file = "coverage-7.11.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f761dbcf45e9416ec4698e1a7649248005f0064ce3523a47402d1bff4af2779e"}, + {file = "coverage-7.11.3-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:1410bac9e98afd9623f53876fae7d8a5db9f5a0ac1c9e7c5188463cb4b3212e2"}, + {file = "coverage-7.11.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:004cdcea3457c0ea3233622cd3464c1e32ebba9b41578421097402bee6461b63"}, + {file = "coverage-7.11.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f067ada2c333609b52835ca4d4868645d3b63ac04fb2b9a658c55bba7f667d3"}, + {file = "coverage-7.11.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:07bc7745c945a6d95676953e86ba7cebb9f11de7773951c387f4c07dc76d03f5"}, + {file = "coverage-7.11.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8bba7e4743e37484ae17d5c3b8eb1ce78b564cb91b7ace2e2182b25f0f764cb5"}, + {file = "coverage-7.11.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:fbffc22d80d86fbe456af9abb17f7a7766e7b2101f7edaacc3535501691563f7"}, + {file = "coverage-7.11.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:0dba4da36730e384669e05b765a2c49f39514dd3012fcc0398dd66fba8d746d5"}, + {file = "coverage-7.11.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ae12fe90b00b71a71b69f513773310782ce01d5f58d2ceb2b7c595ab9d222094"}, + {file = "coverage-7.11.3-cp313-cp313-win32.whl", hash = "sha256:12d821de7408292530b0d241468b698bce18dd12ecaf45316149f53877885f8c"}, + {file = "coverage-7.11.3-cp313-cp313-win_amd64.whl", hash = "sha256:6bb599052a974bb6cedfa114f9778fedfad66854107cf81397ec87cb9b8fbcf2"}, + {file = "coverage-7.11.3-cp313-cp313-win_arm64.whl", hash = "sha256:bb9d7efdb063903b3fdf77caec7b77c3066885068bdc0d44bc1b0c171033f944"}, + {file = "coverage-7.11.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:fb58da65e3339b3dbe266b607bb936efb983d86b00b03eb04c4ad5b442c58428"}, + {file = "coverage-7.11.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8d16bbe566e16a71d123cd66382c1315fcd520c7573652a8074a8fe281b38c6a"}, + {file = "coverage-7.11.3-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a8258f10059b5ac837232c589a350a2df4a96406d6d5f2a09ec587cbdd539655"}, + {file = "coverage-7.11.3-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4c5627429f7fbff4f4131cfdd6abd530734ef7761116811a707b88b7e205afd7"}, + {file = "coverage-7.11.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:465695268414e149bab754c54b0c45c8ceda73dd4a5c3ba255500da13984b16d"}, + {file = "coverage-7.11.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4ebcddfcdfb4c614233cff6e9a3967a09484114a8b2e4f2c7a62dc83676ba13f"}, + {file = "coverage-7.11.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:13b2066303a1c1833c654d2af0455bb009b6e1727b3883c9964bc5c2f643c1d0"}, + {file = "coverage-7.11.3-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d8750dd20362a1b80e3cf84f58013d4672f89663aee457ea59336df50fab6739"}, + {file = "coverage-7.11.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ab6212e62ea0e1006531a2234e209607f360d98d18d532c2fa8e403c1afbdd71"}, + {file = "coverage-7.11.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a6b17c2b5e0b9bb7702449200f93e2d04cb04b1414c41424c08aa1e5d352da76"}, + {file = "coverage-7.11.3-cp313-cp313t-win32.whl", hash = "sha256:426559f105f644b69290ea414e154a0d320c3ad8a2bb75e62884731f69cf8e2c"}, + {file = "coverage-7.11.3-cp313-cp313t-win_amd64.whl", hash = "sha256:90a96fcd824564eae6137ec2563bd061d49a32944858d4bdbae5c00fb10e76ac"}, + {file = "coverage-7.11.3-cp313-cp313t-win_arm64.whl", hash = "sha256:1e33d0bebf895c7a0905fcfaff2b07ab900885fc78bba2a12291a2cfbab014cc"}, + {file = "coverage-7.11.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fdc5255eb4815babcdf236fa1a806ccb546724c8a9b129fd1ea4a5448a0bf07c"}, + {file = "coverage-7.11.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:fe3425dc6021f906c6325d3c415e048e7cdb955505a94f1eb774dafc779ba203"}, + {file = "coverage-7.11.3-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:4ca5f876bf41b24378ee67c41d688155f0e54cdc720de8ef9ad6544005899240"}, + {file = "coverage-7.11.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9061a3e3c92b27fd8036dafa26f25d95695b6aa2e4514ab16a254f297e664f83"}, + {file = "coverage-7.11.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:abcea3b5f0dc44e1d01c27090bc32ce6ffb7aa665f884f1890710454113ea902"}, + {file = "coverage-7.11.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:68c4eb92997dbaaf839ea13527be463178ac0ddd37a7ac636b8bc11a51af2428"}, + {file = "coverage-7.11.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:149eccc85d48c8f06547534068c41d69a1a35322deaa4d69ba1561e2e9127e75"}, + {file = "coverage-7.11.3-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:08c0bcf932e47795c49f0406054824b9d45671362dfc4269e0bc6e4bff010704"}, + {file = "coverage-7.11.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:39764c6167c82d68a2d8c97c33dba45ec0ad9172570860e12191416f4f8e6e1b"}, + {file = "coverage-7.11.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3224c7baf34e923ffc78cb45e793925539d640d42c96646db62dbd61bbcfa131"}, + {file = "coverage-7.11.3-cp314-cp314-win32.whl", hash = "sha256:c713c1c528284d636cd37723b0b4c35c11190da6f932794e145fc40f8210a14a"}, + {file = "coverage-7.11.3-cp314-cp314-win_amd64.whl", hash = "sha256:c381a252317f63ca0179d2c7918e83b99a4ff3101e1b24849b999a00f9cd4f86"}, + {file = "coverage-7.11.3-cp314-cp314-win_arm64.whl", hash = "sha256:3e33a968672be1394eded257ec10d4acbb9af2ae263ba05a99ff901bb863557e"}, + {file = "coverage-7.11.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:f9c96a29c6d65bd36a91f5634fef800212dff69dacdb44345c4c9783943ab0df"}, + {file = "coverage-7.11.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2ec27a7a991d229213c8070d31e3ecf44d005d96a9edc30c78eaeafaa421c001"}, + {file = "coverage-7.11.3-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:72c8b494bd20ae1c58528b97c4a67d5cfeafcb3845c73542875ecd43924296de"}, + {file = "coverage-7.11.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:60ca149a446da255d56c2a7a813b51a80d9497a62250532598d249b3cdb1a926"}, + {file = "coverage-7.11.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb5069074db19a534de3859c43eec78e962d6d119f637c41c8e028c5ab3f59dd"}, + {file = "coverage-7.11.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac5d5329c9c942bbe6295f4251b135d860ed9f86acd912d418dce186de7c19ac"}, + {file = "coverage-7.11.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e22539b676fafba17f0a90ac725f029a309eb6e483f364c86dcadee060429d46"}, + {file = "coverage-7.11.3-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:2376e8a9c889016f25472c452389e98bc6e54a19570b107e27cde9d47f387b64"}, + {file = "coverage-7.11.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:4234914b8c67238a3c4af2bba648dc716aa029ca44d01f3d51536d44ac16854f"}, + {file = "coverage-7.11.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f0b4101e2b3c6c352ff1f70b3a6fcc7c17c1ab1a91ccb7a33013cb0782af9820"}, + {file = "coverage-7.11.3-cp314-cp314t-win32.whl", hash = "sha256:305716afb19133762e8cf62745c46c4853ad6f9eeba54a593e373289e24ea237"}, + {file = "coverage-7.11.3-cp314-cp314t-win_amd64.whl", hash = "sha256:9245bd392572b9f799261c4c9e7216bafc9405537d0f4ce3ad93afe081a12dc9"}, + {file = "coverage-7.11.3-cp314-cp314t-win_arm64.whl", hash = "sha256:9a1d577c20b4334e5e814c3d5fe07fa4a8c3ae42a601945e8d7940bab811d0bd"}, + {file = "coverage-7.11.3-py3-none-any.whl", hash = "sha256:351511ae28e2509c8d8cae5311577ea7dd511ab8e746ffc8814a0896c3d33fbe"}, + {file = "coverage-7.11.3.tar.gz", hash = "sha256:0f59387f5e6edbbffec2281affb71cdc85e0776c1745150a3ab9b6c1d016106b"}, +] + +[package.extras] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] + +[[package]] +name = "cryptography" +version = "46.0.3" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = false +python-versions = "!=3.9.0,!=3.9.1,>=3.8" +groups = ["main"] +files = [ + {file = "cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e"}, + {file = "cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926"}, + {file = "cryptography-46.0.3-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71"}, + {file = "cryptography-46.0.3-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac"}, + {file = "cryptography-46.0.3-cp311-abi3-win32.whl", hash = "sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018"}, + {file = "cryptography-46.0.3-cp311-abi3-win_amd64.whl", hash = "sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb"}, + {file = "cryptography-46.0.3-cp311-abi3-win_arm64.whl", hash = "sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c"}, + {file = "cryptography-46.0.3-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:00a5e7e87938e5ff9ff5447ab086a5706a957137e6e433841e9d24f38a065217"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c8daeb2d2174beb4575b77482320303f3d39b8e81153da4f0fb08eb5fe86a6c5"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:39b6755623145ad5eff1dab323f4eae2a32a77a7abef2c5089a04a3d04366715"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:db391fa7c66df6762ee3f00c95a89e6d428f4d60e7abc8328f4fe155b5ac6e54"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:78a97cf6a8839a48c49271cdcbd5cf37ca2c1d6b7fdd86cc864f302b5e9bf459"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:dfb781ff7eaa91a6f7fd41776ec37c5853c795d3b358d4896fdbb5df168af422"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:6f61efb26e76c45c4a227835ddeae96d83624fb0d29eb5df5b96e14ed1a0afb7"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:23b1a8f26e43f47ceb6d6a43115f33a5a37d57df4ea0ca295b780ae8546e8044"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:b419ae593c86b87014b9be7396b385491ad7f320bde96826d0dd174459e54665"}, + {file = "cryptography-46.0.3-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:50fc3343ac490c6b08c0cf0d704e881d0d660be923fd3076db3e932007e726e3"}, + {file = "cryptography-46.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:22d7e97932f511d6b0b04f2bfd818d73dcd5928db509460aaf48384778eb6d20"}, + {file = "cryptography-46.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d55f3dffadd674514ad19451161118fd010988540cee43d8bc20675e775925de"}, + {file = "cryptography-46.0.3-cp314-cp314t-win32.whl", hash = "sha256:8a6e050cb6164d3f830453754094c086ff2d0b2f3a897a1d9820f6139a1f0914"}, + {file = "cryptography-46.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:760f83faa07f8b64e9c33fc963d790a2edb24efb479e3520c14a45741cd9b2db"}, + {file = "cryptography-46.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:516ea134e703e9fe26bcd1277a4b59ad30586ea90c365a87781d7887a646fe21"}, + {file = "cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04"}, + {file = "cryptography-46.0.3-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506"}, + {file = "cryptography-46.0.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963"}, + {file = "cryptography-46.0.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4"}, + {file = "cryptography-46.0.3-cp38-abi3-win32.whl", hash = "sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df"}, + {file = "cryptography-46.0.3-cp38-abi3-win_amd64.whl", hash = "sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f"}, + {file = "cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372"}, + {file = "cryptography-46.0.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a23582810fedb8c0bc47524558fb6c56aac3fc252cb306072fd2815da2a47c32"}, + {file = "cryptography-46.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e7aec276d68421f9574040c26e2a7c3771060bc0cff408bae1dcb19d3ab1e63c"}, + {file = "cryptography-46.0.3-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7ce938a99998ed3c8aa7e7272dca1a610401ede816d36d0693907d863b10d9ea"}, + {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:191bb60a7be5e6f54e30ba16fdfae78ad3a342a0599eb4193ba88e3f3d6e185b"}, + {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c70cc23f12726be8f8bc72e41d5065d77e4515efae3690326764ea1b07845cfb"}, + {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:9394673a9f4de09e28b5356e7fff97d778f8abad85c9d5ac4a4b7e25a0de7717"}, + {file = "cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94cd0549accc38d1494e1f8de71eca837d0509d0d44bf11d158524b0e12cebf9"}, + {file = "cryptography-46.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c"}, + {file = "cryptography-46.0.3.tar.gz", hash = "sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1"}, +] + +[package.dependencies] +cffi = {version = ">=2.0.0", markers = "python_full_version >= \"3.9.0\" and platform_python_implementation != \"PyPy\""} + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-inline-tabs", "sphinx-rtd-theme (>=3.0.0)"] +docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] +nox = ["nox[uv] (>=2024.4.15)"] +pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.14)", "ruff (>=0.11.11)"] +sdist = ["build (>=1.0.0)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["certifi (>=2024)", "cryptography-vectors (==46.0.3)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] +test-randomorder = ["pytest-randomly"] [[package]] name = "cycler" @@ -909,6 +1123,7 @@ version = "0.12.1" description = "Composable style cycles" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, @@ -920,23 +1135,25 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] [[package]] name = "databricks-sdk" -version = "0.56.0" +version = "0.73.0" description = "Databricks SDK for Python (Beta)" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ - {file = "databricks_sdk-0.56.0-py3-none-any.whl", hash = "sha256:0b0036a3b48b59a70f07968658379ceecd47a7cd55646d519fe1a1c3cbc60c02"}, - {file = "databricks_sdk-0.56.0.tar.gz", hash = "sha256:3d701eba806f07a0203f97634789195495a1104f5061d2d7b76dc1669fe3c20c"}, + {file = "databricks_sdk-0.73.0-py3-none-any.whl", hash = "sha256:a4d3cfd19357a2b459d2dc3101454d7f0d1b62865ce099c35d0c342b66ac64ff"}, + {file = "databricks_sdk-0.73.0.tar.gz", hash = "sha256:db09eaaacd98e07dded78d3e7ab47d2f6c886e0380cb577977bd442bace8bd8d"}, ] [package.dependencies] google-auth = ">=2.0,<3.0" +protobuf = ">=4.25.8,<5.26.dev0 || >5.29.0,<5.29.1 || >5.29.1,<5.29.2 || >5.29.2,<5.29.3 || >5.29.3,<5.29.4 || >5.29.4,<6.30.0 || >6.30.0,<6.30.1 || >6.30.1,<6.31.0 || >6.31.0,<7.0" requests = ">=2.28.1,<3" [package.extras] -dev = ["autoflake", "black", "build", "databricks-connect", "httpx", "ipython", "ipywidgets", "isort", "langchain-openai", "openai", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-rerunfailures", "pytest-xdist", "requests-mock", "wheel"] +dev = ["autoflake", "black", "build", "databricks-connect", "httpx", "ipython", "ipywidgets", "isort", "langchain-openai ; python_version > \"3.7\"", "openai", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-rerunfailures", "pytest-xdist (>=3.6.1,<4.0)", "requests-mock", "wheel"] notebook = ["ipython (>=8,<10)", "ipywidgets (>=8,<9)"] -openai = ["httpx", "langchain-openai", "openai"] +openai = ["httpx", "langchain-openai ; python_version > \"3.7\"", "openai"] [[package]] name = "dataclasses-json" @@ -944,6 +1161,7 @@ version = "0.6.7" description = "Easily serialize dataclasses to and from JSON." optional = false python-versions = "<4.0,>=3.7" +groups = ["main"] files = [ {file = "dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a"}, {file = "dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0"}, @@ -955,37 +1173,42 @@ typing-inspect = ">=0.4.0,<1" [[package]] name = "debugpy" -version = "1.8.14" +version = "1.8.17" description = "An implementation of the Debug Adapter Protocol for Python" optional = false python-versions = ">=3.8" -files = [ - {file = "debugpy-1.8.14-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:93fee753097e85623cab1c0e6a68c76308cd9f13ffdf44127e6fab4fbf024339"}, - {file = "debugpy-1.8.14-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d937d93ae4fa51cdc94d3e865f535f185d5f9748efb41d0d49e33bf3365bd79"}, - {file = "debugpy-1.8.14-cp310-cp310-win32.whl", hash = "sha256:c442f20577b38cc7a9aafecffe1094f78f07fb8423c3dddb384e6b8f49fd2987"}, - {file = "debugpy-1.8.14-cp310-cp310-win_amd64.whl", hash = "sha256:f117dedda6d969c5c9483e23f573b38f4e39412845c7bc487b6f2648df30fe84"}, - {file = "debugpy-1.8.14-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:1b2ac8c13b2645e0b1eaf30e816404990fbdb168e193322be8f545e8c01644a9"}, - {file = "debugpy-1.8.14-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf431c343a99384ac7eab2f763980724834f933a271e90496944195318c619e2"}, - {file = "debugpy-1.8.14-cp311-cp311-win32.whl", hash = "sha256:c99295c76161ad8d507b413cd33422d7c542889fbb73035889420ac1fad354f2"}, - {file = "debugpy-1.8.14-cp311-cp311-win_amd64.whl", hash = "sha256:7816acea4a46d7e4e50ad8d09d963a680ecc814ae31cdef3622eb05ccacf7b01"}, - {file = "debugpy-1.8.14-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:8899c17920d089cfa23e6005ad9f22582fd86f144b23acb9feeda59e84405b84"}, - {file = "debugpy-1.8.14-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6bb5c0dcf80ad5dbc7b7d6eac484e2af34bdacdf81df09b6a3e62792b722826"}, - {file = "debugpy-1.8.14-cp312-cp312-win32.whl", hash = "sha256:281d44d248a0e1791ad0eafdbbd2912ff0de9eec48022a5bfbc332957487ed3f"}, - {file = "debugpy-1.8.14-cp312-cp312-win_amd64.whl", hash = "sha256:5aa56ef8538893e4502a7d79047fe39b1dae08d9ae257074c6464a7b290b806f"}, - {file = "debugpy-1.8.14-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:329a15d0660ee09fec6786acdb6e0443d595f64f5d096fc3e3ccf09a4259033f"}, - {file = "debugpy-1.8.14-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f920c7f9af409d90f5fd26e313e119d908b0dd2952c2393cd3247a462331f15"}, - {file = "debugpy-1.8.14-cp313-cp313-win32.whl", hash = "sha256:3784ec6e8600c66cbdd4ca2726c72d8ca781e94bce2f396cc606d458146f8f4e"}, - {file = "debugpy-1.8.14-cp313-cp313-win_amd64.whl", hash = "sha256:684eaf43c95a3ec39a96f1f5195a7ff3d4144e4a18d69bb66beeb1a6de605d6e"}, - {file = "debugpy-1.8.14-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:d5582bcbe42917bc6bbe5c12db1bffdf21f6bfc28d4554b738bf08d50dc0c8c3"}, - {file = "debugpy-1.8.14-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5349b7c3735b766a281873fbe32ca9cca343d4cc11ba4a743f84cb854339ff35"}, - {file = "debugpy-1.8.14-cp38-cp38-win32.whl", hash = "sha256:7118d462fe9724c887d355eef395fae68bc764fd862cdca94e70dcb9ade8a23d"}, - {file = "debugpy-1.8.14-cp38-cp38-win_amd64.whl", hash = "sha256:d235e4fa78af2de4e5609073972700523e372cf5601742449970110d565ca28c"}, - {file = "debugpy-1.8.14-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:413512d35ff52c2fb0fd2d65e69f373ffd24f0ecb1fac514c04a668599c5ce7f"}, - {file = "debugpy-1.8.14-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c9156f7524a0d70b7a7e22b2e311d8ba76a15496fb00730e46dcdeedb9e1eea"}, - {file = "debugpy-1.8.14-cp39-cp39-win32.whl", hash = "sha256:b44985f97cc3dd9d52c42eb59ee9d7ee0c4e7ecd62bca704891f997de4cef23d"}, - {file = "debugpy-1.8.14-cp39-cp39-win_amd64.whl", hash = "sha256:b1528cfee6c1b1c698eb10b6b096c598738a8238822d218173d21c3086de8123"}, - {file = "debugpy-1.8.14-py2.py3-none-any.whl", hash = "sha256:5cd9a579d553b6cb9759a7908a41988ee6280b961f24f63336835d9418216a20"}, - {file = "debugpy-1.8.14.tar.gz", hash = "sha256:7cd287184318416850aa8b60ac90105837bb1e59531898c07569d197d2ed5322"}, +groups = ["dev"] +files = [ + {file = "debugpy-1.8.17-cp310-cp310-macosx_15_0_x86_64.whl", hash = "sha256:c41d2ce8bbaddcc0009cc73f65318eedfa3dbc88a8298081deb05389f1ab5542"}, + {file = "debugpy-1.8.17-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:1440fd514e1b815edd5861ca394786f90eb24960eb26d6f7200994333b1d79e3"}, + {file = "debugpy-1.8.17-cp310-cp310-win32.whl", hash = "sha256:3a32c0af575749083d7492dc79f6ab69f21b2d2ad4cd977a958a07d5865316e4"}, + {file = "debugpy-1.8.17-cp310-cp310-win_amd64.whl", hash = "sha256:a3aad0537cf4d9c1996434be68c6c9a6d233ac6f76c2a482c7803295b4e4f99a"}, + {file = "debugpy-1.8.17-cp311-cp311-macosx_15_0_universal2.whl", hash = "sha256:d3fce3f0e3de262a3b67e69916d001f3e767661c6e1ee42553009d445d1cd840"}, + {file = "debugpy-1.8.17-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:c6bdf134457ae0cac6fb68205776be635d31174eeac9541e1d0c062165c6461f"}, + {file = "debugpy-1.8.17-cp311-cp311-win32.whl", hash = "sha256:e79a195f9e059edfe5d8bf6f3749b2599452d3e9380484cd261f6b7cd2c7c4da"}, + {file = "debugpy-1.8.17-cp311-cp311-win_amd64.whl", hash = "sha256:b532282ad4eca958b1b2d7dbcb2b7218e02cb934165859b918e3b6ba7772d3f4"}, + {file = "debugpy-1.8.17-cp312-cp312-macosx_15_0_universal2.whl", hash = "sha256:f14467edef672195c6f6b8e27ce5005313cb5d03c9239059bc7182b60c176e2d"}, + {file = "debugpy-1.8.17-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:24693179ef9dfa20dca8605905a42b392be56d410c333af82f1c5dff807a64cc"}, + {file = "debugpy-1.8.17-cp312-cp312-win32.whl", hash = "sha256:6a4e9dacf2cbb60d2514ff7b04b4534b0139facbf2abdffe0639ddb6088e59cf"}, + {file = "debugpy-1.8.17-cp312-cp312-win_amd64.whl", hash = "sha256:e8f8f61c518952fb15f74a302e068b48d9c4691768ade433e4adeea961993464"}, + {file = "debugpy-1.8.17-cp313-cp313-macosx_15_0_universal2.whl", hash = "sha256:857c1dd5d70042502aef1c6d1c2801211f3ea7e56f75e9c335f434afb403e464"}, + {file = "debugpy-1.8.17-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:3bea3b0b12f3946e098cce9b43c3c46e317b567f79570c3f43f0b96d00788088"}, + {file = "debugpy-1.8.17-cp313-cp313-win32.whl", hash = "sha256:e34ee844c2f17b18556b5bbe59e1e2ff4e86a00282d2a46edab73fd7f18f4a83"}, + {file = "debugpy-1.8.17-cp313-cp313-win_amd64.whl", hash = "sha256:6c5cd6f009ad4fca8e33e5238210dc1e5f42db07d4b6ab21ac7ffa904a196420"}, + {file = "debugpy-1.8.17-cp314-cp314-macosx_15_0_universal2.whl", hash = "sha256:045290c010bcd2d82bc97aa2daf6837443cd52f6328592698809b4549babcee1"}, + {file = "debugpy-1.8.17-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:b69b6bd9dba6a03632534cdf67c760625760a215ae289f7489a452af1031fe1f"}, + {file = "debugpy-1.8.17-cp314-cp314-win32.whl", hash = "sha256:5c59b74aa5630f3a5194467100c3b3d1c77898f9ab27e3f7dc5d40fc2f122670"}, + {file = "debugpy-1.8.17-cp314-cp314-win_amd64.whl", hash = "sha256:893cba7bb0f55161de4365584b025f7064e1f88913551bcd23be3260b231429c"}, + {file = "debugpy-1.8.17-cp38-cp38-macosx_15_0_x86_64.whl", hash = "sha256:8deb4e31cd575c9f9370042876e078ca118117c1b5e1f22c32befcfbb6955f0c"}, + {file = "debugpy-1.8.17-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:b75868b675949a96ab51abc114c7163f40ff0d8f7d6d5fd63f8932fd38e9c6d7"}, + {file = "debugpy-1.8.17-cp38-cp38-win32.whl", hash = "sha256:17e456da14848d618662354e1dccfd5e5fb75deec3d1d48dc0aa0baacda55860"}, + {file = "debugpy-1.8.17-cp38-cp38-win_amd64.whl", hash = "sha256:e851beb536a427b5df8aa7d0c7835b29a13812f41e46292ff80b2ef77327355a"}, + {file = "debugpy-1.8.17-cp39-cp39-macosx_15_0_x86_64.whl", hash = "sha256:f2ac8055a0c4a09b30b931100996ba49ef334c6947e7ae365cdd870416d7513e"}, + {file = "debugpy-1.8.17-cp39-cp39-manylinux_2_34_x86_64.whl", hash = "sha256:eaa85bce251feca8e4c87ce3b954aba84b8c645b90f0e6a515c00394a9f5c0e7"}, + {file = "debugpy-1.8.17-cp39-cp39-win32.whl", hash = "sha256:b13eea5587e44f27f6c48588b5ad56dcb74a4f3a5f89250443c94587f3eb2ea1"}, + {file = "debugpy-1.8.17-cp39-cp39-win_amd64.whl", hash = "sha256:bb1bbf92317e1f35afcf3ef0450219efb3afe00be79d8664b250ac0933b9015f"}, + {file = "debugpy-1.8.17-py2.py3-none-any.whl", hash = "sha256:60c7dca6571efe660ccb7a9508d73ca14b8796c4ed484c2002abba714226cfef"}, + {file = "debugpy-1.8.17.tar.gz", hash = "sha256:fd723b47a8c08892b1a16b2c6239a8b96637c62a59b94bb5dab4bac592a58a8e"}, ] [[package]] @@ -994,6 +1217,7 @@ version = "5.2.1" description = "Decorators for Humans" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"}, {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"}, @@ -1005,6 +1229,7 @@ version = "0.7.1" description = "XML bomb protection for Python stdlib modules" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev"] files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -1012,30 +1237,32 @@ files = [ [[package]] name = "deprecated" -version = "1.2.18" +version = "1.3.1" description = "Python @deprecated decorator to deprecate old python classes, functions or methods." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +groups = ["main"] files = [ - {file = "Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec"}, - {file = "deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d"}, + {file = "deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f"}, + {file = "deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223"}, ] [package.dependencies] -wrapt = ">=1.10,<2" +wrapt = ">=1.10,<3" [package.extras] -dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "setuptools", "tox"] +dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "setuptools ; python_version >= \"3.12\"", "tox"] [[package]] name = "distlib" -version = "0.3.9" +version = "0.4.0" description = "Distribution utilities" optional = false python-versions = "*" +groups = ["dev"] files = [ - {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"}, - {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, + {file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"}, + {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"}, ] [[package]] @@ -1044,6 +1271,7 @@ version = "1.9.0" description = "Distro - an OS platform information API" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, @@ -1055,6 +1283,7 @@ version = "7.1.0" description = "A Python library for the Docker Engine API." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, @@ -1071,29 +1300,85 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"] ssh = ["paramiko (>=2.4.3)"] websockets = ["websocket-client (>=1.3.0)"] +[[package]] +name = "duckdb" +version = "1.4.3" +description = "DuckDB in-process database" +optional = false +python-versions = ">=3.9.0" +groups = ["main"] +files = [ + {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:efa7f1191c59e34b688fcd4e588c1b903a4e4e1f4804945902cf0b20e08a9001"}, + {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4fef6a053a1c485292000bf0c338bba60f89d334f6a06fc76ba4085a5a322b76"}, + {file = "duckdb-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:702dabbc22b27dc5b73e7599c60deef3d8c59968527c36b391773efddd8f4cf1"}, + {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854b79375fa618f6ffa8d84fb45cbc9db887f6c4834076ea10d20bc106f1fd90"}, + {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1bb8bd5a3dd205983726185b280a211eacc9f5bc0c4d4505bec8c87ac33a8ccb"}, + {file = "duckdb-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:d0ff08388ef8b1d1a4c95c321d6c5fa11201b241036b1ee740f9d841df3d6ba2"}, + {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:366bf607088053dce845c9d24c202c04d78022436cc5d8e4c9f0492de04afbe7"}, + {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8d080e8d1bf2d226423ec781f539c8f6b6ef3fd42a9a58a7160de0a00877a21f"}, + {file = "duckdb-1.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9dc049ba7e906cb49ca2b6d4fbf7b6615ec3883193e8abb93f0bef2652e42dda"}, + {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b30245375ea94ab528c87c61fc3ab3e36331180b16af92ee3a37b810a745d24"}, + {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7c864df027da1ee95f0c32def67e15d02cd4a906c9c1cbae82c09c5112f526b"}, + {file = "duckdb-1.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:813f189039b46877b5517f1909c7b94a8fe01b4bde2640ab217537ea0fe9b59b"}, + {file = "duckdb-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:fbc63ffdd03835f660155b37a1b6db2005bcd46e5ad398b8cac141eb305d2a3d"}, + {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:6302452e57aef29aae3977063810ed7b2927967b97912947b9cca45c1c21955f"}, + {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:deab351ac43b6282a3270e3d40e3d57b3b50f472d9fd8c30975d88a31be41231"}, + {file = "duckdb-1.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5634e40e1e2d972e4f75bced1fbdd9e9e90faa26445c1052b27de97ee546944a"}, + {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:274d4a31aba63115f23e7e7b401e3e3a937f3626dc9dea820a9c7d3073f450d2"}, + {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f868a7e6d9b37274a1aa34849ea92aa964e9bd59a5237d6c17e8540533a1e4f"}, + {file = "duckdb-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef7ef15347ce97201b1b5182a5697682679b04c3374d5a01ac10ba31cf791b95"}, + {file = "duckdb-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:1b9b445970fd18274d5ac07a0b24c032e228f967332fb5ebab3d7db27738c0e4"}, + {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16952ac05bd7e7b39946695452bf450db1ebbe387e1e7178e10f593f2ea7b9a8"}, + {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de984cd24a6cbefdd6d4a349f7b9a46e583ca3e58ce10d8def0b20a6e5fcbe78"}, + {file = "duckdb-1.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e5457dda91b67258aae30fb1a0df84183a9f6cd27abac1d5536c0d876c6dfa1"}, + {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:006aca6a6d6736c441b02ff5c7600b099bb8b7f4de094b8b062137efddce42df"}, + {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a2813f4635f4d6681cc3304020374c46aca82758c6740d7edbc237fe3aae2744"}, + {file = "duckdb-1.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:6db124f53a3edcb32b0a896ad3519e37477f7e67bf4811cb41ab60c1ef74e4c8"}, + {file = "duckdb-1.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:a8b0a8764e1b5dd043d168c8f749314f7a1252b5a260fa415adaa26fa3b958fd"}, + {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:316711a9e852bcfe1ed6241a5f654983f67e909e290495f3562cccdf43be8180"}, + {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9e625b2b4d52bafa1fd0ebdb0990c3961dac8bb00e30d327185de95b68202131"}, + {file = "duckdb-1.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:130c6760f6c573f9c9fe9aba56adba0fab48811a4871b7b8fd667318b4a3e8da"}, + {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20c88effaa557a11267706b01419c542fe42f893dee66e5a6daa5974ea2d4a46"}, + {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b35491db98ccd11d151165497c084a9d29d3dc42fc80abea2715a6c861ca43d"}, + {file = "duckdb-1.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:23b12854032c1a58d0452e2b212afa908d4ce64171862f3792ba9a596ba7c765"}, + {file = "duckdb-1.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:90f241f25cffe7241bf9f376754a5845c74775e00e1c5731119dc88cd71e0cb2"}, + {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:aa26a7406205bc1426cee28bdfdf084f669a5686977dafa4c3ec65873989593c"}, + {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:caa2164c91f7e91befb1ffb081b3cd97a137117533aef7abe1538b03ad72e3a9"}, + {file = "duckdb-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8d53b217698a76c4957e2c807dd9295d409146f9d3d7932f372883201ba9d25a"}, + {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8afba22c370f06b7314aa46bfed052509269e482bcfb3f7b1ea0fa17ae49ce42"}, + {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b195270ff1a661f22cbd547a215baff265b7d4469a76a215c8992b5994107c3"}, + {file = "duckdb-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:23a3a077821bed1768a84ac9cbf6b6487ead33e28e62cb118bda5fb8f9e53dea"}, + {file = "duckdb-1.4.3.tar.gz", hash = "sha256:fea43e03604c713e25a25211ada87d30cd2a044d8f27afab5deba26ac49e5268"}, +] + +[package.extras] +all = ["adbc-driver-manager", "fsspec", "ipython", "numpy", "pandas", "pyarrow"] + [[package]] name = "executing" -version = "2.2.0" +version = "2.2.1" description = "Get the currently executing AST node of a frame, and other information" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"}, - {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"}, + {file = "executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017"}, + {file = "executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4"}, ] [package.extras] -tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] [[package]] name = "fastapi" -version = "0.115.12" +version = "0.115.14" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "fastapi-0.115.12-py3-none-any.whl", hash = "sha256:e94613d6c05e27be7ffebdd6ea5f388112e5e430c8f7d6494a9d1d88d43e814d"}, - {file = "fastapi-0.115.12.tar.gz", hash = "sha256:1e2c2a2646905f9e83d32f04a3f86aff4a286669c6c950ca95b5fd68c2602681"}, + {file = "fastapi-0.115.14-py3-none-any.whl", hash = "sha256:6c0c8bf9420bd58f565e585036d971872472b4f7d3f6c73b698e10cffdefb3ca"}, + {file = "fastapi-0.115.14.tar.gz", hash = "sha256:b1de15cdc1c499a4da47914db35d0e4ef8f1ce62b624e94e0e5824421df99739"}, ] [package.dependencies] @@ -1107,43 +1392,77 @@ standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "htt [[package]] name = "fastjsonschema" -version = "2.21.1" +version = "2.21.2" description = "Fastest Python implementation of JSON schema" optional = false python-versions = "*" +groups = ["dev"] files = [ - {file = "fastjsonschema-2.21.1-py3-none-any.whl", hash = "sha256:c9e5b7e908310918cf494a434eeb31384dd84a98b57a30bcb1f535015b554667"}, - {file = "fastjsonschema-2.21.1.tar.gz", hash = "sha256:794d4f0a58f848961ba16af7b9c85a3e88cd360df008c59aac6fc5ae9323b5d4"}, + {file = "fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463"}, + {file = "fastjsonschema-2.21.2.tar.gz", hash = "sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de"}, ] [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] [[package]] -name = "filelock" -version = "3.18.0" -description = "A platform independent file lock." +name = "featuretools" +version = "1.31.0" +description = "a framework for automated feature engineering" optional = false -python-versions = ">=3.9" +python-versions = "<4,>=3.9" +groups = ["main"] files = [ - {file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"}, - {file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"}, + {file = "featuretools-1.31.0-py3-none-any.whl", hash = "sha256:87c94e9ae959c89acd83da96bd2583f3ef0f6daaa9639cbb6e46dbde2c742a18"}, + {file = "featuretools-1.31.0.tar.gz", hash = "sha256:01bfb17fcc1715b4c3623c7bc94a8982122c4a0fa03350ed478601bb81f90155"}, ] +[package.dependencies] +cloudpickle = ">=1.5.0" +holidays = ">=0.17" +numpy = ">=1.25.0" +packaging = ">=20.0" +pandas = ">=2.0.0" +psutil = ">=5.7.0" +scipy = ">=1.10.0" +tqdm = ">=4.66.3" +woodwork = ">=0.28.0" + [package.extras] -docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] -typing = ["typing-extensions (>=4.12.2)"] +autonormalize = ["autonormalize (>=2.0.1)"] +complete = ["featuretools[dask,nlp,premium]"] +dask = ["dask[dataframe] (>=2023.2.0)", "distributed (>=2023.2.0)"] +dev = ["black[jupyter] (>=23.1.0)", "featuretools[dask,docs,test]", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)"] +docs = ["Sphinx (==5.1.1)", "autonormalize (>=2.0.1)", "click (>=7.0.0)", "featuretools[dask,test]", "ipython (==8.4.0)", "jupyter (==1.0.0)", "jupyter-client (>=8.0.2)", "matplotlib (==3.7.2)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"] +nlp = ["nlp-primitives (>=2.12.0)"] +premium = ["premium-primitives (>=0.0.3)"] +sklearn = ["featuretools-sklearn-transformer (>=1.0.0)"] +sql = ["featuretools-sql (>=0.0.1)", "psycopg2-binary (>=2.9.3)"] +test = ["boto3 (>=1.34.32)", "composeml (>=0.8.0)", "graphviz (>=0.8.4)", "moto[all] (>=5.0.0)", "pip (>=23.3.0)", "pyarrow (>=14.0.1)", "pympler (>=0.8)", "pytest (>=7.1.2)", "pytest-cov (>=3.0.0)", "pytest-timeout (>=2.1.0)", "pytest-xdist (>=2.5.0)", "smart-open (>=5.0.0)", "urllib3 (>=1.26.18)"] +tsfresh = ["featuretools-tsfresh-primitives (>=1.0.0)"] + +[[package]] +name = "filelock" +version = "3.20.0" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.10" +groups = ["main", "dev"] +files = [ + {file = "filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2"}, + {file = "filelock-3.20.0.tar.gz", hash = "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4"}, +] [[package]] name = "flask" -version = "3.1.1" +version = "3.1.2" description = "A simple framework for building complex web applications." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "flask-3.1.1-py3-none-any.whl", hash = "sha256:07aae2bb5eaf77993ef57e357491839f5fd9f4dc281593a81a9e4d79a24f295c"}, - {file = "flask-3.1.1.tar.gz", hash = "sha256:284c7b8f2f58cb737f0cf1c30fd7eaf0ccfcde196099d24ecede3fc2005aa59e"}, + {file = "flask-3.1.2-py3-none-any.whl", hash = "sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c"}, + {file = "flask-3.1.2.tar.gz", hash = "sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87"}, ] [package.dependencies] @@ -1158,70 +1477,102 @@ werkzeug = ">=3.1.0" async = ["asgiref (>=3.2)"] dotenv = ["python-dotenv"] +[[package]] +name = "flask-cors" +version = "6.0.1" +description = "A Flask extension simplifying CORS support" +optional = false +python-versions = "<4.0,>=3.9" +groups = ["main"] +files = [ + {file = "flask_cors-6.0.1-py3-none-any.whl", hash = "sha256:c7b2cbfb1a31aa0d2e5341eea03a6805349f7a61647daee1a15c46bbe981494c"}, + {file = "flask_cors-6.0.1.tar.gz", hash = "sha256:d81bcb31f07b0985be7f48406247e9243aced229b7747219160a0559edd678db"}, +] + +[package.dependencies] +flask = ">=0.9" +Werkzeug = ">=0.7" + [[package]] name = "fonttools" -version = "4.58.2" +version = "4.60.1" description = "Tools to manipulate font files" optional = false python-versions = ">=3.9" -files = [ - {file = "fonttools-4.58.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4baaf34f07013ba9c2c3d7a95d0c391fcbb30748cb86c36c094fab8f168e49bb"}, - {file = "fonttools-4.58.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2e26e4a4920d57f04bb2c3b6e9a68b099c7ef2d70881d4fee527896fa4f7b5aa"}, - {file = "fonttools-4.58.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0bb956d9d01ea51368415515f664f58abf96557ba3c1aae4e26948ae7c86f29"}, - {file = "fonttools-4.58.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d40af8493c80ec17a1133ef429d42f1a97258dd9213b917daae9d8cafa6e0e6c"}, - {file = "fonttools-4.58.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:60b5cde1c76f6ded198da5608dddb1ee197faad7d2f0f6d3348ca0cda0c756c4"}, - {file = "fonttools-4.58.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f8df6dc80ecc9033ca25a944ee5db7564fecca28e96383043fd92d9df861a159"}, - {file = "fonttools-4.58.2-cp310-cp310-win32.whl", hash = "sha256:25728e980f5fbb67f52c5311b90fae4aaec08c3d3b78dce78ab564784df1129c"}, - {file = "fonttools-4.58.2-cp310-cp310-win_amd64.whl", hash = "sha256:d6997ee7c2909a904802faf44b0d0208797c4d751f7611836011ace165308165"}, - {file = "fonttools-4.58.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:024faaf20811296fd2f83ebdac7682276362e726ed5fea4062480dd36aff2fd9"}, - {file = "fonttools-4.58.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2faec6e7f2abd80cd9f2392dfa28c02cfd5b1125be966ea6eddd6ca684deaa40"}, - {file = "fonttools-4.58.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520792629a938c14dd7fe185794b156cfc159c609d07b31bbb5f51af8dc7918a"}, - {file = "fonttools-4.58.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12fbc6e0bf0c75ce475ef170f2c065be6abc9e06ad19a13b56b02ec2acf02427"}, - {file = "fonttools-4.58.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:44a39cf856d52109127d55576c7ec010206a8ba510161a7705021f70d1649831"}, - {file = "fonttools-4.58.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5390a67c55a835ad5a420da15b3d88b75412cbbd74450cb78c4916b0bd7f0a34"}, - {file = "fonttools-4.58.2-cp311-cp311-win32.whl", hash = "sha256:f7e10f4e7160bcf6a240d7560e9e299e8cb585baed96f6a616cef51180bf56cb"}, - {file = "fonttools-4.58.2-cp311-cp311-win_amd64.whl", hash = "sha256:29bdf52bfafdae362570d3f0d3119a3b10982e1ef8cb3a9d3ebb72da81cb8d5e"}, - {file = "fonttools-4.58.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:c6eeaed9c54c1d33c1db928eb92b4e180c7cb93b50b1ee3e79b2395cb01f25e9"}, - {file = "fonttools-4.58.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbe1d9c72b7f981bed5c2a61443d5e3127c1b3aca28ca76386d1ad93268a803f"}, - {file = "fonttools-4.58.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85babe5b3ce2cbe57fc0d09c0ee92bbd4d594fd7ea46a65eb43510a74a4ce773"}, - {file = "fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:918a2854537fcdc662938057ad58b633bc9e0698f04a2f4894258213283a7932"}, - {file = "fonttools-4.58.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b379cf05bf776c336a0205632596b1c7d7ab5f7135e3935f2ca2a0596d2d092"}, - {file = "fonttools-4.58.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99ab3547a15a5d168c265e139e21756bbae1de04782ac9445c9ef61b8c0a32ce"}, - {file = "fonttools-4.58.2-cp312-cp312-win32.whl", hash = "sha256:6764e7a3188ce36eea37b477cdeca602ae62e63ae9fc768ebc176518072deb04"}, - {file = "fonttools-4.58.2-cp312-cp312-win_amd64.whl", hash = "sha256:41f02182a1d41b79bae93c1551855146868b04ec3e7f9c57d6fef41a124e6b29"}, - {file = "fonttools-4.58.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:829048ef29dbefec35d95cc6811014720371c95bdc6ceb0afd2f8e407c41697c"}, - {file = "fonttools-4.58.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:64998c5993431e45b474ed5f579f18555f45309dd1cf8008b594d2fe0a94be59"}, - {file = "fonttools-4.58.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b887a1cf9fbcb920980460ee4a489c8aba7e81341f6cdaeefa08c0ab6529591c"}, - {file = "fonttools-4.58.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27d74b9f6970cefbcda33609a3bee1618e5e57176c8b972134c4e22461b9c791"}, - {file = "fonttools-4.58.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec26784610056a770e15a60f9920cee26ae10d44d1e43271ea652dadf4e7a236"}, - {file = "fonttools-4.58.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ed0a71d57dd427c0fb89febd08cac9b925284d2a8888e982a6c04714b82698d7"}, - {file = "fonttools-4.58.2-cp313-cp313-win32.whl", hash = "sha256:994e362b01460aa863ef0cb41a29880bc1a498c546952df465deff7abf75587a"}, - {file = "fonttools-4.58.2-cp313-cp313-win_amd64.whl", hash = "sha256:f95dec862d7c395f2d4efe0535d9bdaf1e3811e51b86432fa2a77e73f8195756"}, - {file = "fonttools-4.58.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e6f6ca4337e37d287535fd0089b4520cedc5666023fe4176a74e3415f917b570"}, - {file = "fonttools-4.58.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b269c7a783ec3be40809dc0dc536230a3d2d2c08e3fb9538d4e0213872b1a762"}, - {file = "fonttools-4.58.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1902d9b2b84cc9485663f1a72882890cd240f4464e8443af93faa34b095a4444"}, - {file = "fonttools-4.58.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a94a00ffacbb044729c6a5b29e02bf6f0e80681e9275cd374a1d25db3061328"}, - {file = "fonttools-4.58.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:25d22628f8b6b49b78666415f7cfa60c88138c24d66f3e5818d09ca001810cc5"}, - {file = "fonttools-4.58.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4bacb925a045e964a44bdeb9790b8778ce659605c7a2a39ef4f12e06c323406b"}, - {file = "fonttools-4.58.2-cp39-cp39-win32.whl", hash = "sha256:eb4bc19a3ab45d2b4bb8f4f7c60e55bec53016e402af0b6ff4ef0c0129193671"}, - {file = "fonttools-4.58.2-cp39-cp39-win_amd64.whl", hash = "sha256:c8d16973f8ab02a5a960afe1cae4db72220ef628bf397499aba8e3caa0c10e33"}, - {file = "fonttools-4.58.2-py3-none-any.whl", hash = "sha256:84f4b0bcfa046254a65ee7117094b4907e22dc98097a220ef108030eb3c15596"}, - {file = "fonttools-4.58.2.tar.gz", hash = "sha256:4b491ddbfd50b856e84b0648b5f7941af918f6d32f938f18e62b58426a8d50e2"}, -] - -[package.extras] -all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "fs (>=2.2.0,<3)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.23.0)", "unicodedata2 (>=15.1.0)", "xattr", "zopfli (>=0.1.4)"] +groups = ["main"] +files = [ + {file = "fonttools-4.60.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9a52f254ce051e196b8fe2af4634c2d2f02c981756c6464dc192f1b6050b4e28"}, + {file = "fonttools-4.60.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c7420a2696a44650120cdd269a5d2e56a477e2bfa9d95e86229059beb1c19e15"}, + {file = "fonttools-4.60.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee0c0b3b35b34f782afc673d503167157094a16f442ace7c6c5e0ca80b08f50c"}, + {file = "fonttools-4.60.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:282dafa55f9659e8999110bd8ed422ebe1c8aecd0dc396550b038e6c9a08b8ea"}, + {file = "fonttools-4.60.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4ba4bd646e86de16160f0fb72e31c3b9b7d0721c3e5b26b9fa2fc931dfdb2652"}, + {file = "fonttools-4.60.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0b0835ed15dd5b40d726bb61c846a688f5b4ce2208ec68779bc81860adb5851a"}, + {file = "fonttools-4.60.1-cp310-cp310-win32.whl", hash = "sha256:1525796c3ffe27bb6268ed2a1bb0dcf214d561dfaf04728abf01489eb5339dce"}, + {file = "fonttools-4.60.1-cp310-cp310-win_amd64.whl", hash = "sha256:268ecda8ca6cb5c4f044b1fb9b3b376e8cd1b361cef275082429dc4174907038"}, + {file = "fonttools-4.60.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7b4c32e232a71f63a5d00259ca3d88345ce2a43295bb049d21061f338124246f"}, + {file = "fonttools-4.60.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3630e86c484263eaac71d117085d509cbcf7b18f677906824e4bace598fb70d2"}, + {file = "fonttools-4.60.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5c1015318e4fec75dd4943ad5f6a206d9727adf97410d58b7e32ab644a807914"}, + {file = "fonttools-4.60.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e6c58beb17380f7c2ea181ea11e7db8c0ceb474c9dd45f48e71e2cb577d146a1"}, + {file = "fonttools-4.60.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec3681a0cb34c255d76dd9d865a55f260164adb9fa02628415cdc2d43ee2c05d"}, + {file = "fonttools-4.60.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f4b5c37a5f40e4d733d3bbaaef082149bee5a5ea3156a785ff64d949bd1353fa"}, + {file = "fonttools-4.60.1-cp311-cp311-win32.whl", hash = "sha256:398447f3d8c0c786cbf1209711e79080a40761eb44b27cdafffb48f52bcec258"}, + {file = "fonttools-4.60.1-cp311-cp311-win_amd64.whl", hash = "sha256:d066ea419f719ed87bc2c99a4a4bfd77c2e5949cb724588b9dd58f3fd90b92bf"}, + {file = "fonttools-4.60.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7b0c6d57ab00dae9529f3faf187f2254ea0aa1e04215cf2f1a8ec277c96661bc"}, + {file = "fonttools-4.60.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:839565cbf14645952d933853e8ade66a463684ed6ed6c9345d0faf1f0e868877"}, + {file = "fonttools-4.60.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:8177ec9676ea6e1793c8a084a90b65a9f778771998eb919d05db6d4b1c0b114c"}, + {file = "fonttools-4.60.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:996a4d1834524adbb423385d5a629b868ef9d774670856c63c9a0408a3063401"}, + {file = "fonttools-4.60.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a46b2f450bc79e06ef3b6394f0c68660529ed51692606ad7f953fc2e448bc903"}, + {file = "fonttools-4.60.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6ec722ee589e89a89f5b7574f5c45604030aa6ae24cb2c751e2707193b466fed"}, + {file = "fonttools-4.60.1-cp312-cp312-win32.whl", hash = "sha256:b2cf105cee600d2de04ca3cfa1f74f1127f8455b71dbad02b9da6ec266e116d6"}, + {file = "fonttools-4.60.1-cp312-cp312-win_amd64.whl", hash = "sha256:992775c9fbe2cf794786fa0ffca7f09f564ba3499b8fe9f2f80bd7197db60383"}, + {file = "fonttools-4.60.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f68576bb4bbf6060c7ab047b1574a1ebe5c50a17de62830079967b211059ebb"}, + {file = "fonttools-4.60.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:eedacb5c5d22b7097482fa834bda0dafa3d914a4e829ec83cdea2a01f8c813c4"}, + {file = "fonttools-4.60.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b33a7884fabd72bdf5f910d0cf46be50dce86a0362a65cfc746a4168c67eb96c"}, + {file = "fonttools-4.60.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2409d5fb7b55fd70f715e6d34e7a6e4f7511b8ad29a49d6df225ee76da76dd77"}, + {file = "fonttools-4.60.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c8651e0d4b3bdeda6602b85fdc2abbefc1b41e573ecb37b6779c4ca50753a199"}, + {file = "fonttools-4.60.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:145daa14bf24824b677b9357c5e44fd8895c2a8f53596e1b9ea3496081dc692c"}, + {file = "fonttools-4.60.1-cp313-cp313-win32.whl", hash = "sha256:2299df884c11162617a66b7c316957d74a18e3758c0274762d2cc87df7bc0272"}, + {file = "fonttools-4.60.1-cp313-cp313-win_amd64.whl", hash = "sha256:a3db56f153bd4c5c2b619ab02c5db5192e222150ce5a1bc10f16164714bc39ac"}, + {file = "fonttools-4.60.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:a884aef09d45ba1206712c7dbda5829562d3fea7726935d3289d343232ecb0d3"}, + {file = "fonttools-4.60.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8a44788d9d91df72d1a5eac49b31aeb887a5f4aab761b4cffc4196c74907ea85"}, + {file = "fonttools-4.60.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e852d9dda9f93ad3651ae1e3bb770eac544ec93c3807888798eccddf84596537"}, + {file = "fonttools-4.60.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:154cb6ee417e417bf5f7c42fe25858c9140c26f647c7347c06f0cc2d47eff003"}, + {file = "fonttools-4.60.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:5664fd1a9ea7f244487ac8f10340c4e37664675e8667d6fee420766e0fb3cf08"}, + {file = "fonttools-4.60.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:583b7f8e3c49486e4d489ad1deacfb8d5be54a8ef34d6df824f6a171f8511d99"}, + {file = "fonttools-4.60.1-cp314-cp314-win32.whl", hash = "sha256:66929e2ea2810c6533a5184f938502cfdaea4bc3efb7130d8cc02e1c1b4108d6"}, + {file = "fonttools-4.60.1-cp314-cp314-win_amd64.whl", hash = "sha256:f3d5be054c461d6a2268831f04091dc82753176f6ea06dc6047a5e168265a987"}, + {file = "fonttools-4.60.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:b6379e7546ba4ae4b18f8ae2b9bc5960936007a1c0e30b342f662577e8bc3299"}, + {file = "fonttools-4.60.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9d0ced62b59e0430b3690dbc5373df1c2aa7585e9a8ce38eff87f0fd993c5b01"}, + {file = "fonttools-4.60.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:875cb7764708b3132637f6c5fb385b16eeba0f7ac9fa45a69d35e09b47045801"}, + {file = "fonttools-4.60.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a184b2ea57b13680ab6d5fbde99ccef152c95c06746cb7718c583abd8f945ccc"}, + {file = "fonttools-4.60.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:026290e4ec76583881763fac284aca67365e0be9f13a7fb137257096114cb3bc"}, + {file = "fonttools-4.60.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f0e8817c7d1a0c2eedebf57ef9a9896f3ea23324769a9a2061a80fe8852705ed"}, + {file = "fonttools-4.60.1-cp314-cp314t-win32.whl", hash = "sha256:1410155d0e764a4615774e5c2c6fc516259fe3eca5882f034eb9bfdbee056259"}, + {file = "fonttools-4.60.1-cp314-cp314t-win_amd64.whl", hash = "sha256:022beaea4b73a70295b688f817ddc24ed3e3418b5036ffcd5658141184ef0d0c"}, + {file = "fonttools-4.60.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:122e1a8ada290423c493491d002f622b1992b1ab0b488c68e31c413390dc7eb2"}, + {file = "fonttools-4.60.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a140761c4ff63d0cb9256ac752f230460ee225ccef4ad8f68affc723c88e2036"}, + {file = "fonttools-4.60.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0eae96373e4b7c9e45d099d7a523444e3554360927225c1cdae221a58a45b856"}, + {file = "fonttools-4.60.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:596ecaca36367027d525b3b426d8a8208169d09edcf8c7506aceb3a38bfb55c7"}, + {file = "fonttools-4.60.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ee06fc57512144d8b0445194c2da9f190f61ad51e230f14836286470c99f854"}, + {file = "fonttools-4.60.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b42d86938e8dda1cd9a1a87a6d82f1818eaf933348429653559a458d027446da"}, + {file = "fonttools-4.60.1-cp39-cp39-win32.whl", hash = "sha256:8b4eb332f9501cb1cd3d4d099374a1e1306783ff95489a1026bde9eb02ccc34a"}, + {file = "fonttools-4.60.1-cp39-cp39-win_amd64.whl", hash = "sha256:7473a8ed9ed09aeaa191301244a5a9dbe46fe0bf54f9d6cd21d83044c3321217"}, + {file = "fonttools-4.60.1-py3-none-any.whl", hash = "sha256:906306ac7afe2156fcf0042173d6ebbb05416af70f6b370967b47f8f00103bbb"}, + {file = "fonttools-4.60.1.tar.gz", hash = "sha256:ef00af0439ebfee806b25f24c8f92109157ff3fac5731dc7867957812e87b8d9"}, +] + +[package.extras] +all = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\"", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.23.0)", "unicodedata2 (>=15.1.0) ; python_version <= \"3.12\"", "xattr ; sys_platform == \"darwin\"", "zopfli (>=0.1.4)"] graphite = ["lz4 (>=1.7.4.2)"] -interpolatable = ["munkres", "pycairo", "scipy"] +interpolatable = ["munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\""] lxml = ["lxml (>=4.0)"] pathops = ["skia-pathops (>=0.5.0)"] plot = ["matplotlib"] repacker = ["uharfbuzz (>=0.23.0)"] symfont = ["sympy"] -type1 = ["xattr"] -ufo = ["fs (>=2.2.0,<3)"] -unicode = ["unicodedata2 (>=15.1.0)"] -woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] +type1 = ["xattr ; sys_platform == \"darwin\""] +unicode = ["unicodedata2 (>=15.1.0) ; python_version <= \"3.12\""] +woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"] [[package]] name = "fqdn" @@ -1229,6 +1580,7 @@ version = "1.5.1" description = "Validates fully-qualified domain names against RFC 1123, so that they are acceptable to modern bowsers" optional = false python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4, <4" +groups = ["dev"] files = [ {file = "fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014"}, {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, @@ -1236,126 +1588,154 @@ files = [ [[package]] name = "frozenlist" -version = "1.6.2" +version = "1.8.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.9" -files = [ - {file = "frozenlist-1.6.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:92836b9903e52f787f4f4bfc6cf3b03cf19de4cbc09f5969e58806f876d8647f"}, - {file = "frozenlist-1.6.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3af419982432a13a997451e611ff7681a4fbf81dca04f70b08fc51106335ff0"}, - {file = "frozenlist-1.6.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1570ba58f0852a6e6158d4ad92de13b9aba3474677c3dee827ba18dcf439b1d8"}, - {file = "frozenlist-1.6.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0de575df0135949c4049ae42db714c43d1693c590732abc78c47a04228fc1efb"}, - {file = "frozenlist-1.6.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2b6eaba27ec2b3c0af7845619a425eeae8d510d5cc83fb3ef80569129238153b"}, - {file = "frozenlist-1.6.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:af1ee5188d2f63b4f09b67cf0c60b8cdacbd1e8d24669eac238e247d8b157581"}, - {file = "frozenlist-1.6.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9179c5186eb996c0dd7e4c828858ade4d7a8d1d12dd67320675a6ae7401f2647"}, - {file = "frozenlist-1.6.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38814ebc3c6bb01dc3bb4d6cffd0e64c19f4f2d03e649978aeae8e12b81bdf43"}, - {file = "frozenlist-1.6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dbcab0531318fc9ca58517865fae63a2fe786d5e2d8f3a56058c29831e49f13"}, - {file = "frozenlist-1.6.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7472e477dc5d6a000945f45b6e38cbb1093fdec189dc1e98e57f8ab53f8aa246"}, - {file = "frozenlist-1.6.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:17c230586d47332774332af86cc1e69ee095731ec70c27e5698dfebb9db167a0"}, - {file = "frozenlist-1.6.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:946a41e095592cf1c88a1fcdd154c13d0ef6317b371b817dc2b19b3d93ca0811"}, - {file = "frozenlist-1.6.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d90c9b36c669eb481de605d3c2da02ea98cba6a3f5e93b3fe5881303026b2f14"}, - {file = "frozenlist-1.6.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8651dd2d762d6eefebe8450ec0696cf3706b0eb5e46463138931f70c667ba612"}, - {file = "frozenlist-1.6.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:48400e6a09e217346949c034105b0df516a1b3c5aa546913b70b71b646caa9f5"}, - {file = "frozenlist-1.6.2-cp310-cp310-win32.whl", hash = "sha256:56354f09082262217f837d91106f1cc204dd29ac895f9bbab33244e2fa948bd7"}, - {file = "frozenlist-1.6.2-cp310-cp310-win_amd64.whl", hash = "sha256:3016ff03a332cdd2800f0eed81ca40a2699b2f62f23626e8cf81a2993867978a"}, - {file = "frozenlist-1.6.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb66c5d48b89701b93d58c31a48eb64e15d6968315a9ccc7dfbb2d6dc2c62ab7"}, - {file = "frozenlist-1.6.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8fb9aee4f7b495044b868d7e74fb110d8996e8fddc0bfe86409c7fc7bd5692f0"}, - {file = "frozenlist-1.6.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:48dde536fc4d8198fad4e211f977b1a5f070e6292801decf2d6bc77b805b0430"}, - {file = "frozenlist-1.6.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91dd2fb760f4a2c04b3330e0191787c3437283f9241f0b379017d4b13cea8f5e"}, - {file = "frozenlist-1.6.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f01f34f8a5c7b4d74a1c65227678822e69801dcf68edd4c11417a7c83828ff6f"}, - {file = "frozenlist-1.6.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f43f872cc4cfc46d9805d0e71302e9c39c755d5ad7572198cd2ceb3a291176cc"}, - {file = "frozenlist-1.6.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f96cc8ab3a73d42bcdb6d9d41c3dceffa8da8273ac54b71304b891e32de8b13"}, - {file = "frozenlist-1.6.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c0b257123320832cce9bea9935c860e4fa625b0e58b10db49fdfef70087df81"}, - {file = "frozenlist-1.6.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dc4def97ccc0232f491836050ae664d3d2352bb43ad4cd34cd3399ad8d1fc8"}, - {file = "frozenlist-1.6.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fcf3663463c040315f025bd6a5f88b3748082cfe111e90fd422f71668c65de52"}, - {file = "frozenlist-1.6.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:16b9e7b59ea6eef876a8a5fac084c95fd4bac687c790c4d48c0d53c6bcde54d1"}, - {file = "frozenlist-1.6.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:308b40d32a98a8d0d09bc28e4cbc13a0b803a0351041d4548564f28f6b148b05"}, - {file = "frozenlist-1.6.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:baf585d8968eaad6c1aae99456c40978a9fa822ccbdb36fd4746b581ef338192"}, - {file = "frozenlist-1.6.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4dfdbdb671a6af6ea1a363b210373c8233df3925d9a7fb99beaa3824f6b99656"}, - {file = "frozenlist-1.6.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:94916e3acaeb8374d5aea9c37db777c9f0a2b9be46561f5de30064cbbbfae54a"}, - {file = "frozenlist-1.6.2-cp311-cp311-win32.whl", hash = "sha256:0453e3d2d12616949cb2581068942a0808c7255f2abab0676d2da7db30f9ea11"}, - {file = "frozenlist-1.6.2-cp311-cp311-win_amd64.whl", hash = "sha256:fb512753c4bbf0af03f6b9c7cc5ecc9bbac2e198a94f61aaabd26c3cf3229c8c"}, - {file = "frozenlist-1.6.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:48544d07404d7fcfccb6cc091922ae10de4d9e512c537c710c063ae8f5662b85"}, - {file = "frozenlist-1.6.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6ee0cf89e7638de515c0bb2e8be30e8e2e48f3be9b6c2f7127bca4a1f35dff45"}, - {file = "frozenlist-1.6.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e084d838693d73c0fe87d212b91af80c18068c95c3d877e294f165056cedfa58"}, - {file = "frozenlist-1.6.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d918b01781c6ebb5b776c18a87dd3016ff979eb78626aaca928bae69a640c3"}, - {file = "frozenlist-1.6.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e2892d9ab060a847f20fab83fdb886404d0f213f648bdeaebbe76a6134f0973d"}, - {file = "frozenlist-1.6.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbd2225d7218e7d386f4953d11484b0e38e5d134e85c91f0a6b0f30fb6ae25c4"}, - {file = "frozenlist-1.6.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b679187cba0a99f1162c7ec1b525e34bdc5ca246857544d16c1ed234562df80"}, - {file = "frozenlist-1.6.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bceb7bd48849d4b76eac070a6d508aa3a529963f5d9b0a6840fd41fb381d5a09"}, - {file = "frozenlist-1.6.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b1b79ae86fdacc4bf842a4e0456540947abba64a84e61b5ae24c87adb089db"}, - {file = "frozenlist-1.6.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6c5c3c575148aa7308a38709906842039d7056bf225da6284b7a11cf9275ac5d"}, - {file = "frozenlist-1.6.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:16263bd677a31fe1a5dc2b803b564e349c96f804a81706a62b8698dd14dbba50"}, - {file = "frozenlist-1.6.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2e51b2054886ff7db71caf68285c2cd936eb7a145a509965165a2aae715c92a7"}, - {file = "frozenlist-1.6.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ae1785b76f641cce4efd7e6f49ca4ae456aa230383af5ab0d4d3922a7e37e763"}, - {file = "frozenlist-1.6.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:30155cc481f73f92f47ab1e858a7998f7b1207f9b5cf3b3cba90ec65a7f224f5"}, - {file = "frozenlist-1.6.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e1a1d82f2eb3d2875a8d139ae3f5026f7797f9de5dce44f53811ab0a883e85e7"}, - {file = "frozenlist-1.6.2-cp312-cp312-win32.whl", hash = "sha256:84105cb0f3479dfa20b85f459fb2db3b0ee52e2f84e86d447ea8b0de1fb7acdd"}, - {file = "frozenlist-1.6.2-cp312-cp312-win_amd64.whl", hash = "sha256:eecc861bd30bc5ee3b04a1e6ebf74ed0451f596d91606843f3edbd2f273e2fe3"}, - {file = "frozenlist-1.6.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2ad8851ae1f6695d735f8646bf1e68675871789756f7f7e8dc8224a74eabb9d0"}, - {file = "frozenlist-1.6.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cd2d5abc0ccd99a2a5b437987f3b1e9c265c1044d2855a09ac68f09bbb8082ca"}, - {file = "frozenlist-1.6.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:15c33f665faa9b8f8e525b987eeaae6641816e0f6873e8a9c4d224338cebbb55"}, - {file = "frozenlist-1.6.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3e6c0681783723bb472b6b8304e61ecfcb4c2b11cf7f243d923813c21ae5d2a"}, - {file = "frozenlist-1.6.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:61bae4d345a26550d0ed9f2c9910ea060f89dbfc642b7b96e9510a95c3a33b3c"}, - {file = "frozenlist-1.6.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90e5a84016d0d2fb828f770ede085b5d89155fcb9629b8a3237c960c41c120c3"}, - {file = "frozenlist-1.6.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55dc289a064c04819d669e6e8a85a1c0416e6c601782093bdc749ae14a2f39da"}, - {file = "frozenlist-1.6.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b79bcf97ca03c95b044532a4fef6e5ae106a2dd863875b75fde64c553e3f4820"}, - {file = "frozenlist-1.6.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e5e7564d232a782baa3089b25a0d979e2e4d6572d3c7231fcceacc5c22bf0f7"}, - {file = "frozenlist-1.6.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6fcd8d56880dccdd376afb18f483ab55a0e24036adc9a83c914d4b7bb5729d4e"}, - {file = "frozenlist-1.6.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:4fbce985c7fe7bafb4d9bf647c835dbe415b465a897b0c79d1bdf0f3fae5fe50"}, - {file = "frozenlist-1.6.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:3bd12d727cd616387d50fe283abebb2db93300c98f8ff1084b68460acd551926"}, - {file = "frozenlist-1.6.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:38544cae535ed697960891131731b33bb865b7d197ad62dc380d2dbb1bceff48"}, - {file = "frozenlist-1.6.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:47396898f98fae5c9b9bb409c3d2cf6106e409730f35a0926aad09dd7acf1ef5"}, - {file = "frozenlist-1.6.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d10d835f8ce8571fd555db42d3aef325af903535dad7e6faa7b9c8abe191bffc"}, - {file = "frozenlist-1.6.2-cp313-cp313-win32.whl", hash = "sha256:a400fe775a41b6d7a3fef00d88f10cbae4f0074c9804e282013d7797671ba58d"}, - {file = "frozenlist-1.6.2-cp313-cp313-win_amd64.whl", hash = "sha256:cc8b25b321863ed46992558a29bb09b766c41e25f31461666d501be0f893bada"}, - {file = "frozenlist-1.6.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:56de277a0e0ad26a1dcdc99802b4f5becd7fd890807b68e3ecff8ced01d58132"}, - {file = "frozenlist-1.6.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:9cb386dd69ae91be586aa15cb6f39a19b5f79ffc1511371eca8ff162721c4867"}, - {file = "frozenlist-1.6.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53835d8a6929c2f16e02616f8b727bd140ce8bf0aeddeafdb290a67c136ca8ad"}, - {file = "frozenlist-1.6.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc49f2277e8173abf028d744f8b7d69fe8cc26bffc2de97d47a3b529599fbf50"}, - {file = "frozenlist-1.6.2-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:65eb9e8a973161bdac5fa06ea6bd261057947adc4f47a7a6ef3d6db30c78c5b4"}, - {file = "frozenlist-1.6.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:301eb2f898d863031f8c5a56c88a6c5d976ba11a4a08a1438b96ee3acb5aea80"}, - {file = "frozenlist-1.6.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:207f717fd5e65fddb77d33361ab8fa939f6d89195f11307e073066886b33f2b8"}, - {file = "frozenlist-1.6.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f83992722642ee0db0333b1dbf205b1a38f97d51a7382eb304ba414d8c3d1e05"}, - {file = "frozenlist-1.6.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12af99e6023851b36578e5bcc60618b5b30f4650340e29e565cd1936326dbea7"}, - {file = "frozenlist-1.6.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6f01620444a674eaad900a3263574418e99c49e2a5d6e5330753857363b5d59f"}, - {file = "frozenlist-1.6.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:82b94c8948341512306ca8ccc702771600b442c6abe5f8ee017e00e452a209e8"}, - {file = "frozenlist-1.6.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:324a4cf4c220ddb3db1f46ade01e48432c63fa8c26812c710006e7f6cfba4a08"}, - {file = "frozenlist-1.6.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:695284e51458dabb89af7f7dc95c470aa51fd259207aba5378b187909297feef"}, - {file = "frozenlist-1.6.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:9ccbeb1c8dda4f42d0678076aa5cbde941a232be71c67b9d8ca89fbaf395807c"}, - {file = "frozenlist-1.6.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cbbdf62fcc1864912c592a1ec748fee94f294c6b23215d5e8e9569becb7723ee"}, - {file = "frozenlist-1.6.2-cp313-cp313t-win32.whl", hash = "sha256:76857098ee17258df1a61f934f2bae052b8542c9ea6b187684a737b2e3383a65"}, - {file = "frozenlist-1.6.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c06a88daba7e891add42f9278cdf7506a49bc04df9b1648be54da1bf1c79b4c6"}, - {file = "frozenlist-1.6.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99119fa5ae292ac1d3e73336ecbe3301dbb2a7f5b4e6a4594d3a6b2e240c31c1"}, - {file = "frozenlist-1.6.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:af923dbcfd382554e960328133c2a8151706673d1280f55552b1bb914d276267"}, - {file = "frozenlist-1.6.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69e85175df4cc35f2cef8cb60a8bad6c5fc50e91524cd7018d73dd2fcbc70f5d"}, - {file = "frozenlist-1.6.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97dcdffe18c0e35ce57b3d7c1352893a3608e7578b814abb3b2a3cc15907e682"}, - {file = "frozenlist-1.6.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:cc228faf4533327e5f1d153217ab598648a2cd5f6b1036d82e63034f079a5861"}, - {file = "frozenlist-1.6.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ee53aba5d0768e2c5c6185ec56a94bab782ef002429f293497ec5c5a3b94bdf"}, - {file = "frozenlist-1.6.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d3214738024afd53434614ee52aa74353a562414cd48b1771fa82fd982cb1edb"}, - {file = "frozenlist-1.6.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5628e6a6f74ef1693adbe25c0bce312eb9aee82e58abe370d287794aff632d0f"}, - {file = "frozenlist-1.6.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7678d3e32cb3884879f10c679804c08f768df55078436fb56668f3e13e2a5e"}, - {file = "frozenlist-1.6.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b776ab5217e2bf99c84b2cbccf4d30407789c0653f72d1653b5f8af60403d28f"}, - {file = "frozenlist-1.6.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:b1e162a99405cb62d338f747b8625d6bd7b6794383e193335668295fb89b75fb"}, - {file = "frozenlist-1.6.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2de1ddeb9dd8a07383f6939996217f0f1b2ce07f6a01d74c9adb1db89999d006"}, - {file = "frozenlist-1.6.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2dcabe4e7aac889d41316c1698df0eb2565ed233b66fab6bc4a5c5b7769cad4c"}, - {file = "frozenlist-1.6.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:06e28cd2ac31797e12ec8c65aa462a89116323f045e8b1930127aba9486aab24"}, - {file = "frozenlist-1.6.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:86f908b70043c3517f862247bdc621bd91420d40c3e90ede1701a75f025fcd5f"}, - {file = "frozenlist-1.6.2-cp39-cp39-win32.whl", hash = "sha256:2647a3d11f10014a5f9f2ca38c7fadd0dd28f5b1b5e9ce9c9d194aa5d0351c7e"}, - {file = "frozenlist-1.6.2-cp39-cp39-win_amd64.whl", hash = "sha256:e2cbef30ba27a1d9f3e3c6aa84a60f53d907d955969cd0103b004056e28bca08"}, - {file = "frozenlist-1.6.2-py3-none-any.whl", hash = "sha256:947abfcc8c42a329bbda6df97a4b9c9cdb4e12c85153b3b57b9d2f02aa5877dc"}, - {file = "frozenlist-1.6.2.tar.gz", hash = "sha256:effc641518696471cf4962e8e32050133bc1f7b2851ae8fd0cb8797dd70dc202"}, +groups = ["main"] +files = [ + {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011"}, + {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef2b7b394f208233e471abc541cc6991f907ffd47dc72584acee3147899d6565"}, + {file = "frozenlist-1.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a88f062f072d1589b7b46e951698950e7da00442fc1cacbe17e19e025dc327ad"}, + {file = "frozenlist-1.8.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f57fb59d9f385710aa7060e89410aeb5058b99e62f4d16b08b91986b9a2140c2"}, + {file = "frozenlist-1.8.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:799345ab092bee59f01a915620b5d014698547afd011e691a208637312db9186"}, + {file = "frozenlist-1.8.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c23c3ff005322a6e16f71bf8692fcf4d5a304aaafe1e262c98c6d4adc7be863e"}, + {file = "frozenlist-1.8.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a76ea0f0b9dfa06f254ee06053d93a600865b3274358ca48a352ce4f0798450"}, + {file = "frozenlist-1.8.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c7366fe1418a6133d5aa824ee53d406550110984de7637d65a178010f759c6ef"}, + {file = "frozenlist-1.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:13d23a45c4cebade99340c4165bd90eeb4a56c6d8a9d8aa49568cac19a6d0dc4"}, + {file = "frozenlist-1.8.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:e4a3408834f65da56c83528fb52ce7911484f0d1eaf7b761fc66001db1646eff"}, + {file = "frozenlist-1.8.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:42145cd2748ca39f32801dad54aeea10039da6f86e303659db90db1c4b614c8c"}, + {file = "frozenlist-1.8.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e2de870d16a7a53901e41b64ffdf26f2fbb8917b3e6ebf398098d72c5b20bd7f"}, + {file = "frozenlist-1.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:20e63c9493d33ee48536600d1a5c95eefc870cd71e7ab037763d1fbb89cc51e7"}, + {file = "frozenlist-1.8.0-cp310-cp310-win32.whl", hash = "sha256:adbeebaebae3526afc3c96fad434367cafbfd1b25d72369a9e5858453b1bb71a"}, + {file = "frozenlist-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:667c3777ca571e5dbeb76f331562ff98b957431df140b54c85fd4d52eea8d8f6"}, + {file = "frozenlist-1.8.0-cp310-cp310-win_arm64.whl", hash = "sha256:80f85f0a7cc86e7a54c46d99c9e1318ff01f4687c172ede30fd52d19d1da1c8e"}, + {file = "frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84"}, + {file = "frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9"}, + {file = "frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93"}, + {file = "frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f"}, + {file = "frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695"}, + {file = "frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52"}, + {file = "frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581"}, + {file = "frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567"}, + {file = "frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b"}, + {file = "frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92"}, + {file = "frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d"}, + {file = "frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd"}, + {file = "frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967"}, + {file = "frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25"}, + {file = "frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b"}, + {file = "frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a"}, + {file = "frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1"}, + {file = "frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b"}, + {file = "frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4"}, + {file = "frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383"}, + {file = "frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4"}, + {file = "frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8"}, + {file = "frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b"}, + {file = "frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52"}, + {file = "frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29"}, + {file = "frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3"}, + {file = "frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143"}, + {file = "frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608"}, + {file = "frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa"}, + {file = "frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf"}, + {file = "frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746"}, + {file = "frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd"}, + {file = "frozenlist-1.8.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a"}, + {file = "frozenlist-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7"}, + {file = "frozenlist-1.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40"}, + {file = "frozenlist-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027"}, + {file = "frozenlist-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822"}, + {file = "frozenlist-1.8.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121"}, + {file = "frozenlist-1.8.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5"}, + {file = "frozenlist-1.8.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e"}, + {file = "frozenlist-1.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11"}, + {file = "frozenlist-1.8.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1"}, + {file = "frozenlist-1.8.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1"}, + {file = "frozenlist-1.8.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8"}, + {file = "frozenlist-1.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed"}, + {file = "frozenlist-1.8.0-cp313-cp313-win32.whl", hash = "sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496"}, + {file = "frozenlist-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231"}, + {file = "frozenlist-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62"}, + {file = "frozenlist-1.8.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94"}, + {file = "frozenlist-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c"}, + {file = "frozenlist-1.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52"}, + {file = "frozenlist-1.8.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51"}, + {file = "frozenlist-1.8.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65"}, + {file = "frozenlist-1.8.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82"}, + {file = "frozenlist-1.8.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714"}, + {file = "frozenlist-1.8.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d"}, + {file = "frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506"}, + {file = "frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51"}, + {file = "frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e"}, + {file = "frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0"}, + {file = "frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41"}, + {file = "frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b"}, + {file = "frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888"}, + {file = "frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042"}, + {file = "frozenlist-1.8.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cee686f1f4cadeb2136007ddedd0aaf928ab95216e7691c63e50a8ec066336d0"}, + {file = "frozenlist-1.8.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:119fb2a1bd47307e899c2fac7f28e85b9a543864df47aa7ec9d3c1b4545f096f"}, + {file = "frozenlist-1.8.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4970ece02dbc8c3a92fcc5228e36a3e933a01a999f7094ff7c23fbd2beeaa67c"}, + {file = "frozenlist-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cba69cb73723c3f329622e34bdbf5ce1f80c21c290ff04256cff1cd3c2036ed2"}, + {file = "frozenlist-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:778a11b15673f6f1df23d9586f83c4846c471a8af693a22e066508b77d201ec8"}, + {file = "frozenlist-1.8.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0325024fe97f94c41c08872db482cf8ac4800d80e79222c6b0b7b162d5b13686"}, + {file = "frozenlist-1.8.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:97260ff46b207a82a7567b581ab4190bd4dfa09f4db8a8b49d1a958f6aa4940e"}, + {file = "frozenlist-1.8.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:54b2077180eb7f83dd52c40b2750d0a9f175e06a42e3213ce047219de902717a"}, + {file = "frozenlist-1.8.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2f05983daecab868a31e1da44462873306d3cbfd76d1f0b5b69c473d21dbb128"}, + {file = "frozenlist-1.8.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:33f48f51a446114bc5d251fb2954ab0164d5be02ad3382abcbfe07e2531d650f"}, + {file = "frozenlist-1.8.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:154e55ec0655291b5dd1b8731c637ecdb50975a2ae70c606d100750a540082f7"}, + {file = "frozenlist-1.8.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:4314debad13beb564b708b4a496020e5306c7333fa9a3ab90374169a20ffab30"}, + {file = "frozenlist-1.8.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:073f8bf8becba60aa931eb3bc420b217bb7d5b8f4750e6f8b3be7f3da85d38b7"}, + {file = "frozenlist-1.8.0-cp314-cp314-win32.whl", hash = "sha256:bac9c42ba2ac65ddc115d930c78d24ab8d4f465fd3fc473cdedfccadb9429806"}, + {file = "frozenlist-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:3e0761f4d1a44f1d1a47996511752cf3dcec5bbdd9cc2b4fe595caf97754b7a0"}, + {file = "frozenlist-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:d1eaff1d00c7751b7c6662e9c5ba6eb2c17a2306ba5e2a37f24ddf3cc953402b"}, + {file = "frozenlist-1.8.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d3bb933317c52d7ea5004a1c442eef86f426886fba134ef8cf4226ea6ee1821d"}, + {file = "frozenlist-1.8.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8009897cdef112072f93a0efdce29cd819e717fd2f649ee3016efd3cd885a7ed"}, + {file = "frozenlist-1.8.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2c5dcbbc55383e5883246d11fd179782a9d07a986c40f49abe89ddf865913930"}, + {file = "frozenlist-1.8.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:39ecbc32f1390387d2aa4f5a995e465e9e2f79ba3adcac92d68e3e0afae6657c"}, + {file = "frozenlist-1.8.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92db2bf818d5cc8d9c1f1fc56b897662e24ea5adb36ad1f1d82875bd64e03c24"}, + {file = "frozenlist-1.8.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2dc43a022e555de94c3b68a4ef0b11c4f747d12c024a520c7101709a2144fb37"}, + {file = "frozenlist-1.8.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb89a7f2de3602cfed448095bab3f178399646ab7c61454315089787df07733a"}, + {file = "frozenlist-1.8.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:33139dc858c580ea50e7e60a1b0ea003efa1fd42e6ec7fdbad78fff65fad2fd2"}, + {file = "frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:168c0969a329b416119507ba30b9ea13688fafffac1b7822802537569a1cb0ef"}, + {file = "frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:28bd570e8e189d7f7b001966435f9dac6718324b5be2990ac496cf1ea9ddb7fe"}, + {file = "frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b2a095d45c5d46e5e79ba1e5b9cb787f541a8dee0433836cea4b96a2c439dcd8"}, + {file = "frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:eab8145831a0d56ec9c4139b6c3e594c7a83c2c8be25d5bcf2d86136a532287a"}, + {file = "frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:974b28cf63cc99dfb2188d8d222bc6843656188164848c4f679e63dae4b0708e"}, + {file = "frozenlist-1.8.0-cp314-cp314t-win32.whl", hash = "sha256:342c97bf697ac5480c0a7ec73cd700ecfa5a8a40ac923bd035484616efecc2df"}, + {file = "frozenlist-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:06be8f67f39c8b1dc671f5d83aaefd3358ae5cdcf8314552c57e7ed3e6475bdd"}, + {file = "frozenlist-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:102e6314ca4da683dca92e3b1355490fed5f313b768500084fbe6371fddfdb79"}, + {file = "frozenlist-1.8.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d8b7138e5cd0647e4523d6685b0eac5d4be9a184ae9634492f25c6eb38c12a47"}, + {file = "frozenlist-1.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a6483e309ca809f1efd154b4d37dc6d9f61037d6c6a81c2dc7a15cb22c8c5dca"}, + {file = "frozenlist-1.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b9290cf81e95e93fdf90548ce9d3c1211cf574b8e3f4b3b7cb0537cf2227068"}, + {file = "frozenlist-1.8.0-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:59a6a5876ca59d1b63af8cd5e7ffffb024c3dc1e9cf9301b21a2e76286505c95"}, + {file = "frozenlist-1.8.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6dc4126390929823e2d2d9dc79ab4046ed74680360fc5f38b585c12c66cdf459"}, + {file = "frozenlist-1.8.0-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:332db6b2563333c5671fecacd085141b5800cb866be16d5e3eb15a2086476675"}, + {file = "frozenlist-1.8.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ff15928d62a0b80bb875655c39bf517938c7d589554cbd2669be42d97c2cb61"}, + {file = "frozenlist-1.8.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7bf6cdf8e07c8151fba6fe85735441240ec7f619f935a5205953d58009aef8c6"}, + {file = "frozenlist-1.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:48e6d3f4ec5c7273dfe83ff27c91083c6c9065af655dc2684d2c200c94308bb5"}, + {file = "frozenlist-1.8.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:1a7607e17ad33361677adcd1443edf6f5da0ce5e5377b798fba20fae194825f3"}, + {file = "frozenlist-1.8.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3a935c3a4e89c733303a2d5a7c257ea44af3a56c8202df486b7f5de40f37e1"}, + {file = "frozenlist-1.8.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:940d4a017dbfed9daf46a3b086e1d2167e7012ee297fef9e1c545c4d022f5178"}, + {file = "frozenlist-1.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b9be22a69a014bc47e78072d0ecae716f5eb56c15238acca0f43d6eb8e4a5bda"}, + {file = "frozenlist-1.8.0-cp39-cp39-win32.whl", hash = "sha256:1aa77cb5697069af47472e39612976ed05343ff2e84a3dcf15437b232cbfd087"}, + {file = "frozenlist-1.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:7398c222d1d405e796970320036b1b563892b65809d9e5261487bb2c7f7b5c6a"}, + {file = "frozenlist-1.8.0-cp39-cp39-win_arm64.whl", hash = "sha256:b4f3b365f31c6cd4af24545ca0a244a53688cad8834e32f56831c4923b50a103"}, + {file = "frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d"}, + {file = "frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad"}, ] [[package]] name = "fsspec" -version = "2025.5.1" +version = "2025.10.0" description = "File-system specification" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "fsspec-2025.5.1-py3-none-any.whl", hash = "sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462"}, - {file = "fsspec-2025.5.1.tar.gz", hash = "sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475"}, + {file = "fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d"}, + {file = "fsspec-2025.10.0.tar.gz", hash = "sha256:b6789427626f068f9a83ca4e8a3cc050850b6c0f71f99ddb4f542b8266a26a59"}, ] [package.extras] @@ -1363,7 +1743,7 @@ abfs = ["adlfs"] adl = ["adlfs"] arrow = ["pyarrow (>=1)"] dask = ["dask", "distributed"] -dev = ["pre-commit", "ruff"] +dev = ["pre-commit", "ruff (>=0.5)"] doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"] dropbox = ["dropbox", "dropboxdrivefs", "requests"] full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] @@ -1383,7 +1763,7 @@ smb = ["smbprotocol"] ssh = ["paramiko"] test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"] test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"] -test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"] +test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard ; python_version < \"3.14\""] tqdm = ["tqdm"] [[package]] @@ -1392,6 +1772,7 @@ version = "4.0.12" description = "Git Object Database" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf"}, {file = "gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571"}, @@ -1402,13 +1783,14 @@ smmap = ">=3.0.1,<6" [[package]] name = "gitpython" -version = "3.1.44" +version = "3.1.45" description = "GitPython is a Python library used to interact with Git repositories" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ - {file = "GitPython-3.1.44-py3-none-any.whl", hash = "sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110"}, - {file = "gitpython-3.1.44.tar.gz", hash = "sha256:c87e30b26253bf5418b01b0660f818967f3c503193838337fe5e573331249269"}, + {file = "gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77"}, + {file = "gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c"}, ] [package.dependencies] @@ -1416,32 +1798,33 @@ gitdb = ">=4.0.1,<5" [package.extras] doc = ["sphinx (>=7.1.2,<7.2)", "sphinx-autodoc-typehints", "sphinx_rtd_theme"] -test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] +test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock ; python_version < \"3.8\"", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions ; python_version < \"3.11\""] [[package]] name = "google-auth" -version = "2.40.3" +version = "2.43.0" description = "Google Authentication Library" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ - {file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"}, - {file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"}, + {file = "google_auth-2.43.0-py2.py3-none-any.whl", hash = "sha256:af628ba6fa493f75c7e9dbe9373d148ca9f4399b5ea29976519e0a3848eddd16"}, + {file = "google_auth-2.43.0.tar.gz", hash = "sha256:88228eee5fc21b62a1b5fe773ca15e67778cb07dc8363adcb4a8827b52d81483"}, ] [package.dependencies] -cachetools = ">=2.0.0,<6.0" +cachetools = ">=2.0.0,<7.0" pyasn1-modules = ">=0.2.1" rsa = ">=3.1.4,<5" [package.extras] aiohttp = ["aiohttp (>=3.6.2,<4.0.0)", "requests (>=2.20.0,<3.0.0)"] enterprise-cert = ["cryptography", "pyopenssl"] -pyjwt = ["cryptography (<39.0.0)", "cryptography (>=38.0.3)", "pyjwt (>=2.0)"] -pyopenssl = ["cryptography (<39.0.0)", "cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] +pyjwt = ["cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "pyjwt (>=2.0)"] +pyopenssl = ["cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] reauth = ["pyu2f (>=0.1.5)"] requests = ["requests (>=2.20.0,<3.0.0)"] -testing = ["aiohttp (<3.10.0)", "aiohttp (>=3.6.2,<4.0.0)", "aioresponses", "cryptography (<39.0.0)", "cryptography (>=38.0.3)", "flask", "freezegun", "grpcio", "mock", "oauth2client", "packaging", "pyjwt (>=2.0)", "pyopenssl (<24.3.0)", "pyopenssl (>=20.0.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-localserver", "pyu2f (>=0.1.5)", "requests (>=2.20.0,<3.0.0)", "responses", "urllib3"] +testing = ["aiohttp (<3.10.0)", "aiohttp (>=3.6.2,<4.0.0)", "aioresponses", "cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "cryptography (>=38.0.3)", "flask", "freezegun", "grpcio", "mock", "oauth2client", "packaging", "pyjwt (>=2.0)", "pyopenssl (<24.3.0)", "pyopenssl (>=20.0.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-localserver", "pyu2f (>=0.1.5)", "requests (>=2.20.0,<3.0.0)", "responses", "urllib3"] urllib3 = ["packaging", "urllib3"] [[package]] @@ -1450,6 +1833,7 @@ version = "3.4.3" description = "GraphQL Framework for Python" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "graphene-3.4.3-py2.py3-none-any.whl", hash = "sha256:820db6289754c181007a150db1f7fff544b94142b556d12e3ebc777a7bf36c71"}, {file = "graphene-3.4.3.tar.gz", hash = "sha256:2a3786948ce75fe7e078443d37f609cbe5bb36ad8d6b828740ad3b95ed1a0aaa"}, @@ -1467,13 +1851,14 @@ test = ["coveralls (>=3.3,<5)", "pytest (>=8,<9)", "pytest-asyncio (>=0.16,<2)", [[package]] name = "graphql-core" -version = "3.2.6" +version = "3.2.7" description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." optional = false -python-versions = "<4,>=3.6" +python-versions = "<4,>=3.7" +groups = ["main"] files = [ - {file = "graphql_core-3.2.6-py3-none-any.whl", hash = "sha256:78b016718c161a6fb20a7d97bbf107f331cd1afe53e45566c59f776ed7f0b45f"}, - {file = "graphql_core-3.2.6.tar.gz", hash = "sha256:c08eec22f9e40f0bd61d805907e3b3b1b9a320bc606e23dc145eebca07c8fbab"}, + {file = "graphql_core-3.2.7-py3-none-any.whl", hash = "sha256:17fc8f3ca4a42913d8e24d9ac9f08deddf0a0b2483076575757f6c412ead2ec0"}, + {file = "graphql_core-3.2.7.tar.gz", hash = "sha256:27b6904bdd3b43f2a0556dad5d579bdfdeab1f38e8e8788e555bdcb586a6f62c"}, ] [[package]] @@ -1482,6 +1867,7 @@ version = "3.2.0" description = "Relay library for graphql-core" optional = false python-versions = ">=3.6,<4" +groups = ["main"] files = [ {file = "graphql-relay-3.2.0.tar.gz", hash = "sha256:1ff1c51298356e481a0be009ccdff249832ce53f30559c1338f22a0e0d17250c"}, {file = "graphql_relay-3.2.0-py3-none-any.whl", hash = "sha256:c9b22bd28b170ba1fe674c74384a8ff30a76c8e26f88ac3aa1584dd3179953e5"}, @@ -1492,70 +1878,84 @@ graphql-core = ">=3.2,<3.3" [[package]] name = "greenlet" -version = "3.2.3" +version = "3.2.4" description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.9" -files = [ - {file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"}, - {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"}, - {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:a433dbc54e4a37e4fff90ef34f25a8c00aed99b06856f0119dcf09fbafa16392"}, - {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:72e77ed69312bab0434d7292316d5afd6896192ac4327d44f3d613ecb85b037c"}, - {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:68671180e3849b963649254a882cd544a3c75bfcd2c527346ad8bb53494444db"}, - {file = "greenlet-3.2.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49c8cfb18fb419b3d08e011228ef8a25882397f3a859b9fe1436946140b6756b"}, - {file = "greenlet-3.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:efc6dc8a792243c31f2f5674b670b3a95d46fa1c6a912b8e310d6f542e7b0712"}, - {file = "greenlet-3.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:731e154aba8e757aedd0781d4b240f1225b075b4409f1bb83b05ff410582cf00"}, - {file = "greenlet-3.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:96c20252c2f792defe9a115d3287e14811036d51e78b3aaddbee23b69b216302"}, - {file = "greenlet-3.2.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:784ae58bba89fa1fa5733d170d42486580cab9decda3484779f4759345b29822"}, - {file = "greenlet-3.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0921ac4ea42a5315d3446120ad48f90c3a6b9bb93dd9b3cf4e4d84a66e42de83"}, - {file = "greenlet-3.2.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:d2971d93bb99e05f8c2c0c2f4aa9484a18d98c4c3bd3c62b65b7e6ae33dfcfaf"}, - {file = "greenlet-3.2.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c667c0bf9d406b77a15c924ef3285e1e05250948001220368e039b6aa5b5034b"}, - {file = "greenlet-3.2.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:592c12fb1165be74592f5de0d70f82bc5ba552ac44800d632214b76089945147"}, - {file = "greenlet-3.2.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:29e184536ba333003540790ba29829ac14bb645514fbd7e32af331e8202a62a5"}, - {file = "greenlet-3.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:93c0bb79844a367782ec4f429d07589417052e621aa39a5ac1fb99c5aa308edc"}, - {file = "greenlet-3.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:751261fc5ad7b6705f5f76726567375bb2104a059454e0226e1eef6c756748ba"}, - {file = "greenlet-3.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:83a8761c75312361aa2b5b903b79da97f13f556164a7dd2d5448655425bd4c34"}, - {file = "greenlet-3.2.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:25ad29caed5783d4bd7a85c9251c651696164622494c00802a139c00d639242d"}, - {file = "greenlet-3.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88cd97bf37fe24a6710ec6a3a7799f3f81d9cd33317dcf565ff9950c83f55e0b"}, - {file = "greenlet-3.2.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:baeedccca94880d2f5666b4fa16fc20ef50ba1ee353ee2d7092b383a243b0b0d"}, - {file = "greenlet-3.2.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:be52af4b6292baecfa0f397f3edb3c6092ce071b499dd6fe292c9ac9f2c8f264"}, - {file = "greenlet-3.2.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0cc73378150b8b78b0c9fe2ce56e166695e67478550769536a6742dca3651688"}, - {file = "greenlet-3.2.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:706d016a03e78df129f68c4c9b4c4f963f7d73534e48a24f5f5a7101ed13dbbb"}, - {file = "greenlet-3.2.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:419e60f80709510c343c57b4bb5a339d8767bf9aef9b8ce43f4f143240f88b7c"}, - {file = "greenlet-3.2.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:93d48533fade144203816783373f27a97e4193177ebaaf0fc396db19e5d61163"}, - {file = "greenlet-3.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:7454d37c740bb27bdeddfc3f358f26956a07d5220818ceb467a483197d84f849"}, - {file = "greenlet-3.2.3-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:500b8689aa9dd1ab26872a34084503aeddefcb438e2e7317b89b11eaea1901ad"}, - {file = "greenlet-3.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a07d3472c2a93117af3b0136f246b2833fdc0b542d4a9799ae5f41c28323faef"}, - {file = "greenlet-3.2.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:8704b3768d2f51150626962f4b9a9e4a17d2e37c8a8d9867bbd9fa4eb938d3b3"}, - {file = "greenlet-3.2.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5035d77a27b7c62db6cf41cf786cfe2242644a7a337a0e155c80960598baab95"}, - {file = "greenlet-3.2.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2d8aa5423cd4a396792f6d4580f88bdc6efcb9205891c9d40d20f6e670992efb"}, - {file = "greenlet-3.2.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2c724620a101f8170065d7dded3f962a2aea7a7dae133a009cada42847e04a7b"}, - {file = "greenlet-3.2.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:873abe55f134c48e1f2a6f53f7d1419192a3d1a4e873bace00499a4e45ea6af0"}, - {file = "greenlet-3.2.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:024571bbce5f2c1cfff08bf3fbaa43bbc7444f580ae13b0099e95d0e6e67ed36"}, - {file = "greenlet-3.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:5195fb1e75e592dd04ce79881c8a22becdfa3e6f500e7feb059b1e6fdd54d3e3"}, - {file = "greenlet-3.2.3-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:3d04332dddb10b4a211b68111dabaee2e1a073663d117dc10247b5b1642bac86"}, - {file = "greenlet-3.2.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8186162dffde068a465deab08fc72c767196895c39db26ab1c17c0b77a6d8b97"}, - {file = "greenlet-3.2.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f4bfbaa6096b1b7a200024784217defedf46a07c2eee1a498e94a1b5f8ec5728"}, - {file = "greenlet-3.2.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:ed6cfa9200484d234d8394c70f5492f144b20d4533f69262d530a1a082f6ee9a"}, - {file = "greenlet-3.2.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:02b0df6f63cd15012bed5401b47829cfd2e97052dc89da3cfaf2c779124eb892"}, - {file = "greenlet-3.2.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86c2d68e87107c1792e2e8d5399acec2487a4e993ab76c792408e59394d52141"}, - {file = "greenlet-3.2.3-cp314-cp314-win_amd64.whl", hash = "sha256:8c47aae8fbbfcf82cc13327ae802ba13c9c36753b67e760023fd116bc124a62a"}, - {file = "greenlet-3.2.3-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:42efc522c0bd75ffa11a71e09cd8a399d83fafe36db250a87cf1dacfaa15dc64"}, - {file = "greenlet-3.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d760f9bdfe79bff803bad32b4d8ffb2c1d2ce906313fc10a83976ffb73d64ca7"}, - {file = "greenlet-3.2.3-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:8324319cbd7b35b97990090808fdc99c27fe5338f87db50514959f8059999805"}, - {file = "greenlet-3.2.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:8c37ef5b3787567d322331d5250e44e42b58c8c713859b8a04c6065f27efbf72"}, - {file = "greenlet-3.2.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce539fb52fb774d0802175d37fcff5c723e2c7d249c65916257f0a940cee8904"}, - {file = "greenlet-3.2.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:003c930e0e074db83559edc8705f3a2d066d4aa8c2f198aff1e454946efd0f26"}, - {file = "greenlet-3.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7e70ea4384b81ef9e84192e8a77fb87573138aa5d4feee541d8014e452b434da"}, - {file = "greenlet-3.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:22eb5ba839c4b2156f18f76768233fe44b23a31decd9cc0d4cc8141c211fd1b4"}, - {file = "greenlet-3.2.3-cp39-cp39-win32.whl", hash = "sha256:4532f0d25df67f896d137431b13f4cdce89f7e3d4a96387a41290910df4d3a57"}, - {file = "greenlet-3.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:aaa7aae1e7f75eaa3ae400ad98f8644bb81e1dc6ba47ce8a93d3f17274e08322"}, - {file = "greenlet-3.2.3.tar.gz", hash = "sha256:8b0dd8ae4c0d6f5e54ee55ba935eeb3d735a9b58a8a1e5b5cbab64e01a39f365"}, +groups = ["main"] +markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\"" +files = [ + {file = "greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c"}, + {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590"}, + {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f10fd42b5ee276335863712fa3da6608e93f70629c631bf77145021600abc23c"}, + {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c8c9e331e58180d0d83c5b7999255721b725913ff6bc6cf39fa2a45841a4fd4b"}, + {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58b97143c9cc7b86fc458f215bd0932f1757ce649e05b640fea2e79b54cedb31"}, + {file = "greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d"}, + {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5"}, + {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f"}, + {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7"}, + {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8"}, + {file = "greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c"}, + {file = "greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2"}, + {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246"}, + {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:94abf90142c2a18151632371140b3dba4dee031633fe614cb592dbb6c9e17bc3"}, + {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:4d1378601b85e2e5171b99be8d2dc85f594c79967599328f95c1dc1a40f1c633"}, + {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0db5594dce18db94f7d1650d7489909b57afde4c580806b8d9203b6e79cdc079"}, + {file = "greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8"}, + {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52"}, + {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa"}, + {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c"}, + {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5"}, + {file = "greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9"}, + {file = "greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd"}, + {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb"}, + {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968"}, + {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9"}, + {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6"}, + {file = "greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0"}, + {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0"}, + {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f"}, + {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0"}, + {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d"}, + {file = "greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02"}, + {file = "greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31"}, + {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945"}, + {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc"}, + {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a"}, + {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504"}, + {file = "greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671"}, + {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b"}, + {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae"}, + {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b"}, + {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929"}, + {file = "greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b"}, + {file = "greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0"}, + {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f"}, + {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5"}, + {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1"}, + {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735"}, + {file = "greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337"}, + {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269"}, + {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681"}, + {file = "greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01"}, + {file = "greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:b6a7c19cf0d2742d0809a4c05975db036fdff50cd294a93632d6a310bf9ac02c"}, + {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:27890167f55d2387576d1f41d9487ef171849ea0359ce1510ca6e06c8bece11d"}, + {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:18d9260df2b5fbf41ae5139e1be4e796d99655f023a636cd0e11e6406cca7d58"}, + {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:671df96c1f23c4a0d4077a325483c1503c96a1b7d9db26592ae770daa41233d4"}, + {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:16458c245a38991aa19676900d48bd1a6f2ce3e16595051a4db9d012154e8433"}, + {file = "greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9913f1a30e4526f432991f89ae263459b1c64d1608c0d22a5c79c287b3c70df"}, + {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b90654e092f928f110e0007f572007c9727b5265f7632c2fa7415b4689351594"}, + {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81701fd84f26330f0d5f4944d4e92e61afe6319dcd9775e39396e39d7c3e5f98"}, + {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:28a3c6b7cd72a96f61b0e4b2a36f681025b60ae4779cc73c1535eb5f29560b10"}, + {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:52206cd642670b0b320a1fd1cbfd95bca0e043179c1d8a045f2c6109dfe973be"}, + {file = "greenlet-3.2.4-cp39-cp39-win32.whl", hash = "sha256:65458b409c1ed459ea899e939f0e1cdb14f58dbc803f2f93c5eab5694d32671b"}, + {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"}, + {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"}, ] [package.extras] docs = ["Sphinx", "furo"] -test = ["objgraph", "psutil"] +test = ["objgraph", "psutil", "setuptools"] [[package]] name = "gunicorn" @@ -1563,6 +1963,8 @@ version = "23.0.0" description = "WSGI HTTP Server for UNIX" optional = false python-versions = ">=3.7" +groups = ["main"] +markers = "platform_system != \"Windows\"" files = [ {file = "gunicorn-23.0.0-py3-none-any.whl", hash = "sha256:ec400d38950de4dfd418cff8328b2c8faed0edb0d517d3394e457c317908ca4d"}, {file = "gunicorn-23.0.0.tar.gz", hash = "sha256:f014447a0101dc57e294f6c18ca6b40227a4c90e9bdb586042628030cba004ec"}, @@ -1584,6 +1986,7 @@ version = "0.16.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"}, {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, @@ -1591,30 +1994,62 @@ files = [ [[package]] name = "hf-xet" -version = "1.1.3" +version = "1.2.0" description = "Fast transfer of large files with the Hugging Face Hub." optional = false python-versions = ">=3.8" -files = [ - {file = "hf_xet-1.1.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c3b508b5f583a75641aebf732853deb058953370ce8184f5dabc49f803b0819b"}, - {file = "hf_xet-1.1.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:b788a61977fbe6b5186e66239e2a329a3f0b7e7ff50dad38984c0c74f44aeca1"}, - {file = "hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd2da210856444a34aad8ada2fc12f70dabed7cc20f37e90754d1d9b43bc0534"}, - {file = "hf_xet-1.1.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8203f52827e3df65981984936654a5b390566336956f65765a8aa58c362bb841"}, - {file = "hf_xet-1.1.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:30c575a5306f8e6fda37edb866762140a435037365eba7a17ce7bd0bc0216a8b"}, - {file = "hf_xet-1.1.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7c1a6aa6abed1f696f8099aa9796ca04c9ee778a58728a115607de9cc4638ff1"}, - {file = "hf_xet-1.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:b578ae5ac9c056296bb0df9d018e597c8dc6390c5266f35b5c44696003cde9f3"}, - {file = "hf_xet-1.1.3.tar.gz", hash = "sha256:a5f09b1dd24e6ff6bcedb4b0ddab2d81824098bb002cf8b4ffa780545fa348c3"}, +groups = ["main"] +markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\"" +files = [ + {file = "hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649"}, + {file = "hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813"}, + {file = "hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc"}, + {file = "hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5"}, + {file = "hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f"}, + {file = "hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832"}, + {file = "hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382"}, + {file = "hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e"}, + {file = "hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8"}, + {file = "hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0"}, + {file = "hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090"}, + {file = "hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a"}, + {file = "hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f"}, + {file = "hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc"}, + {file = "hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848"}, + {file = "hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4"}, + {file = "hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd"}, + {file = "hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c"}, + {file = "hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737"}, + {file = "hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865"}, + {file = "hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69"}, + {file = "hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f"}, ] [package.extras] tests = ["pytest"] +[[package]] +name = "holidays" +version = "0.85" +description = "Open World Holidays Framework" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "holidays-0.85-py3-none-any.whl", hash = "sha256:46445107ee3251c7e2daa23773a86921fa2e29d09f850038e7bfa2c75a434ad8"}, + {file = "holidays-0.85.tar.gz", hash = "sha256:4291155456c6abde885487a05bd16f523d4ef4136819a55833ff86f60ca3cc3b"}, +] + +[package.dependencies] +python-dateutil = "*" + [[package]] name = "httpcore" version = "1.0.9" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55"}, {file = "httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8"}, @@ -1632,58 +2067,57 @@ trio = ["trio (>=0.22.0,<1.0)"] [[package]] name = "httptools" -version = "0.6.4" +version = "0.7.1" description = "A collection of framework independent HTTP protocol utils." optional = true -python-versions = ">=3.8.0" -files = [ - {file = "httptools-0.6.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0"}, - {file = "httptools-0.6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da"}, - {file = "httptools-0.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1"}, - {file = "httptools-0.6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50"}, - {file = "httptools-0.6.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959"}, - {file = "httptools-0.6.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4"}, - {file = "httptools-0.6.4-cp310-cp310-win_amd64.whl", hash = "sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c"}, - {file = "httptools-0.6.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069"}, - {file = "httptools-0.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a"}, - {file = "httptools-0.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975"}, - {file = "httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636"}, - {file = "httptools-0.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721"}, - {file = "httptools-0.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988"}, - {file = "httptools-0.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17"}, - {file = "httptools-0.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2"}, - {file = "httptools-0.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44"}, - {file = "httptools-0.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1"}, - {file = "httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2"}, - {file = "httptools-0.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81"}, - {file = "httptools-0.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f"}, - {file = "httptools-0.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970"}, - {file = "httptools-0.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660"}, - {file = "httptools-0.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083"}, - {file = "httptools-0.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3"}, - {file = "httptools-0.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071"}, - {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5"}, - {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0"}, - {file = "httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8"}, - {file = "httptools-0.6.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d3f0d369e7ffbe59c4b6116a44d6a8eb4783aae027f2c0b366cf0aa964185dba"}, - {file = "httptools-0.6.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:94978a49b8f4569ad607cd4946b759d90b285e39c0d4640c6b36ca7a3ddf2efc"}, - {file = "httptools-0.6.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40dc6a8e399e15ea525305a2ddba998b0af5caa2566bcd79dcbe8948181eeaff"}, - {file = "httptools-0.6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab9ba8dcf59de5181f6be44a77458e45a578fc99c31510b8c65b7d5acc3cf490"}, - {file = "httptools-0.6.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43"}, - {file = "httptools-0.6.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d54efd20338ac52ba31e7da78e4a72570cf729fac82bc31ff9199bedf1dc7440"}, - {file = "httptools-0.6.4-cp38-cp38-win_amd64.whl", hash = "sha256:df959752a0c2748a65ab5387d08287abf6779ae9165916fe053e68ae1fbdc47f"}, - {file = "httptools-0.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003"}, - {file = "httptools-0.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab"}, - {file = "httptools-0.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547"}, - {file = "httptools-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9"}, - {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076"}, - {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd"}, - {file = "httptools-0.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6"}, - {file = "httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c"}, -] - -[package.extras] -test = ["Cython (>=0.29.24)"] +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"chatui\" or extra == \"all\"" +files = [ + {file = "httptools-0.7.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:11d01b0ff1fe02c4c32d60af61a4d613b74fad069e47e06e9067758c01e9ac78"}, + {file = "httptools-0.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84d86c1e5afdc479a6fdabf570be0d3eb791df0ae727e8dbc0259ed1249998d4"}, + {file = "httptools-0.7.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c8c751014e13d88d2be5f5f14fc8b89612fcfa92a9cc480f2bc1598357a23a05"}, + {file = "httptools-0.7.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:654968cb6b6c77e37b832a9be3d3ecabb243bbe7a0b8f65fbc5b6b04c8fcabed"}, + {file = "httptools-0.7.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b580968316348b474b020edf3988eecd5d6eec4634ee6561e72ae3a2a0e00a8a"}, + {file = "httptools-0.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d496e2f5245319da9d764296e86c5bb6fcf0cf7a8806d3d000717a889c8c0b7b"}, + {file = "httptools-0.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:cbf8317bfccf0fed3b5680c559d3459cccf1abe9039bfa159e62e391c7270568"}, + {file = "httptools-0.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:474d3b7ab469fefcca3697a10d11a32ee2b9573250206ba1e50d5980910da657"}, + {file = "httptools-0.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3c3b7366bb6c7b96bd72d0dbe7f7d5eead261361f013be5f6d9590465ea1c70"}, + {file = "httptools-0.7.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:379b479408b8747f47f3b253326183d7c009a3936518cdb70db58cffd369d9df"}, + {file = "httptools-0.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cad6b591a682dcc6cf1397c3900527f9affef1e55a06c4547264796bbd17cf5e"}, + {file = "httptools-0.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eb844698d11433d2139bbeeb56499102143beb582bd6c194e3ba69c22f25c274"}, + {file = "httptools-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f65744d7a8bdb4bda5e1fa23e4ba16832860606fcc09d674d56e425e991539ec"}, + {file = "httptools-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:135fbe974b3718eada677229312e97f3b31f8a9c8ffa3ae6f565bf808d5b6bcb"}, + {file = "httptools-0.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:38e0c83a2ea9746ebbd643bdfb521b9aa4a91703e2cd705c20443405d2fd16a5"}, + {file = "httptools-0.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f25bbaf1235e27704f1a7b86cd3304eabc04f569c828101d94a0e605ef7205a5"}, + {file = "httptools-0.7.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2c15f37ef679ab9ecc06bfc4e6e8628c32a8e4b305459de7cf6785acd57e4d03"}, + {file = "httptools-0.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7fe6e96090df46b36ccfaf746f03034e5ab723162bc51b0a4cf58305324036f2"}, + {file = "httptools-0.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f72fdbae2dbc6e68b8239defb48e6a5937b12218e6ffc2c7846cc37befa84362"}, + {file = "httptools-0.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e99c7b90a29fd82fea9ef57943d501a16f3404d7b9ee81799d41639bdaae412c"}, + {file = "httptools-0.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:3e14f530fefa7499334a79b0cf7e7cd2992870eb893526fb097d51b4f2d0f321"}, + {file = "httptools-0.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6babce6cfa2a99545c60bfef8bee0cc0545413cb0018f617c8059a30ad985de3"}, + {file = "httptools-0.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:601b7628de7504077dd3dcb3791c6b8694bbd967148a6d1f01806509254fb1ca"}, + {file = "httptools-0.7.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:04c6c0e6c5fb0739c5b8a9eb046d298650a0ff38cf42537fc372b28dc7e4472c"}, + {file = "httptools-0.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69d4f9705c405ae3ee83d6a12283dc9feba8cc6aaec671b412917e644ab4fa66"}, + {file = "httptools-0.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:44c8f4347d4b31269c8a9205d8a5ee2df5322b09bbbd30f8f862185bb6b05346"}, + {file = "httptools-0.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:465275d76db4d554918aba40bf1cbebe324670f3dfc979eaffaa5d108e2ed650"}, + {file = "httptools-0.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:322d00c2068d125bd570f7bf78b2d367dad02b919d8581d7476d8b75b294e3e6"}, + {file = "httptools-0.7.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:c08fe65728b8d70b6923ce31e3956f859d5e1e8548e6f22ec520a962c6757270"}, + {file = "httptools-0.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7aea2e3c3953521c3c51106ee11487a910d45586e351202474d45472db7d72d3"}, + {file = "httptools-0.7.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0e68b8582f4ea9166be62926077a3334064d422cf08ab87d8b74664f8e9058e1"}, + {file = "httptools-0.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df091cf961a3be783d6aebae963cc9b71e00d57fa6f149025075217bc6a55a7b"}, + {file = "httptools-0.7.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f084813239e1eb403ddacd06a30de3d3e09a9b76e7894dcda2b22f8a726e9c60"}, + {file = "httptools-0.7.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7347714368fb2b335e9063bc2b96f2f87a9ceffcd9758ac295f8bbcd3ffbc0ca"}, + {file = "httptools-0.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:cfabda2a5bb85aa2a904ce06d974a3f30fb36cc63d7feaddec05d2050acede96"}, + {file = "httptools-0.7.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ac50afa68945df63ec7a2707c506bd02239272288add34539a2ef527254626a4"}, + {file = "httptools-0.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:de987bb4e7ac95b99b805b99e0aae0ad51ae61df4263459d36e07cf4052d8b3a"}, + {file = "httptools-0.7.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d169162803a24425eb5e4d51d79cbf429fd7a491b9e570a55f495ea55b26f0bf"}, + {file = "httptools-0.7.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49794f9250188a57fa73c706b46cb21a313edb00d337ca4ce1a011fe3c760b28"}, + {file = "httptools-0.7.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:aeefa0648362bb97a7d6b5ff770bfb774930a327d7f65f8208394856862de517"}, + {file = "httptools-0.7.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0d92b10dbf0b3da4823cde6a96d18e6ae358a9daa741c71448975f6a2c339cad"}, + {file = "httptools-0.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:5ddbd045cfcb073db2449563dd479057f2c2b681ebc232380e63ef15edc9c023"}, + {file = "httptools-0.7.1.tar.gz", hash = "sha256:abd72556974f8e7c74a259655924a717a2365b236c882c3f6f8a45fe94703ac9"}, +] [[package]] name = "httpx" @@ -1691,6 +2125,7 @@ version = "0.28.1" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, @@ -1703,27 +2138,44 @@ httpcore = "==1.*" idna = "*" [package.extras] -brotli = ["brotli", "brotlicffi"] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "huey" +version = "2.5.4" +description = "huey, a little task queue" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "huey-2.5.4-py3-none-any.whl", hash = "sha256:0eac1fb2711f6366a1db003629354a0cea470a3db720d5bab0d140c28e993f9c"}, + {file = "huey-2.5.4.tar.gz", hash = "sha256:4b7fb217b640fbb46efc4f4681b446b40726593522f093e8ef27c4a8fcb6cfbb"}, +] + +[package.extras] +backends = ["redis (>=3.0.0)"] +redis = ["redis (>=3.0.0)"] + [[package]] name = "huggingface-hub" -version = "0.32.4" +version = "0.36.0" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" +groups = ["main"] files = [ - {file = "huggingface_hub-0.32.4-py3-none-any.whl", hash = "sha256:37abf8826b38d971f60d3625229221c36e53fe58060286db9baf619cfbf39767"}, - {file = "huggingface_hub-0.32.4.tar.gz", hash = "sha256:f61d45cd338736f59fb0e97550b74c24ee771bcc92c05ae0766b9116abe720be"}, + {file = "huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d"}, + {file = "huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25"}, ] [package.dependencies] filelock = "*" fsspec = ">=2023.5.0" -hf-xet = {version = ">=1.1.2,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""} +hf-xet = {version = ">=1.1.3,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""} packaging = ">=20.9" pyyaml = ">=5.1" requests = "*" @@ -1731,64 +2183,65 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (==1.4.0)", "mypy (==1.15.0)", "mypy (>=1.14.1,<1.15.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "ty", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (==1.4.0)", "mypy (==1.15.0)", "mypy (>=1.14.1,<1.15.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "ty", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] hf-transfer = ["hf-transfer (>=0.1.4)"] hf-xet = ["hf-xet (>=1.1.2,<2.0.0)"] inference = ["aiohttp"] mcp = ["aiohttp", "mcp (>=1.8.0)", "typer"] oauth = ["authlib (>=1.3.2)", "fastapi", "httpx", "itsdangerous"] -quality = ["libcst (==1.4.0)", "mypy (==1.15.0)", "mypy (>=1.14.1,<1.15.0)", "ruff (>=0.9.0)"] +quality = ["libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "ruff (>=0.9.0)", "ty"] tensorflow = ["graphviz", "pydot", "tensorflow"] tensorflow-testing = ["keras (<3.0)", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] torch = ["safetensors[torch]", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] [[package]] name = "hypothesis" -version = "6.135.1" -description = "A library for property-based testing" +version = "6.148.1" +description = "The property-based testing library for Python" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "hypothesis-6.135.1-py3-none-any.whl", hash = "sha256:14fab728bfe2409a3934e6e1ea6ae0a706d0bc78187137218a253aec7528b4c8"}, - {file = "hypothesis-6.135.1.tar.gz", hash = "sha256:36eea411ef5dde5612301fcd9a293b6f2a3a5ab96488be2e23e7c5799cbd7b33"}, + {file = "hypothesis-6.148.1-py3-none-any.whl", hash = "sha256:2cca1a20e8fcd230da86bb6d9f3e940a052caef0d4327c12b5694d164ea0f89e"}, + {file = "hypothesis-6.148.1.tar.gz", hash = "sha256:98851429bd89b0c8835e8ebca6705c28dec56517576e5606fc3b841ddb0dc1c7"}, ] [package.dependencies] -attrs = ">=22.2.0" sortedcontainers = ">=2.1.0,<3.0.0" [package.extras] -all = ["black (>=19.10b0)", "click (>=7.0)", "crosshair-tool (>=0.0.88)", "django (>=4.2)", "dpcontracts (>=0.4)", "hypothesis-crosshair (>=0.0.23)", "lark (>=0.10.1)", "libcst (>=0.3.16)", "numpy (>=1.19.3)", "pandas (>=1.1)", "pytest (>=4.6)", "python-dateutil (>=1.4)", "pytz (>=2014.1)", "redis (>=3.0.0)", "rich (>=9.0.0)", "tzdata (>=2025.2)", "watchdog (>=4.0.0)"] -cli = ["black (>=19.10b0)", "click (>=7.0)", "rich (>=9.0.0)"] +all = ["black (>=20.8b0)", "click (>=7.0)", "crosshair-tool (>=0.0.97)", "django (>=4.2)", "dpcontracts (>=0.4)", "hypothesis-crosshair (>=0.0.25)", "lark (>=0.10.1)", "libcst (>=0.3.16)", "numpy (>=1.21.6)", "pandas (>=1.1)", "pytest (>=4.6)", "python-dateutil (>=1.4)", "pytz (>=2014.1)", "redis (>=3.0.0)", "rich (>=9.0.0)", "tzdata (>=2025.2) ; sys_platform == \"win32\" or sys_platform == \"emscripten\"", "watchdog (>=4.0.0)"] +cli = ["black (>=20.8b0)", "click (>=7.0)", "rich (>=9.0.0)"] codemods = ["libcst (>=0.3.16)"] -crosshair = ["crosshair-tool (>=0.0.88)", "hypothesis-crosshair (>=0.0.23)"] +crosshair = ["crosshair-tool (>=0.0.97)", "hypothesis-crosshair (>=0.0.25)"] dateutil = ["python-dateutil (>=1.4)"] django = ["django (>=4.2)"] dpcontracts = ["dpcontracts (>=0.4)"] -ghostwriter = ["black (>=19.10b0)"] +ghostwriter = ["black (>=20.8b0)"] lark = ["lark (>=0.10.1)"] -numpy = ["numpy (>=1.19.3)"] +numpy = ["numpy (>=1.21.6)"] pandas = ["pandas (>=1.1)"] pytest = ["pytest (>=4.6)"] pytz = ["pytz (>=2014.1)"] redis = ["redis (>=3.0.0)"] watchdog = ["watchdog (>=4.0.0)"] -zoneinfo = ["tzdata (>=2025.2)"] +zoneinfo = ["tzdata (>=2025.2) ; sys_platform == \"win32\" or sys_platform == \"emscripten\""] [[package]] name = "identify" -version = "2.6.12" +version = "2.6.15" description = "File identification library for Python" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "identify-2.6.12-py2.py3-none-any.whl", hash = "sha256:ad9672d5a72e0d2ff7c5c8809b62dfa60458626352fb0eb7b55e69bdc45334a2"}, - {file = "identify-2.6.12.tar.gz", hash = "sha256:d8de45749f1efb108badef65ee8386f0f7bb19a7f26185f74de6367bffbaf0e6"}, + {file = "identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757"}, + {file = "identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf"}, ] [package.extras] @@ -1796,13 +2249,14 @@ license = ["ukkonen"] [[package]] name = "idna" -version = "3.10" +version = "3.11" description = "Internationalized Domain Names in Applications (IDNA)" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" +groups = ["main", "dev"] files = [ - {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, - {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, + {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, + {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, ] [package.extras] @@ -1814,6 +2268,7 @@ version = "0.12.4" description = "Toolbox for imbalanced dataset in machine learning." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "imbalanced-learn-0.12.4.tar.gz", hash = "sha256:8153ba385d296b07d97e0901a2624a86c06b48c94c2f92da3a5354827697b7a3"}, {file = "imbalanced_learn-0.12.4-py3-none-any.whl", hash = "sha256:d47fc599160d3ea882e712a3a6b02bdd353c1a6436d8d68d41b1922e6ee4a703"}, @@ -1838,6 +2293,7 @@ version = "8.7.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd"}, {file = "importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000"}, @@ -1847,89 +2303,112 @@ files = [ zipp = ">=3.20" [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] perf = ["ipython"] -test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +test = ["flufl.flake8", "importlib_resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +type = ["pytest-mypy"] + +[[package]] +name = "importlib-resources" +version = "6.5.2" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec"}, + {file = "importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] type = ["pytest-mypy"] [[package]] name = "iniconfig" -version = "2.1.0" +version = "2.3.0" description = "brain-dead simple config-ini parsing" optional = false -python-versions = ">=3.8" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, - {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, + {file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"}, + {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, ] [[package]] name = "ipykernel" -version = "6.29.5" +version = "7.1.0" description = "IPython Kernel for Jupyter" optional = false -python-versions = ">=3.8" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, - {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, + {file = "ipykernel-7.1.0-py3-none-any.whl", hash = "sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c"}, + {file = "ipykernel-7.1.0.tar.gz", hash = "sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db"}, ] [package.dependencies] -appnope = {version = "*", markers = "platform_system == \"Darwin\""} +appnope = {version = ">=0.1.2", markers = "platform_system == \"Darwin\""} comm = ">=0.1.1" debugpy = ">=1.6.5" ipython = ">=7.23.1" -jupyter-client = ">=6.1.12" +jupyter-client = ">=8.0.0" jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" matplotlib-inline = ">=0.1" -nest-asyncio = "*" -packaging = "*" -psutil = "*" -pyzmq = ">=24" -tornado = ">=6.1" +nest-asyncio = ">=1.4" +packaging = ">=22" +psutil = ">=5.7" +pyzmq = ">=25" +tornado = ">=6.2" traitlets = ">=5.4.0" [package.extras] -cov = ["coverage[toml]", "curio", "matplotlib", "pytest-cov", "trio"] -docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "trio"] +cov = ["coverage[toml]", "matplotlib", "pytest-cov", "trio"] +docs = ["intersphinx-registry", "myst-parser", "pydata-sphinx-theme", "sphinx (<8.2.0)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "trio"] pyqt5 = ["pyqt5"] pyside6 = ["pyside6"] -test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio (>=0.23.5)", "pytest-cov", "pytest-timeout"] +test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0,<9)", "pytest-asyncio (>=0.23.5)", "pytest-cov", "pytest-timeout"] [[package]] name = "ipython" -version = "9.3.0" +version = "9.7.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.11" +groups = ["dev"] files = [ - {file = "ipython-9.3.0-py3-none-any.whl", hash = "sha256:1a0b6dd9221a1f5dddf725b57ac0cb6fddc7b5f470576231ae9162b9b3455a04"}, - {file = "ipython-9.3.0.tar.gz", hash = "sha256:79eb896f9f23f50ad16c3bc205f686f6e030ad246cc309c6279a242b14afe9d8"}, + {file = "ipython-9.7.0-py3-none-any.whl", hash = "sha256:bce8ac85eb9521adc94e1845b4c03d88365fd6ac2f4908ec4ed1eb1b0a065f9f"}, + {file = "ipython-9.7.0.tar.gz", hash = "sha256:5f6de88c905a566c6a9d6c400a8fed54a638e1f7543d17aae2551133216b1e4e"}, ] [package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -decorator = "*" -ipython-pygments-lexers = "*" -jedi = ">=0.16" -matplotlib-inline = "*" +colorama = {version = ">=0.4.4", markers = "sys_platform == \"win32\""} +decorator = ">=4.3.2" +ipython-pygments-lexers = ">=1.0.0" +jedi = ">=0.18.1" +matplotlib-inline = ">=0.1.5" pexpect = {version = ">4.3", markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\""} prompt_toolkit = ">=3.0.41,<3.1.0" -pygments = ">=2.4.0" -stack_data = "*" +pygments = ">=2.11.0" +stack_data = ">=0.6.0" traitlets = ">=5.13.0" typing_extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} [package.extras] all = ["ipython[doc,matplotlib,test,test-extra]"] black = ["black"] -doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinx_toml (==0.0.4)", "typing_extensions"] -matplotlib = ["matplotlib"] -test = ["packaging", "pytest", "pytest-asyncio (<0.22)", "testpath"] -test-extra = ["curio", "ipykernel", "ipython[test]", "jupyter_ai", "matplotlib (!=3.2.0)", "nbclient", "nbformat", "numpy (>=1.23)", "pandas", "trio"] +doc = ["docrepr", "exceptiongroup", "intersphinx_registry", "ipykernel", "ipython[matplotlib,test]", "setuptools (>=70.0)", "sphinx (>=8.0)", "sphinx-rtd-theme (>=0.1.8)", "sphinx_toml (==0.0.4)", "typing_extensions"] +matplotlib = ["matplotlib (>3.9)"] +test = ["packaging (>=20.1.0)", "pytest (>=7.0.0)", "pytest-asyncio (>=1.0.0)", "setuptools (>=61.2)", "testpath (>=0.2)"] +test-extra = ["curio", "ipykernel (>6.30)", "ipython[matplotlib]", "ipython[test]", "jupyter_ai", "nbclient", "nbformat", "numpy (>=1.27)", "pandas (>2.1)", "trio (>=0.1.0)"] [[package]] name = "ipython-pygments-lexers" @@ -1937,6 +2416,7 @@ version = "1.1.1" description = "Defines a variety of Pygments lexers for highlighting IPython code." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c"}, {file = "ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81"}, @@ -1951,6 +2431,7 @@ version = "20.11.0" description = "Operations with ISO 8601 durations" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "isoduration-20.11.0-py3-none-any.whl", hash = "sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042"}, {file = "isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9"}, @@ -1965,6 +2446,7 @@ version = "2.2.0" description = "Safely pass data to untrusted environments and back." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"}, {file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"}, @@ -1976,6 +2458,7 @@ version = "0.19.2" description = "An autocompletion tool for Python that can be used for text editors." optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, @@ -1995,6 +2478,7 @@ version = "3.1.6" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, @@ -2008,114 +2492,142 @@ i18n = ["Babel (>=2.7)"] [[package]] name = "jiter" -version = "0.10.0" +version = "0.12.0" description = "Fast iterable JSON parser." optional = false python-versions = ">=3.9" -files = [ - {file = "jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303"}, - {file = "jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e"}, - {file = "jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f"}, - {file = "jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224"}, - {file = "jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7"}, - {file = "jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6"}, - {file = "jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf"}, - {file = "jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90"}, - {file = "jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0"}, - {file = "jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee"}, - {file = "jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4"}, - {file = "jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5"}, - {file = "jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978"}, - {file = "jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc"}, - {file = "jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d"}, - {file = "jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2"}, - {file = "jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61"}, - {file = "jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db"}, - {file = "jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5"}, - {file = "jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606"}, - {file = "jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605"}, - {file = "jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5"}, - {file = "jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7"}, - {file = "jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812"}, - {file = "jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b"}, - {file = "jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744"}, - {file = "jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2"}, - {file = "jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026"}, - {file = "jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c"}, - {file = "jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959"}, - {file = "jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a"}, - {file = "jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95"}, - {file = "jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea"}, - {file = "jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b"}, - {file = "jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01"}, - {file = "jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49"}, - {file = "jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644"}, - {file = "jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a"}, - {file = "jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6"}, - {file = "jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3"}, - {file = "jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2"}, - {file = "jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25"}, - {file = "jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041"}, - {file = "jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca"}, - {file = "jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4"}, - {file = "jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e"}, - {file = "jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d"}, - {file = "jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4"}, - {file = "jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca"}, - {file = "jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070"}, - {file = "jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca"}, - {file = "jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522"}, - {file = "jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8"}, - {file = "jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216"}, - {file = "jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4"}, - {file = "jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426"}, - {file = "jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12"}, - {file = "jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9"}, - {file = "jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a"}, - {file = "jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853"}, - {file = "jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86"}, - {file = "jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357"}, - {file = "jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00"}, - {file = "jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5"}, - {file = "jiter-0.10.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bd6292a43c0fc09ce7c154ec0fa646a536b877d1e8f2f96c19707f65355b5a4d"}, - {file = "jiter-0.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:39de429dcaeb6808d75ffe9effefe96a4903c6a4b376b2f6d08d77c1aaee2f18"}, - {file = "jiter-0.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52ce124f13a7a616fad3bb723f2bfb537d78239d1f7f219566dc52b6f2a9e48d"}, - {file = "jiter-0.10.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:166f3606f11920f9a1746b2eea84fa2c0a5d50fd313c38bdea4edc072000b0af"}, - {file = "jiter-0.10.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:28dcecbb4ba402916034fc14eba7709f250c4d24b0c43fc94d187ee0580af181"}, - {file = "jiter-0.10.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86c5aa6910f9bebcc7bc4f8bc461aff68504388b43bfe5e5c0bd21efa33b52f4"}, - {file = "jiter-0.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ceeb52d242b315d7f1f74b441b6a167f78cea801ad7c11c36da77ff2d42e8a28"}, - {file = "jiter-0.10.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ff76d8887c8c8ee1e772274fcf8cc1071c2c58590d13e33bd12d02dc9a560397"}, - {file = "jiter-0.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a9be4d0fa2b79f7222a88aa488bd89e2ae0a0a5b189462a12def6ece2faa45f1"}, - {file = "jiter-0.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9ab7fd8738094139b6c1ab1822d6f2000ebe41515c537235fd45dabe13ec9324"}, - {file = "jiter-0.10.0-cp39-cp39-win32.whl", hash = "sha256:5f51e048540dd27f204ff4a87f5d79294ea0aa3aa552aca34934588cf27023cf"}, - {file = "jiter-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b28302349dc65703a9e4ead16f163b1c339efffbe1049c30a44b001a2a4fff9"}, - {file = "jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500"}, +groups = ["main"] +files = [ + {file = "jiter-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e7acbaba9703d5de82a2c98ae6a0f59ab9770ab5af5fa35e43a303aee962cf65"}, + {file = "jiter-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:364f1a7294c91281260364222f535bc427f56d4de1d8ffd718162d21fbbd602e"}, + {file = "jiter-0.12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85ee4d25805d4fb23f0a5167a962ef8e002dbfb29c0989378488e32cf2744b62"}, + {file = "jiter-0.12.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:796f466b7942107eb889c08433b6e31b9a7ed31daceaecf8af1be26fb26c0ca8"}, + {file = "jiter-0.12.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35506cb71f47dba416694e67af996bbdefb8e3608f1f78799c2e1f9058b01ceb"}, + {file = "jiter-0.12.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:726c764a90c9218ec9e4f99a33d6bf5ec169163f2ca0fc21b654e88c2abc0abc"}, + {file = "jiter-0.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa47810c5565274810b726b0dc86d18dce5fd17b190ebdc3890851d7b2a0e74"}, + {file = "jiter-0.12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8ec0259d3f26c62aed4d73b198c53e316ae11f0f69c8fbe6682c6dcfa0fcce2"}, + {file = "jiter-0.12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:79307d74ea83465b0152fa23e5e297149506435535282f979f18b9033c0bb025"}, + {file = "jiter-0.12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cf6e6dd18927121fec86739f1a8906944703941d000f0639f3eb6281cc601dca"}, + {file = "jiter-0.12.0-cp310-cp310-win32.whl", hash = "sha256:b6ae2aec8217327d872cbfb2c1694489057b9433afce447955763e6ab015b4c4"}, + {file = "jiter-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:c7f49ce90a71e44f7e1aa9e7ec415b9686bbc6a5961e57eab511015e6759bc11"}, + {file = "jiter-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d8f8a7e317190b2c2d60eb2e8aa835270b008139562d70fe732e1c0020ec53c9"}, + {file = "jiter-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2218228a077e784c6c8f1a8e5d6b8cb1dea62ce25811c356364848554b2056cd"}, + {file = "jiter-0.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9354ccaa2982bf2188fd5f57f79f800ef622ec67beb8329903abf6b10da7d423"}, + {file = "jiter-0.12.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f2607185ea89b4af9a604d4c7ec40e45d3ad03ee66998b031134bc510232bb7"}, + {file = "jiter-0.12.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a585a5e42d25f2e71db5f10b171f5e5ea641d3aa44f7df745aa965606111cc2"}, + {file = "jiter-0.12.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd9e21d34edff5a663c631f850edcb786719c960ce887a5661e9c828a53a95d9"}, + {file = "jiter-0.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a612534770470686cd5431478dc5a1b660eceb410abade6b1b74e320ca98de6"}, + {file = "jiter-0.12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3985aea37d40a908f887b34d05111e0aae822943796ebf8338877fee2ab67725"}, + {file = "jiter-0.12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b1207af186495f48f72529f8d86671903c8c10127cac6381b11dddc4aaa52df6"}, + {file = "jiter-0.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef2fb241de583934c9915a33120ecc06d94aa3381a134570f59eed784e87001e"}, + {file = "jiter-0.12.0-cp311-cp311-win32.whl", hash = "sha256:453b6035672fecce8007465896a25b28a6b59cfe8fbc974b2563a92f5a92a67c"}, + {file = "jiter-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:ca264b9603973c2ad9435c71a8ec8b49f8f715ab5ba421c85a51cde9887e421f"}, + {file = "jiter-0.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:cb00ef392e7d684f2754598c02c409f376ddcef857aae796d559e6cacc2d78a5"}, + {file = "jiter-0.12.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:305e061fa82f4680607a775b2e8e0bcb071cd2205ac38e6ef48c8dd5ebe1cf37"}, + {file = "jiter-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c1860627048e302a528333c9307c818c547f214d8659b0705d2195e1a94b274"}, + {file = "jiter-0.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df37577a4f8408f7e0ec3205d2a8f87672af8f17008358063a4d6425b6081ce3"}, + {file = "jiter-0.12.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fdd787356c1c13a4f40b43c2156276ef7a71eb487d98472476476d803fb2cf"}, + {file = "jiter-0.12.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1eb5db8d9c65b112aacf14fcd0faae9913d07a8afea5ed06ccdd12b724e966a1"}, + {file = "jiter-0.12.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:73c568cc27c473f82480abc15d1301adf333a7ea4f2e813d6a2c7d8b6ba8d0df"}, + {file = "jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4321e8a3d868919bcb1abb1db550d41f2b5b326f72df29e53b2df8b006eb9403"}, + {file = "jiter-0.12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0a51bad79f8cc9cac2b4b705039f814049142e0050f30d91695a2d9a6611f126"}, + {file = "jiter-0.12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2a67b678f6a5f1dd6c36d642d7db83e456bc8b104788262aaefc11a22339f5a9"}, + {file = "jiter-0.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efe1a211fe1fd14762adea941e3cfd6c611a136e28da6c39272dbb7a1bbe6a86"}, + {file = "jiter-0.12.0-cp312-cp312-win32.whl", hash = "sha256:d779d97c834b4278276ec703dc3fc1735fca50af63eb7262f05bdb4e62203d44"}, + {file = "jiter-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e8269062060212b373316fe69236096aaf4c49022d267c6736eebd66bbbc60bb"}, + {file = "jiter-0.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:06cb970936c65de926d648af0ed3d21857f026b1cf5525cb2947aa5e01e05789"}, + {file = "jiter-0.12.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6cc49d5130a14b732e0612bc76ae8db3b49898732223ef8b7599aa8d9810683e"}, + {file = "jiter-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37f27a32ce36364d2fa4f7fdc507279db604d27d239ea2e044c8f148410defe1"}, + {file = "jiter-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc0944aa3d4b4773e348cda635252824a78f4ba44328e042ef1ff3f6080d1cf"}, + {file = "jiter-0.12.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da25c62d4ee1ffbacb97fac6dfe4dcd6759ebdc9015991e92a6eae5816287f44"}, + {file = "jiter-0.12.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:048485c654b838140b007390b8182ba9774621103bd4d77c9c3f6f117474ba45"}, + {file = "jiter-0.12.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:635e737fbb7315bef0037c19b88b799143d2d7d3507e61a76751025226b3ac87"}, + {file = "jiter-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e017c417b1ebda911bd13b1e40612704b1f5420e30695112efdbed8a4b389ed"}, + {file = "jiter-0.12.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:89b0bfb8b2bf2351fba36bb211ef8bfceba73ef58e7f0c68fb67b5a2795ca2f9"}, + {file = "jiter-0.12.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f5aa5427a629a824a543672778c9ce0c5e556550d1569bb6ea28a85015287626"}, + {file = "jiter-0.12.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed53b3d6acbcb0fd0b90f20c7cb3b24c357fe82a3518934d4edfa8c6898e498c"}, + {file = "jiter-0.12.0-cp313-cp313-win32.whl", hash = "sha256:4747de73d6b8c78f2e253a2787930f4fffc68da7fa319739f57437f95963c4de"}, + {file = "jiter-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:e25012eb0c456fcc13354255d0338cd5397cce26c77b2832b3c4e2e255ea5d9a"}, + {file = "jiter-0.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:c97b92c54fe6110138c872add030a1f99aea2401ddcdaa21edf74705a646dd60"}, + {file = "jiter-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53839b35a38f56b8be26a7851a48b89bc47e5d88e900929df10ed93b95fea3d6"}, + {file = "jiter-0.12.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94f669548e55c91ab47fef8bddd9c954dab1938644e715ea49d7e117015110a4"}, + {file = "jiter-0.12.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:351d54f2b09a41600ffea43d081522d792e81dcfb915f6d2d242744c1cc48beb"}, + {file = "jiter-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2a5e90604620f94bf62264e7c2c038704d38217b7465b863896c6d7c902b06c7"}, + {file = "jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3"}, + {file = "jiter-0.12.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c46d927acd09c67a9fb1416df45c5a04c27e83aae969267e98fba35b74e99525"}, + {file = "jiter-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:774ff60b27a84a85b27b88cd5583899c59940bcc126caca97eb2a9df6aa00c49"}, + {file = "jiter-0.12.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5433fab222fb072237df3f637d01b81f040a07dcac1cb4a5c75c7aa9ed0bef1"}, + {file = "jiter-0.12.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8c593c6e71c07866ec6bfb790e202a833eeec885022296aff6b9e0b92d6a70e"}, + {file = "jiter-0.12.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90d32894d4c6877a87ae00c6b915b609406819dce8bc0d4e962e4de2784e567e"}, + {file = "jiter-0.12.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:798e46eed9eb10c3adbbacbd3bdb5ecd4cf7064e453d00dbef08802dae6937ff"}, + {file = "jiter-0.12.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3f1368f0a6719ea80013a4eb90ba72e75d7ea67cfc7846db2ca504f3df0169a"}, + {file = "jiter-0.12.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65f04a9d0b4406f7e51279710b27484af411896246200e461d80d3ba0caa901a"}, + {file = "jiter-0.12.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:fd990541982a24281d12b67a335e44f117e4c6cbad3c3b75c7dea68bf4ce3a67"}, + {file = "jiter-0.12.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:b111b0e9152fa7df870ecaebb0bd30240d9f7fff1f2003bcb4ed0f519941820b"}, + {file = "jiter-0.12.0-cp314-cp314-win32.whl", hash = "sha256:a78befb9cc0a45b5a5a0d537b06f8544c2ebb60d19d02c41ff15da28a9e22d42"}, + {file = "jiter-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:e1fe01c082f6aafbe5c8faf0ff074f38dfb911d53f07ec333ca03f8f6226debf"}, + {file = "jiter-0.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:d72f3b5a432a4c546ea4bedc84cce0c3404874f1d1676260b9c7f048a9855451"}, + {file = "jiter-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6ded41aeba3603f9728ed2b6196e4df875348ab97b28fc8afff115ed42ba7a7"}, + {file = "jiter-0.12.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a947920902420a6ada6ad51892082521978e9dd44a802663b001436e4b771684"}, + {file = "jiter-0.12.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:add5e227e0554d3a52cf390a7635edaffdf4f8fce4fdbcef3cc2055bb396a30c"}, + {file = "jiter-0.12.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9b1cda8fcb736250d7e8711d4580ebf004a46771432be0ae4796944b5dfa5d"}, + {file = "jiter-0.12.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deeb12a2223fe0135c7ff1356a143d57f95bbf1f4a66584f1fc74df21d86b993"}, + {file = "jiter-0.12.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c596cc0f4cb574877550ce4ecd51f8037469146addd676d7c1a30ebe6391923f"}, + {file = "jiter-0.12.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ab4c823b216a4aeab3fdbf579c5843165756bd9ad87cc6b1c65919c4715f783"}, + {file = "jiter-0.12.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e427eee51149edf962203ff8db75a7514ab89be5cb623fb9cea1f20b54f1107b"}, + {file = "jiter-0.12.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:edb868841f84c111255ba5e80339d386d937ec1fdce419518ce1bd9370fac5b6"}, + {file = "jiter-0.12.0-cp314-cp314t-win32.whl", hash = "sha256:8bbcfe2791dfdb7c5e48baf646d37a6a3dcb5a97a032017741dea9f817dca183"}, + {file = "jiter-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2fa940963bf02e1d8226027ef461e36af472dea85d36054ff835aeed944dd873"}, + {file = "jiter-0.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:506c9708dd29b27288f9f8f1140c3cb0e3d8ddb045956d7757b1fa0e0f39a473"}, + {file = "jiter-0.12.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c9d28b218d5f9e5f69a0787a196322a5056540cb378cac8ff542b4fa7219966c"}, + {file = "jiter-0.12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0ee12028daf8cfcf880dd492349a122a64f42c059b6c62a2b0c96a83a8da820"}, + {file = "jiter-0.12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b135ebe757a82d67ed2821526e72d0acf87dd61f6013e20d3c45b8048af927b"}, + {file = "jiter-0.12.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15d7fafb81af8a9e3039fc305529a61cd933eecee33b4251878a1c89859552a3"}, + {file = "jiter-0.12.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92d1f41211d8a8fe412faad962d424d334764c01dac6691c44691c2e4d3eedaf"}, + {file = "jiter-0.12.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3a64a48d7c917b8f32f25c176df8749ecf08cec17c466114727efe7441e17f6d"}, + {file = "jiter-0.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:122046f3b3710b85de99d9aa2f3f0492a8233a2f54a64902b096efc27ea747b5"}, + {file = "jiter-0.12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:27ec39225e03c32c6b863ba879deb427882f243ae46f0d82d68b695fa5b48b40"}, + {file = "jiter-0.12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26b9e155ddc132225a39b1995b3b9f0fe0f79a6d5cbbeacf103271e7d309b404"}, + {file = "jiter-0.12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9ab05b7c58e29bb9e60b70c2e0094c98df79a1e42e397b9bb6eaa989b7a66dd0"}, + {file = "jiter-0.12.0-cp39-cp39-win32.whl", hash = "sha256:59f9f9df87ed499136db1c2b6c9efb902f964bed42a582ab7af413b6a293e7b0"}, + {file = "jiter-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:d3719596a1ebe7a48a498e8d5d0c4bf7553321d4c3eee1d620628d51351a3928"}, + {file = "jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:4739a4657179ebf08f85914ce50332495811004cc1747852e8b2041ed2aab9b8"}, + {file = "jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:41da8def934bf7bec16cb24bd33c0ca62126d2d45d81d17b864bd5ad721393c3"}, + {file = "jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c44ee814f499c082e69872d426b624987dbc5943ab06e9bbaa4f81989fdb79e"}, + {file = "jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd2097de91cf03eaa27b3cbdb969addf83f0179c6afc41bbc4513705e013c65d"}, + {file = "jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:e8547883d7b96ef2e5fe22b88f8a4c8725a56e7f4abafff20fd5272d634c7ecb"}, + {file = "jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:89163163c0934854a668ed783a2546a0617f71706a2551a4a0666d91ab365d6b"}, + {file = "jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d96b264ab7d34bbb2312dedc47ce07cd53f06835eacbc16dde3761f47c3a9e7f"}, + {file = "jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c"}, + {file = "jiter-0.12.0.tar.gz", hash = "sha256:64dfcd7d5c168b38d3f9f8bba7fc639edb3418abcc74f22fdbe6b8938293f30b"}, ] [[package]] name = "joblib" -version = "1.5.1" +version = "1.5.2" description = "Lightweight pipelining with Python functions" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "joblib-1.5.1-py3-none-any.whl", hash = "sha256:4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a"}, - {file = "joblib-1.5.1.tar.gz", hash = "sha256:f4f86e351f39fe3d0d32a9f2c3d8af1ee4cec285aafcb27003dda5205576b444"}, + {file = "joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241"}, + {file = "joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55"}, ] [[package]] name = "json5" -version = "0.12.0" +version = "0.12.1" description = "A Python implementation of the JSON5 data format." optional = false python-versions = ">=3.8.0" +groups = ["dev"] files = [ - {file = "json5-0.12.0-py3-none-any.whl", hash = "sha256:6d37aa6c08b0609f16e1ec5ff94697e2cbbfbad5ac112afa05794da9ab7810db"}, - {file = "json5-0.12.0.tar.gz", hash = "sha256:0b4b6ff56801a1c7dc817b0241bca4ce474a0e6a163bfef3fc594d3fd263ff3a"}, + {file = "json5-0.12.1-py3-none-any.whl", hash = "sha256:d9c9b3bc34a5f54d43c35e11ef7cb87d8bdd098c6ace87117a7b7e83e705c1d5"}, + {file = "json5-0.12.1.tar.gz", hash = "sha256:b2743e77b3242f8d03c143dd975a6ec7c52e2f2afe76ed934e53503dd4ad4990"}, ] [package.extras] -dev = ["build (==1.2.2.post1)", "coverage (==7.5.4)", "coverage (==7.8.0)", "mypy (==1.14.1)", "mypy (==1.15.0)", "pip (==25.0.1)", "pylint (==3.2.7)", "pylint (==3.3.6)", "ruff (==0.11.2)", "twine (==6.1.0)", "uv (==0.6.11)"] +dev = ["build (==1.2.2.post1)", "coverage (==7.5.4) ; python_version < \"3.9\"", "coverage (==7.8.0) ; python_version >= \"3.9\"", "mypy (==1.14.1) ; python_version < \"3.9\"", "mypy (==1.15.0) ; python_version >= \"3.9\"", "pip (==25.0.1)", "pylint (==3.2.7) ; python_version < \"3.9\"", "pylint (==3.3.6) ; python_version >= \"3.9\"", "ruff (==0.11.2)", "twine (==6.1.0)", "uv (==0.6.11)"] [[package]] name = "jsonpointer" @@ -2123,6 +2635,7 @@ version = "3.0.0" description = "Identify specific nodes in a JSON document (RFC 6901)" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"}, {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, @@ -2130,13 +2643,14 @@ files = [ [[package]] name = "jsonschema" -version = "4.24.0" +version = "4.25.1" description = "An implementation of JSON Schema validation for Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "jsonschema-4.24.0-py3-none-any.whl", hash = "sha256:a462455f19f5faf404a7902952b6f0e3ce868f3ee09a359b05eca6673bd8412d"}, - {file = "jsonschema-4.24.0.tar.gz", hash = "sha256:0b4e8069eb12aedfa881333004bccaec24ecef5a8a6a4b6df142b2cc9599d196"}, + {file = "jsonschema-4.25.1-py3-none-any.whl", hash = "sha256:3fba0169e345c7175110351d456342c364814cfcf3b964ba4587f22915230a63"}, + {file = "jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85"}, ] [package.dependencies] @@ -2149,23 +2663,25 @@ jsonschema-specifications = ">=2023.03.6" referencing = ">=0.28.4" rfc3339-validator = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} rfc3986-validator = {version = ">0.1.0", optional = true, markers = "extra == \"format-nongpl\""} +rfc3987-syntax = {version = ">=1.1.0", optional = true, markers = "extra == \"format-nongpl\""} rpds-py = ">=0.7.1" uri-template = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} webcolors = {version = ">=24.6.0", optional = true, markers = "extra == \"format-nongpl\""} [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] -format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "rfc3987-syntax (>=1.1.0)", "uri-template", "webcolors (>=24.6.0)"] [[package]] name = "jsonschema-specifications" -version = "2025.4.1" +version = "2025.9.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af"}, - {file = "jsonschema_specifications-2025.4.1.tar.gz", hash = "sha256:630159c9f4dbea161a6a2205c3011cc4f18ff381b189fff48bb39b9bf26ae608"}, + {file = "jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe"}, + {file = "jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d"}, ] [package.dependencies] @@ -2177,6 +2693,7 @@ version = "8.6.3" description = "Jupyter protocol implementation and client libraries" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"}, {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"}, @@ -2191,22 +2708,22 @@ traitlets = ">=5.3" [package.extras] docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] +test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko ; sys_platform == \"win32\"", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] [[package]] name = "jupyter-core" -version = "5.8.1" +version = "5.9.1" description = "Jupyter core package. A base package on which Jupyter projects rely." optional = false -python-versions = ">=3.8" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0"}, - {file = "jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941"}, + {file = "jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407"}, + {file = "jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508"}, ] [package.dependencies] platformdirs = ">=2.5" -pywin32 = {version = ">=300", markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""} traitlets = ">=5.3" [package.extras] @@ -2219,6 +2736,7 @@ version = "0.12.0" description = "Jupyter Event System library" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "jupyter_events-0.12.0-py3-none-any.whl", hash = "sha256:6464b2fa5ad10451c3d35fabc75eab39556ae1e2853ad0c0cc31b656731a97fb"}, {file = "jupyter_events-0.12.0.tar.gz", hash = "sha256:fc3fce98865f6784c9cd0a56a20644fc6098f21c8c33834a8d9fe383c17e554b"}, @@ -2241,27 +2759,29 @@ test = ["click", "pre-commit", "pytest (>=7.0)", "pytest-asyncio (>=0.19.0)", "p [[package]] name = "jupyter-lsp" -version = "2.2.5" +version = "2.3.0" description = "Multi-Language Server WebSocket proxy for Jupyter Notebook/Lab server" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "jupyter-lsp-2.2.5.tar.gz", hash = "sha256:793147a05ad446f809fd53ef1cd19a9f5256fd0a2d6b7ce943a982cb4f545001"}, - {file = "jupyter_lsp-2.2.5-py3-none-any.whl", hash = "sha256:45fbddbd505f3fbfb0b6cb2f1bc5e15e83ab7c79cd6e89416b248cb3c00c11da"}, + {file = "jupyter_lsp-2.3.0-py3-none-any.whl", hash = "sha256:e914a3cb2addf48b1c7710914771aaf1819d46b2e5a79b0f917b5478ec93f34f"}, + {file = "jupyter_lsp-2.3.0.tar.gz", hash = "sha256:458aa59339dc868fb784d73364f17dbce8836e906cd75fd471a325cba02e0245"}, ] [package.dependencies] -jupyter-server = ">=1.1.2" +jupyter_server = ">=1.1.2" [[package]] name = "jupyter-server" -version = "2.16.0" +version = "2.17.0" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "jupyter_server-2.16.0-py3-none-any.whl", hash = "sha256:3d8db5be3bc64403b1c65b400a1d7f4647a5ce743f3b20dbdefe8ddb7b55af9e"}, - {file = "jupyter_server-2.16.0.tar.gz", hash = "sha256:65d4b44fdf2dcbbdfe0aa1ace4a842d4aaf746a2b7b168134d5aaed35621b7f6"}, + {file = "jupyter_server-2.17.0-py3-none-any.whl", hash = "sha256:e8cb9c7db4251f51ed307e329b81b72ccf2056ff82d50524debde1ee1870e13f"}, + {file = "jupyter_server-2.17.0.tar.gz", hash = "sha256:c38ea898566964c888b4772ae1ed58eca84592e88251d2cfc4d171f81f7e99d5"}, ] [package.dependencies] @@ -2274,7 +2794,7 @@ jupyter-events = ">=0.11.0" jupyter-server-terminals = ">=0.4.4" nbconvert = ">=6.4.4" nbformat = ">=5.3.0" -overrides = ">=5.0" +overrides = {version = ">=5.0", markers = "python_version < \"3.12\""} packaging = ">=22.0" prometheus-client = ">=0.9" pywinpty = {version = ">=2.0.1", markers = "os_name == \"nt\""} @@ -2295,6 +2815,7 @@ version = "0.5.3" description = "A Jupyter Server Extension Providing Terminals." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_server_terminals-0.5.3-py3-none-any.whl", hash = "sha256:41ee0d7dc0ebf2809c668e0fc726dfaf258fcd3e769568996ca731b6194ae9aa"}, {file = "jupyter_server_terminals-0.5.3.tar.gz", hash = "sha256:5ae0295167220e9ace0edcfdb212afd2b01ee8d179fe6f23c899590e9b8a5269"}, @@ -2310,19 +2831,20 @@ test = ["jupyter-server (>=2.0.0)", "pytest (>=7.0)", "pytest-jupyter[server] (> [[package]] name = "jupyterlab" -version = "4.4.3" +version = "4.4.10" description = "JupyterLab computational environment" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "jupyterlab-4.4.3-py3-none-any.whl", hash = "sha256:164302f6d4b6c44773dfc38d585665a4db401a16e5296c37df5cba63904fbdea"}, - {file = "jupyterlab-4.4.3.tar.gz", hash = "sha256:a94c32fd7f8b93e82a49dc70a6ec45a5c18281ca2a7228d12765e4e210e5bca2"}, + {file = "jupyterlab-4.4.10-py3-none-any.whl", hash = "sha256:65939ab4c8dcd0c42185c2d0d1a9d60b254dc8c46fc4fdb286b63c51e9358e07"}, + {file = "jupyterlab-4.4.10.tar.gz", hash = "sha256:521c017508af4e1d6d9d8a9d90f47a11c61197ad63b2178342489de42540a615"}, ] [package.dependencies] async-lru = ">=1.0.0" -httpx = ">=0.25.0" -ipykernel = ">=6.5.0" +httpx = ">=0.25.0,<1" +ipykernel = ">=6.5.0,<6.30.0 || >6.30.0" jinja2 = ">=3.0.3" jupyter-core = "*" jupyter-lsp = ">=2.0.0" @@ -2347,6 +2869,7 @@ version = "0.3.0" description = "Pygments theme using JupyterLab CSS variables" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"}, {file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"}, @@ -2354,13 +2877,14 @@ files = [ [[package]] name = "jupyterlab-server" -version = "2.27.3" +version = "2.28.0" description = "A set of server components for JupyterLab and JupyterLab like applications." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "jupyterlab_server-2.27.3-py3-none-any.whl", hash = "sha256:e697488f66c3db49df675158a77b3b017520d772c6e1548c7d9bcc5df7944ee4"}, - {file = "jupyterlab_server-2.27.3.tar.gz", hash = "sha256:eb36caca59e74471988f0ae25c77945610b887f777255aa21f8065def9e51ed4"}, + {file = "jupyterlab_server-2.28.0-py3-none-any.whl", hash = "sha256:e4355b148fdcf34d312bbbc80f22467d6d20460e8b8736bf235577dd18506968"}, + {file = "jupyterlab_server-2.28.0.tar.gz", hash = "sha256:35baa81898b15f93573e2deca50d11ac0ae407ebb688299d3a5213265033712c"}, ] [package.dependencies] @@ -2383,6 +2907,7 @@ version = "1.6.17" description = "Kaggle API" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "kaggle-1.6.17.tar.gz", hash = "sha256:439a7dea1d5039f320fd6ad5ec21b688dcfa70d405cb42095b81f41edc401b81"}, ] @@ -2399,99 +2924,140 @@ urllib3 = "*" [[package]] name = "kiwisolver" -version = "1.4.8" +version = "1.4.9" description = "A fast implementation of the Cassowary constraint solver" optional = false python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"}, + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"}, + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634"}, + {file = "kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611"}, + {file = "kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464"}, + {file = "kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2"}, + {file = "kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145"}, + {file = "kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54"}, + {file = "kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c"}, + {file = "kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d"}, + {file = "kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce"}, + {file = "kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7"}, + {file = "kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1"}, + {file = "kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d"}, +] + +[[package]] +name = "lark" +version = "1.3.1" +description = "a modern parsing library" +optional = false +python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "kiwisolver-1.4.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88c6f252f6816a73b1f8c904f7bbe02fd67c09a69f7cb8a0eecdbf5ce78e63db"}, - {file = "kiwisolver-1.4.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c72941acb7b67138f35b879bbe85be0f6c6a70cab78fe3ef6db9c024d9223e5b"}, - {file = "kiwisolver-1.4.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce2cf1e5688edcb727fdf7cd1bbd0b6416758996826a8be1d958f91880d0809d"}, - {file = "kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c8bf637892dc6e6aad2bc6d4d69d08764166e5e3f69d469e55427b6ac001b19d"}, - {file = "kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:034d2c891f76bd3edbdb3ea11140d8510dca675443da7304205a2eaa45d8334c"}, - {file = "kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d47b28d1dfe0793d5e96bce90835e17edf9a499b53969b03c6c47ea5985844c3"}, - {file = "kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb158fe28ca0c29f2260cca8c43005329ad58452c36f0edf298204de32a9a3ed"}, - {file = "kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5536185fce131780ebd809f8e623bf4030ce1b161353166c49a3c74c287897f"}, - {file = "kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:369b75d40abedc1da2c1f4de13f3482cb99e3237b38726710f4a793432b1c5ff"}, - {file = "kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:641f2ddf9358c80faa22e22eb4c9f54bd3f0e442e038728f500e3b978d00aa7d"}, - {file = "kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d561d2d8883e0819445cfe58d7ddd673e4015c3c57261d7bdcd3710d0d14005c"}, - {file = "kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1732e065704b47c9afca7ffa272f845300a4eb959276bf6970dc07265e73b605"}, - {file = "kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bcb1ebc3547619c3b58a39e2448af089ea2ef44b37988caf432447374941574e"}, - {file = "kiwisolver-1.4.8-cp310-cp310-win_amd64.whl", hash = "sha256:89c107041f7b27844179ea9c85d6da275aa55ecf28413e87624d033cf1f6b751"}, - {file = "kiwisolver-1.4.8-cp310-cp310-win_arm64.whl", hash = "sha256:b5773efa2be9eb9fcf5415ea3ab70fc785d598729fd6057bea38d539ead28271"}, - {file = "kiwisolver-1.4.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a4d3601908c560bdf880f07d94f31d734afd1bb71e96585cace0e38ef44c6d84"}, - {file = "kiwisolver-1.4.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856b269c4d28a5c0d5e6c1955ec36ebfd1651ac00e1ce0afa3e28da95293b561"}, - {file = "kiwisolver-1.4.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2b9a96e0f326205af81a15718a9073328df1173a2619a68553decb7097fd5d7"}, - {file = "kiwisolver-1.4.8-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5020c83e8553f770cb3b5fc13faac40f17e0b205bd237aebd21d53d733adb03"}, - {file = "kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dace81d28c787956bfbfbbfd72fdcef014f37d9b48830829e488fdb32b49d954"}, - {file = "kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11e1022b524bd48ae56c9b4f9296bce77e15a2e42a502cceba602f804b32bb79"}, - {file = "kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b9b4d2892fefc886f30301cdd80debd8bb01ecdf165a449eb6e78f79f0fabd6"}, - {file = "kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a96c0e790ee875d65e340ab383700e2b4891677b7fcd30a699146f9384a2bb0"}, - {file = "kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23454ff084b07ac54ca8be535f4174170c1094a4cff78fbae4f73a4bcc0d4dab"}, - {file = "kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:87b287251ad6488e95b4f0b4a79a6d04d3ea35fde6340eb38fbd1ca9cd35bbbc"}, - {file = "kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b21dbe165081142b1232a240fc6383fd32cdd877ca6cc89eab93e5f5883e1c25"}, - {file = "kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:768cade2c2df13db52475bd28d3a3fac8c9eff04b0e9e2fda0f3760f20b3f7fc"}, - {file = "kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d47cfb2650f0e103d4bf68b0b5804c68da97272c84bb12850d877a95c056bd67"}, - {file = "kiwisolver-1.4.8-cp311-cp311-win_amd64.whl", hash = "sha256:ed33ca2002a779a2e20eeb06aea7721b6e47f2d4b8a8ece979d8ba9e2a167e34"}, - {file = "kiwisolver-1.4.8-cp311-cp311-win_arm64.whl", hash = "sha256:16523b40aab60426ffdebe33ac374457cf62863e330a90a0383639ce14bf44b2"}, - {file = "kiwisolver-1.4.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d6af5e8815fd02997cb6ad9bbed0ee1e60014438ee1a5c2444c96f87b8843502"}, - {file = "kiwisolver-1.4.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bade438f86e21d91e0cf5dd7c0ed00cda0f77c8c1616bd83f9fc157fa6760d31"}, - {file = "kiwisolver-1.4.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b83dc6769ddbc57613280118fb4ce3cd08899cc3369f7d0e0fab518a7cf37fdb"}, - {file = "kiwisolver-1.4.8-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111793b232842991be367ed828076b03d96202c19221b5ebab421ce8bcad016f"}, - {file = "kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:257af1622860e51b1a9d0ce387bf5c2c4f36a90594cb9514f55b074bcc787cfc"}, - {file = "kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69b5637c3f316cab1ec1c9a12b8c5f4750a4c4b71af9157645bf32830e39c03a"}, - {file = "kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:782bb86f245ec18009890e7cb8d13a5ef54dcf2ebe18ed65f795e635a96a1c6a"}, - {file = "kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc978a80a0db3a66d25767b03688f1147a69e6237175c0f4ffffaaedf744055a"}, - {file = "kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:36dbbfd34838500a31f52c9786990d00150860e46cd5041386f217101350f0d3"}, - {file = "kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:eaa973f1e05131de5ff3569bbba7f5fd07ea0595d3870ed4a526d486fe57fa1b"}, - {file = "kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a66f60f8d0c87ab7f59b6fb80e642ebb29fec354a4dfad687ca4092ae69d04f4"}, - {file = "kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858416b7fb777a53f0c59ca08190ce24e9abbd3cffa18886a5781b8e3e26f65d"}, - {file = "kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:085940635c62697391baafaaeabdf3dd7a6c3643577dde337f4d66eba021b2b8"}, - {file = "kiwisolver-1.4.8-cp312-cp312-win_amd64.whl", hash = "sha256:01c3d31902c7db5fb6182832713d3b4122ad9317c2c5877d0539227d96bb2e50"}, - {file = "kiwisolver-1.4.8-cp312-cp312-win_arm64.whl", hash = "sha256:a3c44cb68861de93f0c4a8175fbaa691f0aa22550c331fefef02b618a9dcb476"}, - {file = "kiwisolver-1.4.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1c8ceb754339793c24aee1c9fb2485b5b1f5bb1c2c214ff13368431e51fc9a09"}, - {file = "kiwisolver-1.4.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a62808ac74b5e55a04a408cda6156f986cefbcf0ada13572696b507cc92fa1"}, - {file = "kiwisolver-1.4.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:68269e60ee4929893aad82666821aaacbd455284124817af45c11e50a4b42e3c"}, - {file = "kiwisolver-1.4.8-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34d142fba9c464bc3bbfeff15c96eab0e7310343d6aefb62a79d51421fcc5f1b"}, - {file = "kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ddc373e0eef45b59197de815b1b28ef89ae3955e7722cc9710fb91cd77b7f47"}, - {file = "kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:77e6f57a20b9bd4e1e2cedda4d0b986ebd0216236f0106e55c28aea3d3d69b16"}, - {file = "kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08e77738ed7538f036cd1170cbed942ef749137b1311fa2bbe2a7fda2f6bf3cc"}, - {file = "kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5ce1e481a74b44dd5e92ff03ea0cb371ae7a0268318e202be06c8f04f4f1246"}, - {file = "kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fc2ace710ba7c1dfd1a3b42530b62b9ceed115f19a1656adefce7b1782a37794"}, - {file = "kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:3452046c37c7692bd52b0e752b87954ef86ee2224e624ef7ce6cb21e8c41cc1b"}, - {file = "kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7e9a60b50fe8b2ec6f448fe8d81b07e40141bfced7f896309df271a0b92f80f3"}, - {file = "kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:918139571133f366e8362fa4a297aeba86c7816b7ecf0bc79168080e2bd79957"}, - {file = "kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e063ef9f89885a1d68dd8b2e18f5ead48653176d10a0e324e3b0030e3a69adeb"}, - {file = "kiwisolver-1.4.8-cp313-cp313-win_amd64.whl", hash = "sha256:a17b7c4f5b2c51bb68ed379defd608a03954a1845dfed7cc0117f1cc8a9b7fd2"}, - {file = "kiwisolver-1.4.8-cp313-cp313-win_arm64.whl", hash = "sha256:3cd3bc628b25f74aedc6d374d5babf0166a92ff1317f46267f12d2ed54bc1d30"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:370fd2df41660ed4e26b8c9d6bbcad668fbe2560462cba151a721d49e5b6628c"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:84a2f830d42707de1d191b9490ac186bf7997a9495d4e9072210a1296345f7dc"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7a3ad337add5148cf51ce0b55642dc551c0b9d6248458a757f98796ca7348712"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7506488470f41169b86d8c9aeff587293f530a23a23a49d6bc64dab66bedc71e"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f0121b07b356a22fb0414cec4666bbe36fd6d0d759db3d37228f496ed67c880"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6d6bd87df62c27d4185de7c511c6248040afae67028a8a22012b010bc7ad062"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:291331973c64bb9cce50bbe871fb2e675c4331dab4f31abe89f175ad7679a4d7"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:893f5525bb92d3d735878ec00f781b2de998333659507d29ea4466208df37bed"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b47a465040146981dc9db8647981b8cb96366fbc8d452b031e4f8fdffec3f26d"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:99cea8b9dd34ff80c521aef46a1dddb0dcc0283cf18bde6d756f1e6f31772165"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:151dffc4865e5fe6dafce5480fab84f950d14566c480c08a53c663a0020504b6"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:577facaa411c10421314598b50413aa1ebcf5126f704f1e5d72d7e4e9f020d90"}, - {file = "kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85"}, - {file = "kiwisolver-1.4.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e7a019419b7b510f0f7c9dceff8c5eae2392037eae483a7f9162625233802b0a"}, - {file = "kiwisolver-1.4.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:286b18e86682fd2217a48fc6be6b0f20c1d0ed10958d8dc53453ad58d7be0bf8"}, - {file = "kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4191ee8dfd0be1c3666ccbac178c5a05d5f8d689bbe3fc92f3c4abec817f8fe0"}, - {file = "kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd2785b9391f2873ad46088ed7599a6a71e762e1ea33e87514b1a441ed1da1c"}, - {file = "kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c07b29089b7ba090b6f1a669f1411f27221c3662b3a1b7010e67b59bb5a6f10b"}, - {file = "kiwisolver-1.4.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b"}, - {file = "kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e"}, + {file = "lark-1.3.1-py3-none-any.whl", hash = "sha256:c629b661023a014c37da873b4ff58a817398d12635d3bbb2c5a03be7fe5d1e12"}, + {file = "lark-1.3.1.tar.gz", hash = "sha256:b426a7a6d6d53189d318f2b6236ab5d6429eaf09259f1ca33eb716eed10d2905"}, ] +[package.extras] +atomic-cache = ["atomicwrites"] +interegular = ["interegular (>=0.3.1,<0.4.0)"] +nearley = ["js2py"] +regex = ["regex"] + [[package]] name = "litellm" version = "1.65.8" description = "Library to easily interface with LLM API providers" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" +groups = ["main"] files = [ {file = "litellm-1.65.8-py3-none-any.whl", hash = "sha256:53323d8477bc6bf08e0744363a743f45197b6af789136a3e95099b3c5cb36b25"}, {file = "litellm-1.65.8.tar.gz", hash = "sha256:b486359953aaf97c754c30bd865981208f545cb4454932378595988f3314e7b7"}, @@ -2511,8 +3077,8 @@ tiktoken = ">=0.7.0" tokenizers = "*" [package.extras] -extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "redisvl (>=0.4.1,<0.5.0)", "resend (>=0.8.0,<0.9.0)"] -proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "boto3 (==1.34.34)", "cryptography (>=43.0.1,<44.0.0)", "fastapi (>=0.115.5,<0.116.0)", "fastapi-sso (>=0.16.0,<0.17.0)", "gunicorn (>=23.0.0,<24.0.0)", "litellm-proxy-extras (==0.1.3)", "mcp (==1.5.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.18,<0.0.19)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.29.0,<0.30.0)", "uvloop (>=0.21.0,<0.22.0)", "websockets (>=13.1.0,<14.0.0)"] +extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "redisvl (>=0.4.1,<0.5.0) ; python_version >= \"3.9\" and python_version < \"3.14\"", "resend (>=0.8.0,<0.9.0)"] +proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "boto3 (==1.34.34)", "cryptography (>=43.0.1,<44.0.0)", "fastapi (>=0.115.5,<0.116.0)", "fastapi-sso (>=0.16.0,<0.17.0)", "gunicorn (>=23.0.0,<24.0.0)", "litellm-proxy-extras (==0.1.3)", "mcp (==1.5.0) ; python_version >= \"3.10\"", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.18,<0.0.19)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.29.0,<0.30.0)", "uvloop (>=0.21.0,<0.22.0)", "websockets (>=13.1.0,<14.0.0)"] [[package]] name = "mako" @@ -2520,6 +3086,7 @@ version = "1.3.10" description = "A super-fast templating language that borrows the best ideas from the existing templating languages." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"}, {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"}, @@ -2535,13 +3102,14 @@ testing = ["pytest"] [[package]] name = "markdown-it-py" -version = "3.0.0" +version = "4.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false -python-versions = ">=3.8" +python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, - {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, + {file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"}, + {file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"}, ] [package.dependencies] @@ -2549,82 +3117,110 @@ mdurl = ">=0.1,<1.0" [package.extras] benchmarking = ["psutil", "pytest", "pytest-benchmark"] -code-style = ["pre-commit (>=3.0,<4.0)"] -compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "markdown-it-pyrs", "mistletoe (>=1.0,<2.0)", "mistune (>=3.0,<4.0)", "panflute (>=2.3,<3.0)"] linkify = ["linkify-it-py (>=1,<3)"] -plugins = ["mdit-py-plugins"] +plugins = ["mdit-py-plugins (>=0.5.0)"] profiling = ["gprof2dot"] -rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] +rtd = ["ipykernel", "jupyter_sphinx", "mdit-py-plugins (>=0.5.0)", "myst-parser", "pyyaml", "sphinx", "sphinx-book-theme (>=1.0,<2.0)", "sphinx-copybutton", "sphinx-design"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions", "requests"] [[package]] name = "markupsafe" -version = "3.0.2" +version = "3.0.3" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.9" -files = [ - {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, - {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, - {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, - {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, - {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, - {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, - {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, - {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, +groups = ["main", "dev"] +files = [ + {file = "markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559"}, + {file = "markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419"}, + {file = "markupsafe-3.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695"}, + {file = "markupsafe-3.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591"}, + {file = "markupsafe-3.0.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c"}, + {file = "markupsafe-3.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f"}, + {file = "markupsafe-3.0.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6"}, + {file = "markupsafe-3.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1"}, + {file = "markupsafe-3.0.3-cp310-cp310-win32.whl", hash = "sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa"}, + {file = "markupsafe-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8"}, + {file = "markupsafe-3.0.3-cp310-cp310-win_arm64.whl", hash = "sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1"}, + {file = "markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad"}, + {file = "markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a"}, + {file = "markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50"}, + {file = "markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf"}, + {file = "markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f"}, + {file = "markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a"}, + {file = "markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115"}, + {file = "markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a"}, + {file = "markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19"}, + {file = "markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01"}, + {file = "markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c"}, + {file = "markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e"}, + {file = "markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce"}, + {file = "markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d"}, + {file = "markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d"}, + {file = "markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a"}, + {file = "markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b"}, + {file = "markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f"}, + {file = "markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b"}, + {file = "markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d"}, + {file = "markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c"}, + {file = "markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f"}, + {file = "markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795"}, + {file = "markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219"}, + {file = "markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6"}, + {file = "markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676"}, + {file = "markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9"}, + {file = "markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1"}, + {file = "markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc"}, + {file = "markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12"}, + {file = "markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed"}, + {file = "markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5"}, + {file = "markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485"}, + {file = "markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73"}, + {file = "markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37"}, + {file = "markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19"}, + {file = "markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025"}, + {file = "markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6"}, + {file = "markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f"}, + {file = "markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb"}, + {file = "markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009"}, + {file = "markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354"}, + {file = "markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218"}, + {file = "markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287"}, + {file = "markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe"}, + {file = "markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026"}, + {file = "markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737"}, + {file = "markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97"}, + {file = "markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d"}, + {file = "markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda"}, + {file = "markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf"}, + {file = "markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe"}, + {file = "markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9"}, + {file = "markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581"}, + {file = "markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4"}, + {file = "markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab"}, + {file = "markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175"}, + {file = "markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634"}, + {file = "markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50"}, + {file = "markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e"}, + {file = "markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5"}, + {file = "markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523"}, + {file = "markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc"}, + {file = "markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d"}, + {file = "markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9"}, + {file = "markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa"}, + {file = "markupsafe-3.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:15d939a21d546304880945ca1ecb8a039db6b4dc49b2c5a400387cdae6a62e26"}, + {file = "markupsafe-3.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f71a396b3bf33ecaa1626c255855702aca4d3d9fea5e051b41ac59a9c1c41edc"}, + {file = "markupsafe-3.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f4b68347f8c5eab4a13419215bdfd7f8c9b19f2b25520968adfad23eb0ce60c"}, + {file = "markupsafe-3.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8fc20152abba6b83724d7ff268c249fa196d8259ff481f3b1476383f8f24e42"}, + {file = "markupsafe-3.0.3-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:949b8d66bc381ee8b007cd945914c721d9aba8e27f71959d750a46f7c282b20b"}, + {file = "markupsafe-3.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:3537e01efc9d4dccdf77221fb1cb3b8e1a38d5428920e0657ce299b20324d758"}, + {file = "markupsafe-3.0.3-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:591ae9f2a647529ca990bc681daebdd52c8791ff06c2bfa05b65163e28102ef2"}, + {file = "markupsafe-3.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a320721ab5a1aba0a233739394eb907f8c8da5c98c9181d1161e77a0c8e36f2d"}, + {file = "markupsafe-3.0.3-cp39-cp39-win32.whl", hash = "sha256:df2449253ef108a379b8b5d6b43f4b1a8e81a061d6537becd5582fba5f9196d7"}, + {file = "markupsafe-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:7c3fb7d25180895632e5d3148dbdc29ea38ccb7fd210aa27acbd1201a1902c6e"}, + {file = "markupsafe-3.0.3-cp39-cp39-win_arm64.whl", hash = "sha256:38664109c14ffc9e7437e86b4dceb442b0096dfe3541d7864d9cbe1da4cf36c8"}, + {file = "markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698"}, ] [[package]] @@ -2633,6 +3229,7 @@ version = "3.26.1" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "marshmallow-3.26.1-py3-none-any.whl", hash = "sha256:3350409f20a70a7e4e11a27661187b77cdcaeb20abca41c1454fe33636bea09c"}, {file = "marshmallow-3.26.1.tar.gz", hash = "sha256:e6d8affb6cb61d39d26402096dc0aee12d5a26d490a121f118d2e81dc0719dc6"}, @@ -2648,45 +3245,67 @@ tests = ["pytest", "simplejson"] [[package]] name = "matplotlib" -version = "3.10.3" +version = "3.10.7" description = "Python plotting package" optional = false python-versions = ">=3.10" -files = [ - {file = "matplotlib-3.10.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:213fadd6348d106ca7db99e113f1bea1e65e383c3ba76e8556ba4a3054b65ae7"}, - {file = "matplotlib-3.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3bec61cb8221f0ca6313889308326e7bb303d0d302c5cc9e523b2f2e6c73deb"}, - {file = "matplotlib-3.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c21ae75651c0231b3ba014b6d5e08fb969c40cdb5a011e33e99ed0c9ea86ecb"}, - {file = "matplotlib-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a49e39755580b08e30e3620efc659330eac5d6534ab7eae50fa5e31f53ee4e30"}, - {file = "matplotlib-3.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cf4636203e1190871d3a73664dea03d26fb019b66692cbfd642faafdad6208e8"}, - {file = "matplotlib-3.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:fd5641a9bb9d55f4dd2afe897a53b537c834b9012684c8444cc105895c8c16fd"}, - {file = "matplotlib-3.10.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0ef061f74cd488586f552d0c336b2f078d43bc00dc473d2c3e7bfee2272f3fa8"}, - {file = "matplotlib-3.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d96985d14dc5f4a736bbea4b9de9afaa735f8a0fc2ca75be2fa9e96b2097369d"}, - {file = "matplotlib-3.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5f0283da91e9522bdba4d6583ed9d5521566f63729ffb68334f86d0bb98049"}, - {file = "matplotlib-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdfa07c0ec58035242bc8b2c8aae37037c9a886370eef6850703d7583e19964b"}, - {file = "matplotlib-3.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c0b9849a17bce080a16ebcb80a7b714b5677d0ec32161a2cc0a8e5a6030ae220"}, - {file = "matplotlib-3.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:eef6ed6c03717083bc6d69c2d7ee8624205c29a8e6ea5a31cd3492ecdbaee1e1"}, - {file = "matplotlib-3.10.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0ab1affc11d1f495ab9e6362b8174a25afc19c081ba5b0775ef00533a4236eea"}, - {file = "matplotlib-3.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2a818d8bdcafa7ed2eed74487fdb071c09c1ae24152d403952adad11fa3c65b4"}, - {file = "matplotlib-3.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:748ebc3470c253e770b17d8b0557f0aa85cf8c63fd52f1a61af5b27ec0b7ffee"}, - {file = "matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed70453fd99733293ace1aec568255bc51c6361cb0da94fa5ebf0649fdb2150a"}, - {file = "matplotlib-3.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dbed9917b44070e55640bd13419de83b4c918e52d97561544814ba463811cbc7"}, - {file = "matplotlib-3.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:cf37d8c6ef1a48829443e8ba5227b44236d7fcaf7647caa3178a4ff9f7a5be05"}, - {file = "matplotlib-3.10.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9f2efccc8dcf2b86fc4ee849eea5dcaecedd0773b30f47980dc0cbeabf26ec84"}, - {file = "matplotlib-3.10.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3ddbba06a6c126e3301c3d272a99dcbe7f6c24c14024e80307ff03791a5f294e"}, - {file = "matplotlib-3.10.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:748302b33ae9326995b238f606e9ed840bf5886ebafcb233775d946aa8107a15"}, - {file = "matplotlib-3.10.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a80fcccbef63302c0efd78042ea3c2436104c5b1a4d3ae20f864593696364ac7"}, - {file = "matplotlib-3.10.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:55e46cbfe1f8586adb34f7587c3e4f7dedc59d5226719faf6cb54fc24f2fd52d"}, - {file = "matplotlib-3.10.3-cp313-cp313-win_amd64.whl", hash = "sha256:151d89cb8d33cb23345cd12490c76fd5d18a56581a16d950b48c6ff19bb2ab93"}, - {file = "matplotlib-3.10.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:c26dd9834e74d164d06433dc7be5d75a1e9890b926b3e57e74fa446e1a62c3e2"}, - {file = "matplotlib-3.10.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:24853dad5b8c84c8c2390fc31ce4858b6df504156893292ce8092d190ef8151d"}, - {file = "matplotlib-3.10.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68f7878214d369d7d4215e2a9075fef743be38fa401d32e6020bab2dfabaa566"}, - {file = "matplotlib-3.10.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6929fc618cb6db9cb75086f73b3219bbb25920cb24cee2ea7a12b04971a4158"}, - {file = "matplotlib-3.10.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6c7818292a5cc372a2dc4c795e5c356942eb8350b98ef913f7fda51fe175ac5d"}, - {file = "matplotlib-3.10.3-cp313-cp313t-win_amd64.whl", hash = "sha256:4f23ffe95c5667ef8a2b56eea9b53db7f43910fa4a2d5472ae0f72b64deab4d5"}, - {file = "matplotlib-3.10.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:86ab63d66bbc83fdb6733471d3bff40897c1e9921cba112accd748eee4bce5e4"}, - {file = "matplotlib-3.10.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a48f9c08bf7444b5d2391a83e75edb464ccda3c380384b36532a0962593a1751"}, - {file = "matplotlib-3.10.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb73d8aa75a237457988f9765e4dfe1c0d2453c5ca4eabc897d4309672c8e014"}, - {file = "matplotlib-3.10.3.tar.gz", hash = "sha256:2f82d2c5bb7ae93aaaa4cd42aca65d76ce6376f83304fa3a630b569aca274df0"}, +groups = ["main"] +files = [ + {file = "matplotlib-3.10.7-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7ac81eee3b7c266dd92cee1cd658407b16c57eed08c7421fa354ed68234de380"}, + {file = "matplotlib-3.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:667ecd5d8d37813a845053d8f5bf110b534c3c9f30e69ebd25d4701385935a6d"}, + {file = "matplotlib-3.10.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc1c51b846aca49a5a8b44fbba6a92d583a35c64590ad9e1e950dc88940a4297"}, + {file = "matplotlib-3.10.7-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a11c2e9e72e7de09b7b72e62f3df23317c888299c875e2b778abf1eda8c0a42"}, + {file = "matplotlib-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f19410b486fdd139885ace124e57f938c1e6a3210ea13dd29cab58f5d4bc12c7"}, + {file = "matplotlib-3.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:b498e9e4022f93de2d5a37615200ca01297ceebbb56fe4c833f46862a490f9e3"}, + {file = "matplotlib-3.10.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:53b492410a6cd66c7a471de6c924f6ede976e963c0f3097a3b7abfadddc67d0a"}, + {file = "matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9749313deb729f08207718d29c86246beb2ea3fdba753595b55901dee5d2fd6"}, + {file = "matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2222c7ba2cbde7fe63032769f6eb7e83ab3227f47d997a8453377709b7fe3a5a"}, + {file = "matplotlib-3.10.7-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e91f61a064c92c307c5a9dc8c05dc9f8a68f0a3be199d9a002a0622e13f874a1"}, + {file = "matplotlib-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f1851eab59ca082c95df5a500106bad73672645625e04538b3ad0f69471ffcc"}, + {file = "matplotlib-3.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:6516ce375109c60ceec579e699524e9d504cd7578506f01150f7a6bc174a775e"}, + {file = "matplotlib-3.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:b172db79759f5f9bc13ef1c3ef8b9ee7b37b0247f987fbbbdaa15e4f87fd46a9"}, + {file = "matplotlib-3.10.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a0edb7209e21840e8361e91ea84ea676658aa93edd5f8762793dec77a4a6748"}, + {file = "matplotlib-3.10.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c380371d3c23e0eadf8ebff114445b9f970aff2010198d498d4ab4c3b41eea4f"}, + {file = "matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5f256d49fea31f40f166a5e3131235a5d2f4b7f44520b1cf0baf1ce568ccff0"}, + {file = "matplotlib-3.10.7-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11ae579ac83cdf3fb72573bb89f70e0534de05266728740d478f0f818983c695"}, + {file = "matplotlib-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4c14b6acd16cddc3569a2d515cfdd81c7a68ac5639b76548cfc1a9e48b20eb65"}, + {file = "matplotlib-3.10.7-cp312-cp312-win_amd64.whl", hash = "sha256:0d8c32b7ea6fb80b1aeff5a2ceb3fb9778e2759e899d9beff75584714afcc5ee"}, + {file = "matplotlib-3.10.7-cp312-cp312-win_arm64.whl", hash = "sha256:5f3f6d315dcc176ba7ca6e74c7768fb7e4cf566c49cb143f6bc257b62e634ed8"}, + {file = "matplotlib-3.10.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1d9d3713a237970569156cfb4de7533b7c4eacdd61789726f444f96a0d28f57f"}, + {file = "matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37a1fea41153dd6ee061d21ab69c9cf2cf543160b1b85d89cd3d2e2a7902ca4c"}, + {file = "matplotlib-3.10.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b3c4ea4948d93c9c29dc01c0c23eef66f2101bf75158c291b88de6525c55c3d1"}, + {file = "matplotlib-3.10.7-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22df30ffaa89f6643206cf13877191c63a50e8f800b038bc39bee9d2d4957632"}, + {file = "matplotlib-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b69676845a0a66f9da30e87f48be36734d6748024b525ec4710be40194282c84"}, + {file = "matplotlib-3.10.7-cp313-cp313-win_amd64.whl", hash = "sha256:744991e0cc863dd669c8dc9136ca4e6e0082be2070b9d793cbd64bec872a6815"}, + {file = "matplotlib-3.10.7-cp313-cp313-win_arm64.whl", hash = "sha256:fba2974df0bf8ce3c995fa84b79cde38326e0f7b5409e7a3a481c1141340bcf7"}, + {file = "matplotlib-3.10.7-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:932c55d1fa7af4423422cb6a492a31cbcbdbe68fd1a9a3f545aa5e7a143b5355"}, + {file = "matplotlib-3.10.7-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e38c2d581d62ee729a6e144c47a71b3f42fb4187508dbbf4fe71d5612c3433b"}, + {file = "matplotlib-3.10.7-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:786656bb13c237bbcebcd402f65f44dd61ead60ee3deb045af429d889c8dbc67"}, + {file = "matplotlib-3.10.7-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09d7945a70ea43bf9248f4b6582734c2fe726723204a76eca233f24cffc7ef67"}, + {file = "matplotlib-3.10.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d0b181e9fa8daf1d9f2d4c547527b167cb8838fc587deabca7b5c01f97199e84"}, + {file = "matplotlib-3.10.7-cp313-cp313t-win_amd64.whl", hash = "sha256:31963603041634ce1a96053047b40961f7a29eb8f9a62e80cc2c0427aa1d22a2"}, + {file = "matplotlib-3.10.7-cp313-cp313t-win_arm64.whl", hash = "sha256:aebed7b50aa6ac698c90f60f854b47e48cd2252b30510e7a1feddaf5a3f72cbf"}, + {file = "matplotlib-3.10.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d883460c43e8c6b173fef244a2341f7f7c0e9725c7fe68306e8e44ed9c8fb100"}, + {file = "matplotlib-3.10.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07124afcf7a6504eafcb8ce94091c5898bbdd351519a1beb5c45f7a38c67e77f"}, + {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c17398b709a6cce3d9fdb1595c33e356d91c098cd9486cb2cc21ea2ea418e715"}, + {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7146d64f561498764561e9cd0ed64fcf582e570fc519e6f521e2d0cfd43365e1"}, + {file = "matplotlib-3.10.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:90ad854c0a435da3104c01e2c6f0028d7e719b690998a2333d7218db80950722"}, + {file = "matplotlib-3.10.7-cp314-cp314-win_amd64.whl", hash = "sha256:4645fc5d9d20ffa3a39361fcdbcec731382763b623b72627806bf251b6388866"}, + {file = "matplotlib-3.10.7-cp314-cp314-win_arm64.whl", hash = "sha256:9257be2f2a03415f9105c486d304a321168e61ad450f6153d77c69504ad764bb"}, + {file = "matplotlib-3.10.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1e4bbad66c177a8fdfa53972e5ef8be72a5f27e6a607cec0d8579abd0f3102b1"}, + {file = "matplotlib-3.10.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d8eb7194b084b12feb19142262165832fc6ee879b945491d1c3d4660748020c4"}, + {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4d41379b05528091f00e1728004f9a8d7191260f3862178b88e8fd770206318"}, + {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a74f79fafb2e177f240579bc83f0b60f82cc47d2f1d260f422a0627207008ca"}, + {file = "matplotlib-3.10.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:702590829c30aada1e8cef0568ddbffa77ca747b4d6e36c6d173f66e301f89cc"}, + {file = "matplotlib-3.10.7-cp314-cp314t-win_amd64.whl", hash = "sha256:f79d5de970fc90cd5591f60053aecfce1fcd736e0303d9f0bf86be649fa68fb8"}, + {file = "matplotlib-3.10.7-cp314-cp314t-win_arm64.whl", hash = "sha256:cb783436e47fcf82064baca52ce748af71725d0352e1d31564cbe9c95df92b9c"}, + {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5c09cf8f2793f81368f49f118b6f9f937456362bee282eac575cca7f84cda537"}, + {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:de66744b2bb88d5cd27e80dfc2ec9f0517d0a46d204ff98fe9e5f2864eb67657"}, + {file = "matplotlib-3.10.7-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53cc80662dd197ece414dd5b66e07370201515a3eaf52e7c518c68c16814773b"}, + {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15112bcbaef211bd663fa935ec33313b948e214454d949b723998a43357b17b0"}, + {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2a959c640cdeecdd2ec3136e8ea0441da59bcaf58d67e9c590740addba2cb68"}, + {file = "matplotlib-3.10.7-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3886e47f64611046bc1db523a09dd0a0a6bed6081e6f90e13806dd1d1d1b5e91"}, + {file = "matplotlib-3.10.7.tar.gz", hash = "sha256:a06ba7e2a2ef9131c79c49e63dad355d2d878413a0376c1727c8b9335ff731c7"}, ] [package.dependencies] @@ -2697,7 +3316,7 @@ kiwisolver = ">=1.3.1" numpy = ">=1.23" packaging = ">=20.0" pillow = ">=8" -pyparsing = ">=2.3.1" +pyparsing = ">=3" python-dateutil = ">=2.7" [package.extras] @@ -2705,24 +3324,29 @@ dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setup [[package]] name = "matplotlib-inline" -version = "0.1.7" +version = "0.2.1" description = "Inline Matplotlib backend for Jupyter" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, - {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, + {file = "matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76"}, + {file = "matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe"}, ] [package.dependencies] traitlets = "*" +[package.extras] +test = ["flake8", "nbdime", "nbval", "notebook", "pytest"] + [[package]] name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -2730,37 +3354,43 @@ files = [ [[package]] name = "mistune" -version = "3.1.3" +version = "3.1.4" description = "A sane and fast Markdown parser with useful plugins and renderers" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "mistune-3.1.3-py3-none-any.whl", hash = "sha256:1a32314113cff28aa6432e99e522677c8587fd83e3d51c29b82a52409c842bd9"}, - {file = "mistune-3.1.3.tar.gz", hash = "sha256:a7035c21782b2becb6be62f8f25d3df81ccb4d6fa477a6525b15af06539f02a0"}, + {file = "mistune-3.1.4-py3-none-any.whl", hash = "sha256:93691da911e5d9d2e23bc54472892aff676df27a75274962ff9edc210364266d"}, + {file = "mistune-3.1.4.tar.gz", hash = "sha256:b5a7f801d389f724ec702840c11d8fc48f2b33519102fc7ee739e8177b672164"}, ] [[package]] name = "mlflow" -version = "3.1.0rc0" +version = "3.6.0" description = "MLflow is an open source platform for the complete machine learning lifecycle" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "mlflow-3.1.0rc0-py3-none-any.whl", hash = "sha256:5a45aa64e03e9036bf6af169f62c60a13de5cc6bbf6201381a2ca18e6d3f316e"}, - {file = "mlflow-3.1.0rc0.tar.gz", hash = "sha256:f7b2b7476b0f483b6ababb7e110d7c274b19136ee86dbdfcc60d43043eaff497"}, + {file = "mlflow-3.6.0-py3-none-any.whl", hash = "sha256:04d1691facd412be8e61b963fad859286cfeb2dbcafaea294e6aa0b83a15fc04"}, + {file = "mlflow-3.6.0.tar.gz", hash = "sha256:d945d259b5c6b551a9f26846db8979fd84c78114a027b77ada3298f821a9b0e1"}, ] [package.dependencies] alembic = "<1.10.0 || >1.10.0,<2" +cryptography = ">=43.0.0,<47" docker = ">=4.0.0,<8" Flask = "<4" +Flask-CORS = "<7" graphene = "<4" gunicorn = {version = "<24", markers = "platform_system != \"Windows\""} +huey = ">=2.5.0,<3" matplotlib = "<4" -mlflow-skinny = "3.1.0rc0" +mlflow-skinny = "3.6.0" +mlflow-tracing = "3.6.0" numpy = "<3" pandas = "<3" -pyarrow = ">=4.0.0,<21" +pyarrow = ">=4.0.0,<23" scikit-learn = "<2" scipy = "<2" sqlalchemy = ">=1.4.0,<3" @@ -2769,29 +3399,30 @@ waitress = {version = "<4", markers = "platform_system == \"Windows\""} [package.extras] aliyun-oss = ["aliyunstoreplugin"] auth = ["Flask-WTF (<2)"] -databricks = ["azure-storage-file-datalake (>12)", "boto3 (>1)", "botocore", "google-cloud-storage (>=1.30.0)"] +databricks = ["azure-storage-file-datalake (>12)", "boto3 (>1)", "botocore", "databricks-agents (>=1.2.0,<2.0)", "google-cloud-storage (>=1.30.0)"] extras = ["azureml-core (>=1.2.0)", "boto3", "botocore", "google-cloud-storage (>=1.30.0)", "kubernetes", "prometheus-flask-exporter", "pyarrow", "pysftp", "requests-auth-aws-sigv4", "virtualenv"] gateway = ["aiohttp (<4)", "boto3 (>=1.28.56,<2)", "fastapi (<1)", "slowapi (>=0.1.9,<1)", "tiktoken (<1)", "uvicorn[standard] (<1)", "watchfiles (<2)"] genai = ["aiohttp (<4)", "boto3 (>=1.28.56,<2)", "fastapi (<1)", "slowapi (>=0.1.9,<1)", "tiktoken (<1)", "uvicorn[standard] (<1)", "watchfiles (<2)"] jfrog = ["mlflow-jfrog-plugin"] -langchain = ["langchain (>=0.1.0,<=0.3.25)"] -mlserver = ["mlserver (>=1.2.0,!=1.3.1)", "mlserver-mlflow (>=1.2.0,!=1.3.1)"] +langchain = ["langchain (>=0.3.7,<=0.3.27)"] +mcp = ["fastmcp (>=2.0.0,<3)"] +mlserver = ["mlserver (>=1.2.0,!=1.3.1,<2.0.0)", "mlserver-mlflow (>=1.2.0,!=1.3.1,<2.0.0)"] sqlserver = ["mlflow-dbstore"] -xethub = ["mlflow-xethub"] [[package]] name = "mlflow-skinny" -version = "3.1.0rc0" +version = "3.6.0" description = "MLflow is an open source platform for the complete machine learning lifecycle" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "mlflow_skinny-3.1.0rc0-py3-none-any.whl", hash = "sha256:d80fbc3cd70715c76033d9efb30a050ef993ba40575ca18a8ffcd938fd2be2ab"}, - {file = "mlflow_skinny-3.1.0rc0.tar.gz", hash = "sha256:f6f24a5ce7aa27ef19176a9c2cc93397cc0b8a57f6b190dc407bd9981c2e6dc6"}, + {file = "mlflow_skinny-3.6.0-py3-none-any.whl", hash = "sha256:c83b34fce592acb2cc6bddcb507587a6d9ef3f590d9e7a8658c85e0980596d78"}, + {file = "mlflow_skinny-3.6.0.tar.gz", hash = "sha256:cc04706b5b6faace9faf95302a6e04119485e1bfe98ddc9b85b81984e80944b6"}, ] [package.dependencies] -cachetools = ">=5.0.0,<6" +cachetools = ">=5.0.0,<7" click = ">=7.0,<9" cloudpickle = "<4" databricks-sdk = ">=0.20.0,<1" @@ -2799,10 +3430,12 @@ fastapi = "<1" gitpython = ">=3.1.9,<4" importlib_metadata = ">=3.7.0,<4.7.0 || >4.7.0,<9" opentelemetry-api = ">=1.9.0,<3" +opentelemetry-proto = ">=1.9.0,<3" opentelemetry-sdk = ">=1.9.0,<3" packaging = "<26" protobuf = ">=3.12.0,<7" -pydantic = ">=1.10.8,<3" +pydantic = ">=2.0.0,<3" +python-dotenv = ">=0.19.0,<2" pyyaml = ">=5.1,<7" requests = ">=2.17.3,<3" sqlparse = ">=0.4.0,<1" @@ -2812,15 +3445,37 @@ uvicorn = "<1" [package.extras] aliyun-oss = ["aliyunstoreplugin"] auth = ["Flask-WTF (<2)"] -databricks = ["azure-storage-file-datalake (>12)", "boto3 (>1)", "botocore", "google-cloud-storage (>=1.30.0)"] +databricks = ["azure-storage-file-datalake (>12)", "boto3 (>1)", "botocore", "databricks-agents (>=1.2.0,<2.0)", "google-cloud-storage (>=1.30.0)"] extras = ["azureml-core (>=1.2.0)", "boto3", "botocore", "google-cloud-storage (>=1.30.0)", "kubernetes", "prometheus-flask-exporter", "pyarrow", "pysftp", "requests-auth-aws-sigv4", "virtualenv"] gateway = ["aiohttp (<4)", "boto3 (>=1.28.56,<2)", "fastapi (<1)", "slowapi (>=0.1.9,<1)", "tiktoken (<1)", "uvicorn[standard] (<1)", "watchfiles (<2)"] genai = ["aiohttp (<4)", "boto3 (>=1.28.56,<2)", "fastapi (<1)", "slowapi (>=0.1.9,<1)", "tiktoken (<1)", "uvicorn[standard] (<1)", "watchfiles (<2)"] jfrog = ["mlflow-jfrog-plugin"] -langchain = ["langchain (>=0.1.0,<=0.3.25)"] -mlserver = ["mlserver (>=1.2.0,!=1.3.1)", "mlserver-mlflow (>=1.2.0,!=1.3.1)"] +langchain = ["langchain (>=0.3.7,<=0.3.27)"] +mcp = ["fastmcp (>=2.0.0,<3)"] +mlserver = ["mlserver (>=1.2.0,!=1.3.1,<2.0.0)", "mlserver-mlflow (>=1.2.0,!=1.3.1,<2.0.0)"] sqlserver = ["mlflow-dbstore"] -xethub = ["mlflow-xethub"] + +[[package]] +name = "mlflow-tracing" +version = "3.6.0" +description = "MLflow Tracing SDK is an open-source, lightweight Python package that only includes the minimum set of dependencies and functionality to instrument your code/models/agents with MLflow Tracing." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "mlflow_tracing-3.6.0-py3-none-any.whl", hash = "sha256:a68ff03ba5129c67dc98e6871e0d5ef512dd3ee66d01e1c1a0c946c08a6d4755"}, + {file = "mlflow_tracing-3.6.0.tar.gz", hash = "sha256:ccff80b3aad6caa18233c98ba69922a91a6f914e0a13d12e1977af7523523d4c"}, +] + +[package.dependencies] +cachetools = ">=5.0.0,<7" +databricks-sdk = ">=0.20.0,<1" +opentelemetry-api = ">=1.9.0,<3" +opentelemetry-proto = ">=1.9.0,<3" +opentelemetry-sdk = ">=1.9.0,<3" +packaging = "<26" +protobuf = ">=3.12.0,<7" +pydantic = ">=2.0.0,<3" [[package]] name = "mlxtend" @@ -2828,6 +3483,7 @@ version = "0.23.4" description = "Machine Learning Library Extensions" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "mlxtend-0.23.4-py3-none-any.whl", hash = "sha256:8675456e2b71841116e5317f6d7aa568848ea2546865eb5eca7192e9b7f395f4"}, {file = "mlxtend-0.23.4.tar.gz", hash = "sha256:ba8c8427cbe65d462c8487511331cae0cdcf9252d92f8687f0a53d311128770f"}, @@ -2851,6 +3507,7 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -2859,193 +3516,235 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] name = "msgpack" -version = "1.1.0" +version = "1.1.2" description = "MessagePack serializer" optional = false -python-versions = ">=3.8" -files = [ - {file = "msgpack-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd"}, - {file = "msgpack-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d"}, - {file = "msgpack-1.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5"}, - {file = "msgpack-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5"}, - {file = "msgpack-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e"}, - {file = "msgpack-1.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b"}, - {file = "msgpack-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f"}, - {file = "msgpack-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68"}, - {file = "msgpack-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b"}, - {file = "msgpack-1.1.0-cp310-cp310-win32.whl", hash = "sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044"}, - {file = "msgpack-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f"}, - {file = "msgpack-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7"}, - {file = "msgpack-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa"}, - {file = "msgpack-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701"}, - {file = "msgpack-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6"}, - {file = "msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59"}, - {file = "msgpack-1.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0"}, - {file = "msgpack-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e"}, - {file = "msgpack-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6"}, - {file = "msgpack-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5"}, - {file = "msgpack-1.1.0-cp311-cp311-win32.whl", hash = "sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88"}, - {file = "msgpack-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788"}, - {file = "msgpack-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d"}, - {file = "msgpack-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2"}, - {file = "msgpack-1.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420"}, - {file = "msgpack-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2"}, - {file = "msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39"}, - {file = "msgpack-1.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f"}, - {file = "msgpack-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247"}, - {file = "msgpack-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c"}, - {file = "msgpack-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b"}, - {file = "msgpack-1.1.0-cp312-cp312-win32.whl", hash = "sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b"}, - {file = "msgpack-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f"}, - {file = "msgpack-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf"}, - {file = "msgpack-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330"}, - {file = "msgpack-1.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734"}, - {file = "msgpack-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e"}, - {file = "msgpack-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca"}, - {file = "msgpack-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915"}, - {file = "msgpack-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d"}, - {file = "msgpack-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434"}, - {file = "msgpack-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c"}, - {file = "msgpack-1.1.0-cp313-cp313-win32.whl", hash = "sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc"}, - {file = "msgpack-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f"}, - {file = "msgpack-1.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec"}, - {file = "msgpack-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96"}, - {file = "msgpack-1.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870"}, - {file = "msgpack-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7"}, - {file = "msgpack-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb"}, - {file = "msgpack-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f"}, - {file = "msgpack-1.1.0-cp38-cp38-win32.whl", hash = "sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b"}, - {file = "msgpack-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb"}, - {file = "msgpack-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1"}, - {file = "msgpack-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48"}, - {file = "msgpack-1.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c"}, - {file = "msgpack-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468"}, - {file = "msgpack-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74"}, - {file = "msgpack-1.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846"}, - {file = "msgpack-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346"}, - {file = "msgpack-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b"}, - {file = "msgpack-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8"}, - {file = "msgpack-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd"}, - {file = "msgpack-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325"}, - {file = "msgpack-1.1.0.tar.gz", hash = "sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e"}, +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "msgpack-1.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0051fffef5a37ca2cd16978ae4f0aef92f164df86823871b5162812bebecd8e2"}, + {file = "msgpack-1.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a605409040f2da88676e9c9e5853b3449ba8011973616189ea5ee55ddbc5bc87"}, + {file = "msgpack-1.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b696e83c9f1532b4af884045ba7f3aa741a63b2bc22617293a2c6a7c645f251"}, + {file = "msgpack-1.1.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:365c0bbe981a27d8932da71af63ef86acc59ed5c01ad929e09a0b88c6294e28a"}, + {file = "msgpack-1.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:41d1a5d875680166d3ac5c38573896453bbbea7092936d2e107214daf43b1d4f"}, + {file = "msgpack-1.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354e81bcdebaab427c3df4281187edc765d5d76bfb3a7c125af9da7a27e8458f"}, + {file = "msgpack-1.1.2-cp310-cp310-win32.whl", hash = "sha256:e64c8d2f5e5d5fda7b842f55dec6133260ea8f53c4257d64494c534f306bf7a9"}, + {file = "msgpack-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:db6192777d943bdaaafb6ba66d44bf65aa0e9c5616fa1d2da9bb08828c6b39aa"}, + {file = "msgpack-1.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e86a607e558d22985d856948c12a3fa7b42efad264dca8a3ebbcfa2735d786c"}, + {file = "msgpack-1.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:283ae72fc89da59aa004ba147e8fc2f766647b1251500182fac0350d8af299c0"}, + {file = "msgpack-1.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61c8aa3bd513d87c72ed0b37b53dd5c5a0f58f2ff9f26e1555d3bd7948fb7296"}, + {file = "msgpack-1.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:454e29e186285d2ebe65be34629fa0e8605202c60fbc7c4c650ccd41870896ef"}, + {file = "msgpack-1.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7bc8813f88417599564fafa59fd6f95be417179f76b40325b500b3c98409757c"}, + {file = "msgpack-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bafca952dc13907bdfdedfc6a5f579bf4f292bdd506fadb38389afa3ac5b208e"}, + {file = "msgpack-1.1.2-cp311-cp311-win32.whl", hash = "sha256:602b6740e95ffc55bfb078172d279de3773d7b7db1f703b2f1323566b878b90e"}, + {file = "msgpack-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d198d275222dc54244bf3327eb8cbe00307d220241d9cec4d306d49a44e85f68"}, + {file = "msgpack-1.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:86f8136dfa5c116365a8a651a7d7484b65b13339731dd6faebb9a0242151c406"}, + {file = "msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa"}, + {file = "msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb"}, + {file = "msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f"}, + {file = "msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42"}, + {file = "msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9"}, + {file = "msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620"}, + {file = "msgpack-1.1.2-cp312-cp312-win32.whl", hash = "sha256:1fff3d825d7859ac888b0fbda39a42d59193543920eda9d9bea44d958a878029"}, + {file = "msgpack-1.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1de460f0403172cff81169a30b9a92b260cb809c4cb7e2fc79ae8d0510c78b6b"}, + {file = "msgpack-1.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:be5980f3ee0e6bd44f3a9e9dea01054f175b50c3e6cdb692bc9424c0bbb8bf69"}, + {file = "msgpack-1.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4efd7b5979ccb539c221a4c4e16aac1a533efc97f3b759bb5a5ac9f6d10383bf"}, + {file = "msgpack-1.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42eefe2c3e2af97ed470eec850facbe1b5ad1d6eacdbadc42ec98e7dcf68b4b7"}, + {file = "msgpack-1.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fdf7d83102bf09e7ce3357de96c59b627395352a4024f6e2458501f158bf999"}, + {file = "msgpack-1.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fac4be746328f90caa3cd4bc67e6fe36ca2bf61d5c6eb6d895b6527e3f05071e"}, + {file = "msgpack-1.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fffee09044073e69f2bad787071aeec727183e7580443dfeb8556cbf1978d162"}, + {file = "msgpack-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5928604de9b032bc17f5099496417f113c45bc6bc21b5c6920caf34b3c428794"}, + {file = "msgpack-1.1.2-cp313-cp313-win32.whl", hash = "sha256:a7787d353595c7c7e145e2331abf8b7ff1e6673a6b974ded96e6d4ec09f00c8c"}, + {file = "msgpack-1.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:a465f0dceb8e13a487e54c07d04ae3ba131c7c5b95e2612596eafde1dccf64a9"}, + {file = "msgpack-1.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:e69b39f8c0aa5ec24b57737ebee40be647035158f14ed4b40e6f150077e21a84"}, + {file = "msgpack-1.1.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e23ce8d5f7aa6ea6d2a2b326b4ba46c985dbb204523759984430db7114f8aa00"}, + {file = "msgpack-1.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6c15b7d74c939ebe620dd8e559384be806204d73b4f9356320632d783d1f7939"}, + {file = "msgpack-1.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99e2cb7b9031568a2a5c73aa077180f93dd2e95b4f8d3b8e14a73ae94a9e667e"}, + {file = "msgpack-1.1.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:180759d89a057eab503cf62eeec0aa61c4ea1200dee709f3a8e9397dbb3b6931"}, + {file = "msgpack-1.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:04fb995247a6e83830b62f0b07bf36540c213f6eac8e851166d8d86d83cbd014"}, + {file = "msgpack-1.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8e22ab046fa7ede9e36eeb4cfad44d46450f37bb05d5ec482b02868f451c95e2"}, + {file = "msgpack-1.1.2-cp314-cp314-win32.whl", hash = "sha256:80a0ff7d4abf5fecb995fcf235d4064b9a9a8a40a3ab80999e6ac1e30b702717"}, + {file = "msgpack-1.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:9ade919fac6a3e7260b7f64cea89df6bec59104987cbea34d34a2fa15d74310b"}, + {file = "msgpack-1.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:59415c6076b1e30e563eb732e23b994a61c159cec44deaf584e5cc1dd662f2af"}, + {file = "msgpack-1.1.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:897c478140877e5307760b0ea66e0932738879e7aa68144d9b78ea4c8302a84a"}, + {file = "msgpack-1.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a668204fa43e6d02f89dbe79a30b0d67238d9ec4c5bd8a940fc3a004a47b721b"}, + {file = "msgpack-1.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5559d03930d3aa0f3aacb4c42c776af1a2ace2611871c84a75afe436695e6245"}, + {file = "msgpack-1.1.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70c5a7a9fea7f036b716191c29047374c10721c389c21e9ffafad04df8c52c90"}, + {file = "msgpack-1.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f2cb069d8b981abc72b41aea1c580ce92d57c673ec61af4c500153a626cb9e20"}, + {file = "msgpack-1.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d62ce1f483f355f61adb5433ebfd8868c5f078d1a52d042b0a998682b4fa8c27"}, + {file = "msgpack-1.1.2-cp314-cp314t-win32.whl", hash = "sha256:1d1418482b1ee984625d88aa9585db570180c286d942da463533b238b98b812b"}, + {file = "msgpack-1.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:5a46bf7e831d09470ad92dff02b8b1ac92175ca36b087f904a0519857c6be3ff"}, + {file = "msgpack-1.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d99ef64f349d5ec3293688e91486c5fdb925ed03807f64d98d205d2713c60b46"}, + {file = "msgpack-1.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ea5405c46e690122a76531ab97a079e184c0daf491e588592d6a23d3e32af99e"}, + {file = "msgpack-1.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fba231af7a933400238cb357ecccf8ab5d51535ea95d94fc35b7806218ff844"}, + {file = "msgpack-1.1.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a8f6e7d30253714751aa0b0c84ae28948e852ee7fb0524082e6716769124bc23"}, + {file = "msgpack-1.1.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94fd7dc7d8cb0a54432f296f2246bc39474e017204ca6f4ff345941d4ed285a7"}, + {file = "msgpack-1.1.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:350ad5353a467d9e3b126d8d1b90fe05ad081e2e1cef5753f8c345217c37e7b8"}, + {file = "msgpack-1.1.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6bde749afe671dc44893f8d08e83bf475a1a14570d67c4bb5cec5573463c8833"}, + {file = "msgpack-1.1.2-cp39-cp39-win32.whl", hash = "sha256:ad09b984828d6b7bb52d1d1d0c9be68ad781fa004ca39216c8a1e63c0f34ba3c"}, + {file = "msgpack-1.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:67016ae8c8965124fdede9d3769528ad8284f14d635337ffa6a713a580f6c030"}, + {file = "msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e"}, ] [[package]] name = "multidict" -version = "6.4.4" +version = "6.7.0" description = "multidict implementation" optional = false python-versions = ">=3.9" -files = [ - {file = "multidict-6.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8adee3ac041145ffe4488ea73fa0a622b464cc25340d98be76924d0cda8545ff"}, - {file = "multidict-6.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b61e98c3e2a861035aaccd207da585bdcacef65fe01d7a0d07478efac005e028"}, - {file = "multidict-6.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:75493f28dbadecdbb59130e74fe935288813301a8554dc32f0c631b6bdcdf8b0"}, - {file = "multidict-6.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffc3c6a37e048b5395ee235e4a2a0d639c2349dffa32d9367a42fc20d399772"}, - {file = "multidict-6.4.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:87cb72263946b301570b0f63855569a24ee8758aaae2cd182aae7d95fbc92ca7"}, - {file = "multidict-6.4.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bbf7bd39822fd07e3609b6b4467af4c404dd2b88ee314837ad1830a7f4a8299"}, - {file = "multidict-6.4.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1f7cbd4f1f44ddf5fd86a8675b7679176eae770f2fc88115d6dddb6cefb59bc"}, - {file = "multidict-6.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb5ac9e5bfce0e6282e7f59ff7b7b9a74aa8e5c60d38186a4637f5aa764046ad"}, - {file = "multidict-6.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4efc31dfef8c4eeb95b6b17d799eedad88c4902daba39ce637e23a17ea078915"}, - {file = "multidict-6.4.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9fcad2945b1b91c29ef2b4050f590bfcb68d8ac8e0995a74e659aa57e8d78e01"}, - {file = "multidict-6.4.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:d877447e7368c7320832acb7159557e49b21ea10ffeb135c1077dbbc0816b598"}, - {file = "multidict-6.4.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:33a12ebac9f380714c298cbfd3e5b9c0c4e89c75fe612ae496512ee51028915f"}, - {file = "multidict-6.4.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0f14ea68d29b43a9bf37953881b1e3eb75b2739e896ba4a6aa4ad4c5b9ffa145"}, - {file = "multidict-6.4.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0327ad2c747a6600e4797d115d3c38a220fdb28e54983abe8964fd17e95ae83c"}, - {file = "multidict-6.4.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d1a20707492db9719a05fc62ee215fd2c29b22b47c1b1ba347f9abc831e26683"}, - {file = "multidict-6.4.4-cp310-cp310-win32.whl", hash = "sha256:d83f18315b9fca5db2452d1881ef20f79593c4aa824095b62cb280019ef7aa3d"}, - {file = "multidict-6.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:9c17341ee04545fd962ae07330cb5a39977294c883485c8d74634669b1f7fe04"}, - {file = "multidict-6.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4f5f29794ac0e73d2a06ac03fd18870adc0135a9d384f4a306a951188ed02f95"}, - {file = "multidict-6.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c04157266344158ebd57b7120d9b0b35812285d26d0e78193e17ef57bfe2979a"}, - {file = "multidict-6.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb61ffd3ab8310d93427e460f565322c44ef12769f51f77277b4abad7b6f7223"}, - {file = "multidict-6.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e0ba18a9afd495f17c351d08ebbc4284e9c9f7971d715f196b79636a4d0de44"}, - {file = "multidict-6.4.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9faf1b1dcaadf9f900d23a0e6d6c8eadd6a95795a0e57fcca73acce0eb912065"}, - {file = "multidict-6.4.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a4d1cb1327c6082c4fce4e2a438483390964c02213bc6b8d782cf782c9b1471f"}, - {file = "multidict-6.4.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:941f1bec2f5dbd51feeb40aea654c2747f811ab01bdd3422a48a4e4576b7d76a"}, - {file = "multidict-6.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5f8a146184da7ea12910a4cec51ef85e44f6268467fb489c3caf0cd512f29c2"}, - {file = "multidict-6.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:232b7237e57ec3c09be97206bfb83a0aa1c5d7d377faa019c68a210fa35831f1"}, - {file = "multidict-6.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:55ae0721c1513e5e3210bca4fc98456b980b0c2c016679d3d723119b6b202c42"}, - {file = "multidict-6.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:51d662c072579f63137919d7bb8fc250655ce79f00c82ecf11cab678f335062e"}, - {file = "multidict-6.4.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0e05c39962baa0bb19a6b210e9b1422c35c093b651d64246b6c2e1a7e242d9fd"}, - {file = "multidict-6.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d5b1cc3ab8c31d9ebf0faa6e3540fb91257590da330ffe6d2393d4208e638925"}, - {file = "multidict-6.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:93ec84488a384cd7b8a29c2c7f467137d8a73f6fe38bb810ecf29d1ade011a7c"}, - {file = "multidict-6.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b308402608493638763abc95f9dc0030bbd6ac6aff784512e8ac3da73a88af08"}, - {file = "multidict-6.4.4-cp311-cp311-win32.whl", hash = "sha256:343892a27d1a04d6ae455ecece12904d242d299ada01633d94c4f431d68a8c49"}, - {file = "multidict-6.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:73484a94f55359780c0f458bbd3c39cb9cf9c182552177d2136e828269dee529"}, - {file = "multidict-6.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:dc388f75a1c00000824bf28b7633e40854f4127ede80512b44c3cfeeea1839a2"}, - {file = "multidict-6.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:98af87593a666f739d9dba5d0ae86e01b0e1a9cfcd2e30d2d361fbbbd1a9162d"}, - {file = "multidict-6.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aff4cafea2d120327d55eadd6b7f1136a8e5a0ecf6fb3b6863e8aca32cd8e50a"}, - {file = "multidict-6.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:169c4ba7858176b797fe551d6e99040c531c775d2d57b31bcf4de6d7a669847f"}, - {file = "multidict-6.4.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b9eb4c59c54421a32b3273d4239865cb14ead53a606db066d7130ac80cc8ec93"}, - {file = "multidict-6.4.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cf3bd54c56aa16fdb40028d545eaa8d051402b61533c21e84046e05513d5780"}, - {file = "multidict-6.4.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f682c42003c7264134bfe886376299db4cc0c6cd06a3295b41b347044bcb5482"}, - {file = "multidict-6.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a920f9cf2abdf6e493c519492d892c362007f113c94da4c239ae88429835bad1"}, - {file = "multidict-6.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:530d86827a2df6504526106b4c104ba19044594f8722d3e87714e847c74a0275"}, - {file = "multidict-6.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ecde56ea2439b96ed8a8d826b50c57364612ddac0438c39e473fafad7ae1c23b"}, - {file = "multidict-6.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:dc8c9736d8574b560634775ac0def6bdc1661fc63fa27ffdfc7264c565bcb4f2"}, - {file = "multidict-6.4.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7f3d3b3c34867579ea47cbd6c1f2ce23fbfd20a273b6f9e3177e256584f1eacc"}, - {file = "multidict-6.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:87a728af265e08f96b6318ebe3c0f68b9335131f461efab2fc64cc84a44aa6ed"}, - {file = "multidict-6.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9f193eeda1857f8e8d3079a4abd258f42ef4a4bc87388452ed1e1c4d2b0c8740"}, - {file = "multidict-6.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be06e73c06415199200e9a2324a11252a3d62030319919cde5e6950ffeccf72e"}, - {file = "multidict-6.4.4-cp312-cp312-win32.whl", hash = "sha256:622f26ea6a7e19b7c48dd9228071f571b2fbbd57a8cd71c061e848f281550e6b"}, - {file = "multidict-6.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:5e2bcda30d5009996ff439e02a9f2b5c3d64a20151d34898c000a6281faa3781"}, - {file = "multidict-6.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:82ffabefc8d84c2742ad19c37f02cde5ec2a1ee172d19944d380f920a340e4b9"}, - {file = "multidict-6.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6a2f58a66fe2c22615ad26156354005391e26a2f3721c3621504cd87c1ea87bf"}, - {file = "multidict-6.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5883d6ee0fd9d8a48e9174df47540b7545909841ac82354c7ae4cbe9952603bd"}, - {file = "multidict-6.4.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9abcf56a9511653fa1d052bfc55fbe53dbee8f34e68bd6a5a038731b0ca42d15"}, - {file = "multidict-6.4.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6ed5ae5605d4ad5a049fad2a28bb7193400700ce2f4ae484ab702d1e3749c3f9"}, - {file = "multidict-6.4.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbfcb60396f9bcfa63e017a180c3105b8c123a63e9d1428a36544e7d37ca9e20"}, - {file = "multidict-6.4.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0f1987787f5f1e2076b59692352ab29a955b09ccc433c1f6b8e8e18666f608b"}, - {file = "multidict-6.4.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d0121ccce8c812047d8d43d691a1ad7641f72c4f730474878a5aeae1b8ead8c"}, - {file = "multidict-6.4.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83ec4967114295b8afd120a8eec579920c882831a3e4c3331d591a8e5bfbbc0f"}, - {file = "multidict-6.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:995f985e2e268deaf17867801b859a282e0448633f1310e3704b30616d269d69"}, - {file = "multidict-6.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:d832c608f94b9f92a0ec8b7e949be7792a642b6e535fcf32f3e28fab69eeb046"}, - {file = "multidict-6.4.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d21c1212171cf7da703c5b0b7a0e85be23b720818aef502ad187d627316d5645"}, - {file = "multidict-6.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:cbebaa076aaecad3d4bb4c008ecc73b09274c952cf6a1b78ccfd689e51f5a5b0"}, - {file = "multidict-6.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:c93a6fb06cc8e5d3628b2b5fda215a5db01e8f08fc15fadd65662d9b857acbe4"}, - {file = "multidict-6.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8cd8f81f1310182362fb0c7898145ea9c9b08a71081c5963b40ee3e3cac589b1"}, - {file = "multidict-6.4.4-cp313-cp313-win32.whl", hash = "sha256:3e9f1cd61a0ab857154205fb0b1f3d3ace88d27ebd1409ab7af5096e409614cd"}, - {file = "multidict-6.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:8ffb40b74400e4455785c2fa37eba434269149ec525fc8329858c862e4b35373"}, - {file = "multidict-6.4.4-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:6a602151dbf177be2450ef38966f4be3467d41a86c6a845070d12e17c858a156"}, - {file = "multidict-6.4.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0d2b9712211b860d123815a80b859075d86a4d54787e247d7fbee9db6832cf1c"}, - {file = "multidict-6.4.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d2fa86af59f8fc1972e121ade052145f6da22758f6996a197d69bb52f8204e7e"}, - {file = "multidict-6.4.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50855d03e9e4d66eab6947ba688ffb714616f985838077bc4b490e769e48da51"}, - {file = "multidict-6.4.4-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5bce06b83be23225be1905dcdb6b789064fae92499fbc458f59a8c0e68718601"}, - {file = "multidict-6.4.4-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66ed0731f8e5dfd8369a883b6e564aca085fb9289aacabd9decd70568b9a30de"}, - {file = "multidict-6.4.4-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:329ae97fc2f56f44d91bc47fe0972b1f52d21c4b7a2ac97040da02577e2daca2"}, - {file = "multidict-6.4.4-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c27e5dcf520923d6474d98b96749e6805f7677e93aaaf62656005b8643f907ab"}, - {file = "multidict-6.4.4-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:058cc59b9e9b143cc56715e59e22941a5d868c322242278d28123a5d09cdf6b0"}, - {file = "multidict-6.4.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:69133376bc9a03f8c47343d33f91f74a99c339e8b58cea90433d8e24bb298031"}, - {file = "multidict-6.4.4-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:d6b15c55721b1b115c5ba178c77104123745b1417527ad9641a4c5e2047450f0"}, - {file = "multidict-6.4.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:a887b77f51d3d41e6e1a63cf3bc7ddf24de5939d9ff69441387dfefa58ac2e26"}, - {file = "multidict-6.4.4-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:632a3bf8f1787f7ef7d3c2f68a7bde5be2f702906f8b5842ad6da9d974d0aab3"}, - {file = "multidict-6.4.4-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:a145c550900deb7540973c5cdb183b0d24bed6b80bf7bddf33ed8f569082535e"}, - {file = "multidict-6.4.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc5d83c6619ca5c9672cb78b39ed8542f1975a803dee2cda114ff73cbb076edd"}, - {file = "multidict-6.4.4-cp313-cp313t-win32.whl", hash = "sha256:3312f63261b9df49be9d57aaa6abf53a6ad96d93b24f9cc16cf979956355ce6e"}, - {file = "multidict-6.4.4-cp313-cp313t-win_amd64.whl", hash = "sha256:ba852168d814b2c73333073e1c7116d9395bea69575a01b0b3c89d2d5a87c8fb"}, - {file = "multidict-6.4.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:603f39bd1cf85705c6c1ba59644b480dfe495e6ee2b877908de93322705ad7cf"}, - {file = "multidict-6.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fc60f91c02e11dfbe3ff4e1219c085695c339af72d1641800fe6075b91850c8f"}, - {file = "multidict-6.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:496bcf01c76a70a31c3d746fd39383aad8d685ce6331e4c709e9af4ced5fa221"}, - {file = "multidict-6.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4219390fb5bf8e548e77b428bb36a21d9382960db5321b74d9d9987148074d6b"}, - {file = "multidict-6.4.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef4e9096ff86dfdcbd4a78253090ba13b1d183daa11b973e842465d94ae1772"}, - {file = "multidict-6.4.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:49a29d7133b1fc214e818bbe025a77cc6025ed9a4f407d2850373ddde07fd04a"}, - {file = "multidict-6.4.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e32053d6d3a8b0dfe49fde05b496731a0e6099a4df92154641c00aa76786aef5"}, - {file = "multidict-6.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc403092a49509e8ef2d2fd636a8ecefc4698cc57bbe894606b14579bc2a955"}, - {file = "multidict-6.4.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5363f9b2a7f3910e5c87d8b1855c478c05a2dc559ac57308117424dfaad6805c"}, - {file = "multidict-6.4.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2e543a40e4946cf70a88a3be87837a3ae0aebd9058ba49e91cacb0b2cd631e2b"}, - {file = "multidict-6.4.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:60d849912350da557fe7de20aa8cf394aada6980d0052cc829eeda4a0db1c1db"}, - {file = "multidict-6.4.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:19d08b4f22eae45bb018b9f06e2838c1e4b853c67628ef8ae126d99de0da6395"}, - {file = "multidict-6.4.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d693307856d1ef08041e8b6ff01d5b4618715007d288490ce2c7e29013c12b9a"}, - {file = "multidict-6.4.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:fad6daaed41021934917f4fb03ca2db8d8a4d79bf89b17ebe77228eb6710c003"}, - {file = "multidict-6.4.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c10d17371bff801af0daf8b073c30b6cf14215784dc08cd5c43ab5b7b8029bbc"}, - {file = "multidict-6.4.4-cp39-cp39-win32.whl", hash = "sha256:7e23f2f841fcb3ebd4724a40032d32e0892fbba4143e43d2a9e7695c5e50e6bd"}, - {file = "multidict-6.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:4d7b50b673ffb4ff4366e7ab43cf1f0aef4bd3608735c5fbdf0bdb6f690da411"}, - {file = "multidict-6.4.4-py3-none-any.whl", hash = "sha256:bd4557071b561a8b3b6075c3ce93cf9bfb6182cb241805c3d66ced3b75eff4ac"}, - {file = "multidict-6.4.4.tar.gz", hash = "sha256:69ee9e6ba214b5245031b76233dd95408a0fd57fdb019ddcc1ead4790932a8e8"}, +groups = ["main"] +files = [ + {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9f474ad5acda359c8758c8accc22032c6abe6dc87a8be2440d097785e27a9349"}, + {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a9db5a870f780220e931d0002bbfd88fb53aceb6293251e2c839415c1b20e"}, + {file = "multidict-6.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03ca744319864e92721195fa28c7a3b2bc7b686246b35e4078c1e4d0eb5466d3"}, + {file = "multidict-6.7.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f0e77e3c0008bc9316e662624535b88d360c3a5d3f81e15cf12c139a75250046"}, + {file = "multidict-6.7.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08325c9e5367aa379a3496aa9a022fe8837ff22e00b94db256d3a1378c76ab32"}, + {file = "multidict-6.7.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e2862408c99f84aa571ab462d25236ef9cb12a602ea959ba9c9009a54902fc73"}, + {file = "multidict-6.7.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4d72a9a2d885f5c208b0cb91ff2ed43636bb7e345ec839ff64708e04f69a13cc"}, + {file = "multidict-6.7.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:478cc36476687bac1514d651cbbaa94b86b0732fb6855c60c673794c7dd2da62"}, + {file = "multidict-6.7.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6843b28b0364dc605f21481c90fadb5f60d9123b442eb8a726bb74feef588a84"}, + {file = "multidict-6.7.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:23bfeee5316266e5ee2d625df2d2c602b829435fc3a235c2ba2131495706e4a0"}, + {file = "multidict-6.7.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:680878b9f3d45c31e1f730eef731f9b0bc1da456155688c6745ee84eb818e90e"}, + {file = "multidict-6.7.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:eb866162ef2f45063acc7a53a88ef6fe8bf121d45c30ea3c9cd87ce7e191a8d4"}, + {file = "multidict-6.7.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:df0e3bf7993bdbeca5ac25aa859cf40d39019e015c9c91809ba7093967f7a648"}, + {file = "multidict-6.7.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:661709cdcd919a2ece2234f9bae7174e5220c80b034585d7d8a755632d3e2111"}, + {file = "multidict-6.7.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:096f52730c3fb8ed419db2d44391932b63891b2c5ed14850a7e215c0ba9ade36"}, + {file = "multidict-6.7.0-cp310-cp310-win32.whl", hash = "sha256:afa8a2978ec65d2336305550535c9c4ff50ee527914328c8677b3973ade52b85"}, + {file = "multidict-6.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:b15b3afff74f707b9275d5ba6a91ae8f6429c3ffb29bbfd216b0b375a56f13d7"}, + {file = "multidict-6.7.0-cp310-cp310-win_arm64.whl", hash = "sha256:4b73189894398d59131a66ff157837b1fafea9974be486d036bb3d32331fdbf0"}, + {file = "multidict-6.7.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4d409aa42a94c0b3fa617708ef5276dfe81012ba6753a0370fcc9d0195d0a1fc"}, + {file = "multidict-6.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14c9e076eede3b54c636f8ce1c9c252b5f057c62131211f0ceeec273810c9721"}, + {file = "multidict-6.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c09703000a9d0fa3c3404b27041e574cc7f4df4c6563873246d0e11812a94b6"}, + {file = "multidict-6.7.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a265acbb7bb33a3a2d626afbe756371dce0279e7b17f4f4eda406459c2b5ff1c"}, + {file = "multidict-6.7.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51cb455de290ae462593e5b1cb1118c5c22ea7f0d3620d9940bf695cea5a4bd7"}, + {file = "multidict-6.7.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db99677b4457c7a5c5a949353e125ba72d62b35f74e26da141530fbb012218a7"}, + {file = "multidict-6.7.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f470f68adc395e0183b92a2f4689264d1ea4b40504a24d9882c27375e6662bb9"}, + {file = "multidict-6.7.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0db4956f82723cc1c270de9c6e799b4c341d327762ec78ef82bb962f79cc07d8"}, + {file = "multidict-6.7.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e56d780c238f9e1ae66a22d2adf8d16f485381878250db8d496623cd38b22bd"}, + {file = "multidict-6.7.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9d14baca2ee12c1a64740d4531356ba50b82543017f3ad6de0deb943c5979abb"}, + {file = "multidict-6.7.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:295a92a76188917c7f99cda95858c822f9e4aae5824246bba9b6b44004ddd0a6"}, + {file = "multidict-6.7.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39f1719f57adbb767ef592a50ae5ebb794220d1188f9ca93de471336401c34d2"}, + {file = "multidict-6.7.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0a13fb8e748dfc94749f622de065dd5c1def7e0d2216dba72b1d8069a389c6ff"}, + {file = "multidict-6.7.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e3aa16de190d29a0ea1b48253c57d99a68492c8dd8948638073ab9e74dc9410b"}, + {file = "multidict-6.7.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a048ce45dcdaaf1defb76b2e684f997fb5abf74437b6cb7b22ddad934a964e34"}, + {file = "multidict-6.7.0-cp311-cp311-win32.whl", hash = "sha256:a90af66facec4cebe4181b9e62a68be65e45ac9b52b67de9eec118701856e7ff"}, + {file = "multidict-6.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:95b5ffa4349df2887518bb839409bcf22caa72d82beec453216802f475b23c81"}, + {file = "multidict-6.7.0-cp311-cp311-win_arm64.whl", hash = "sha256:329aa225b085b6f004a4955271a7ba9f1087e39dcb7e65f6284a988264a63912"}, + {file = "multidict-6.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8a3862568a36d26e650a19bb5cbbba14b71789032aebc0423f8cc5f150730184"}, + {file = "multidict-6.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:960c60b5849b9b4f9dcc9bea6e3626143c252c74113df2c1540aebce70209b45"}, + {file = "multidict-6.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2049be98fb57a31b4ccf870bf377af2504d4ae35646a19037ec271e4c07998aa"}, + {file = "multidict-6.7.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0934f3843a1860dd465d38895c17fce1f1cb37295149ab05cd1b9a03afacb2a7"}, + {file = "multidict-6.7.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b3e34f3a1b8131ba06f1a73adab24f30934d148afcd5f5de9a73565a4404384e"}, + {file = "multidict-6.7.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:efbb54e98446892590dc2458c19c10344ee9a883a79b5cec4bc34d6656e8d546"}, + {file = "multidict-6.7.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a35c5fc61d4f51eb045061e7967cfe3123d622cd500e8868e7c0c592a09fedc4"}, + {file = "multidict-6.7.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29fe6740ebccba4175af1b9b87bf553e9c15cd5868ee967e010efcf94e4fd0f1"}, + {file = "multidict-6.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:123e2a72e20537add2f33a79e605f6191fba2afda4cbb876e35c1a7074298a7d"}, + {file = "multidict-6.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b284e319754366c1aee2267a2036248b24eeb17ecd5dc16022095e747f2f4304"}, + {file = "multidict-6.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:803d685de7be4303b5a657b76e2f6d1240e7e0a8aa2968ad5811fa2285553a12"}, + {file = "multidict-6.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c04a328260dfd5db8c39538f999f02779012268f54614902d0afc775d44e0a62"}, + {file = "multidict-6.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8a19cdb57cd3df4cd865849d93ee14920fb97224300c88501f16ecfa2604b4e0"}, + {file = "multidict-6.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b2fd74c52accced7e75de26023b7dccee62511a600e62311b918ec5c168fc2a"}, + {file = "multidict-6.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3e8bfdd0e487acf992407a140d2589fe598238eaeffa3da8448d63a63cd363f8"}, + {file = "multidict-6.7.0-cp312-cp312-win32.whl", hash = "sha256:dd32a49400a2c3d52088e120ee00c1e3576cbff7e10b98467962c74fdb762ed4"}, + {file = "multidict-6.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:92abb658ef2d7ef22ac9f8bb88e8b6c3e571671534e029359b6d9e845923eb1b"}, + {file = "multidict-6.7.0-cp312-cp312-win_arm64.whl", hash = "sha256:490dab541a6a642ce1a9d61a4781656b346a55c13038f0b1244653828e3a83ec"}, + {file = "multidict-6.7.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bee7c0588aa0076ce77c0ea5d19a68d76ad81fcd9fe8501003b9a24f9d4000f6"}, + {file = "multidict-6.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7ef6b61cad77091056ce0e7ce69814ef72afacb150b7ac6a3e9470def2198159"}, + {file = "multidict-6.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c0359b1ec12b1d6849c59f9d319610b7f20ef990a6d454ab151aa0e3b9f78ca"}, + {file = "multidict-6.7.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cd240939f71c64bd658f186330603aac1a9a81bf6273f523fca63673cb7378a8"}, + {file = "multidict-6.7.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60a4d75718a5efa473ebd5ab685786ba0c67b8381f781d1be14da49f1a2dc60"}, + {file = "multidict-6.7.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53a42d364f323275126aff81fb67c5ca1b7a04fda0546245730a55c8c5f24bc4"}, + {file = "multidict-6.7.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3b29b980d0ddbecb736735ee5bef69bb2ddca56eff603c86f3f29a1128299b4f"}, + {file = "multidict-6.7.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f8a93b1c0ed2d04b97a5e9336fd2d33371b9a6e29ab7dd6503d63407c20ffbaf"}, + {file = "multidict-6.7.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ff96e8815eecacc6645da76c413eb3b3d34cfca256c70b16b286a687d013c32"}, + {file = "multidict-6.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7516c579652f6a6be0e266aec0acd0db80829ca305c3d771ed898538804c2036"}, + {file = "multidict-6.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:040f393368e63fb0f3330e70c26bfd336656bed925e5cbe17c9da839a6ab13ec"}, + {file = "multidict-6.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b3bc26a951007b1057a1c543af845f1c7e3e71cc240ed1ace7bf4484aa99196e"}, + {file = "multidict-6.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7b022717c748dd1992a83e219587aabe45980d88969f01b316e78683e6285f64"}, + {file = "multidict-6.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:9600082733859f00d79dee64effc7aef1beb26adb297416a4ad2116fd61374bd"}, + {file = "multidict-6.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:94218fcec4d72bc61df51c198d098ce2b378e0ccbac41ddbed5ef44092913288"}, + {file = "multidict-6.7.0-cp313-cp313-win32.whl", hash = "sha256:a37bd74c3fa9d00be2d7b8eca074dc56bd8077ddd2917a839bd989612671ed17"}, + {file = "multidict-6.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:30d193c6cc6d559db42b6bcec8a5d395d34d60c9877a0b71ecd7c204fcf15390"}, + {file = "multidict-6.7.0-cp313-cp313-win_arm64.whl", hash = "sha256:ea3334cabe4d41b7ccd01e4d349828678794edbc2d3ae97fc162a3312095092e"}, + {file = "multidict-6.7.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:ad9ce259f50abd98a1ca0aa6e490b58c316a0fce0617f609723e40804add2c00"}, + {file = "multidict-6.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07f5594ac6d084cbb5de2df218d78baf55ef150b91f0ff8a21cc7a2e3a5a58eb"}, + {file = "multidict-6.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:0591b48acf279821a579282444814a2d8d0af624ae0bc600aa4d1b920b6e924b"}, + {file = "multidict-6.7.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:749a72584761531d2b9467cfbdfd29487ee21124c304c4b6cb760d8777b27f9c"}, + {file = "multidict-6.7.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b4c3d199f953acd5b446bf7c0de1fe25d94e09e79086f8dc2f48a11a129cdf1"}, + {file = "multidict-6.7.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9fb0211dfc3b51efea2f349ec92c114d7754dd62c01f81c3e32b765b70c45c9b"}, + {file = "multidict-6.7.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a027ec240fe73a8d6281872690b988eed307cd7d91b23998ff35ff577ca688b5"}, + {file = "multidict-6.7.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1d964afecdf3a8288789df2f5751dc0a8261138c3768d9af117ed384e538fad"}, + {file = "multidict-6.7.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:caf53b15b1b7df9fbd0709aa01409000a2b4dd03a5f6f5cc548183c7c8f8b63c"}, + {file = "multidict-6.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:654030da3197d927f05a536a66186070e98765aa5142794c9904555d3a9d8fb5"}, + {file = "multidict-6.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:2090d3718829d1e484706a2f525e50c892237b2bf9b17a79b059cb98cddc2f10"}, + {file = "multidict-6.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2d2cfeec3f6f45651b3d408c4acec0ebf3daa9bc8a112a084206f5db5d05b754"}, + {file = "multidict-6.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:4ef089f985b8c194d341eb2c24ae6e7408c9a0e2e5658699c92f497437d88c3c"}, + {file = "multidict-6.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e93a0617cd16998784bf4414c7e40f17a35d2350e5c6f0bd900d3a8e02bd3762"}, + {file = "multidict-6.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f0feece2ef8ebc42ed9e2e8c78fc4aa3cf455733b507c09ef7406364c94376c6"}, + {file = "multidict-6.7.0-cp313-cp313t-win32.whl", hash = "sha256:19a1d55338ec1be74ef62440ca9e04a2f001a04d0cc49a4983dc320ff0f3212d"}, + {file = "multidict-6.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3da4fb467498df97e986af166b12d01f05d2e04f978a9c1c680ea1988e0bc4b6"}, + {file = "multidict-6.7.0-cp313-cp313t-win_arm64.whl", hash = "sha256:b4121773c49a0776461f4a904cdf6264c88e42218aaa8407e803ca8025872792"}, + {file = "multidict-6.7.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3bab1e4aff7adaa34410f93b1f8e57c4b36b9af0426a76003f441ee1d3c7e842"}, + {file = "multidict-6.7.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b8512bac933afc3e45fb2b18da8e59b78d4f408399a960339598374d4ae3b56b"}, + {file = "multidict-6.7.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:79dcf9e477bc65414ebfea98ffd013cb39552b5ecd62908752e0e413d6d06e38"}, + {file = "multidict-6.7.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:31bae522710064b5cbeddaf2e9f32b1abab70ac6ac91d42572502299e9953128"}, + {file = "multidict-6.7.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a0df7ff02397bb63e2fd22af2c87dfa39e8c7f12947bc524dbdc528282c7e34"}, + {file = "multidict-6.7.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7a0222514e8e4c514660e182d5156a415c13ef0aabbd71682fc714e327b95e99"}, + {file = "multidict-6.7.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2397ab4daaf2698eb51a76721e98db21ce4f52339e535725de03ea962b5a3202"}, + {file = "multidict-6.7.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8891681594162635948a636c9fe0ff21746aeb3dd5463f6e25d9bea3a8a39ca1"}, + {file = "multidict-6.7.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18706cc31dbf402a7945916dd5cddf160251b6dab8a2c5f3d6d5a55949f676b3"}, + {file = "multidict-6.7.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f844a1bbf1d207dd311a56f383f7eda2d0e134921d45751842d8235e7778965d"}, + {file = "multidict-6.7.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:d4393e3581e84e5645506923816b9cc81f5609a778c7e7534054091acc64d1c6"}, + {file = "multidict-6.7.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:fbd18dc82d7bf274b37aa48d664534330af744e03bccf696d6f4c6042e7d19e7"}, + {file = "multidict-6.7.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b6234e14f9314731ec45c42fc4554b88133ad53a09092cc48a88e771c125dadb"}, + {file = "multidict-6.7.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:08d4379f9744d8f78d98c8673c06e202ffa88296f009c71bbafe8a6bf847d01f"}, + {file = "multidict-6.7.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9fe04da3f79387f450fd0061d4dd2e45a72749d31bf634aecc9e27f24fdc4b3f"}, + {file = "multidict-6.7.0-cp314-cp314-win32.whl", hash = "sha256:fbafe31d191dfa7c4c51f7a6149c9fb7e914dcf9ffead27dcfd9f1ae382b3885"}, + {file = "multidict-6.7.0-cp314-cp314-win_amd64.whl", hash = "sha256:2f67396ec0310764b9222a1728ced1ab638f61aadc6226f17a71dd9324f9a99c"}, + {file = "multidict-6.7.0-cp314-cp314-win_arm64.whl", hash = "sha256:ba672b26069957ee369cfa7fc180dde1fc6f176eaf1e6beaf61fbebbd3d9c000"}, + {file = "multidict-6.7.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:c1dcc7524066fa918c6a27d61444d4ee7900ec635779058571f70d042d86ed63"}, + {file = "multidict-6.7.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:27e0b36c2d388dc7b6ced3406671b401e84ad7eb0656b8f3a2f46ed0ce483718"}, + {file = "multidict-6.7.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a7baa46a22e77f0988e3b23d4ede5513ebec1929e34ee9495be535662c0dfe2"}, + {file = "multidict-6.7.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7bf77f54997a9166a2f5675d1201520586439424c2511723a7312bdb4bcc034e"}, + {file = "multidict-6.7.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e011555abada53f1578d63389610ac8a5400fc70ce71156b0aa30d326f1a5064"}, + {file = "multidict-6.7.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:28b37063541b897fd6a318007373930a75ca6d6ac7c940dbe14731ffdd8d498e"}, + {file = "multidict-6.7.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:05047ada7a2fde2631a0ed706f1fd68b169a681dfe5e4cf0f8e4cb6618bbc2cd"}, + {file = "multidict-6.7.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:716133f7d1d946a4e1b91b1756b23c088881e70ff180c24e864c26192ad7534a"}, + {file = "multidict-6.7.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d1bed1b467ef657f2a0ae62844a607909ef1c6889562de5e1d505f74457d0b96"}, + {file = "multidict-6.7.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ca43bdfa5d37bd6aee89d85e1d0831fb86e25541be7e9d376ead1b28974f8e5e"}, + {file = "multidict-6.7.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:44b546bd3eb645fd26fb949e43c02a25a2e632e2ca21a35e2e132c8105dc8599"}, + {file = "multidict-6.7.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a6ef16328011d3f468e7ebc326f24c1445f001ca1dec335b2f8e66bed3006394"}, + {file = "multidict-6.7.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:5aa873cbc8e593d361ae65c68f85faadd755c3295ea2c12040ee146802f23b38"}, + {file = "multidict-6.7.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:3d7b6ccce016e29df4b7ca819659f516f0bc7a4b3efa3bb2012ba06431b044f9"}, + {file = "multidict-6.7.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:171b73bd4ee683d307599b66793ac80981b06f069b62eea1c9e29c9241aa66b0"}, + {file = "multidict-6.7.0-cp314-cp314t-win32.whl", hash = "sha256:b2d7f80c4e1fd010b07cb26820aae86b7e73b681ee4889684fb8d2d4537aab13"}, + {file = "multidict-6.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:09929cab6fcb68122776d575e03c6cc64ee0b8fca48d17e135474b042ce515cd"}, + {file = "multidict-6.7.0-cp314-cp314t-win_arm64.whl", hash = "sha256:cc41db090ed742f32bd2d2c721861725e6109681eddf835d0a82bd3a5c382827"}, + {file = "multidict-6.7.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:363eb68a0a59bd2303216d2346e6c441ba10d36d1f9969fcb6f1ba700de7bb5c"}, + {file = "multidict-6.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d874eb056410ca05fed180b6642e680373688efafc7f077b2a2f61811e873a40"}, + {file = "multidict-6.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b55d5497b51afdfde55925e04a022f1de14d4f4f25cdfd4f5d9b0aa96166851"}, + {file = "multidict-6.7.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f8e5c0031b90ca9ce555e2e8fd5c3b02a25f14989cbc310701823832c99eb687"}, + {file = "multidict-6.7.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cf41880c991716f3c7cec48e2f19ae4045fc9db5fc9cff27347ada24d710bb5"}, + {file = "multidict-6.7.0-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8cfc12a8630a29d601f48d47787bd7eb730e475e83edb5d6c5084317463373eb"}, + {file = "multidict-6.7.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3996b50c3237c4aec17459217c1e7bbdead9a22a0fcd3c365564fbd16439dde6"}, + {file = "multidict-6.7.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7f5170993a0dd3ab871c74f45c0a21a4e2c37a2f2b01b5f722a2ad9c6650469e"}, + {file = "multidict-6.7.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ec81878ddf0e98817def1e77d4f50dae5ef5b0e4fe796fae3bd674304172416e"}, + {file = "multidict-6.7.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9281bf5b34f59afbc6b1e477a372e9526b66ca446f4bf62592839c195a718b32"}, + {file = "multidict-6.7.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:68af405971779d8b37198726f2b6fe3955db846fee42db7a4286fc542203934c"}, + {file = "multidict-6.7.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3ba3ef510467abb0667421a286dc906e30eb08569365f5cdb131d7aff7c2dd84"}, + {file = "multidict-6.7.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b61189b29081a20c7e4e0b49b44d5d44bb0dc92be3c6d06a11cc043f81bf9329"}, + {file = "multidict-6.7.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:fb287618b9c7aa3bf8d825f02d9201b2f13078a5ed3b293c8f4d953917d84d5e"}, + {file = "multidict-6.7.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:521f33e377ff64b96c4c556b81c55d0cfffb96a11c194fd0c3f1e56f3d8dd5a4"}, + {file = "multidict-6.7.0-cp39-cp39-win32.whl", hash = "sha256:ce8fdc2dca699f8dbf055a61d73eaa10482569ad20ee3c36ef9641f69afa8c91"}, + {file = "multidict-6.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:7e73299c99939f089dd9b2120a04a516b95cdf8c1cd2b18c53ebf0de80b1f18f"}, + {file = "multidict-6.7.0-cp39-cp39-win_arm64.whl", hash = "sha256:6bdce131e14b04fd34a809b6380dbfd826065c3e2fe8a50dbae659fa0c390546"}, + {file = "multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3"}, + {file = "multidict-6.7.0.tar.gz", hash = "sha256:c6e99d9a65ca282e578dfea819cfa9c0a62b2499d8677392e09feaf305e9e6f5"}, ] [[package]] @@ -3054,6 +3753,7 @@ version = "1.1.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"}, {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, @@ -3065,6 +3765,7 @@ version = "0.10.2" description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." optional = false python-versions = ">=3.9.0" +groups = ["dev"] files = [ {file = "nbclient-0.10.2-py3-none-any.whl", hash = "sha256:4ffee11e788b4a27fabeb7955547e4318a5298f34342a4bfd01f2e1faaeadc3d"}, {file = "nbclient-0.10.2.tar.gz", hash = "sha256:90b7fc6b810630db87a6d0c2250b1f0ab4cf4d3c27a299b0cde78a4ed3fd9193"}, @@ -3087,6 +3788,7 @@ version = "7.16.6" description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "nbconvert-7.16.6-py3-none-any.whl", hash = "sha256:1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b"}, {file = "nbconvert-7.16.6.tar.gz", hash = "sha256:576a7e37c6480da7b8465eefa66c17844243816ce1ccc372633c6b71c3c0f582"}, @@ -3123,6 +3825,7 @@ version = "5.10.4" description = "The Jupyter Notebook format" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"}, {file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"}, @@ -3144,6 +3847,7 @@ version = "1.6.0" description = "Patch asyncio to allow nested event loops" optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, @@ -3155,6 +3859,7 @@ version = "3.5" description = "Python package for creating and manipulating graphs and networks" optional = false python-versions = ">=3.11" +groups = ["main"] files = [ {file = "networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec"}, {file = "networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037"}, @@ -3175,6 +3880,7 @@ version = "1.9.1" description = "Node.js virtual environment builder" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] files = [ {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, @@ -3186,6 +3892,7 @@ version = "0.2.4" description = "A shim layer for notebook traits and config" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "notebook_shim-0.2.4-py3-none-any.whl", hash = "sha256:411a5be4e9dc882a074ccbcae671eda64cceb068767e9a3419096986560e1cef"}, {file = "notebook_shim-0.2.4.tar.gz", hash = "sha256:b4b2cfa1b65d98307ca24361f5b30fe785b53c3fd07b7a47e89acb5e6ac638cb"}, @@ -3203,6 +3910,7 @@ version = "1.26.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, @@ -3244,66 +3952,72 @@ files = [ [[package]] name = "nvidia-cublas-cu12" -version = "12.6.4.1" +version = "12.8.4.1" description = "CUBLAS native runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb"}, - {file = "nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668"}, - {file = "nvidia_cublas_cu12-12.6.4.1-py3-none-win_amd64.whl", hash = "sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8"}, + {file = "nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0"}, + {file = "nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142"}, + {file = "nvidia_cublas_cu12-12.8.4.1-py3-none-win_amd64.whl", hash = "sha256:47e9b82132fa8d2b4944e708049229601448aaad7e6f296f630f2d1a32de35af"}, ] [[package]] name = "nvidia-cuda-cupti-cu12" -version = "12.6.80" +version = "12.8.90" description = "CUDA profiling tools runtime libs." optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:166ee35a3ff1587f2490364f90eeeb8da06cd867bd5b701bf7f9a02b78bc63fc"}, - {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.whl", hash = "sha256:358b4a1d35370353d52e12f0a7d1769fc01ff74a191689d3870b2123156184c4"}, - {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6768bad6cab4f19e8292125e5f1ac8aa7d1718704012a0e3272a6f61c4bce132"}, - {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73"}, - {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-win_amd64.whl", hash = "sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a"}, + {file = "nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed"}, + {file = "nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182"}, + {file = "nvidia_cuda_cupti_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:bb479dcdf7e6d4f8b0b01b115260399bf34154a1a2e9fe11c85c517d87efd98e"}, ] [[package]] name = "nvidia-cuda-nvrtc-cu12" -version = "12.6.77" +version = "12.8.93" description = "NVRTC native runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13"}, - {file = "nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53"}, - {file = "nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a"}, + {file = "nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994"}, + {file = "nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8"}, + {file = "nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-win_amd64.whl", hash = "sha256:7a4b6b2904850fe78e0bd179c4b655c404d4bb799ef03ddc60804247099ae909"}, ] [[package]] name = "nvidia-cuda-runtime-cu12" -version = "12.6.77" +version = "12.8.90" description = "CUDA Runtime native Libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd"}, - {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e"}, - {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ba3b56a4f896141e25e19ab287cd71e52a6a0f4b29d0d31609f60e3b4d5219b7"}, - {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a84d15d5e1da416dd4774cb42edf5e954a3e60cc945698dc1d5be02321c44dc8"}, - {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:86c58044c824bf3c173c49a2dbc7a6c8b53cb4e4dca50068be0bf64e9dab3f7f"}, + {file = "nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d"}, + {file = "nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90"}, + {file = "nvidia_cuda_runtime_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:c0c6027f01505bfed6c3b21ec546f69c687689aad5f1a377554bc6ca4aa993a8"}, ] [[package]] name = "nvidia-cudnn-cu12" -version = "9.5.1.17" +version = "9.10.2.21" description = "cuDNN runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9fd4584468533c61873e5fda8ca41bac3a38bcb2d12350830c69b0a96a7e4def"}, - {file = "nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2"}, - {file = "nvidia_cudnn_cu12-9.5.1.17-py3-none-win_amd64.whl", hash = "sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8"}, + {file = "nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8"}, + {file = "nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8"}, + {file = "nvidia_cudnn_cu12-9.10.2.21-py3-none-win_amd64.whl", hash = "sha256:c6288de7d63e6cf62988f0923f96dc339cea362decb1bf5b3141883392a7d65e"}, ] [package.dependencies] @@ -3311,16 +4025,16 @@ nvidia-cublas-cu12 = "*" [[package]] name = "nvidia-cufft-cu12" -version = "11.3.0.4" +version = "11.3.3.83" description = "CUFFT native runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6"}, - {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8510990de9f96c803a051822618d42bf6cb8f069ff3f48d93a8486efdacb48fb"}, - {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5"}, - {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:768160ac89f6f7b459bee747e8d175dbf53619cfe74b2a5636264163138013ca"}, - {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-win_amd64.whl", hash = "sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464"}, + {file = "nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a"}, + {file = "nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74"}, + {file = "nvidia_cufft_cu12-11.3.3.83-py3-none-win_amd64.whl", hash = "sha256:7a64a98ef2a7c47f905aaf8931b69a3a43f27c55530c698bb2ed7c75c0b42cb7"}, ] [package.dependencies] @@ -3328,41 +4042,43 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-cufile-cu12" -version = "1.11.1.6" +version = "1.13.1.3" description = "cuFile GPUDirect libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159"}, - {file = "nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8f57a0051dcf2543f6dc2b98a98cb2719c37d3cee1baba8965d57f3bbc90d4db"}, + {file = "nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc"}, + {file = "nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a"}, ] [[package]] name = "nvidia-curand-cu12" -version = "10.3.7.77" +version = "10.3.9.90" description = "CURAND native runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:6e82df077060ea28e37f48a3ec442a8f47690c7499bff392a5938614b56c98d8"}, - {file = "nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf"}, - {file = "nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:99f1a32f1ac2bd134897fc7a203f779303261268a65762a623bf30cc9fe79117"}, - {file = "nvidia_curand_cu12-10.3.7.77-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:7b2ed8e95595c3591d984ea3603dd66fe6ce6812b886d59049988a712ed06b6e"}, - {file = "nvidia_curand_cu12-10.3.7.77-py3-none-win_amd64.whl", hash = "sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905"}, + {file = "nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd"}, + {file = "nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9"}, + {file = "nvidia_curand_cu12-10.3.9.90-py3-none-win_amd64.whl", hash = "sha256:f149a8ca457277da854f89cf282d6ef43176861926c7ac85b2a0fbd237c587ec"}, ] [[package]] name = "nvidia-cusolver-cu12" -version = "11.7.1.2" +version = "11.7.3.90" description = "CUDA solver native runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0"}, - {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c"}, - {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6cf28f17f64107a0c4d7802be5ff5537b2130bfc112f25d5a30df227058ca0e6"}, - {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dbbe4fc38ec1289c7e5230e16248365e375c3673c9c8bac5796e2e20db07f56e"}, - {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-win_amd64.whl", hash = "sha256:6813f9d8073f555444a8705f3ab0296d3e1cb37a16d694c5fc8b862a0d8706d7"}, + {file = "nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0"}, + {file = "nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450"}, + {file = "nvidia_cusolver_cu12-11.7.3.90-py3-none-win_amd64.whl", hash = "sha256:4a550db115fcabc4d495eb7d39ac8b58d4ab5d8e63274d3754df1c0ad6a22d34"}, ] [package.dependencies] @@ -3372,16 +4088,16 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-cusparse-cu12" -version = "12.5.4.2" +version = "12.5.8.93" description = "CUSPARSE native runtime libraries" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887"}, - {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7aa32fa5470cf754f72d1116c7cbc300b4e638d3ae5304cfa4a638a5b87161b1"}, - {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73"}, - {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f"}, - {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-win_amd64.whl", hash = "sha256:4acb8c08855a26d737398cba8fb6f8f5045d93f82612b4cfd84645a2332ccf20"}, + {file = "nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc"}, + {file = "nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b"}, + {file = "nvidia_cusparse_cu12-12.5.8.93-py3-none-win_amd64.whl", hash = "sha256:9a33604331cb2cac199f2e7f5104dfbb8a5a898c367a53dfda9ff2acb6b6b4dd"}, ] [package.dependencies] @@ -3389,123 +4105,162 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-cusparselt-cu12" -version = "0.6.3" +version = "0.7.1" description = "NVIDIA cuSPARSELt" optional = false python-versions = "*" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8371549623ba601a06322af2133c4a44350575f5a3108fb75f3ef20b822ad5f1"}, - {file = "nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46"}, - {file = "nvidia_cusparselt_cu12-0.6.3-py3-none-win_amd64.whl", hash = "sha256:3b325bcbd9b754ba43df5a311488fca11a6b5dc3d11df4d190c000cf1a0765c7"}, + {file = "nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5"}, + {file = "nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623"}, + {file = "nvidia_cusparselt_cu12-0.7.1-py3-none-win_amd64.whl", hash = "sha256:f67fbb5831940ec829c9117b7f33807db9f9678dc2a617fbe781cac17b4e1075"}, ] [[package]] name = "nvidia-nccl-cu12" -version = "2.26.2" +version = "2.27.5" description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine != \"aarch64\"" files = [ - {file = "nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c196e95e832ad30fbbb50381eb3cbd1fadd5675e587a548563993609af19522"}, - {file = "nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6"}, + {file = "nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a"}, + {file = "nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457"}, ] [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.6.85" +version = "12.8.93" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88"}, + {file = "nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7"}, + {file = "nvidia_nvjitlink_cu12-12.8.93-py3-none-win_amd64.whl", hash = "sha256:bd93fbeeee850917903583587f4fc3a4eafa022e34572251368238ab5e6bd67f"}, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.3.20" +description = "NVSHMEM creates a global address space that provides efficient and scalable communication for NVIDIA GPU clusters." +optional = false +python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a"}, - {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41"}, - {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c"}, + {file = "nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b0b960da3842212758e4fa4696b94f129090b30e5122fea3c5345916545cff0"}, + {file = "nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5"}, ] [[package]] name = "nvidia-nvtx-cu12" -version = "12.6.77" +version = "12.8.90" description = "NVIDIA Tools Extension" optional = false python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ - {file = "nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b"}, - {file = "nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:adcaabb9d436c9761fca2b13959a2d237c5f9fd406c8e4b723c695409ff88059"}, - {file = "nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b90bed3df379fa79afbd21be8e04a0314336b8ae16768b58f2d34cb1d04cd7d2"}, - {file = "nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1"}, - {file = "nvidia_nvtx_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:2fb11a4af04a5e6c84073e6404d26588a34afd35379f0855a99797897efa75c0"}, + {file = "nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615"}, + {file = "nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f"}, + {file = "nvidia_nvtx_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:619c8304aedc69f02ea82dd244541a83c3d9d40993381b3b590f1adaed3db41e"}, ] [[package]] name = "openai" -version = "1.84.0" +version = "2.8.0" description = "The official Python library for the openai API" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "openai-1.84.0-py3-none-any.whl", hash = "sha256:7ec4436c3c933d68dc0f5a0cef0cb3dbc0864a54d62bddaf2ed5f3d521844711"}, - {file = "openai-1.84.0.tar.gz", hash = "sha256:4caa43bdab262cc75680ce1a2322cfc01626204074f7e8d9939ab372acf61698"}, + {file = "openai-2.8.0-py3-none-any.whl", hash = "sha256:ba975e347f6add2fe13529ccb94d54a578280e960765e5224c34b08d7e029ddf"}, + {file = "openai-2.8.0.tar.gz", hash = "sha256:4851908f6d6fcacbd47ba659c5ac084f7725b752b6bfa1e948b6fbfc111a6bad"}, ] [package.dependencies] anyio = ">=3.5.0,<5" distro = ">=1.7.0,<2" httpx = ">=0.23.0,<1" -jiter = ">=0.4.0,<1" +jiter = ">=0.10.0,<1" pydantic = ">=1.9.0,<3" sniffio = "*" tqdm = ">4" typing-extensions = ">=4.11,<5" [package.extras] +aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.9)"] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] realtime = ["websockets (>=13,<16)"] voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"] [[package]] name = "opentelemetry-api" -version = "1.34.0" +version = "1.38.0" description = "OpenTelemetry Python API" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "opentelemetry_api-1.34.0-py3-none-any.whl", hash = "sha256:390b81984affe4453180820ca518de55e3be051111e70cc241bb3b0071ca3a2c"}, - {file = "opentelemetry_api-1.34.0.tar.gz", hash = "sha256:48d167589134799093005b7f7f347c69cc67859c693b17787f334fbe8871279f"}, + {file = "opentelemetry_api-1.38.0-py3-none-any.whl", hash = "sha256:2891b0197f47124454ab9f0cf58f3be33faca394457ac3e09daba13ff50aa582"}, + {file = "opentelemetry_api-1.38.0.tar.gz", hash = "sha256:f4c193b5e8acb0912b06ac5b16321908dd0843d75049c091487322284a3eea12"}, ] [package.dependencies] importlib-metadata = ">=6.0,<8.8.0" typing-extensions = ">=4.5.0" +[[package]] +name = "opentelemetry-proto" +version = "1.38.0" +description = "OpenTelemetry Python Proto" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "opentelemetry_proto-1.38.0-py3-none-any.whl", hash = "sha256:b6ebe54d3217c42e45462e2a1ae28c3e2bf2ec5a5645236a490f55f45f1a0a18"}, + {file = "opentelemetry_proto-1.38.0.tar.gz", hash = "sha256:88b161e89d9d372ce723da289b7da74c3a8354a8e5359992be813942969ed468"}, +] + +[package.dependencies] +protobuf = ">=5.0,<7.0" + [[package]] name = "opentelemetry-sdk" -version = "1.34.0" +version = "1.38.0" description = "OpenTelemetry Python SDK" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "opentelemetry_sdk-1.34.0-py3-none-any.whl", hash = "sha256:7850bcd5b5c95f9aae48603d6592bdad5c7bdef50c03e06393f8f457d891fe32"}, - {file = "opentelemetry_sdk-1.34.0.tar.gz", hash = "sha256:719559622afcd515c2aec462ccb749ba2e70075a01df45837623643814d33716"}, + {file = "opentelemetry_sdk-1.38.0-py3-none-any.whl", hash = "sha256:1c66af6564ecc1553d72d811a01df063ff097cdc82ce188da9951f93b8d10f6b"}, + {file = "opentelemetry_sdk-1.38.0.tar.gz", hash = "sha256:93df5d4d871ed09cb4272305be4d996236eedb232253e3ab864c8620f051cebe"}, ] [package.dependencies] -opentelemetry-api = "1.34.0" -opentelemetry-semantic-conventions = "0.55b0" +opentelemetry-api = "1.38.0" +opentelemetry-semantic-conventions = "0.59b0" typing-extensions = ">=4.5.0" [[package]] name = "opentelemetry-semantic-conventions" -version = "0.55b0" +version = "0.59b0" description = "OpenTelemetry Semantic Conventions" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "opentelemetry_semantic_conventions-0.55b0-py3-none-any.whl", hash = "sha256:63bb15b67377700e51c422d0d24092ca6ce9f3a4cb6f032375aa8af1fc2aab65"}, - {file = "opentelemetry_semantic_conventions-0.55b0.tar.gz", hash = "sha256:933d2e20c2dbc0f9b2f4f52138282875b4b14c66c491f5273bcdef1781368e9c"}, + {file = "opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed"}, + {file = "opentelemetry_semantic_conventions-0.59b0.tar.gz", hash = "sha256:7a6db3f30d70202d5bf9fa4b69bc866ca6a30437287de6c510fb594878aed6b0"}, ] [package.dependencies] -opentelemetry-api = "1.34.0" +opentelemetry-api = "1.38.0" typing-extensions = ">=4.5.0" [[package]] @@ -3514,6 +4269,8 @@ version = "7.7.0" description = "A decorator to automatically detect mismatch when overriding a method." optional = false python-versions = ">=3.6" +groups = ["dev"] +markers = "python_version == \"3.11\"" files = [ {file = "overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49"}, {file = "overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a"}, @@ -3525,6 +4282,7 @@ version = "25.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, @@ -3536,6 +4294,7 @@ version = "2.2.0" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "pandas-2.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8108ee1712bb4fa2c16981fba7e68b3f6ea330277f5ca34fa8d557e986a11670"}, {file = "pandas-2.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:736da9ad4033aeab51d067fc3bd69a0ba36f5a60f66a527b3d72e2030e63280a"}, @@ -3607,6 +4366,7 @@ version = "1.5.1" description = "Utilities for writing pandoc filters in python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["dev"] files = [ {file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"}, {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"}, @@ -3614,13 +4374,14 @@ files = [ [[package]] name = "parso" -version = "0.8.4" +version = "0.8.5" description = "A Python Parser" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ - {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, - {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, + {file = "parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887"}, + {file = "parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a"}, ] [package.extras] @@ -3633,6 +4394,7 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -3640,13 +4402,14 @@ files = [ [[package]] name = "patsy" -version = "1.0.1" +version = "1.0.2" description = "A Python package for describing statistical models and for building design matrices." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ - {file = "patsy-1.0.1-py2.py3-none-any.whl", hash = "sha256:751fb38f9e97e62312e921a1954b81e1bb2bcda4f5eeabaf94db251ee791509c"}, - {file = "patsy-1.0.1.tar.gz", hash = "sha256:e786a9391eec818c054e359b737bbce692f051aee4c661f4141cc88fb459c0c4"}, + {file = "patsy-1.0.2-py2.py3-none-any.whl", hash = "sha256:37bfddbc58fcf0362febb5f54f10743f8b21dd2aa73dec7e7ef59d1b02ae668a"}, + {file = "patsy-1.0.2.tar.gz", hash = "sha256:cdc995455f6233e90e22de72c37fcadb344e7586fb83f06696f54d92f8ce74c0"}, ] [package.dependencies] @@ -3655,26 +4418,14 @@ numpy = ">=1.4" [package.extras] test = ["pytest", "pytest-cov", "scipy"] -[[package]] -name = "pbr" -version = "6.1.1" -description = "Python Build Reasonableness" -optional = false -python-versions = ">=2.6" -files = [ - {file = "pbr-6.1.1-py2.py3-none-any.whl", hash = "sha256:38d4daea5d9fa63b3f626131b9d34947fd0c8be9b05a29276870580050a25a76"}, - {file = "pbr-6.1.1.tar.gz", hash = "sha256:93ea72ce6989eb2eed99d0f75721474f69ad88128afdef5ac377eb797c4bf76b"}, -] - -[package.dependencies] -setuptools = "*" - [[package]] name = "pexpect" version = "4.9.0" description = "Pexpect allows easy control of interactive console applications." optional = false python-versions = "*" +groups = ["dev"] +markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\"" files = [ {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, @@ -3685,118 +4436,129 @@ ptyprocess = ">=0.5" [[package]] name = "pillow" -version = "11.2.1" -description = "Python Imaging Library (Fork)" +version = "12.0.0" +description = "Python Imaging Library (fork)" optional = false -python-versions = ">=3.9" -files = [ - {file = "pillow-11.2.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:d57a75d53922fc20c165016a20d9c44f73305e67c351bbc60d1adaf662e74047"}, - {file = "pillow-11.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:127bf6ac4a5b58b3d32fc8289656f77f80567d65660bc46f72c0d77e6600cc95"}, - {file = "pillow-11.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4ba4be812c7a40280629e55ae0b14a0aafa150dd6451297562e1764808bbe61"}, - {file = "pillow-11.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8bd62331e5032bc396a93609982a9ab6b411c05078a52f5fe3cc59234a3abd1"}, - {file = "pillow-11.2.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:562d11134c97a62fe3af29581f083033179f7ff435f78392565a1ad2d1c2c45c"}, - {file = "pillow-11.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c97209e85b5be259994eb5b69ff50c5d20cca0f458ef9abd835e262d9d88b39d"}, - {file = "pillow-11.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0c3e6d0f59171dfa2e25d7116217543310908dfa2770aa64b8f87605f8cacc97"}, - {file = "pillow-11.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc1c3bc53befb6096b84165956e886b1729634a799e9d6329a0c512ab651e579"}, - {file = "pillow-11.2.1-cp310-cp310-win32.whl", hash = "sha256:312c77b7f07ab2139924d2639860e084ec2a13e72af54d4f08ac843a5fc9c79d"}, - {file = "pillow-11.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:9bc7ae48b8057a611e5fe9f853baa88093b9a76303937449397899385da06fad"}, - {file = "pillow-11.2.1-cp310-cp310-win_arm64.whl", hash = "sha256:2728567e249cdd939f6cc3d1f049595c66e4187f3c34078cbc0a7d21c47482d2"}, - {file = "pillow-11.2.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35ca289f712ccfc699508c4658a1d14652e8033e9b69839edf83cbdd0ba39e70"}, - {file = "pillow-11.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0409af9f829f87a2dfb7e259f78f317a5351f2045158be321fd135973fff7bf"}, - {file = "pillow-11.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4e5c5edee874dce4f653dbe59db7c73a600119fbea8d31f53423586ee2aafd7"}, - {file = "pillow-11.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b93a07e76d13bff9444f1a029e0af2964e654bfc2e2c2d46bfd080df5ad5f3d8"}, - {file = "pillow-11.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:e6def7eed9e7fa90fde255afaf08060dc4b343bbe524a8f69bdd2a2f0018f600"}, - {file = "pillow-11.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8f4f3724c068be008c08257207210c138d5f3731af6c155a81c2b09a9eb3a788"}, - {file = "pillow-11.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0a6709b47019dff32e678bc12c63008311b82b9327613f534e496dacaefb71e"}, - {file = "pillow-11.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f6b0c664ccb879109ee3ca702a9272d877f4fcd21e5eb63c26422fd6e415365e"}, - {file = "pillow-11.2.1-cp311-cp311-win32.whl", hash = "sha256:cc5d875d56e49f112b6def6813c4e3d3036d269c008bf8aef72cd08d20ca6df6"}, - {file = "pillow-11.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:0f5c7eda47bf8e3c8a283762cab94e496ba977a420868cb819159980b6709193"}, - {file = "pillow-11.2.1-cp311-cp311-win_arm64.whl", hash = "sha256:4d375eb838755f2528ac8cbc926c3e31cc49ca4ad0cf79cff48b20e30634a4a7"}, - {file = "pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f"}, - {file = "pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b"}, - {file = "pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d"}, - {file = "pillow-11.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4"}, - {file = "pillow-11.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d"}, - {file = "pillow-11.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4"}, - {file = "pillow-11.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443"}, - {file = "pillow-11.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c"}, - {file = "pillow-11.2.1-cp312-cp312-win32.whl", hash = "sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3"}, - {file = "pillow-11.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941"}, - {file = "pillow-11.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb"}, - {file = "pillow-11.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28"}, - {file = "pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830"}, - {file = "pillow-11.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0"}, - {file = "pillow-11.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1"}, - {file = "pillow-11.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f"}, - {file = "pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155"}, - {file = "pillow-11.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14"}, - {file = "pillow-11.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b"}, - {file = "pillow-11.2.1-cp313-cp313-win32.whl", hash = "sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2"}, - {file = "pillow-11.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691"}, - {file = "pillow-11.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c"}, - {file = "pillow-11.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22"}, - {file = "pillow-11.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7"}, - {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16"}, - {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b"}, - {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406"}, - {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91"}, - {file = "pillow-11.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751"}, - {file = "pillow-11.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9"}, - {file = "pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd"}, - {file = "pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e"}, - {file = "pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681"}, - {file = "pillow-11.2.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:7491cf8a79b8eb867d419648fff2f83cb0b3891c8b36da92cc7f1931d46108c8"}, - {file = "pillow-11.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b02d8f9cb83c52578a0b4beadba92e37d83a4ef11570a8688bbf43f4ca50909"}, - {file = "pillow-11.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:014ca0050c85003620526b0ac1ac53f56fc93af128f7546623cc8e31875ab928"}, - {file = "pillow-11.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3692b68c87096ac6308296d96354eddd25f98740c9d2ab54e1549d6c8aea9d79"}, - {file = "pillow-11.2.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:f781dcb0bc9929adc77bad571b8621ecb1e4cdef86e940fe2e5b5ee24fd33b35"}, - {file = "pillow-11.2.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:2b490402c96f907a166615e9a5afacf2519e28295f157ec3a2bb9bd57de638cb"}, - {file = "pillow-11.2.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dd6b20b93b3ccc9c1b597999209e4bc5cf2853f9ee66e3fc9a400a78733ffc9a"}, - {file = "pillow-11.2.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4b835d89c08a6c2ee7781b8dd0a30209a8012b5f09c0a665b65b0eb3560b6f36"}, - {file = "pillow-11.2.1-cp39-cp39-win32.whl", hash = "sha256:b10428b3416d4f9c61f94b494681280be7686bda15898a3a9e08eb66a6d92d67"}, - {file = "pillow-11.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:6ebce70c3f486acf7591a3d73431fa504a4e18a9b97ff27f5f47b7368e4b9dd1"}, - {file = "pillow-11.2.1-cp39-cp39-win_arm64.whl", hash = "sha256:c27476257b2fdcd7872d54cfd119b3a9ce4610fb85c8e32b70b42e3680a29a1e"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9b7b0d4fd2635f54ad82785d56bc0d94f147096493a79985d0ab57aedd563156"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:aa442755e31c64037aa7c1cb186e0b369f8416c567381852c63444dd666fb772"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0d3348c95b766f54b76116d53d4cb171b52992a1027e7ca50c81b43b9d9e363"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85d27ea4c889342f7e35f6d56e7e1cb345632ad592e8c51b693d7b7556043ce0"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bf2c33d6791c598142f00c9c4c7d47f6476731c31081331664eb26d6ab583e01"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e616e7154c37669fc1dfc14584f11e284e05d1c650e1c0f972f281c4ccc53193"}, - {file = "pillow-11.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:39ad2e0f424394e3aebc40168845fee52df1394a4673a6ee512d840d14ab3013"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:80f1df8dbe9572b4b7abdfa17eb5d78dd620b1d55d9e25f834efdbee872d3aed"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ea926cfbc3957090becbcbbb65ad177161a2ff2ad578b5a6ec9bb1e1cd78753c"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:738db0e0941ca0376804d4de6a782c005245264edaa253ffce24e5a15cbdc7bd"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db98ab6565c69082ec9b0d4e40dd9f6181dab0dd236d26f7a50b8b9bfbd5076"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:036e53f4170e270ddb8797d4c590e6dd14d28e15c7da375c18978045f7e6c37b"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14f73f7c291279bd65fda51ee87affd7c1e097709f7fdd0188957a16c264601f"}, - {file = "pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044"}, - {file = "pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6"}, -] - -[package.extras] -docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "pillow-12.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:3adfb466bbc544b926d50fe8f4a4e6abd8c6bffd28a26177594e6e9b2b76572b"}, + {file = "pillow-12.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1ac11e8ea4f611c3c0147424eae514028b5e9077dd99ab91e1bd7bc33ff145e1"}, + {file = "pillow-12.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d49e2314c373f4c2b39446fb1a45ed333c850e09d0c59ac79b72eb3b95397363"}, + {file = "pillow-12.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c7b2a63fd6d5246349f3d3f37b14430d73ee7e8173154461785e43036ffa96ca"}, + {file = "pillow-12.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d64317d2587c70324b79861babb9c09f71fbb780bad212018874b2c013d8600e"}, + {file = "pillow-12.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d77153e14b709fd8b8af6f66a3afbb9ed6e9fc5ccf0b6b7e1ced7b036a228782"}, + {file = "pillow-12.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32ed80ea8a90ee3e6fa08c21e2e091bba6eda8eccc83dbc34c95169507a91f10"}, + {file = "pillow-12.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c828a1ae702fc712978bda0320ba1b9893d99be0badf2647f693cc01cf0f04fa"}, + {file = "pillow-12.0.0-cp310-cp310-win32.whl", hash = "sha256:bd87e140e45399c818fac4247880b9ce719e4783d767e030a883a970be632275"}, + {file = "pillow-12.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:455247ac8a4cfb7b9bc45b7e432d10421aea9fc2e74d285ba4072688a74c2e9d"}, + {file = "pillow-12.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:6ace95230bfb7cd79ef66caa064bbe2f2a1e63d93471c3a2e1f1348d9f22d6b7"}, + {file = "pillow-12.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0fd00cac9c03256c8b2ff58f162ebcd2587ad3e1f2e397eab718c47e24d231cc"}, + {file = "pillow-12.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3475b96f5908b3b16c47533daaa87380c491357d197564e0ba34ae75c0f3257"}, + {file = "pillow-12.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:110486b79f2d112cf6add83b28b627e369219388f64ef2f960fef9ebaf54c642"}, + {file = "pillow-12.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5269cc1caeedb67e6f7269a42014f381f45e2e7cd42d834ede3c703a1d915fe3"}, + {file = "pillow-12.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa5129de4e174daccbc59d0a3b6d20eaf24417d59851c07ebb37aeb02947987c"}, + {file = "pillow-12.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bee2a6db3a7242ea309aa7ee8e2780726fed67ff4e5b40169f2c940e7eb09227"}, + {file = "pillow-12.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:90387104ee8400a7b4598253b4c406f8958f59fcf983a6cea2b50d59f7d63d0b"}, + {file = "pillow-12.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc91a56697869546d1b8f0a3ff35224557ae7f881050e99f615e0119bf934b4e"}, + {file = "pillow-12.0.0-cp311-cp311-win32.whl", hash = "sha256:27f95b12453d165099c84f8a8bfdfd46b9e4bda9e0e4b65f0635430027f55739"}, + {file = "pillow-12.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b583dc9070312190192631373c6c8ed277254aa6e6084b74bdd0a6d3b221608e"}, + {file = "pillow-12.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:759de84a33be3b178a64c8ba28ad5c135900359e85fb662bc6e403ad4407791d"}, + {file = "pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:53561a4ddc36facb432fae7a9d8afbfaf94795414f5cdc5fc52f28c1dca90371"}, + {file = "pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:71db6b4c1653045dacc1585c1b0d184004f0d7e694c7b34ac165ca70c0838082"}, + {file = "pillow-12.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2fa5f0b6716fc88f11380b88b31fe591a06c6315e955c096c35715788b339e3f"}, + {file = "pillow-12.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82240051c6ca513c616f7f9da06e871f61bfd7805f566275841af15015b8f98d"}, + {file = "pillow-12.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55f818bd74fe2f11d4d7cbc65880a843c4075e0ac7226bc1a23261dbea531953"}, + {file = "pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b87843e225e74576437fd5b6a4c2205d422754f84a06942cfaf1dc32243e45a8"}, + {file = "pillow-12.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c607c90ba67533e1b2355b821fef6764d1dd2cbe26b8c1005ae84f7aea25ff79"}, + {file = "pillow-12.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:21f241bdd5080a15bc86d3466a9f6074a9c2c2b314100dd896ac81ee6db2f1ba"}, + {file = "pillow-12.0.0-cp312-cp312-win32.whl", hash = "sha256:dd333073e0cacdc3089525c7df7d39b211bcdf31fc2824e49d01c6b6187b07d0"}, + {file = "pillow-12.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe611163f6303d1619bbcb653540a4d60f9e55e622d60a3108be0d5b441017a"}, + {file = "pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad"}, + {file = "pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643"}, + {file = "pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4"}, + {file = "pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399"}, + {file = "pillow-12.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c50f36a62a22d350c96e49ad02d0da41dbd17ddc2e29750dbdba4323f85eb4a5"}, + {file = "pillow-12.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5193fde9a5f23c331ea26d0cf171fbf67e3f247585f50c08b3e205c7aeb4589b"}, + {file = "pillow-12.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bde737cff1a975b70652b62d626f7785e0480918dece11e8fef3c0cf057351c3"}, + {file = "pillow-12.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6597ff2b61d121172f5844b53f21467f7082f5fb385a9a29c01414463f93b07"}, + {file = "pillow-12.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b817e7035ea7f6b942c13aa03bb554fc44fea70838ea21f8eb31c638326584e"}, + {file = "pillow-12.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4f1231b7dec408e8670264ce63e9c71409d9583dd21d32c163e25213ee2a344"}, + {file = "pillow-12.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e51b71417049ad6ab14c49608b4a24d8fb3fe605e5dfabfe523b58064dc3d27"}, + {file = "pillow-12.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d120c38a42c234dc9a8c5de7ceaaf899cf33561956acb4941653f8bdc657aa79"}, + {file = "pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098"}, + {file = "pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905"}, + {file = "pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a"}, + {file = "pillow-12.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7fa22993bac7b77b78cae22bad1e2a987ddf0d9015c63358032f84a53f23cdc3"}, + {file = "pillow-12.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f135c702ac42262573fe9714dfe99c944b4ba307af5eb507abef1667e2cbbced"}, + {file = "pillow-12.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c85de1136429c524e55cfa4e033b4a7940ac5c8ee4d9401cc2d1bf48154bbc7b"}, + {file = "pillow-12.0.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38df9b4bfd3db902c9c2bd369bcacaf9d935b2fff73709429d95cc41554f7b3d"}, + {file = "pillow-12.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d87ef5795da03d742bf49439f9ca4d027cde49c82c5371ba52464aee266699a"}, + {file = "pillow-12.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aff9e4d82d082ff9513bdd6acd4f5bd359f5b2c870907d2b0a9c5e10d40c88fe"}, + {file = "pillow-12.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8d8ca2b210ada074d57fcee40c30446c9562e542fc46aedc19baf758a93532ee"}, + {file = "pillow-12.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:99a7f72fb6249302aa62245680754862a44179b545ded638cf1fef59befb57ef"}, + {file = "pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9"}, + {file = "pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b"}, + {file = "pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47"}, + {file = "pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:beeae3f27f62308f1ddbcfb0690bf44b10732f2ef43758f169d5e9303165d3f9"}, + {file = "pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d4827615da15cd59784ce39d3388275ec093ae3ee8d7f0c089b76fa87af756c2"}, + {file = "pillow-12.0.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:3e42edad50b6909089750e65c91aa09aaf1e0a71310d383f11321b27c224ed8a"}, + {file = "pillow-12.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e5d8efac84c9afcb40914ab49ba063d94f5dbdf5066db4482c66a992f47a3a3b"}, + {file = "pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:266cd5f2b63ff316d5a1bba46268e603c9caf5606d44f38c2873c380950576ad"}, + {file = "pillow-12.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:58eea5ebe51504057dd95c5b77d21700b77615ab0243d8152793dc00eb4faf01"}, + {file = "pillow-12.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f13711b1a5ba512d647a0e4ba79280d3a9a045aaf7e0cc6fbe96b91d4cdf6b0c"}, + {file = "pillow-12.0.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6846bd2d116ff42cba6b646edf5bf61d37e5cbd256425fa089fee4ff5c07a99e"}, + {file = "pillow-12.0.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c98fa880d695de164b4135a52fd2e9cd7b7c90a9d8ac5e9e443a24a95ef9248e"}, + {file = "pillow-12.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa3ed2a29a9e9d2d488b4da81dcb54720ac3104a20bf0bd273f1e4648aff5af9"}, + {file = "pillow-12.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d034140032870024e6b9892c692fe2968493790dd57208b2c37e3fb35f6df3ab"}, + {file = "pillow-12.0.0-cp314-cp314-win32.whl", hash = "sha256:1b1b133e6e16105f524a8dec491e0586d072948ce15c9b914e41cdadd209052b"}, + {file = "pillow-12.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8dc232e39d409036af549c86f24aed8273a40ffa459981146829a324e0848b4b"}, + {file = "pillow-12.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:d52610d51e265a51518692045e372a4c363056130d922a7351429ac9f27e70b0"}, + {file = "pillow-12.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1979f4566bb96c1e50a62d9831e2ea2d1211761e5662afc545fa766f996632f6"}, + {file = "pillow-12.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b2e4b27a6e15b04832fe9bf292b94b5ca156016bbc1ea9c2c20098a0320d6cf6"}, + {file = "pillow-12.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb3096c30df99fd01c7bf8e544f392103d0795b9f98ba71a8054bcbf56b255f1"}, + {file = "pillow-12.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7438839e9e053ef79f7112c881cef684013855016f928b168b81ed5835f3e75e"}, + {file = "pillow-12.0.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d5c411a8eaa2299322b647cd932586b1427367fd3184ffbb8f7a219ea2041ca"}, + {file = "pillow-12.0.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7e091d464ac59d2c7ad8e7e08105eaf9dafbc3883fd7265ffccc2baad6ac925"}, + {file = "pillow-12.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:792a2c0be4dcc18af9d4a2dfd8a11a17d5e25274a1062b0ec1c2d79c76f3e7f8"}, + {file = "pillow-12.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:afbefa430092f71a9593a99ab6a4e7538bc9eabbf7bf94f91510d3503943edc4"}, + {file = "pillow-12.0.0-cp314-cp314t-win32.whl", hash = "sha256:3830c769decf88f1289680a59d4f4c46c72573446352e2befec9a8512104fa52"}, + {file = "pillow-12.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:905b0365b210c73afb0ebe9101a32572152dfd1c144c7e28968a331b9217b94a"}, + {file = "pillow-12.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:99353a06902c2e43b43e8ff74ee65a7d90307d82370604746738a1e0661ccca7"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b22bd8c974942477156be55a768f7aa37c46904c175be4e158b6a86e3a6b7ca8"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:805ebf596939e48dbb2e4922a1d3852cfc25c38160751ce02da93058b48d252a"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae81479f77420d217def5f54b5b9d279804d17e982e0f2fa19b1d1e14ab5197"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aeaefa96c768fc66818730b952a862235d68825c178f1b3ffd4efd7ad2edcb7c"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f2d0abef9e4e2f349305a4f8cc784a8a6c2f58a8c4892eea13b10a943bd26e"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdee52571a343d721fb2eb3b090a82d959ff37fc631e3f70422e0c2e029f3e76"}, + {file = "pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5"}, + {file = "pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] fpx = ["olefile"] mic = ["olefile"] -test-arrow = ["pyarrow"] -tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "trove-classifiers (>=2024.10.12)"] -typing = ["typing-extensions"] +test-arrow = ["arro3-compute", "arro3-core", "nanoarrow", "pyarrow"] +tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma (>=5)", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "trove-classifiers (>=2024.10.12)"] xmp = ["defusedxml"] [[package]] name = "platformdirs" -version = "4.3.8" +version = "4.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["main", "dev"] files = [ - {file = "platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4"}, - {file = "platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc"}, + {file = "platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3"}, + {file = "platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312"}, ] [package.extras] -docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.4)", "pytest-cov (>=6)", "pytest-mock (>=3.14)"] -type = ["mypy (>=1.14.1)"] +docs = ["furo (>=2025.9.25)", "proselint (>=0.14)", "sphinx (>=8.2.3)", "sphinx-autodoc-typehints (>=3.2)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.4.2)", "pytest-cov (>=7)", "pytest-mock (>=3.15.1)"] +type = ["mypy (>=1.18.2)"] [[package]] name = "pluggy" @@ -3804,6 +4566,7 @@ version = "1.6.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, @@ -3813,15 +4576,38 @@ files = [ dev = ["pre-commit", "tox"] testing = ["coverage", "pytest", "pytest-benchmark"] +[[package]] +name = "pooch" +version = "1.8.2" +description = "A friend to fetch your data files" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "pooch-1.8.2-py3-none-any.whl", hash = "sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47"}, + {file = "pooch-1.8.2.tar.gz", hash = "sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10"}, +] + +[package.dependencies] +packaging = ">=20.0" +platformdirs = ">=2.5.0" +requests = ">=2.19.0" + +[package.extras] +progress = ["tqdm (>=4.41.0,<5.0.0)"] +sftp = ["paramiko (>=2.7.0)"] +xxhash = ["xxhash (>=1.4.3)"] + [[package]] name = "pre-commit" -version = "4.2.0" +version = "4.4.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "pre_commit-4.2.0-py2.py3-none-any.whl", hash = "sha256:a009ca7205f1eb497d10b845e52c838a98b6cdd2102a6c8e4540e94ee75c58bd"}, - {file = "pre_commit-4.2.0.tar.gz", hash = "sha256:601283b9757afd87d40c4c4a9b2b5de9637a8ea02eaff7adc2d0fb4e04841146"}, + {file = "pre_commit-4.4.0-py2.py3-none-any.whl", hash = "sha256:b35ea52957cbf83dcc5d8ee636cbead8624e3a15fbfa61a370e42158ac8a5813"}, + {file = "pre_commit-4.4.0.tar.gz", hash = "sha256:f0233ebab440e9f17cabbb558706eb173d19ace965c68cdce2c081042b4fab15"}, ] [package.dependencies] @@ -3833,13 +4619,14 @@ virtualenv = ">=20.10.0" [[package]] name = "prometheus-client" -version = "0.22.1" +version = "0.23.1" description = "Python client for the Prometheus monitoring system." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "prometheus_client-0.22.1-py3-none-any.whl", hash = "sha256:cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094"}, - {file = "prometheus_client-0.22.1.tar.gz", hash = "sha256:190f1331e783cf21eb60bca559354e0a4d4378facecf78f5428c39b675d20d28"}, + {file = "prometheus_client-0.23.1-py3-none-any.whl", hash = "sha256:dd1913e6e76b59cfe44e7a4b83e01afc9873c1bdfd2ed8739f1e76aeca115f99"}, + {file = "prometheus_client-0.23.1.tar.gz", hash = "sha256:6ae8f9081eaaaf153a2e959d2e6c4f4fb57b12ef76c8c7980202f1e57b48b2ce"}, ] [package.extras] @@ -3847,13 +4634,14 @@ twisted = ["twisted"] [[package]] name = "prompt-toolkit" -version = "3.0.51" +version = "3.0.52" description = "Library for building powerful interactive command lines in Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07"}, - {file = "prompt_toolkit-3.0.51.tar.gz", hash = "sha256:931a162e3b27fc90c86f1b48bb1fb2c528c2761475e57c9c06de13311c7b54ed"}, + {file = "prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955"}, + {file = "prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855"}, ] [package.dependencies] @@ -3861,151 +4649,265 @@ wcwidth = "*" [[package]] name = "propcache" -version = "0.3.1" +version = "0.4.1" description = "Accelerated property cache" optional = false python-versions = ">=3.9" -files = [ - {file = "propcache-0.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f27785888d2fdd918bc36de8b8739f2d6c791399552333721b58193f68ea3e98"}, - {file = "propcache-0.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4e89cde74154c7b5957f87a355bb9c8ec929c167b59c83d90654ea36aeb6180"}, - {file = "propcache-0.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:730178f476ef03d3d4d255f0c9fa186cb1d13fd33ffe89d39f2cda4da90ceb71"}, - {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:967a8eec513dbe08330f10137eacb427b2ca52118769e82ebcfcab0fba92a649"}, - {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b9145c35cc87313b5fd480144f8078716007656093d23059e8993d3a8fa730f"}, - {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9e64e948ab41411958670f1093c0a57acfdc3bee5cf5b935671bbd5313bcf229"}, - {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:319fa8765bfd6a265e5fa661547556da381e53274bc05094fc9ea50da51bfd46"}, - {file = "propcache-0.3.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66d8ccbc902ad548312b96ed8d5d266d0d2c6d006fd0f66323e9d8f2dd49be7"}, - {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2d219b0dbabe75e15e581fc1ae796109b07c8ba7d25b9ae8d650da582bed01b0"}, - {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:cd6a55f65241c551eb53f8cf4d2f4af33512c39da5d9777694e9d9c60872f519"}, - {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9979643ffc69b799d50d3a7b72b5164a2e97e117009d7af6dfdd2ab906cb72cd"}, - {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4cf9e93a81979f1424f1a3d155213dc928f1069d697e4353edb8a5eba67c6259"}, - {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2fce1df66915909ff6c824bbb5eb403d2d15f98f1518e583074671a30fe0c21e"}, - {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4d0dfdd9a2ebc77b869a0b04423591ea8823f791293b527dc1bb896c1d6f1136"}, - {file = "propcache-0.3.1-cp310-cp310-win32.whl", hash = "sha256:1f6cc0ad7b4560e5637eb2c994e97b4fa41ba8226069c9277eb5ea7101845b42"}, - {file = "propcache-0.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:47ef24aa6511e388e9894ec16f0fbf3313a53ee68402bc428744a367ec55b833"}, - {file = "propcache-0.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f30241577d2fef2602113b70ef7231bf4c69a97e04693bde08ddab913ba0ce5"}, - {file = "propcache-0.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:43593c6772aa12abc3af7784bff4a41ffa921608dd38b77cf1dfd7f5c4e71371"}, - {file = "propcache-0.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a75801768bbe65499495660b777e018cbe90c7980f07f8aa57d6be79ea6f71da"}, - {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6f1324db48f001c2ca26a25fa25af60711e09b9aaf4b28488602776f4f9a744"}, - {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cdb0f3e1eb6dfc9965d19734d8f9c481b294b5274337a8cb5cb01b462dcb7e0"}, - {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1eb34d90aac9bfbced9a58b266f8946cb5935869ff01b164573a7634d39fbcb5"}, - {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f35c7070eeec2cdaac6fd3fe245226ed2a6292d3ee8c938e5bb645b434c5f256"}, - {file = "propcache-0.3.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b23c11c2c9e6d4e7300c92e022046ad09b91fd00e36e83c44483df4afa990073"}, - {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3e19ea4ea0bf46179f8a3652ac1426e6dcbaf577ce4b4f65be581e237340420d"}, - {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:bd39c92e4c8f6cbf5f08257d6360123af72af9f4da75a690bef50da77362d25f"}, - {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b0313e8b923b3814d1c4a524c93dfecea5f39fa95601f6a9b1ac96cd66f89ea0"}, - {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e861ad82892408487be144906a368ddbe2dc6297074ade2d892341b35c59844a"}, - {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:61014615c1274df8da5991a1e5da85a3ccb00c2d4701ac6f3383afd3ca47ab0a"}, - {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:71ebe3fe42656a2328ab08933d420df5f3ab121772eef78f2dc63624157f0ed9"}, - {file = "propcache-0.3.1-cp311-cp311-win32.whl", hash = "sha256:58aa11f4ca8b60113d4b8e32d37e7e78bd8af4d1a5b5cb4979ed856a45e62005"}, - {file = "propcache-0.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:9532ea0b26a401264b1365146c440a6d78269ed41f83f23818d4b79497aeabe7"}, - {file = "propcache-0.3.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f78eb8422acc93d7b69964012ad7048764bb45a54ba7a39bb9e146c72ea29723"}, - {file = "propcache-0.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:89498dd49c2f9a026ee057965cdf8192e5ae070ce7d7a7bd4b66a8e257d0c976"}, - {file = "propcache-0.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:09400e98545c998d57d10035ff623266927cb784d13dd2b31fd33b8a5316b85b"}, - {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8efd8c5adc5a2c9d3b952815ff8f7710cefdcaf5f2c36d26aff51aeca2f12f"}, - {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2fe5c910f6007e716a06d269608d307b4f36e7babee5f36533722660e8c4a70"}, - {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a0ab8cf8cdd2194f8ff979a43ab43049b1df0b37aa64ab7eca04ac14429baeb7"}, - {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:563f9d8c03ad645597b8d010ef4e9eab359faeb11a0a2ac9f7b4bc8c28ebef25"}, - {file = "propcache-0.3.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb6e0faf8cb6b4beea5d6ed7b5a578254c6d7df54c36ccd3d8b3eb00d6770277"}, - {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1c5c7ab7f2bb3f573d1cb921993006ba2d39e8621019dffb1c5bc94cdbae81e8"}, - {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:050b571b2e96ec942898f8eb46ea4bfbb19bd5502424747e83badc2d4a99a44e"}, - {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e1c4d24b804b3a87e9350f79e2371a705a188d292fd310e663483af6ee6718ee"}, - {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e4fe2a6d5ce975c117a6bb1e8ccda772d1e7029c1cca1acd209f91d30fa72815"}, - {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:feccd282de1f6322f56f6845bf1207a537227812f0a9bf5571df52bb418d79d5"}, - {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ec314cde7314d2dd0510c6787326bbffcbdc317ecee6b7401ce218b3099075a7"}, - {file = "propcache-0.3.1-cp312-cp312-win32.whl", hash = "sha256:7d2d5a0028d920738372630870e7d9644ce437142197f8c827194fca404bf03b"}, - {file = "propcache-0.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:88c423efef9d7a59dae0614eaed718449c09a5ac79a5f224a8b9664d603f04a3"}, - {file = "propcache-0.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f1528ec4374617a7a753f90f20e2f551121bb558fcb35926f99e3c42367164b8"}, - {file = "propcache-0.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc1915ec523b3b494933b5424980831b636fe483d7d543f7afb7b3bf00f0c10f"}, - {file = "propcache-0.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a110205022d077da24e60b3df8bcee73971be9575dec5573dd17ae5d81751111"}, - {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d249609e547c04d190e820d0d4c8ca03ed4582bcf8e4e160a6969ddfb57b62e5"}, - {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ced33d827625d0a589e831126ccb4f5c29dfdf6766cac441d23995a65825dcb"}, - {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4114c4ada8f3181af20808bedb250da6bae56660e4b8dfd9cd95d4549c0962f7"}, - {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:975af16f406ce48f1333ec5e912fe11064605d5c5b3f6746969077cc3adeb120"}, - {file = "propcache-0.3.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a34aa3a1abc50740be6ac0ab9d594e274f59960d3ad253cd318af76b996dd654"}, - {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9cec3239c85ed15bfaded997773fdad9fb5662b0a7cbc854a43f291eb183179e"}, - {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:05543250deac8e61084234d5fc54f8ebd254e8f2b39a16b1dce48904f45b744b"}, - {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5cb5918253912e088edbf023788de539219718d3b10aef334476b62d2b53de53"}, - {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f3bbecd2f34d0e6d3c543fdb3b15d6b60dd69970c2b4c822379e5ec8f6f621d5"}, - {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aca63103895c7d960a5b9b044a83f544b233c95e0dcff114389d64d762017af7"}, - {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a0a9898fdb99bf11786265468571e628ba60af80dc3f6eb89a3545540c6b0ef"}, - {file = "propcache-0.3.1-cp313-cp313-win32.whl", hash = "sha256:3a02a28095b5e63128bcae98eb59025924f121f048a62393db682f049bf4ac24"}, - {file = "propcache-0.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:813fbb8b6aea2fc9659815e585e548fe706d6f663fa73dff59a1677d4595a037"}, - {file = "propcache-0.3.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a444192f20f5ce8a5e52761a031b90f5ea6288b1eef42ad4c7e64fef33540b8f"}, - {file = "propcache-0.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fbe94666e62ebe36cd652f5fc012abfbc2342de99b523f8267a678e4dfdee3c"}, - {file = "propcache-0.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f011f104db880f4e2166bcdcf7f58250f7a465bc6b068dc84c824a3d4a5c94dc"}, - {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e584b6d388aeb0001d6d5c2bd86b26304adde6d9bb9bfa9c4889805021b96de"}, - {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a17583515a04358b034e241f952f1715243482fc2c2945fd99a1b03a0bd77d6"}, - {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5aed8d8308215089c0734a2af4f2e95eeb360660184ad3912686c181e500b2e7"}, - {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8e309ff9a0503ef70dc9a0ebd3e69cf7b3894c9ae2ae81fc10943c37762458"}, - {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b655032b202028a582d27aeedc2e813299f82cb232f969f87a4fde491a233f11"}, - {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f64d91b751df77931336b5ff7bafbe8845c5770b06630e27acd5dbb71e1931c"}, - {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:19a06db789a4bd896ee91ebc50d059e23b3639c25d58eb35be3ca1cbe967c3bf"}, - {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:bef100c88d8692864651b5f98e871fb090bd65c8a41a1cb0ff2322db39c96c27"}, - {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:87380fb1f3089d2a0b8b00f006ed12bd41bd858fabfa7330c954c70f50ed8757"}, - {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e474fc718e73ba5ec5180358aa07f6aded0ff5f2abe700e3115c37d75c947e18"}, - {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:17d1c688a443355234f3c031349da69444be052613483f3e4158eef751abcd8a"}, - {file = "propcache-0.3.1-cp313-cp313t-win32.whl", hash = "sha256:359e81a949a7619802eb601d66d37072b79b79c2505e6d3fd8b945538411400d"}, - {file = "propcache-0.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e7fb9a84c9abbf2b2683fa3e7b0d7da4d8ecf139a1c635732a8bda29c5214b0e"}, - {file = "propcache-0.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ed5f6d2edbf349bd8d630e81f474d33d6ae5d07760c44d33cd808e2f5c8f4ae6"}, - {file = "propcache-0.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:668ddddc9f3075af019f784456267eb504cb77c2c4bd46cc8402d723b4d200bf"}, - {file = "propcache-0.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0c86e7ceea56376216eba345aa1fc6a8a6b27ac236181f840d1d7e6a1ea9ba5c"}, - {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83be47aa4e35b87c106fc0c84c0fc069d3f9b9b06d3c494cd404ec6747544894"}, - {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:27c6ac6aa9fc7bc662f594ef380707494cb42c22786a558d95fcdedb9aa5d035"}, - {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64a956dff37080b352c1c40b2966b09defb014347043e740d420ca1eb7c9b908"}, - {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82de5da8c8893056603ac2d6a89eb8b4df49abf1a7c19d536984c8dd63f481d5"}, - {file = "propcache-0.3.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c3c3a203c375b08fd06a20da3cf7aac293b834b6f4f4db71190e8422750cca5"}, - {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b303b194c2e6f171cfddf8b8ba30baefccf03d36a4d9cab7fd0bb68ba476a3d7"}, - {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:916cd229b0150129d645ec51614d38129ee74c03293a9f3f17537be0029a9641"}, - {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a461959ead5b38e2581998700b26346b78cd98540b5524796c175722f18b0294"}, - {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:069e7212890b0bcf9b2be0a03afb0c2d5161d91e1bf51569a64f629acc7defbf"}, - {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ef2e4e91fb3945769e14ce82ed53007195e616a63aa43b40fb7ebaaf907c8d4c"}, - {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8638f99dca15b9dff328fb6273e09f03d1c50d9b6512f3b65a4154588a7595fe"}, - {file = "propcache-0.3.1-cp39-cp39-win32.whl", hash = "sha256:6f173bbfe976105aaa890b712d1759de339d8a7cef2fc0a1714cc1a1e1c47f64"}, - {file = "propcache-0.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:603f1fe4144420374f1a69b907494c3acbc867a581c2d49d4175b0de7cc64566"}, - {file = "propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40"}, - {file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"}, +groups = ["main"] +files = [ + {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db"}, + {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1eb2994229cc8ce7fe9b3db88f5465f5fd8651672840b2e426b88cdb1a30aac8"}, + {file = "propcache-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66c1f011f45a3b33d7bcb22daed4b29c0c9e2224758b6be00686731e1b46f925"}, + {file = "propcache-0.4.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a52009f2adffe195d0b605c25ec929d26b36ef986ba85244891dee3b294df21"}, + {file = "propcache-0.4.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5d4e2366a9c7b837555cf02fb9be2e3167d333aff716332ef1b7c3a142ec40c5"}, + {file = "propcache-0.4.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:9d2b6caef873b4f09e26ea7e33d65f42b944837563a47a94719cc3544319a0db"}, + {file = "propcache-0.4.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b16ec437a8c8a965ecf95739448dd938b5c7f56e67ea009f4300d8df05f32b7"}, + {file = "propcache-0.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:296f4c8ed03ca7476813fe666c9ea97869a8d7aec972618671b33a38a5182ef4"}, + {file = "propcache-0.4.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:1f0978529a418ebd1f49dad413a2b68af33f85d5c5ca5c6ca2a3bed375a7ac60"}, + {file = "propcache-0.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fd138803047fb4c062b1c1dd95462f5209456bfab55c734458f15d11da288f8f"}, + {file = "propcache-0.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8c9b3cbe4584636d72ff556d9036e0c9317fa27b3ac1f0f558e7e84d1c9c5900"}, + {file = "propcache-0.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f93243fdc5657247533273ac4f86ae106cc6445a0efacb9a1bfe982fcfefd90c"}, + {file = "propcache-0.4.1-cp310-cp310-win32.whl", hash = "sha256:a0ee98db9c5f80785b266eb805016e36058ac72c51a064040f2bc43b61101cdb"}, + {file = "propcache-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:1cdb7988c4e5ac7f6d175a28a9aa0c94cb6f2ebe52756a3c0cda98d2809a9e37"}, + {file = "propcache-0.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:d82ad62b19645419fe79dd63b3f9253e15b30e955c0170e5cebc350c1844e581"}, + {file = "propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf"}, + {file = "propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5"}, + {file = "propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e"}, + {file = "propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566"}, + {file = "propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165"}, + {file = "propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc"}, + {file = "propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48"}, + {file = "propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570"}, + {file = "propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85"}, + {file = "propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e"}, + {file = "propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757"}, + {file = "propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f"}, + {file = "propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1"}, + {file = "propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6"}, + {file = "propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239"}, + {file = "propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2"}, + {file = "propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403"}, + {file = "propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207"}, + {file = "propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72"}, + {file = "propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367"}, + {file = "propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4"}, + {file = "propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf"}, + {file = "propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3"}, + {file = "propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778"}, + {file = "propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6"}, + {file = "propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9"}, + {file = "propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75"}, + {file = "propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8"}, + {file = "propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db"}, + {file = "propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1"}, + {file = "propcache-0.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:43eedf29202c08550aac1d14e0ee619b0430aaef78f85864c1a892294fbc28cf"}, + {file = "propcache-0.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d62cdfcfd89ccb8de04e0eda998535c406bf5e060ffd56be6c586cbcc05b3311"}, + {file = "propcache-0.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cae65ad55793da34db5f54e4029b89d3b9b9490d8abe1b4c7ab5d4b8ec7ebf74"}, + {file = "propcache-0.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:333ddb9031d2704a301ee3e506dc46b1fe5f294ec198ed6435ad5b6a085facfe"}, + {file = "propcache-0.4.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:fd0858c20f078a32cf55f7e81473d96dcf3b93fd2ccdb3d40fdf54b8573df3af"}, + {file = "propcache-0.4.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:678ae89ebc632c5c204c794f8dab2837c5f159aeb59e6ed0539500400577298c"}, + {file = "propcache-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d472aeb4fbf9865e0c6d622d7f4d54a4e101a89715d8904282bb5f9a2f476c3f"}, + {file = "propcache-0.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4d3df5fa7e36b3225954fba85589da77a0fe6a53e3976de39caf04a0db4c36f1"}, + {file = "propcache-0.4.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:ee17f18d2498f2673e432faaa71698032b0127ebf23ae5974eeaf806c279df24"}, + {file = "propcache-0.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:580e97762b950f993ae618e167e7be9256b8353c2dcd8b99ec100eb50f5286aa"}, + {file = "propcache-0.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:501d20b891688eb8e7aa903021f0b72d5a55db40ffaab27edefd1027caaafa61"}, + {file = "propcache-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a0bd56e5b100aef69bd8562b74b46254e7c8812918d3baa700c8a8009b0af66"}, + {file = "propcache-0.4.1-cp313-cp313-win32.whl", hash = "sha256:bcc9aaa5d80322bc2fb24bb7accb4a30f81e90ab8d6ba187aec0744bc302ad81"}, + {file = "propcache-0.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:381914df18634f5494334d201e98245c0596067504b9372d8cf93f4bb23e025e"}, + {file = "propcache-0.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:8873eb4460fd55333ea49b7d189749ecf6e55bf85080f11b1c4530ed3034cba1"}, + {file = "propcache-0.4.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:92d1935ee1f8d7442da9c0c4fa7ac20d07e94064184811b685f5c4fada64553b"}, + {file = "propcache-0.4.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:473c61b39e1460d386479b9b2f337da492042447c9b685f28be4f74d3529e566"}, + {file = "propcache-0.4.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c0ef0aaafc66fbd87842a3fe3902fd889825646bc21149eafe47be6072725835"}, + {file = "propcache-0.4.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f95393b4d66bfae908c3ca8d169d5f79cd65636ae15b5e7a4f6e67af675adb0e"}, + {file = "propcache-0.4.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c07fda85708bc48578467e85099645167a955ba093be0a2dcba962195676e859"}, + {file = "propcache-0.4.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:af223b406d6d000830c6f65f1e6431783fc3f713ba3e6cc8c024d5ee96170a4b"}, + {file = "propcache-0.4.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a78372c932c90ee474559c5ddfffd718238e8673c340dc21fe45c5b8b54559a0"}, + {file = "propcache-0.4.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:564d9f0d4d9509e1a870c920a89b2fec951b44bf5ba7d537a9e7c1ccec2c18af"}, + {file = "propcache-0.4.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:17612831fda0138059cc5546f4d12a2aacfb9e47068c06af35c400ba58ba7393"}, + {file = "propcache-0.4.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:41a89040cb10bd345b3c1a873b2bf36413d48da1def52f268a055f7398514874"}, + {file = "propcache-0.4.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e35b88984e7fa64aacecea39236cee32dd9bd8c55f57ba8a75cf2399553f9bd7"}, + {file = "propcache-0.4.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f8b465489f927b0df505cbe26ffbeed4d6d8a2bbc61ce90eb074ff129ef0ab1"}, + {file = "propcache-0.4.1-cp313-cp313t-win32.whl", hash = "sha256:2ad890caa1d928c7c2965b48f3a3815c853180831d0e5503d35cf00c472f4717"}, + {file = "propcache-0.4.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f7ee0e597f495cf415bcbd3da3caa3bd7e816b74d0d52b8145954c5e6fd3ff37"}, + {file = "propcache-0.4.1-cp313-cp313t-win_arm64.whl", hash = "sha256:929d7cbe1f01bb7baffb33dc14eb5691c95831450a26354cd210a8155170c93a"}, + {file = "propcache-0.4.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3f7124c9d820ba5548d431afb4632301acf965db49e666aa21c305cbe8c6de12"}, + {file = "propcache-0.4.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c0d4b719b7da33599dfe3b22d3db1ef789210a0597bc650b7cee9c77c2be8c5c"}, + {file = "propcache-0.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9f302f4783709a78240ebc311b793f123328716a60911d667e0c036bc5dcbded"}, + {file = "propcache-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c80ee5802e3fb9ea37938e7eecc307fb984837091d5fd262bb37238b1ae97641"}, + {file = "propcache-0.4.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ed5a841e8bb29a55fb8159ed526b26adc5bdd7e8bd7bf793ce647cb08656cdf4"}, + {file = "propcache-0.4.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:55c72fd6ea2da4c318e74ffdf93c4fe4e926051133657459131a95c846d16d44"}, + {file = "propcache-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8326e144341460402713f91df60ade3c999d601e7eb5ff8f6f7862d54de0610d"}, + {file = "propcache-0.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:060b16ae65bc098da7f6d25bf359f1f31f688384858204fe5d652979e0015e5b"}, + {file = "propcache-0.4.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:89eb3fa9524f7bec9de6e83cf3faed9d79bffa560672c118a96a171a6f55831e"}, + {file = "propcache-0.4.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:dee69d7015dc235f526fe80a9c90d65eb0039103fe565776250881731f06349f"}, + {file = "propcache-0.4.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5558992a00dfd54ccbc64a32726a3357ec93825a418a401f5cc67df0ac5d9e49"}, + {file = "propcache-0.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c9b822a577f560fbd9554812526831712c1436d2c046cedee4c3796d3543b144"}, + {file = "propcache-0.4.1-cp314-cp314-win32.whl", hash = "sha256:ab4c29b49d560fe48b696cdcb127dd36e0bc2472548f3bf56cc5cb3da2b2984f"}, + {file = "propcache-0.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:5a103c3eb905fcea0ab98be99c3a9a5ab2de60228aa5aceedc614c0281cf6153"}, + {file = "propcache-0.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:74c1fb26515153e482e00177a1ad654721bf9207da8a494a0c05e797ad27b992"}, + {file = "propcache-0.4.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:824e908bce90fb2743bd6b59db36eb4f45cd350a39637c9f73b1c1ea66f5b75f"}, + {file = "propcache-0.4.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2b5e7db5328427c57c8e8831abda175421b709672f6cfc3d630c3b7e2146393"}, + {file = "propcache-0.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6f6ff873ed40292cd4969ef5310179afd5db59fdf055897e282485043fc80ad0"}, + {file = "propcache-0.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49a2dc67c154db2c1463013594c458881a069fcf98940e61a0569016a583020a"}, + {file = "propcache-0.4.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:005f08e6a0529984491e37d8dbc3dd86f84bd78a8ceb5fa9a021f4c48d4984be"}, + {file = "propcache-0.4.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5c3310452e0d31390da9035c348633b43d7e7feb2e37be252be6da45abd1abcc"}, + {file = "propcache-0.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c3c70630930447f9ef1caac7728c8ad1c56bc5015338b20fed0d08ea2480b3a"}, + {file = "propcache-0.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8e57061305815dfc910a3634dcf584f08168a8836e6999983569f51a8544cd89"}, + {file = "propcache-0.4.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:521a463429ef54143092c11a77e04056dd00636f72e8c45b70aaa3140d639726"}, + {file = "propcache-0.4.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:120c964da3fdc75e3731aa392527136d4ad35868cc556fd09bb6d09172d9a367"}, + {file = "propcache-0.4.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d8f353eb14ee3441ee844ade4277d560cdd68288838673273b978e3d6d2c8f36"}, + {file = "propcache-0.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ab2943be7c652f09638800905ee1bab2c544e537edb57d527997a24c13dc1455"}, + {file = "propcache-0.4.1-cp314-cp314t-win32.whl", hash = "sha256:05674a162469f31358c30bcaa8883cb7829fa3110bf9c0991fe27d7896c42d85"}, + {file = "propcache-0.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:990f6b3e2a27d683cb7602ed6c86f15ee6b43b1194736f9baaeb93d0016633b1"}, + {file = "propcache-0.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:ecef2343af4cc68e05131e45024ba34f6095821988a9d0a02aa7c73fcc448aa9"}, + {file = "propcache-0.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3d233076ccf9e450c8b3bc6720af226b898ef5d051a2d145f7d765e6e9f9bcff"}, + {file = "propcache-0.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:357f5bb5c377a82e105e44bd3d52ba22b616f7b9773714bff93573988ef0a5fb"}, + {file = "propcache-0.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cbc3b6dfc728105b2a57c06791eb07a94229202ea75c59db644d7d496b698cac"}, + {file = "propcache-0.4.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:182b51b421f0501952d938dc0b0eb45246a5b5153c50d42b495ad5fb7517c888"}, + {file = "propcache-0.4.1-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4b536b39c5199b96fc6245eb5fb796c497381d3942f169e44e8e392b29c9ebcc"}, + {file = "propcache-0.4.1-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:db65d2af507bbfbdcedb254a11149f894169d90488dd3e7190f7cdcb2d6cd57a"}, + {file = "propcache-0.4.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd2dbc472da1f772a4dae4fa24be938a6c544671a912e30529984dd80400cd88"}, + {file = "propcache-0.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:daede9cd44e0f8bdd9e6cc9a607fc81feb80fae7a5fc6cecaff0e0bb32e42d00"}, + {file = "propcache-0.4.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:71b749281b816793678ae7f3d0d84bd36e694953822eaad408d682efc5ca18e0"}, + {file = "propcache-0.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:0002004213ee1f36cfb3f9a42b5066100c44276b9b72b4e1504cddd3d692e86e"}, + {file = "propcache-0.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:fe49d0a85038f36ba9e3ffafa1103e61170b28e95b16622e11be0a0ea07c6781"}, + {file = "propcache-0.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:99d43339c83aaf4d32bda60928231848eee470c6bda8d02599cc4cebe872d183"}, + {file = "propcache-0.4.1-cp39-cp39-win32.whl", hash = "sha256:a129e76735bc792794d5177069691c3217898b9f5cee2b2661471e52ffe13f19"}, + {file = "propcache-0.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:948dab269721ae9a87fd16c514a0a2c2a1bdb23a9a61b969b0f9d9ee2968546f"}, + {file = "propcache-0.4.1-cp39-cp39-win_arm64.whl", hash = "sha256:5fd37c406dd6dc85aa743e214cef35dc54bbdd1419baac4f6ae5e5b1a2976938"}, + {file = "propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237"}, + {file = "propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d"}, ] [[package]] name = "protobuf" -version = "6.31.1" +version = "6.33.1" description = "" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9"}, - {file = "protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447"}, - {file = "protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402"}, - {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39"}, - {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6"}, - {file = "protobuf-6.31.1-cp39-cp39-win32.whl", hash = "sha256:0414e3aa5a5f3ff423828e1e6a6e907d6c65c1d5b7e6e975793d5590bdeecc16"}, - {file = "protobuf-6.31.1-cp39-cp39-win_amd64.whl", hash = "sha256:8764cf4587791e7564051b35524b72844f845ad0bb011704c3736cce762d8fe9"}, - {file = "protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e"}, - {file = "protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a"}, + {file = "protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b"}, + {file = "protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed"}, + {file = "protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:fe34575f2bdde76ac429ec7b570235bf0c788883e70aee90068e9981806f2490"}, + {file = "protobuf-6.33.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:f8adba2e44cde2d7618996b3fc02341f03f5bc3f2748be72dc7b063319276178"}, + {file = "protobuf-6.33.1-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:0f4cf01222c0d959c2b399142deb526de420be8236f22c71356e2a544e153c53"}, + {file = "protobuf-6.33.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:8fd7d5e0eb08cd5b87fd3df49bc193f5cfd778701f47e11d127d0afc6c39f1d1"}, + {file = "protobuf-6.33.1-cp39-cp39-win32.whl", hash = "sha256:023af8449482fa884d88b4563d85e83accab54138ae098924a985bcbb734a213"}, + {file = "protobuf-6.33.1-cp39-cp39-win_amd64.whl", hash = "sha256:df051de4fd7e5e4371334e234c62ba43763f15ab605579e04c7008c05735cd82"}, + {file = "protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa"}, + {file = "protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b"}, ] [[package]] name = "psutil" -version = "7.0.0" -description = "Cross-platform lib for process and system monitoring in Python. NOTE: the syntax of this script MUST be kept compatible with Python 2.7." +version = "7.1.3" +description = "Cross-platform lib for process and system monitoring." optional = false python-versions = ">=3.6" -files = [ - {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"}, - {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"}, - {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91"}, - {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34"}, - {file = "psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993"}, - {file = "psutil-7.0.0-cp36-cp36m-win32.whl", hash = "sha256:84df4eb63e16849689f76b1ffcb36db7b8de703d1bc1fe41773db487621b6c17"}, - {file = "psutil-7.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1e744154a6580bc968a0195fd25e80432d3afec619daf145b9e5ba16cc1d688e"}, - {file = "psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99"}, - {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"}, - {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"}, +groups = ["main", "dev"] +files = [ + {file = "psutil-7.1.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0005da714eee687b4b8decd3d6cc7c6db36215c9e74e5ad2264b90c3df7d92dc"}, + {file = "psutil-7.1.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:19644c85dcb987e35eeeaefdc3915d059dac7bd1167cdcdbf27e0ce2df0c08c0"}, + {file = "psutil-7.1.3-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95ef04cf2e5ba0ab9eaafc4a11eaae91b44f4ef5541acd2ee91d9108d00d59a7"}, + {file = "psutil-7.1.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1068c303be3a72f8e18e412c5b2a8f6d31750fb152f9cb106b54090296c9d251"}, + {file = "psutil-7.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:18349c5c24b06ac5612c0428ec2a0331c26443d259e2a0144a9b24b4395b58fa"}, + {file = "psutil-7.1.3-cp313-cp313t-win_arm64.whl", hash = "sha256:c525ffa774fe4496282fb0b1187725793de3e7c6b29e41562733cae9ada151ee"}, + {file = "psutil-7.1.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b403da1df4d6d43973dc004d19cee3b848e998ae3154cc8097d139b77156c353"}, + {file = "psutil-7.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ad81425efc5e75da3f39b3e636293360ad8d0b49bed7df824c79764fb4ba9b8b"}, + {file = "psutil-7.1.3-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f33a3702e167783a9213db10ad29650ebf383946e91bc77f28a5eb083496bc9"}, + {file = "psutil-7.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fac9cd332c67f4422504297889da5ab7e05fd11e3c4392140f7370f4208ded1f"}, + {file = "psutil-7.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:3792983e23b69843aea49c8f5b8f115572c5ab64c153bada5270086a2123c7e7"}, + {file = "psutil-7.1.3-cp314-cp314t-win_arm64.whl", hash = "sha256:31d77fcedb7529f27bb3a0472bea9334349f9a04160e8e6e5020f22c59893264"}, + {file = "psutil-7.1.3-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bdbcd0e58ca14996a42adf3621a6244f1bb2e2e528886959c72cf1e326677ab"}, + {file = "psutil-7.1.3-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:bc31fa00f1fbc3c3802141eede66f3a2d51d89716a194bf2cd6fc68310a19880"}, + {file = "psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb428f9f05c1225a558f53e30ccbad9930b11c3fc206836242de1091d3e7dd3"}, + {file = "psutil-7.1.3-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d974e02ca2c8eb4812c3f76c30e28836fffc311d55d979f1465c1feeb2b68b"}, + {file = "psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd"}, + {file = "psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1"}, + {file = "psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74"}, ] [package.extras] -dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] -test = ["pytest", "pytest-xdist", "setuptools"] +dev = ["abi3audit", "black", "check-manifest", "colorama ; os_name == \"nt\"", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pyreadline ; os_name == \"nt\"", "pytest", "pytest-cov", "pytest-instafail", "pytest-subtests", "pytest-xdist", "pywin32 ; os_name == \"nt\" and platform_python_implementation != \"PyPy\"", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "validate-pyproject[all]", "virtualenv", "vulture", "wheel", "wheel ; os_name == \"nt\" and platform_python_implementation != \"PyPy\"", "wmi ; os_name == \"nt\" and platform_python_implementation != \"PyPy\""] +test = ["pytest", "pytest-instafail", "pytest-subtests", "pytest-xdist", "pywin32 ; os_name == \"nt\" and platform_python_implementation != \"PyPy\"", "setuptools", "wheel ; os_name == \"nt\" and platform_python_implementation != \"PyPy\"", "wmi ; os_name == \"nt\" and platform_python_implementation != \"PyPy\""] + +[[package]] +name = "psycopg2-binary" +version = "2.9.11" +description = "psycopg2 - Python-PostgreSQL Database Adapter" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "psycopg2-binary-2.9.11.tar.gz", hash = "sha256:b6aed9e096bf63f9e75edf2581aa9a7e7186d97ab5c177aa6c87797cd591236c"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6fe6b47d0b42ce1c9f1fa3e35bb365011ca22e39db37074458f27921dca40f2"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a6c0e4262e089516603a09474ee13eabf09cb65c332277e39af68f6233911087"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c47676e5b485393f069b4d7a811267d3168ce46f988fa602658b8bb901e9e64d"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:a28d8c01a7b27a1e3265b11250ba7557e5f72b5ee9e5f3a2fa8d2949c29bf5d2"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f3f2732cf504a1aa9e9609d02f79bea1067d99edf844ab92c247bbca143303b"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:865f9945ed1b3950d968ec4690ce68c55019d79e4497366d36e090327ce7db14"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91537a8df2bde69b1c1db01d6d944c831ca793952e4f57892600e96cee95f2cd"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4dca1f356a67ecb68c81a7bc7809f1569ad9e152ce7fd02c2f2036862ca9f66b"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0da4de5c1ac69d94ed4364b6cbe7190c1a70d325f112ba783d83f8440285f152"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37d8412565a7267f7d79e29ab66876e55cb5e8e7b3bbf94f8206f6795f8f7e7e"}, + {file = "psycopg2_binary-2.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:c665f01ec8ab273a61c62beeb8cce3014c214429ced8a308ca1fc410ecac3a39"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e8480afd62362d0a6a27dd09e4ca2def6fa50ed3a4e7c09165266106b2ffa10"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:763c93ef1df3da6d1a90f86ea7f3f806dc06b21c198fa87c3c25504abec9404a"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e164359396576a3cc701ba8af4751ae68a07235d7a380c631184a611220d9a4"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:d57c9c387660b8893093459738b6abddbb30a7eab058b77b0d0d1c7d521ddfd7"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2c226ef95eb2250974bf6fa7a842082b31f68385c4f3268370e3f3870e7859ee"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a311f1edc9967723d3511ea7d2708e2c3592e3405677bf53d5c7246753591fbb"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb415404821b6d1c47353ebe9c8645967a5235e6d88f914147e7fd411419e6f"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f07c9c4a5093258a03b28fab9b4f151aa376989e7f35f855088234e656ee6a94"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:00ce1830d971f43b667abe4a56e42c1e2d594b32da4802e44a73bacacb25535f"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cffe9d7697ae7456649617e8bb8d7a45afb71cd13f7ab22af3e5c61f04840908"}, + {file = "psycopg2_binary-2.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:304fd7b7f97eef30e91b8f7e720b3db75fee010b520e434ea35ed1ff22501d03"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f090b7ddd13ca842ebfe301cd587a76a4cf0913b1e429eb92c1be5dbeb1a19bc"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d"}, + {file = "psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:366df99e710a2acd90efed3764bb1e28df6c675d33a7fb40df9b7281694432ee"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1"}, + {file = "psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e0deeb03da539fa3577fcb0b3f2554a97f7e5477c246098dbb18091a4a01c16f"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d"}, + {file = "psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20e7fb94e20b03dcc783f76c0865f9da39559dcc0c28dd1a3fce0d01902a6b9c"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4bdab48575b6f870f465b397c38f1b415520e9879fdf10a53ee4f49dcbdf8a21"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9d3a9edcfbe77a3ed4bc72836d466dfce4174beb79eda79ea155cc77237ed9e8"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:44fc5c2b8fa871ce7f0023f619f1349a0aa03a0857f2c96fbc01c657dcbbdb49"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9c55460033867b4622cda1b6872edf445809535144152e5d14941ef591980edf"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:2d11098a83cca92deaeaed3d58cfd150d49b3b06ee0d0852be466bf87596899e"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:691c807d94aecfbc76a14e1408847d59ff5b5906a04a23e12a89007672b9e819"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:8b81627b691f29c4c30a8f322546ad039c40c328373b11dff7490a3e1b517855"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:b637d6d941209e8d96a072d7977238eea128046effbf37d1d8b2c0764750017d"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:41360b01c140c2a03d346cec3280cf8a71aa07d94f3b1509fa0161c366af66b4"}, + {file = "psycopg2_binary-2.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:875039274f8a2361e5207857899706da840768e2a775bf8c65e82f60b197df02"}, +] [[package]] name = "ptyprocess" @@ -4013,6 +4915,8 @@ version = "0.7.0" description = "Run a subprocess in a pseudo terminal" optional = false python-versions = "*" +groups = ["dev"] +markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\" or os_name != \"nt\"" files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, @@ -4024,6 +4928,7 @@ version = "0.2.3" description = "Safely evaluate AST nodes without side effects" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, @@ -4038,6 +4943,7 @@ version = "19.0.1" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69"}, {file = "pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec"}, @@ -4092,6 +4998,7 @@ version = "0.6.1" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, @@ -4103,6 +5010,7 @@ version = "0.4.2" description = "A collection of ASN.1-based protocols modules" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a"}, {file = "pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6"}, @@ -4113,156 +5021,183 @@ pyasn1 = ">=0.6.1,<0.7.0" [[package]] name = "pycparser" -version = "2.22" +version = "2.23" description = "C parser in Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ - {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, - {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, + {file = "pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934"}, + {file = "pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2"}, ] +markers = {main = "platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\"", dev = "implementation_name != \"PyPy\""} [[package]] name = "pydantic" -version = "2.11.5" +version = "2.12.4" description = "Data validation using Python type hints" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "pydantic-2.11.5-py3-none-any.whl", hash = "sha256:f9c26ba06f9747749ca1e5c94d6a85cb84254577553c8785576fd38fa64dc0f7"}, - {file = "pydantic-2.11.5.tar.gz", hash = "sha256:7f853db3d0ce78ce8bbb148c401c2cdd6431b3473c0cdff2755c7690952a7b7a"}, + {file = "pydantic-2.12.4-py3-none-any.whl", hash = "sha256:92d3d202a745d46f9be6df459ac5a064fdaa3c1c4cd8adcfa332ccf3c05f871e"}, + {file = "pydantic-2.12.4.tar.gz", hash = "sha256:0f8cb9555000a4b5b617f66bfd2566264c4984b27589d3b845685983e8ea85ac"}, ] [package.dependencies] annotated-types = ">=0.6.0" -pydantic-core = "2.33.2" -typing-extensions = ">=4.12.2" -typing-inspection = ">=0.4.0" +pydantic-core = "2.41.5" +typing-extensions = ">=4.14.1" +typing-inspection = ">=0.4.2" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] [[package]] name = "pydantic-core" -version = "2.33.2" +version = "2.41.5" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.9" -files = [ - {file = "pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8"}, - {file = "pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d"}, - {file = "pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d"}, - {file = "pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572"}, - {file = "pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02"}, - {file = "pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b"}, - {file = "pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2"}, - {file = "pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a"}, - {file = "pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac"}, - {file = "pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a"}, - {file = "pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b"}, - {file = "pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22"}, - {file = "pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640"}, - {file = "pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7"}, - {file = "pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246"}, - {file = "pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f"}, - {file = "pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc"}, - {file = "pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de"}, - {file = "pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a"}, - {file = "pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef"}, - {file = "pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e"}, - {file = "pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d"}, - {file = "pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30"}, - {file = "pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf"}, - {file = "pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51"}, - {file = "pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab"}, - {file = "pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65"}, - {file = "pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc"}, - {file = "pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7"}, - {file = "pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025"}, - {file = "pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011"}, - {file = "pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f"}, - {file = "pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88"}, - {file = "pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1"}, - {file = "pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b"}, - {file = "pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1"}, - {file = "pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6"}, - {file = "pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea"}, - {file = "pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290"}, - {file = "pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2"}, - {file = "pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab"}, - {file = "pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f"}, - {file = "pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6"}, - {file = "pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef"}, - {file = "pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a"}, - {file = "pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916"}, - {file = "pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a"}, - {file = "pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d"}, - {file = "pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56"}, - {file = "pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5"}, - {file = "pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e"}, - {file = "pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162"}, - {file = "pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849"}, - {file = "pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9"}, - {file = "pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9"}, - {file = "pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac"}, - {file = "pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5"}, - {file = "pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9"}, - {file = "pydantic_core-2.33.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d"}, - {file = "pydantic_core-2.33.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954"}, - {file = "pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb"}, - {file = "pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7"}, - {file = "pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4"}, - {file = "pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b"}, - {file = "pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3"}, - {file = "pydantic_core-2.33.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a"}, - {file = "pydantic_core-2.33.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782"}, - {file = "pydantic_core-2.33.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9"}, - {file = "pydantic_core-2.33.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e"}, - {file = "pydantic_core-2.33.2-cp39-cp39-win32.whl", hash = "sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9"}, - {file = "pydantic_core-2.33.2-cp39-cp39-win_amd64.whl", hash = "sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c"}, - {file = "pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb"}, - {file = "pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039"}, - {file = "pydantic_core-2.33.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27"}, - {file = "pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc"}, +groups = ["main"] +files = [ + {file = "pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146"}, + {file = "pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a"}, + {file = "pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c"}, + {file = "pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2"}, + {file = "pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556"}, + {file = "pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49"}, + {file = "pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba"}, + {file = "pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9"}, + {file = "pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6"}, + {file = "pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284"}, + {file = "pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594"}, + {file = "pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e"}, + {file = "pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b"}, + {file = "pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe"}, + {file = "pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f"}, + {file = "pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7"}, + {file = "pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c"}, + {file = "pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5"}, + {file = "pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c"}, + {file = "pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294"}, + {file = "pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1"}, + {file = "pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d"}, + {file = "pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815"}, + {file = "pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3"}, + {file = "pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9"}, + {file = "pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586"}, + {file = "pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d"}, + {file = "pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740"}, + {file = "pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e"}, + {file = "pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858"}, + {file = "pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36"}, + {file = "pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11"}, + {file = "pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd"}, + {file = "pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a"}, + {file = "pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375"}, + {file = "pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553"}, + {file = "pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90"}, + {file = "pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07"}, + {file = "pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb"}, + {file = "pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23"}, + {file = "pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf"}, + {file = "pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c"}, + {file = "pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008"}, + {file = "pydantic_core-2.41.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8bfeaf8735be79f225f3fefab7f941c712aaca36f1128c9d7e2352ee1aa87bdf"}, + {file = "pydantic_core-2.41.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:346285d28e4c8017da95144c7f3acd42740d637ff41946af5ce6e5e420502dd5"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75dafbf87d6276ddc5b2bf6fae5254e3d0876b626eb24969a574fff9149ee5d"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b93a4d08587e2b7e7882de461e82b6ed76d9026ce91ca7915e740ecc7855f60"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8465ab91a4bd96d36dde3263f06caa6a8a6019e4113f24dc753d79a8b3a3f82"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:299e0a22e7ae2b85c1a57f104538b2656e8ab1873511fd718a1c1c6f149b77b5"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:707625ef0983fcfb461acfaf14de2067c5942c6bb0f3b4c99158bed6fedd3cf3"}, + {file = "pydantic_core-2.41.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f41eb9797986d6ebac5e8edff36d5cef9de40def462311b3eb3eeded1431e425"}, + {file = "pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0384e2e1021894b1ff5a786dbf94771e2986ebe2869533874d7e43bc79c6f504"}, + {file = "pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:f0cd744688278965817fd0839c4a4116add48d23890d468bc436f78beb28abf5"}, + {file = "pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:753e230374206729bf0a807954bcc6c150d3743928a73faffee51ac6557a03c3"}, + {file = "pydantic_core-2.41.5-cp39-cp39-win32.whl", hash = "sha256:873e0d5b4fb9b89ef7c2d2a963ea7d02879d9da0da8d9d4933dee8ee86a8b460"}, + {file = "pydantic_core-2.41.5-cp39-cp39-win_amd64.whl", hash = "sha256:e4f4a984405e91527a0d62649ee21138f8e3d0ef103be488c1dc11a80d7f184b"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2"}, + {file = "pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56"}, + {file = "pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963"}, + {file = "pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f"}, + {file = "pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51"}, + {file = "pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e"}, ] [package.dependencies] -typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +typing-extensions = ">=4.14.1" [[package]] name = "pygments" -version = "2.19.1" +version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ - {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, - {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, + {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, + {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, ] [package.extras] @@ -4270,13 +5205,14 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pyparsing" -version = "3.2.3" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" +version = "3.2.5" +description = "pyparsing - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf"}, - {file = "pyparsing-3.2.3.tar.gz", hash = "sha256:b9c13f1ab8b3b542f72e28f634bad4de758ab3ce4546e4301970ad6fa77c38be"}, + {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"}, + {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"}, ] [package.extras] @@ -4284,31 +5220,33 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pyproject-api" -version = "1.9.1" +version = "1.10.0" description = "API to interact with the python pyproject.toml based projects" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "pyproject_api-1.9.1-py3-none-any.whl", hash = "sha256:7d6238d92f8962773dd75b5f0c4a6a27cce092a14b623b811dba656f3b628948"}, - {file = "pyproject_api-1.9.1.tar.gz", hash = "sha256:43c9918f49daab37e302038fc1aed54a8c7a91a9fa935d00b9a485f37e0f5335"}, + {file = "pyproject_api-1.10.0-py3-none-any.whl", hash = "sha256:8757c41a79c0f4ab71b99abed52b97ecf66bd20b04fa59da43b5840bac105a09"}, + {file = "pyproject_api-1.10.0.tar.gz", hash = "sha256:40c6f2d82eebdc4afee61c773ed208c04c19db4c4a60d97f8d7be3ebc0bbb330"}, ] [package.dependencies] packaging = ">=25" [package.extras] -docs = ["furo (>=2024.8.6)", "sphinx-autodoc-typehints (>=3.2)"] -testing = ["covdefaults (>=2.3)", "pytest (>=8.3.5)", "pytest-cov (>=6.1.1)", "pytest-mock (>=3.14)", "setuptools (>=80.3.1)"] +docs = ["furo (>=2025.9.25)", "sphinx-autodoc-typehints (>=3.5.1)"] +testing = ["covdefaults (>=2.3)", "pytest (>=8.4.2)", "pytest-cov (>=7)", "pytest-mock (>=3.15.1)", "setuptools (>=80.9)"] [[package]] name = "pytest" -version = "8.4.0" +version = "8.4.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "pytest-8.4.0-py3-none-any.whl", hash = "sha256:f40f825768ad76c0977cbacdf1fd37c6f7a468e460ea6a0636078f8972d4517e"}, - {file = "pytest-8.4.0.tar.gz", hash = "sha256:14d920b48472ea0dbf68e45b96cd1ffda4705f33307dcc86c676c1b5104838a6"}, + {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, + {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, ] [package.dependencies] @@ -4323,18 +5261,20 @@ dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests [[package]] name = "pytest-cov" -version = "6.1.1" +version = "6.3.0" description = "Pytest plugin for measuring coverage." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"}, - {file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"}, + {file = "pytest_cov-6.3.0-py3-none-any.whl", hash = "sha256:440db28156d2468cafc0415b4f8e50856a0d11faefa38f30906048fe490f1749"}, + {file = "pytest_cov-6.3.0.tar.gz", hash = "sha256:35c580e7800f87ce892e687461166e1ac2bcb8fb9e13aea79032518d6e503ff2"}, ] [package.dependencies] coverage = {version = ">=7.5", extras = ["toml"]} -pytest = ">=4.6" +pluggy = ">=1.2" +pytest = ">=6.2.5" [package.extras] testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] @@ -4345,6 +5285,7 @@ version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, @@ -4355,13 +5296,14 @@ six = ">=1.5" [[package]] name = "python-dotenv" -version = "1.1.0" +version = "1.2.1" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d"}, - {file = "python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5"}, + {file = "python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61"}, + {file = "python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6"}, ] [package.extras] @@ -4369,17 +5311,30 @@ cli = ["click (>=5.0)"] [[package]] name = "python-json-logger" -version = "3.3.0" +version = "4.0.0" description = "JSON Log Formatter for the Python Logging Package" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "python_json_logger-3.3.0-py3-none-any.whl", hash = "sha256:dd980fae8cffb24c13caf6e158d3d61c0d6d22342f932cb6e9deedab3d35eec7"}, - {file = "python_json_logger-3.3.0.tar.gz", hash = "sha256:12b7e74b17775e7d565129296105bbe3910842d9d0eb083fc83a6a617aa8df84"}, + {file = "python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2"}, + {file = "python_json_logger-4.0.0.tar.gz", hash = "sha256:f58e68eb46e1faed27e0f574a55a0455eecd7b8a5b88b85a784519ba3cff047f"}, ] [package.extras] -dev = ["backports.zoneinfo", "black", "build", "freezegun", "mdx_truly_sane_lists", "mike", "mkdocs", "mkdocs-awesome-pages-plugin", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-material (>=8.5)", "mkdocstrings[python]", "msgspec", "mypy", "orjson", "pylint", "pytest", "tzdata", "validate-pyproject[all]"] +dev = ["backports.zoneinfo ; python_version < \"3.9\"", "black", "build", "freezegun", "mdx_truly_sane_lists", "mike", "mkdocs", "mkdocs-awesome-pages-plugin", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-material (>=8.5)", "mkdocstrings[python]", "msgspec ; implementation_name != \"pypy\"", "mypy", "orjson ; implementation_name != \"pypy\"", "pylint", "pytest", "tzdata", "validate-pyproject[all]"] + +[[package]] +name = "python-multipart" +version = "0.0.16" +description = "A streaming multipart parser for Python" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "python_multipart-0.0.16-py3-none-any.whl", hash = "sha256:c2759b7b976ef3937214dfb592446b59dfaa5f04682a076f78b117c94776d87a"}, + {file = "python_multipart-0.0.16.tar.gz", hash = "sha256:8dee37b88dab9b59922ca173c35acb627cc12ec74019f5cd4578369c6df36554"}, +] [[package]] name = "python-slugify" @@ -4387,6 +5342,7 @@ version = "8.0.4" description = "A Python slugify application that also handles Unicode" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856"}, {file = "python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8"}, @@ -4398,12 +5354,38 @@ text-unidecode = ">=1.3" [package.extras] unidecode = ["Unidecode (>=1.1.1)"] +[[package]] +name = "pytorch-frame" +version = "0.3.0" +description = "Tabular Deep Learning Library for PyTorch" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "pytorch_frame-0.3.0-py3-none-any.whl", hash = "sha256:88916f53865230e4fe04c912ed774ac65f566b5b3424b43a80fe88beac63589f"}, + {file = "pytorch_frame-0.3.0.tar.gz", hash = "sha256:412b42002e1a21a09fc8b0c6e4f0d048bd4e5ebef2f7139bbbbae82d81040152"}, +] + +[package.dependencies] +numpy = "*" +pandas = "*" +Pillow = "*" +pyarrow = "*" +torch = "*" +tqdm = "*" + +[package.extras] +dev = ["pre-commit", "pytorch-frame[test]"] +full = ["catboost", "datasets", "lightgbm", "mpmath (==1.3.0)", "optuna (>=3.0.0)", "optuna-integration", "scikit-learn", "torchmetrics", "xgboost (>=1.7.0,<2.0.0)"] +test = ["mypy", "pytest", "pytest-cov"] + [[package]] name = "pytz" version = "2025.2" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, @@ -4411,207 +5393,238 @@ files = [ [[package]] name = "pywin32" -version = "310" +version = "311" description = "Python for Window Extensions" optional = false python-versions = "*" -files = [ - {file = "pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1"}, - {file = "pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d"}, - {file = "pywin32-310-cp310-cp310-win_arm64.whl", hash = "sha256:33babed0cf0c92a6f94cc6cc13546ab24ee13e3e800e61ed87609ab91e4c8213"}, - {file = "pywin32-310-cp311-cp311-win32.whl", hash = "sha256:1e765f9564e83011a63321bb9d27ec456a0ed90d3732c4b2e312b855365ed8bd"}, - {file = "pywin32-310-cp311-cp311-win_amd64.whl", hash = "sha256:126298077a9d7c95c53823934f000599f66ec9296b09167810eb24875f32689c"}, - {file = "pywin32-310-cp311-cp311-win_arm64.whl", hash = "sha256:19ec5fc9b1d51c4350be7bb00760ffce46e6c95eaf2f0b2f1150657b1a43c582"}, - {file = "pywin32-310-cp312-cp312-win32.whl", hash = "sha256:8a75a5cc3893e83a108c05d82198880704c44bbaee4d06e442e471d3c9ea4f3d"}, - {file = "pywin32-310-cp312-cp312-win_amd64.whl", hash = "sha256:bf5c397c9a9a19a6f62f3fb821fbf36cac08f03770056711f765ec1503972060"}, - {file = "pywin32-310-cp312-cp312-win_arm64.whl", hash = "sha256:2349cc906eae872d0663d4d6290d13b90621eaf78964bb1578632ff20e152966"}, - {file = "pywin32-310-cp313-cp313-win32.whl", hash = "sha256:5d241a659c496ada3253cd01cfaa779b048e90ce4b2b38cd44168ad555ce74ab"}, - {file = "pywin32-310-cp313-cp313-win_amd64.whl", hash = "sha256:667827eb3a90208ddbdcc9e860c81bde63a135710e21e4cb3348968e4bd5249e"}, - {file = "pywin32-310-cp313-cp313-win_arm64.whl", hash = "sha256:e308f831de771482b7cf692a1f308f8fca701b2d8f9dde6cc440c7da17e47b33"}, - {file = "pywin32-310-cp38-cp38-win32.whl", hash = "sha256:0867beb8addefa2e3979d4084352e4ac6e991ca45373390775f7084cc0209b9c"}, - {file = "pywin32-310-cp38-cp38-win_amd64.whl", hash = "sha256:30f0a9b3138fb5e07eb4973b7077e1883f558e40c578c6925acc7a94c34eaa36"}, - {file = "pywin32-310-cp39-cp39-win32.whl", hash = "sha256:851c8d927af0d879221e616ae1f66145253537bbdd321a77e8ef701b443a9a1a"}, - {file = "pywin32-310-cp39-cp39-win_amd64.whl", hash = "sha256:96867217335559ac619f00ad70e513c0fcf84b8a3af9fc2bba3b59b97da70475"}, +groups = ["main"] +markers = "sys_platform == \"win32\"" +files = [ + {file = "pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3"}, + {file = "pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b"}, + {file = "pywin32-311-cp310-cp310-win_arm64.whl", hash = "sha256:0502d1facf1fed4839a9a51ccbcc63d952cf318f78ffc00a7e78528ac27d7a2b"}, + {file = "pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151"}, + {file = "pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503"}, + {file = "pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2"}, + {file = "pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31"}, + {file = "pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067"}, + {file = "pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852"}, + {file = "pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d"}, + {file = "pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d"}, + {file = "pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a"}, + {file = "pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee"}, + {file = "pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87"}, + {file = "pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42"}, + {file = "pywin32-311-cp38-cp38-win32.whl", hash = "sha256:6c6f2969607b5023b0d9ce2541f8d2cbb01c4f46bc87456017cf63b73f1e2d8c"}, + {file = "pywin32-311-cp38-cp38-win_amd64.whl", hash = "sha256:c8015b09fb9a5e188f83b7b04de91ddca4658cee2ae6f3bc483f0b21a77ef6cd"}, + {file = "pywin32-311-cp39-cp39-win32.whl", hash = "sha256:aba8f82d551a942cb20d4a83413ccbac30790b50efb89a75e4f586ac0bb8056b"}, + {file = "pywin32-311-cp39-cp39-win_amd64.whl", hash = "sha256:e0c4cfb0621281fe40387df582097fd796e80430597cb9944f0ae70447bacd91"}, + {file = "pywin32-311-cp39-cp39-win_arm64.whl", hash = "sha256:62ea666235135fee79bb154e695f3ff67370afefd71bd7fea7512fc70ef31e3d"}, ] [[package]] name = "pywinpty" -version = "2.0.15" +version = "3.0.2" description = "Pseudo terminal support for Windows from Python." optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "os_name == \"nt\"" files = [ - {file = "pywinpty-2.0.15-cp310-cp310-win_amd64.whl", hash = "sha256:8e7f5de756a615a38b96cd86fa3cd65f901ce54ce147a3179c45907fa11b4c4e"}, - {file = "pywinpty-2.0.15-cp311-cp311-win_amd64.whl", hash = "sha256:9a6bcec2df2707aaa9d08b86071970ee32c5026e10bcc3cc5f6f391d85baf7ca"}, - {file = "pywinpty-2.0.15-cp312-cp312-win_amd64.whl", hash = "sha256:83a8f20b430bbc5d8957249f875341a60219a4e971580f2ba694fbfb54a45ebc"}, - {file = "pywinpty-2.0.15-cp313-cp313-win_amd64.whl", hash = "sha256:ab5920877dd632c124b4ed17bc6dd6ef3b9f86cd492b963ffdb1a67b85b0f408"}, - {file = "pywinpty-2.0.15-cp313-cp313t-win_amd64.whl", hash = "sha256:a4560ad8c01e537708d2790dbe7da7d986791de805d89dd0d3697ca59e9e4901"}, - {file = "pywinpty-2.0.15-cp39-cp39-win_amd64.whl", hash = "sha256:d261cd88fcd358cfb48a7ca0700db3e1c088c9c10403c9ebc0d8a8b57aa6a117"}, - {file = "pywinpty-2.0.15.tar.gz", hash = "sha256:312cf39153a8736c617d45ce8b6ad6cd2107de121df91c455b10ce6bba7a39b2"}, + {file = "pywinpty-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:65db57fd3387d71e8372b6a54269cbcd0f6dfa6d4616a29e0af749ec19f5c558"}, + {file = "pywinpty-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:327790d70e4c841ebd9d0f295a780177149aeb405bca44c7115a3de5c2054b23"}, + {file = "pywinpty-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:99fdd9b455f0ad6419aba6731a7a0d2f88ced83c3c94a80ff9533d95fa8d8a9e"}, + {file = "pywinpty-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:18f78b81e4cfee6aabe7ea8688441d30247b73e52cd9657138015c5f4ee13a51"}, + {file = "pywinpty-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:663383ecfab7fc382cc97ea5c4f7f0bb32c2f889259855df6ea34e5df42d305b"}, + {file = "pywinpty-3.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:28297cecc37bee9f24d8889e47231972d6e9e84f7b668909de54f36ca785029a"}, + {file = "pywinpty-3.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:34b55ae9a1b671fe3eae071d86618110538e8eaad18fcb1531c0830b91a82767"}, + {file = "pywinpty-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:3962daf801bc38dd4de872108c424b5338c9a46c6efca5761854cd66370a9022"}, + {file = "pywinpty-3.0.2.tar.gz", hash = "sha256:1505cc4cb248af42cb6285a65c9c2086ee9e7e574078ee60933d5d7fa86fb004"}, ] [[package]] name = "pyyaml" -version = "6.0.2" +version = "6.0.3" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -files = [ - {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, - {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, - {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"}, - {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"}, - {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"}, - {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"}, - {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"}, - {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"}, - {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"}, - {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"}, - {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"}, - {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"}, - {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"}, - {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"}, - {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"}, - {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"}, - {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"}, - {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"}, - {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"}, - {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"}, - {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"}, - {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"}, - {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"}, - {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"}, - {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"}, - {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"}, - {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"}, - {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"}, - {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"}, - {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"}, - {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"}, - {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"}, - {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"}, - {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, - {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, - {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, - {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"}, - {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"}, - {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"}, - {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"}, - {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"}, - {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"}, - {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"}, - {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, - {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, - {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, - {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, - {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, - {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, - {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, - {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, - {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, - {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, +groups = ["main", "dev"] +files = [ + {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"}, + {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"}, + {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"}, + {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"}, + {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"}, + {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b"}, + {file = "pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0"}, + {file = "pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69"}, + {file = "pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e"}, + {file = "pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c"}, + {file = "pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e"}, + {file = "pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d"}, + {file = "pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a"}, + {file = "pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4"}, + {file = "pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b"}, + {file = "pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf"}, + {file = "pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196"}, + {file = "pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc"}, + {file = "pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e"}, + {file = "pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea"}, + {file = "pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5"}, + {file = "pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b"}, + {file = "pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd"}, + {file = "pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8"}, + {file = "pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6"}, + {file = "pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6"}, + {file = "pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be"}, + {file = "pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26"}, + {file = "pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c"}, + {file = "pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb"}, + {file = "pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac"}, + {file = "pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5"}, + {file = "pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764"}, + {file = "pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35"}, + {file = "pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac"}, + {file = "pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3"}, + {file = "pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3"}, + {file = "pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c"}, + {file = "pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065"}, + {file = "pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65"}, + {file = "pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9"}, + {file = "pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b"}, + {file = "pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da"}, + {file = "pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a"}, + {file = "pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926"}, + {file = "pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7"}, + {file = "pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0"}, + {file = "pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007"}, + {file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"}, ] [[package]] name = "pyzmq" -version = "26.4.0" +version = "27.1.0" description = "Python bindings for 0MQ" optional = false python-versions = ">=3.8" -files = [ - {file = "pyzmq-26.4.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:0329bdf83e170ac133f44a233fc651f6ed66ef8e66693b5af7d54f45d1ef5918"}, - {file = "pyzmq-26.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:398a825d2dea96227cf6460ce0a174cf7657d6f6827807d4d1ae9d0f9ae64315"}, - {file = "pyzmq-26.4.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6d52d62edc96787f5c1dfa6c6ccff9b581cfae5a70d94ec4c8da157656c73b5b"}, - {file = "pyzmq-26.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1410c3a3705db68d11eb2424d75894d41cff2f64d948ffe245dd97a9debfebf4"}, - {file = "pyzmq-26.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:7dacb06a9c83b007cc01e8e5277f94c95c453c5851aac5e83efe93e72226353f"}, - {file = "pyzmq-26.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6bab961c8c9b3a4dc94d26e9b2cdf84de9918931d01d6ff38c721a83ab3c0ef5"}, - {file = "pyzmq-26.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a5c09413b924d96af2aa8b57e76b9b0058284d60e2fc3730ce0f979031d162a"}, - {file = "pyzmq-26.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7d489ac234d38e57f458fdbd12a996bfe990ac028feaf6f3c1e81ff766513d3b"}, - {file = "pyzmq-26.4.0-cp310-cp310-win32.whl", hash = "sha256:dea1c8db78fb1b4b7dc9f8e213d0af3fc8ecd2c51a1d5a3ca1cde1bda034a980"}, - {file = "pyzmq-26.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:fa59e1f5a224b5e04dc6c101d7186058efa68288c2d714aa12d27603ae93318b"}, - {file = "pyzmq-26.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:a651fe2f447672f4a815e22e74630b6b1ec3a1ab670c95e5e5e28dcd4e69bbb5"}, - {file = "pyzmq-26.4.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:bfcf82644c9b45ddd7cd2a041f3ff8dce4a0904429b74d73a439e8cab1bd9e54"}, - {file = "pyzmq-26.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9bcae3979b2654d5289d3490742378b2f3ce804b0b5fd42036074e2bf35b030"}, - {file = "pyzmq-26.4.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ccdff8ac4246b6fb60dcf3982dfaeeff5dd04f36051fe0632748fc0aa0679c01"}, - {file = "pyzmq-26.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4550af385b442dc2d55ab7717837812799d3674cb12f9a3aa897611839c18e9e"}, - {file = "pyzmq-26.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f7ffe9db1187a253fca95191854b3fda24696f086e8789d1d449308a34b88"}, - {file = "pyzmq-26.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3709c9ff7ba61589b7372923fd82b99a81932b592a5c7f1a24147c91da9a68d6"}, - {file = "pyzmq-26.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f8f3c30fb2d26ae5ce36b59768ba60fb72507ea9efc72f8f69fa088450cff1df"}, - {file = "pyzmq-26.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:382a4a48c8080e273427fc692037e3f7d2851959ffe40864f2db32646eeb3cef"}, - {file = "pyzmq-26.4.0-cp311-cp311-win32.whl", hash = "sha256:d56aad0517d4c09e3b4f15adebba8f6372c5102c27742a5bdbfc74a7dceb8fca"}, - {file = "pyzmq-26.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:963977ac8baed7058c1e126014f3fe58b3773f45c78cce7af5c26c09b6823896"}, - {file = "pyzmq-26.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:c0c8e8cadc81e44cc5088fcd53b9b3b4ce9344815f6c4a03aec653509296fae3"}, - {file = "pyzmq-26.4.0-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:5227cb8da4b6f68acfd48d20c588197fd67745c278827d5238c707daf579227b"}, - {file = "pyzmq-26.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1c07a7fa7f7ba86554a2b1bef198c9fed570c08ee062fd2fd6a4dcacd45f905"}, - {file = "pyzmq-26.4.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae775fa83f52f52de73183f7ef5395186f7105d5ed65b1ae65ba27cb1260de2b"}, - {file = "pyzmq-26.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66c760d0226ebd52f1e6b644a9e839b5db1e107a23f2fcd46ec0569a4fdd4e63"}, - {file = "pyzmq-26.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ef8c6ecc1d520debc147173eaa3765d53f06cd8dbe7bd377064cdbc53ab456f5"}, - {file = "pyzmq-26.4.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3150ef4084e163dec29ae667b10d96aad309b668fac6810c9e8c27cf543d6e0b"}, - {file = "pyzmq-26.4.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4448c9e55bf8329fa1dcedd32f661bf611214fa70c8e02fee4347bc589d39a84"}, - {file = "pyzmq-26.4.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e07dde3647afb084d985310d067a3efa6efad0621ee10826f2cb2f9a31b89d2f"}, - {file = "pyzmq-26.4.0-cp312-cp312-win32.whl", hash = "sha256:ba034a32ecf9af72adfa5ee383ad0fd4f4e38cdb62b13624278ef768fe5b5b44"}, - {file = "pyzmq-26.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:056a97aab4064f526ecb32f4343917a4022a5d9efb6b9df990ff72e1879e40be"}, - {file = "pyzmq-26.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:2f23c750e485ce1eb639dbd576d27d168595908aa2d60b149e2d9e34c9df40e0"}, - {file = "pyzmq-26.4.0-cp313-cp313-macosx_10_15_universal2.whl", hash = "sha256:c43fac689880f5174d6fc864857d1247fe5cfa22b09ed058a344ca92bf5301e3"}, - {file = "pyzmq-26.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:902aca7eba477657c5fb81c808318460328758e8367ecdd1964b6330c73cae43"}, - {file = "pyzmq-26.4.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5e48a830bfd152fe17fbdeaf99ac5271aa4122521bf0d275b6b24e52ef35eb6"}, - {file = "pyzmq-26.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31be2b6de98c824c06f5574331f805707c667dc8f60cb18580b7de078479891e"}, - {file = "pyzmq-26.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6332452034be001bbf3206ac59c0d2a7713de5f25bb38b06519fc6967b7cf771"}, - {file = "pyzmq-26.4.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:da8c0f5dd352136853e6a09b1b986ee5278dfddfebd30515e16eae425c872b30"}, - {file = "pyzmq-26.4.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f4ccc1a0a2c9806dda2a2dd118a3b7b681e448f3bb354056cad44a65169f6d86"}, - {file = "pyzmq-26.4.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:1c0b5fceadbab461578daf8d1dcc918ebe7ddd2952f748cf30c7cf2de5d51101"}, - {file = "pyzmq-26.4.0-cp313-cp313-win32.whl", hash = "sha256:28e2b0ff5ba4b3dd11062d905682bad33385cfa3cc03e81abd7f0822263e6637"}, - {file = "pyzmq-26.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:23ecc9d241004c10e8b4f49d12ac064cd7000e1643343944a10df98e57bc544b"}, - {file = "pyzmq-26.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:1edb0385c7f025045d6e0f759d4d3afe43c17a3d898914ec6582e6f464203c08"}, - {file = "pyzmq-26.4.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:93a29e882b2ba1db86ba5dd5e88e18e0ac6b627026c5cfbec9983422011b82d4"}, - {file = "pyzmq-26.4.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb45684f276f57110bb89e4300c00f1233ca631f08f5f42528a5c408a79efc4a"}, - {file = "pyzmq-26.4.0-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f72073e75260cb301aad4258ad6150fa7f57c719b3f498cb91e31df16784d89b"}, - {file = "pyzmq-26.4.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be37e24b13026cfedd233bcbbccd8c0bcd2fdd186216094d095f60076201538d"}, - {file = "pyzmq-26.4.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:237b283044934d26f1eeff4075f751b05d2f3ed42a257fc44386d00df6a270cf"}, - {file = "pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:b30f862f6768b17040929a68432c8a8be77780317f45a353cb17e423127d250c"}, - {file = "pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_i686.whl", hash = "sha256:c80fcd3504232f13617c6ab501124d373e4895424e65de8b72042333316f64a8"}, - {file = "pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:26a2a7451606b87f67cdeca2c2789d86f605da08b4bd616b1a9981605ca3a364"}, - {file = "pyzmq-26.4.0-cp38-cp38-macosx_10_15_universal2.whl", hash = "sha256:831cc53bf6068d46d942af52fa8b0b9d128fb39bcf1f80d468dc9a3ae1da5bfb"}, - {file = "pyzmq-26.4.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:51d18be6193c25bd229524cfac21e39887c8d5e0217b1857998dfbef57c070a4"}, - {file = "pyzmq-26.4.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:445c97854204119ae2232503585ebb4fa7517142f71092cb129e5ee547957a1f"}, - {file = "pyzmq-26.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:807b8f4ad3e6084412c0f3df0613269f552110fa6fb91743e3e306223dbf11a6"}, - {file = "pyzmq-26.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c01d109dd675ac47fa15c0a79d256878d898f90bc10589f808b62d021d2e653c"}, - {file = "pyzmq-26.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0a294026e28679a8dd64c922e59411cb586dad307661b4d8a5c49e7bbca37621"}, - {file = "pyzmq-26.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:22c8dd677274af8dfb1efd05006d6f68fb2f054b17066e308ae20cb3f61028cf"}, - {file = "pyzmq-26.4.0-cp38-cp38-win32.whl", hash = "sha256:14fc678b696bc42c14e2d7f86ac4e97889d5e6b94d366ebcb637a768d2ad01af"}, - {file = "pyzmq-26.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:d1ef0a536662bbbdc8525f7e2ef19e74123ec9c4578e0582ecd41aedc414a169"}, - {file = "pyzmq-26.4.0-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:a88643de8abd000ce99ca72056a1a2ae15881ee365ecb24dd1d9111e43d57842"}, - {file = "pyzmq-26.4.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0a744ce209ecb557406fb928f3c8c55ce79b16c3eeb682da38ef5059a9af0848"}, - {file = "pyzmq-26.4.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9434540f333332224ecb02ee6278b6c6f11ea1266b48526e73c903119b2f420f"}, - {file = "pyzmq-26.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6c6f0a23e55cd38d27d4c89add963294ea091ebcb104d7fdab0f093bc5abb1c"}, - {file = "pyzmq-26.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6145df55dc2309f6ef72d70576dcd5aabb0fd373311613fe85a5e547c722b780"}, - {file = "pyzmq-26.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2ea81823840ef8c56e5d2f9918e4d571236294fea4d1842b302aebffb9e40997"}, - {file = "pyzmq-26.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cc2abc385dc37835445abe206524fbc0c9e3fce87631dfaa90918a1ba8f425eb"}, - {file = "pyzmq-26.4.0-cp39-cp39-win32.whl", hash = "sha256:41a2508fe7bed4c76b4cf55aacfb8733926f59d440d9ae2b81ee8220633b4d12"}, - {file = "pyzmq-26.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:d4000e8255d6cbce38982e5622ebb90823f3409b7ffe8aeae4337ef7d6d2612a"}, - {file = "pyzmq-26.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:b4f6919d9c120488246bdc2a2f96662fa80d67b35bd6d66218f457e722b3ff64"}, - {file = "pyzmq-26.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:98d948288ce893a2edc5ec3c438fe8de2daa5bbbd6e2e865ec5f966e237084ba"}, - {file = "pyzmq-26.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9f34f5c9e0203ece706a1003f1492a56c06c0632d86cb77bcfe77b56aacf27b"}, - {file = "pyzmq-26.4.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80c9b48aef586ff8b698359ce22f9508937c799cc1d2c9c2f7c95996f2300c94"}, - {file = "pyzmq-26.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f2a5b74009fd50b53b26f65daff23e9853e79aa86e0aa08a53a7628d92d44a"}, - {file = "pyzmq-26.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:61c5f93d7622d84cb3092d7f6398ffc77654c346545313a3737e266fc11a3beb"}, - {file = "pyzmq-26.4.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4478b14cb54a805088299c25a79f27eaf530564a7a4f72bf432a040042b554eb"}, - {file = "pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a28ac29c60e4ba84b5f58605ace8ad495414a724fe7aceb7cf06cd0598d04e1"}, - {file = "pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43b03c1ceea27c6520124f4fb2ba9c647409b9abdf9a62388117148a90419494"}, - {file = "pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7731abd23a782851426d4e37deb2057bf9410848a4459b5ede4fe89342e687a9"}, - {file = "pyzmq-26.4.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a222ad02fbe80166b0526c038776e8042cd4e5f0dec1489a006a1df47e9040e0"}, - {file = "pyzmq-26.4.0-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:91c3ffaea475ec8bb1a32d77ebc441dcdd13cd3c4c284a6672b92a0f5ade1917"}, - {file = "pyzmq-26.4.0-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d9a78a52668bf5c9e7b0da36aa5760a9fc3680144e1445d68e98df78a25082ed"}, - {file = "pyzmq-26.4.0-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b70cab356ff8c860118b89dc86cd910c73ce2127eb986dada4fbac399ef644cf"}, - {file = "pyzmq-26.4.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acae207d4387780838192326b32d373bb286da0b299e733860e96f80728eb0af"}, - {file = "pyzmq-26.4.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f928eafd15794aa4be75463d537348b35503c1e014c5b663f206504ec1a90fe4"}, - {file = "pyzmq-26.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:552b0d2e39987733e1e9e948a0ced6ff75e0ea39ab1a1db2fc36eb60fd8760db"}, - {file = "pyzmq-26.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd670a8aa843f2ee637039bbd412e0d7294a5e588e1ecc9ad98b0cdc050259a4"}, - {file = "pyzmq-26.4.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d367b7b775a0e1e54a59a2ba3ed4d5e0a31566af97cc9154e34262777dab95ed"}, - {file = "pyzmq-26.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112af16c406e4a93df2caef49f884f4c2bb2b558b0b5577ef0b2465d15c1abc"}, - {file = "pyzmq-26.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c76c298683f82669cab0b6da59071f55238c039738297c69f187a542c6d40099"}, - {file = "pyzmq-26.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:49b6ca2e625b46f499fb081aaf7819a177f41eeb555acb05758aa97f4f95d147"}, - {file = "pyzmq-26.4.0.tar.gz", hash = "sha256:4bd13f85f80962f91a651a7356fe0472791a5f7a92f227822b5acf44795c626d"}, +groups = ["dev"] +files = [ + {file = "pyzmq-27.1.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:508e23ec9bc44c0005c4946ea013d9317ae00ac67778bd47519fdf5a0e930ff4"}, + {file = "pyzmq-27.1.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:507b6f430bdcf0ee48c0d30e734ea89ce5567fd7b8a0f0044a369c176aa44556"}, + {file = "pyzmq-27.1.0-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf7b38f9fd7b81cb6d9391b2946382c8237fd814075c6aa9c3b746d53076023b"}, + {file = "pyzmq-27.1.0-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03ff0b279b40d687691a6217c12242ee71f0fba28bf8626ff50e3ef0f4410e1e"}, + {file = "pyzmq-27.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:677e744fee605753eac48198b15a2124016c009a11056f93807000ab11ce6526"}, + {file = "pyzmq-27.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd2fec2b13137416a1c5648b7009499bcc8fea78154cd888855fa32514f3dad1"}, + {file = "pyzmq-27.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:08e90bb4b57603b84eab1d0ca05b3bbb10f60c1839dc471fc1c9e1507bef3386"}, + {file = "pyzmq-27.1.0-cp310-cp310-win32.whl", hash = "sha256:a5b42d7a0658b515319148875fcb782bbf118dd41c671b62dae33666c2213bda"}, + {file = "pyzmq-27.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:c0bb87227430ee3aefcc0ade2088100e528d5d3298a0a715a64f3d04c60ba02f"}, + {file = "pyzmq-27.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:9a916f76c2ab8d045b19f2286851a38e9ac94ea91faf65bd64735924522a8b32"}, + {file = "pyzmq-27.1.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:226b091818d461a3bef763805e75685e478ac17e9008f49fce2d3e52b3d58b86"}, + {file = "pyzmq-27.1.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0790a0161c281ca9723f804871b4027f2e8b5a528d357c8952d08cd1a9c15581"}, + {file = "pyzmq-27.1.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c895a6f35476b0c3a54e3eb6ccf41bf3018de937016e6e18748317f25d4e925f"}, + {file = "pyzmq-27.1.0-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bbf8d3630bf96550b3be8e1fc0fea5cbdc8d5466c1192887bd94869da17a63e"}, + {file = "pyzmq-27.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:15c8bd0fe0dabf808e2d7a681398c4e5ded70a551ab47482067a572c054c8e2e"}, + {file = "pyzmq-27.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bafcb3dd171b4ae9f19ee6380dfc71ce0390fefaf26b504c0e5f628d7c8c54f2"}, + {file = "pyzmq-27.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394"}, + {file = "pyzmq-27.1.0-cp311-cp311-win32.whl", hash = "sha256:6df079c47d5902af6db298ec92151db82ecb557af663098b92f2508c398bb54f"}, + {file = "pyzmq-27.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:190cbf120fbc0fc4957b56866830def56628934a9d112aec0e2507aa6a032b97"}, + {file = "pyzmq-27.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:eca6b47df11a132d1745eb3b5b5e557a7dae2c303277aa0e69c6ba91b8736e07"}, + {file = "pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc"}, + {file = "pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113"}, + {file = "pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233"}, + {file = "pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31"}, + {file = "pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28"}, + {file = "pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856"}, + {file = "pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496"}, + {file = "pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd"}, + {file = "pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf"}, + {file = "pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f"}, + {file = "pyzmq-27.1.0-cp313-cp313-android_24_arm64_v8a.whl", hash = "sha256:93ad4b0855a664229559e45c8d23797ceac03183c7b6f5b4428152a6b06684a5"}, + {file = "pyzmq-27.1.0-cp313-cp313-android_24_x86_64.whl", hash = "sha256:fbb4f2400bfda24f12f009cba62ad5734148569ff4949b1b6ec3b519444342e6"}, + {file = "pyzmq-27.1.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:e343d067f7b151cfe4eb3bb796a7752c9d369eed007b91231e817071d2c2fec7"}, + {file = "pyzmq-27.1.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:08363b2011dec81c354d694bdecaef4770e0ae96b9afea70b3f47b973655cc05"}, + {file = "pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d54530c8c8b5b8ddb3318f481297441af102517602b569146185fa10b63f4fa9"}, + {file = "pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f3afa12c392f0a44a2414056d730eebc33ec0926aae92b5ad5cf26ebb6cc128"}, + {file = "pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c65047adafe573ff023b3187bb93faa583151627bc9c51fc4fb2c561ed689d39"}, + {file = "pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:90e6e9441c946a8b0a667356f7078d96411391a3b8f80980315455574177ec97"}, + {file = "pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:add071b2d25f84e8189aaf0882d39a285b42fa3853016ebab234a5e78c7a43db"}, + {file = "pyzmq-27.1.0-cp313-cp313t-win32.whl", hash = "sha256:7ccc0700cfdf7bd487bea8d850ec38f204478681ea02a582a8da8171b7f90a1c"}, + {file = "pyzmq-27.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:8085a9fba668216b9b4323be338ee5437a235fe275b9d1610e422ccc279733e2"}, + {file = "pyzmq-27.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:6bb54ca21bcfe361e445256c15eedf083f153811c37be87e0514934d6913061e"}, + {file = "pyzmq-27.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:ce980af330231615756acd5154f29813d553ea555485ae712c491cd483df6b7a"}, + {file = "pyzmq-27.1.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1779be8c549e54a1c38f805e56d2a2e5c009d26de10921d7d51cfd1c8d4632ea"}, + {file = "pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7200bb0f03345515df50d99d3db206a0a6bee1955fbb8c453c76f5bf0e08fb96"}, + {file = "pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01c0e07d558b06a60773744ea6251f769cd79a41a97d11b8bf4ab8f034b0424d"}, + {file = "pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:80d834abee71f65253c91540445d37c4c561e293ba6e741b992f20a105d69146"}, + {file = "pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:544b4e3b7198dde4a62b8ff6685e9802a9a1ebf47e77478a5eb88eca2a82f2fd"}, + {file = "pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cedc4c68178e59a4046f97eca31b148ddcf51e88677de1ef4e78cf06c5376c9a"}, + {file = "pyzmq-27.1.0-cp314-cp314t-win32.whl", hash = "sha256:1f0b2a577fd770aa6f053211a55d1c47901f4d537389a034c690291485e5fe92"}, + {file = "pyzmq-27.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:19c9468ae0437f8074af379e986c5d3d7d7bfe033506af442e8c879732bedbe0"}, + {file = "pyzmq-27.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7"}, + {file = "pyzmq-27.1.0-cp38-cp38-macosx_10_15_universal2.whl", hash = "sha256:18339186c0ed0ce5835f2656cdfb32203125917711af64da64dbaa3d949e5a1b"}, + {file = "pyzmq-27.1.0-cp38-cp38-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:753d56fba8f70962cd8295fb3edb40b9b16deaa882dd2b5a3a2039f9ff7625aa"}, + {file = "pyzmq-27.1.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b721c05d932e5ad9ff9344f708c96b9e1a485418c6618d765fca95d4daacfbef"}, + {file = "pyzmq-27.1.0-cp38-cp38-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be883ff3d722e6085ee3f4afc057a50f7f2e0c72d289fd54df5706b4e3d3a50"}, + {file = "pyzmq-27.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:b2e592db3a93128daf567de9650a2f3859017b3f7a66bc4ed6e4779d6034976f"}, + {file = "pyzmq-27.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ad68808a61cbfbbae7ba26d6233f2a4aa3b221de379ce9ee468aa7a83b9c36b0"}, + {file = "pyzmq-27.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:e2687c2d230e8d8584fbea433c24382edfeda0c60627aca3446aa5e58d5d1831"}, + {file = "pyzmq-27.1.0-cp38-cp38-win32.whl", hash = "sha256:a1aa0ee920fb3825d6c825ae3f6c508403b905b698b6460408ebd5bb04bbb312"}, + {file = "pyzmq-27.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:df7cd397ece96cf20a76fae705d40efbab217d217897a5053267cd88a700c266"}, + {file = "pyzmq-27.1.0-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:96c71c32fff75957db6ae33cd961439f386505c6e6b377370af9b24a1ef9eafb"}, + {file = "pyzmq-27.1.0-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:49d3980544447f6bd2968b6ac913ab963a49dcaa2d4a2990041f16057b04c429"}, + {file = "pyzmq-27.1.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:849ca054d81aa1c175c49484afaaa5db0622092b5eccb2055f9f3bb8f703782d"}, + {file = "pyzmq-27.1.0-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3970778e74cb7f85934d2b926b9900e92bfe597e62267d7499acc39c9c28e345"}, + {file = "pyzmq-27.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:da96ecdcf7d3919c3be2de91a8c513c186f6762aa6cf7c01087ed74fad7f0968"}, + {file = "pyzmq-27.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:9541c444cfe1b1c0156c5c86ece2bb926c7079a18e7b47b0b1b3b1b875e5d098"}, + {file = "pyzmq-27.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e30a74a39b93e2e1591b58eb1acef4902be27c957a8720b0e368f579b82dc22f"}, + {file = "pyzmq-27.1.0-cp39-cp39-win32.whl", hash = "sha256:b1267823d72d1e40701dcba7edc45fd17f71be1285557b7fe668887150a14b78"}, + {file = "pyzmq-27.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0c996ded912812a2fcd7ab6574f4ad3edc27cb6510349431e4930d4196ade7db"}, + {file = "pyzmq-27.1.0-cp39-cp39-win_arm64.whl", hash = "sha256:346e9ba4198177a07e7706050f35d733e08c1c1f8ceacd5eb6389d653579ffbc"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c17e03cbc9312bee223864f1a2b13a99522e0dc9f7c5df0177cd45210ac286e6"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f328d01128373cb6763823b2b4e7f73bdf767834268c565151eacb3b7a392f90"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c1790386614232e1b3a40a958454bdd42c6d1811837b15ddbb052a032a43f62"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:448f9cb54eb0cee4732b46584f2710c8bc178b0e5371d9e4fc8125201e413a74"}, + {file = "pyzmq-27.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:05b12f2d32112bf8c95ef2e74ec4f1d4beb01f8b5e703b38537f8849f92cb9ba"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:18770c8d3563715387139060d37859c02ce40718d1faf299abddcdcc6a649066"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ac25465d42f92e990f8d8b0546b01c391ad431c3bf447683fdc40565941d0604"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53b40f8ae006f2734ee7608d59ed661419f087521edbfc2149c3932e9c14808c"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f605d884e7c8be8fe1aa94e0a783bf3f591b84c24e4bc4f3e7564c82ac25e271"}, + {file = "pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:50081a4e98472ba9f5a02850014b4c9b629da6710f8f14f3b15897c666a28f1b"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:510869f9df36ab97f89f4cff9d002a89ac554c7ac9cadd87d444aa4cf66abd27"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1f8426a01b1c4098a750973c37131cf585f61c7911d735f729935a0c701b68d3"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:726b6a502f2e34c6d2ada5e702929586d3ac948a4dbbb7fed9854ec8c0466027"}, + {file = "pyzmq-27.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:bd67e7c8f4654bef471c0b1ca6614af0b5202a790723a58b79d9584dc8022a78"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:722ea791aa233ac0a819fc2c475e1292c76930b31f1d828cb61073e2fe5e208f"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:01f9437501886d3a1dd4b02ef59fb8cc384fa718ce066d52f175ee49dd5b7ed8"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4a19387a3dddcc762bfd2f570d14e2395b2c9701329b266f83dd87a2b3cbd381"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c618fbcd069e3a29dcd221739cacde52edcc681f041907867e0f5cc7e85f172"}, + {file = "pyzmq-27.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ff8d114d14ac671d88c89b9224c63d6c4e5a613fe8acd5594ce53d752a3aafe9"}, + {file = "pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540"}, ] [package.dependencies] @@ -4619,74 +5632,72 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "ray" -version = "2.46.0" +version = "2.51.1" description = "Ray provides a simple, universal API for building distributed applications." optional = false python-versions = ">=3.9" -files = [ - {file = "ray-2.46.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:719244b84df79502e5f09497f256618d94d78d66fbaf229422008a0568d3a0ff"}, - {file = "ray-2.46.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4378a86919e6643238a1094f711b87fa8dc1a18b998d4190f69ab33c64a22a8c"}, - {file = "ray-2.46.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:396b912a4dbf64966e2fdfca9facbcafe57b792ca4842ac5ae17507fdbdfe89f"}, - {file = "ray-2.46.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:c12850608c57c8afd9613a9f757d77663c50d4bd4e77ba2f181425052520c01a"}, - {file = "ray-2.46.0-cp310-cp310-win_amd64.whl", hash = "sha256:bc953aa4879c7a77893f921905df5cf65227cafd94fbc8273bec65ea393eacdd"}, - {file = "ray-2.46.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:942ba51de6f9cd7fb2ed17618181af48ce6b9517743d3235d846ec32295eca76"}, - {file = "ray-2.46.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:af84f3ed0854bb6de28192ca9e0a3bfa1eb34d69f118ae6348522198896480c8"}, - {file = "ray-2.46.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:81c8ce8b7ba33cb607ec78f5eb2555470e3046bb317732d8282e8189bb58ccbd"}, - {file = "ray-2.46.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:d4ddedc3f4d48df564bcee7b131c98c9f898fef0a57483f4ba335f47f951a62f"}, - {file = "ray-2.46.0-cp311-cp311-win_amd64.whl", hash = "sha256:130415c4d231830156f37ce70acbdb5fdee10f6886adc4e85bdc4533d51c24c6"}, - {file = "ray-2.46.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:d1f37ead29299637144726f809c2e0ff958dd9c0e75930ef614156d6a0a3a57f"}, - {file = "ray-2.46.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7a064acfeee7f0677d9e3f25daef9c59593559faea764b44a3e2c5331d5d832"}, - {file = "ray-2.46.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:006cbe1a8fdc37664114aa218773100ee891399785e256c202e48958d2dac167"}, - {file = "ray-2.46.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:5cec1edda93f618ffd2301f81d5398037f03fa9b16825e7e4d8a00ae7a9a4381"}, - {file = "ray-2.46.0-cp312-cp312-win_amd64.whl", hash = "sha256:7d3160f8d187baaea91a86d16a9fd81136cf8607419c94b7a74d66fce774b5c2"}, - {file = "ray-2.46.0-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:b2fc2c43ea0a37521193c61ef9a27b6fca8dbab116a58a52fd44344cd73e1ece"}, - {file = "ray-2.46.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4296dd8c0174256a04ee4b54abe013b6802a45fb85fb7cfdb1375231965d6d4d"}, - {file = "ray-2.46.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:808daece1f12bd8924b9c6382a0f98da6f5c6886cfb271ed8d89407a89413cd5"}, - {file = "ray-2.46.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:a5a28c0a311d2c3221dcf729c40898a6df82466bb5af21e81be0453e09856adf"}, - {file = "ray-2.46.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:e0ec198c16d0e9af7f03242ef7ad7d548eee37a918193917278a124ddd57410a"}, - {file = "ray-2.46.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e31568818973efa4f8ce18b82bce03089395a62ac9fe639e94d755959f607fe9"}, - {file = "ray-2.46.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7c44a98cb24f4905e898d05b787cbe9f267a9f66c1e1f8cda50814f8b3673be2"}, - {file = "ray-2.46.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:91ea998a49578b1450cbef60705f6ece8622a262a3d764d5c99ba89b741de5d0"}, - {file = "ray-2.46.0-cp39-cp39-win_amd64.whl", hash = "sha256:018e98c9745eae53b53ad14fef1ca1c43bb64c39c3cceb9e6d4517729396003b"}, +groups = ["main"] +files = [ + {file = "ray-2.51.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e8ce218c85e9f4043c37136fc90b41343bdb844fcdc9520f21c000d1d8d49f89"}, + {file = "ray-2.51.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:36feb519f31c52d3b4dbcd68ffb2baf93195ceec06ea711e21559096bab95fed"}, + {file = "ray-2.51.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:8a21f5914baa3deefcb4fa5f3878e03b589c190b864fe1b80e6dc0cbfba26004"}, + {file = "ray-2.51.1-cp310-cp310-win_amd64.whl", hash = "sha256:a82417b89260ed751a76e9cfaef6d11392ab0da464cde1a9d07a0bb7dc272a7b"}, + {file = "ray-2.51.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bd8211fc033be1bce9c039e474e97a9077be593020978fdcfba1d770bdc40ba5"}, + {file = "ray-2.51.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d2d7c8af45441ff50bc002352d31e0afec5c85dd5075bf527027178931497bce"}, + {file = "ray-2.51.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:dd353010d2548bc345e46c45795f70291bb460c236aa6a3393b51a9cd861b56f"}, + {file = "ray-2.51.1-cp311-cp311-win_amd64.whl", hash = "sha256:606c6e0733eb18fc307c9645ea84ccbd1aad8a5ba8bad764bed54b94e926d33c"}, + {file = "ray-2.51.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ef847b025ca758baee4571a1ca001d973897cad772f8e95d7f303d24c38b649e"}, + {file = "ray-2.51.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:0bed9408712bad1511e65683a455302f88d94e5e5cb6a58cc4a154b61d8a0b4a"}, + {file = "ray-2.51.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4e786da7862cf73664977d0212a505d6d5a585beadf63e7dc1e1c129259bee20"}, + {file = "ray-2.51.1-cp312-cp312-win_amd64.whl", hash = "sha256:198fda93074a6863555f4003e9013bb2ba0cd50b59b18c02affdc294b28a2eef"}, + {file = "ray-2.51.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:d81547886435142dbd79bff1d4e4edf578a5f20e3b11bbd4ced49cfafbd37d27"}, + {file = "ray-2.51.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:3f2bd2acf9b7f4738c17d08592caaad26eafb7a4fc380ad9ab42d5f0a78f73ad"}, + {file = "ray-2.51.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:265ecd6fd6d4a695b09c686e17d58fca0c09e7198c073628ae7bf4974b03e9ca"}, + {file = "ray-2.51.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:4b5ff43147e8ece5b8bea17403050265761545095691f76664e508818bc30811"}, + {file = "ray-2.51.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:9a0c726f018acc08db07231e48ee457a16fd7a1a960434eed332e751190875be"}, + {file = "ray-2.51.1-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:251539200042478f24c25a804dc96cb1a78fcef2ffa5dddf100688bd173722ed"}, + {file = "ray-2.51.1-cp39-cp39-win_amd64.whl", hash = "sha256:ec205696c3a7420ba10a29eeaadc107807c8979f0b7b787326ca743069a2d068"}, ] [package.dependencies] -click = ">=7.0" +click = ">=7.0,<8.3.0 || >8.3.0" filelock = "*" jsonschema = "*" msgpack = ">=1.0.0,<2.0.0" packaging = "*" -protobuf = ">=3.15.3,<3.19.5 || >3.19.5" +protobuf = ">=3.20.3" pyyaml = "*" requests = "*" [package.extras] -adag = ["cupy-cuda12x"] -air = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "fastapi", "fsspec", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "numpy (>=1.20)", "opencensus", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pyarrow (<18)", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -all = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "cupy-cuda12x", "dm-tree", "fastapi", "fsspec", "grpcio", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==1.0.0)", "lz4", "memray", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "ormsgpack (==1.7.0)", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pyOpenSSL", "pyarrow (<18)", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "requests", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -all-cpp = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "cupy-cuda12x", "dm-tree", "fastapi", "fsspec", "grpcio", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==1.0.0)", "lz4", "memray", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "ormsgpack (==1.7.0)", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pyOpenSSL", "pyarrow (<18)", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.46.0)", "requests", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -cgraph = ["cupy-cuda12x"] -client = ["grpcio", "grpcio (!=1.56.0)"] -cpp = ["ray-cpp (==2.46.0)"] -data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (<18)", "pyarrow (>=9.0.0)"] -default = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "virtualenv (>=20.0.24,!=20.21.1)"] -llm = ["aiohttp (>=3.7)", "aiohttp-cors", "async-timeout", "colorful", "fastapi", "fsspec", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "jsonref (>=1.1.0)", "jsonschema", "ninja", "numpy (>=1.20)", "opencensus", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pyarrow (<18)", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "vllm (>=0.8.5)", "watchfiles"] -observability = ["memray", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] -rllib = ["dm-tree", "fsspec", "gymnasium (==1.0.0)", "lz4", "ormsgpack (==1.7.0)", "pandas", "pyarrow (<18)", "pyarrow (>=9.0.0)", "pyyaml", "requests", "scipy", "tensorboardX (>=1.9)"] -serve = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "fastapi", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -serve-grpc = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "fastapi", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "py-spy (>=0.4.0)", "pyOpenSSL", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -train = ["fsspec", "pandas", "pyarrow (<18)", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "tensorboardX (>=1.9)"] -tune = ["fsspec", "pandas", "pyarrow (<18)", "pyarrow (>=9.0.0)", "requests", "tensorboardX (>=1.9)"] +adag = ["cupy-cuda12x ; sys_platform != \"darwin\""] +air = ["aiohttp (>=3.7)", "aiohttp_cors", "colorful", "fastapi", "fsspec", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "numpy (>=1.20)", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "pandas", "pandas (>=1.3)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart_open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +all = ["aiohttp (>=3.7)", "aiohttp_cors", "celery", "colorful", "cupy-cuda12x ; sys_platform != \"darwin\"", "dm_tree", "fastapi", "fsspec", "grpcio", "grpcio (!=1.56.0) ; sys_platform == \"darwin\"", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "gymnasium (==1.1.1)", "lz4", "memray ; sys_platform != \"win32\"", "numpy (>=1.20)", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "ormsgpack (==1.7.0)", "pandas", "pandas (>=1.3)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pyOpenSSL", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "requests", "scipy", "smart_open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +all-cpp = ["aiohttp (>=3.7)", "aiohttp_cors", "celery", "colorful", "cupy-cuda12x ; sys_platform != \"darwin\"", "dm_tree", "fastapi", "fsspec", "grpcio", "grpcio (!=1.56.0) ; sys_platform == \"darwin\"", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "gymnasium (==1.1.1)", "lz4", "memray ; sys_platform != \"win32\"", "numpy (>=1.20)", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "ormsgpack (==1.7.0)", "pandas", "pandas (>=1.3)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pyOpenSSL", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.51.1)", "requests", "scipy", "smart_open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +cgraph = ["cupy-cuda12x ; sys_platform != \"darwin\""] +client = ["grpcio", "grpcio (!=1.56.0) ; sys_platform == \"darwin\""] +cpp = ["ray-cpp (==2.51.1)"] +data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (>=9.0.0)"] +default = ["aiohttp (>=3.7)", "aiohttp_cors", "colorful", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart_open", "virtualenv (>=20.0.24,!=20.21.1)"] +llm = ["aiohttp (>=3.7)", "aiohttp_cors", "async-timeout ; python_version < \"3.11\"", "colorful", "fastapi", "fsspec", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "hf_transfer", "jsonref (>=1.1.0)", "jsonschema", "ninja", "nixl (>=0.6.1)", "numpy (>=1.20)", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "pandas (>=1.3)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart_open", "starlette", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "vllm (>=0.11.0)", "watchfiles"] +observability = ["memray ; sys_platform != \"win32\""] +rllib = ["dm_tree", "fsspec", "gymnasium (==1.1.1)", "lz4", "ormsgpack (==1.7.0)", "pandas", "pyarrow (>=9.0.0)", "pyyaml", "requests", "scipy", "tensorboardX (>=1.9)"] +serve = ["aiohttp (>=3.7)", "aiohttp_cors", "colorful", "fastapi", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart_open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +serve-async-inference = ["aiohttp (>=3.7)", "aiohttp_cors", "celery", "colorful", "fastapi", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart_open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +serve-grpc = ["aiohttp (>=3.7)", "aiohttp_cors", "colorful", "fastapi", "grpcio (>=1.32.0) ; python_version < \"3.10\"", "grpcio (>=1.42.0) ; python_version >= \"3.10\"", "opencensus", "opentelemetry-exporter-prometheus", "opentelemetry-proto", "opentelemetry-sdk (>=1.30.0)", "prometheus_client (>=0.7.1)", "py-spy (>=0.2.0) ; python_version < \"3.12\"", "py-spy (>=0.4.0) ; python_version >= \"3.12\"", "pyOpenSSL", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart_open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +train = ["fsspec", "pandas", "pyarrow (>=9.0.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "tensorboardX (>=1.9)"] +tune = ["fsspec", "pandas", "pyarrow (>=9.0.0)", "requests", "tensorboardX (>=1.9)"] [[package]] name = "referencing" -version = "0.36.2" +version = "0.37.0" description = "JSON Referencing + Python" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["main", "dev"] files = [ - {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"}, - {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"}, + {file = "referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231"}, + {file = "referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8"}, ] [package.dependencies] @@ -4696,121 +5707,144 @@ typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.13\""} [[package]] name = "regex" -version = "2024.11.6" +version = "2025.11.3" description = "Alternative regular expression module, to replace re." optional = false -python-versions = ">=3.8" -files = [ - {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, - {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, - {file = "regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62"}, - {file = "regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e"}, - {file = "regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519"}, - {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638"}, - {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7"}, - {file = "regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45"}, - {file = "regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9"}, - {file = "regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60"}, - {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a"}, - {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9"}, - {file = "regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad"}, - {file = "regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54"}, - {file = "regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b"}, - {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84"}, - {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4"}, - {file = "regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d"}, - {file = "regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff"}, - {file = "regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a"}, - {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b"}, - {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3"}, - {file = "regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f"}, - {file = "regex-2024.11.6-cp38-cp38-win32.whl", hash = "sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4"}, - {file = "regex-2024.11.6-cp38-cp38-win_amd64.whl", hash = "sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001"}, - {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839"}, - {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e"}, - {file = "regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b"}, - {file = "regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57"}, - {file = "regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983"}, - {file = "regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519"}, +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "regex-2025.11.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2b441a4ae2c8049106e8b39973bfbddfb25a179dda2bdb99b0eeb60c40a6a3af"}, + {file = "regex-2025.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fa2eed3f76677777345d2f81ee89f5de2f5745910e805f7af7386a920fa7313"}, + {file = "regex-2025.11.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8b4a27eebd684319bdf473d39f1d79eed36bf2cd34bd4465cdb4618d82b3d56"}, + {file = "regex-2025.11.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cf77eac15bd264986c4a2c63353212c095b40f3affb2bc6b4ef80c4776c1a28"}, + {file = "regex-2025.11.3-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b7f9ee819f94c6abfa56ec7b1dbab586f41ebbdc0a57e6524bd5e7f487a878c7"}, + {file = "regex-2025.11.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:838441333bc90b829406d4a03cb4b8bf7656231b84358628b0406d803931ef32"}, + {file = "regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cfe6d3f0c9e3b7e8c0c694b24d25e677776f5ca26dce46fd6b0489f9c8339391"}, + {file = "regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2ab815eb8a96379a27c3b6157fcb127c8f59c36f043c1678110cea492868f1d5"}, + {file = "regex-2025.11.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:728a9d2d173a65b62bdc380b7932dd8e74ed4295279a8fe1021204ce210803e7"}, + {file = "regex-2025.11.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:509dc827f89c15c66a0c216331260d777dd6c81e9a4e4f830e662b0bb296c313"}, + {file = "regex-2025.11.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:849202cd789e5f3cf5dcc7822c34b502181b4824a65ff20ce82da5524e45e8e9"}, + {file = "regex-2025.11.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b6f78f98741dcc89607c16b1e9426ee46ce4bf31ac5e6b0d40e81c89f3481ea5"}, + {file = "regex-2025.11.3-cp310-cp310-win32.whl", hash = "sha256:149eb0bba95231fb4f6d37c8f760ec9fa6fabf65bab555e128dde5f2475193ec"}, + {file = "regex-2025.11.3-cp310-cp310-win_amd64.whl", hash = "sha256:ee3a83ce492074c35a74cc76cf8235d49e77b757193a5365ff86e3f2f93db9fd"}, + {file = "regex-2025.11.3-cp310-cp310-win_arm64.whl", hash = "sha256:38af559ad934a7b35147716655d4a2f79fcef2d695ddfe06a06ba40ae631fa7e"}, + {file = "regex-2025.11.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eadade04221641516fa25139273505a1c19f9bf97589a05bc4cfcd8b4a618031"}, + {file = "regex-2025.11.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:feff9e54ec0dd3833d659257f5c3f5322a12eee58ffa360984b716f8b92983f4"}, + {file = "regex-2025.11.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3b30bc921d50365775c09a7ed446359e5c0179e9e2512beec4a60cbcef6ddd50"}, + {file = "regex-2025.11.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f99be08cfead2020c7ca6e396c13543baea32343b7a9a5780c462e323bd8872f"}, + {file = "regex-2025.11.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6dd329a1b61c0ee95ba95385fb0c07ea0d3fe1a21e1349fa2bec272636217118"}, + {file = "regex-2025.11.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c5238d32f3c5269d9e87be0cf096437b7622b6920f5eac4fd202468aaeb34d2"}, + {file = "regex-2025.11.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10483eefbfb0adb18ee9474498c9a32fcf4e594fbca0543bb94c48bac6183e2e"}, + {file = "regex-2025.11.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78c2d02bb6e1da0720eedc0bad578049cad3f71050ef8cd065ecc87691bed2b0"}, + {file = "regex-2025.11.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e6b49cd2aad93a1790ce9cffb18964f6d3a4b0b3dbdbd5de094b65296fce6e58"}, + {file = "regex-2025.11.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:885b26aa3ee56433b630502dc3d36ba78d186a00cc535d3806e6bfd9ed3c70ab"}, + {file = "regex-2025.11.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ddd76a9f58e6a00f8772e72cff8ebcff78e022be95edf018766707c730593e1e"}, + {file = "regex-2025.11.3-cp311-cp311-win32.whl", hash = "sha256:3e816cc9aac1cd3cc9a4ec4d860f06d40f994b5c7b4d03b93345f44e08cc68bf"}, + {file = "regex-2025.11.3-cp311-cp311-win_amd64.whl", hash = "sha256:087511f5c8b7dfbe3a03f5d5ad0c2a33861b1fc387f21f6f60825a44865a385a"}, + {file = "regex-2025.11.3-cp311-cp311-win_arm64.whl", hash = "sha256:1ff0d190c7f68ae7769cd0313fe45820ba07ffebfddfaa89cc1eb70827ba0ddc"}, + {file = "regex-2025.11.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bc8ab71e2e31b16e40868a40a69007bc305e1109bd4658eb6cad007e0bf67c41"}, + {file = "regex-2025.11.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:22b29dda7e1f7062a52359fca6e58e548e28c6686f205e780b02ad8ef710de36"}, + {file = "regex-2025.11.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3a91e4a29938bc1a082cc28fdea44be420bf2bebe2665343029723892eb073e1"}, + {file = "regex-2025.11.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08b884f4226602ad40c5d55f52bf91a9df30f513864e0054bad40c0e9cf1afb7"}, + {file = "regex-2025.11.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3e0b11b2b2433d1c39c7c7a30e3f3d0aeeea44c2a8d0bae28f6b95f639927a69"}, + {file = "regex-2025.11.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:87eb52a81ef58c7ba4d45c3ca74e12aa4b4e77816f72ca25258a85b3ea96cb48"}, + {file = "regex-2025.11.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a12ab1f5c29b4e93db518f5e3872116b7e9b1646c9f9f426f777b50d44a09e8c"}, + {file = "regex-2025.11.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7521684c8c7c4f6e88e35ec89680ee1aa8358d3f09d27dfbdf62c446f5d4c695"}, + {file = "regex-2025.11.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7fe6e5440584e94cc4b3f5f4d98a25e29ca12dccf8873679a635638349831b98"}, + {file = "regex-2025.11.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:8e026094aa12b43f4fd74576714e987803a315c76edb6b098b9809db5de58f74"}, + {file = "regex-2025.11.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:435bbad13e57eb5606a68443af62bed3556de2f46deb9f7d4237bc2f1c9fb3a0"}, + {file = "regex-2025.11.3-cp312-cp312-win32.whl", hash = "sha256:3839967cf4dc4b985e1570fd8d91078f0c519f30491c60f9ac42a8db039be204"}, + {file = "regex-2025.11.3-cp312-cp312-win_amd64.whl", hash = "sha256:e721d1b46e25c481dc5ded6f4b3f66c897c58d2e8cfdf77bbced84339108b0b9"}, + {file = "regex-2025.11.3-cp312-cp312-win_arm64.whl", hash = "sha256:64350685ff08b1d3a6fff33f45a9ca183dc1d58bbfe4981604e70ec9801bbc26"}, + {file = "regex-2025.11.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c1e448051717a334891f2b9a620fe36776ebf3dd8ec46a0b877c8ae69575feb4"}, + {file = "regex-2025.11.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9b5aca4d5dfd7fbfbfbdaf44850fcc7709a01146a797536a8f84952e940cca76"}, + {file = "regex-2025.11.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:04d2765516395cf7dda331a244a3282c0f5ae96075f728629287dfa6f76ba70a"}, + {file = "regex-2025.11.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d9903ca42bfeec4cebedba8022a7c97ad2aab22e09573ce9976ba01b65e4361"}, + {file = "regex-2025.11.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:639431bdc89d6429f6721625e8129413980ccd62e9d3f496be618a41d205f160"}, + {file = "regex-2025.11.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f117efad42068f9715677c8523ed2be1518116d1c49b1dd17987716695181efe"}, + {file = "regex-2025.11.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4aecb6f461316adf9f1f0f6a4a1a3d79e045f9b71ec76055a791affa3b285850"}, + {file = "regex-2025.11.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3b3a5f320136873cc5561098dfab677eea139521cb9a9e8db98b7e64aef44cbc"}, + {file = "regex-2025.11.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:75fa6f0056e7efb1f42a1c34e58be24072cb9e61a601340cc1196ae92326a4f9"}, + {file = "regex-2025.11.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:dbe6095001465294f13f1adcd3311e50dd84e5a71525f20a10bd16689c61ce0b"}, + {file = "regex-2025.11.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:454d9b4ae7881afbc25015b8627c16d88a597479b9dea82b8c6e7e2e07240dc7"}, + {file = "regex-2025.11.3-cp313-cp313-win32.whl", hash = "sha256:28ba4d69171fc6e9896337d4fc63a43660002b7da53fc15ac992abcf3410917c"}, + {file = "regex-2025.11.3-cp313-cp313-win_amd64.whl", hash = "sha256:bac4200befe50c670c405dc33af26dad5a3b6b255dd6c000d92fe4629f9ed6a5"}, + {file = "regex-2025.11.3-cp313-cp313-win_arm64.whl", hash = "sha256:2292cd5a90dab247f9abe892ac584cb24f0f54680c73fcb4a7493c66c2bf2467"}, + {file = "regex-2025.11.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:1eb1ebf6822b756c723e09f5186473d93236c06c579d2cc0671a722d2ab14281"}, + {file = "regex-2025.11.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1e00ec2970aab10dc5db34af535f21fcf32b4a31d99e34963419636e2f85ae39"}, + {file = "regex-2025.11.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a4cb042b615245d5ff9b3794f56be4138b5adc35a4166014d31d1814744148c7"}, + {file = "regex-2025.11.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:44f264d4bf02f3176467d90b294d59bf1db9fe53c141ff772f27a8b456b2a9ed"}, + {file = "regex-2025.11.3-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7be0277469bf3bd7a34a9c57c1b6a724532a0d235cd0dc4e7f4316f982c28b19"}, + {file = "regex-2025.11.3-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0d31e08426ff4b5b650f68839f5af51a92a5b51abd8554a60c2fbc7c71f25d0b"}, + {file = "regex-2025.11.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e43586ce5bd28f9f285a6e729466841368c4a0353f6fd08d4ce4630843d3648a"}, + {file = "regex-2025.11.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0f9397d561a4c16829d4e6ff75202c1c08b68a3bdbfe29dbfcdb31c9830907c6"}, + {file = "regex-2025.11.3-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:dd16e78eb18ffdb25ee33a0682d17912e8cc8a770e885aeee95020046128f1ce"}, + {file = "regex-2025.11.3-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:ffcca5b9efe948ba0661e9df0fa50d2bc4b097c70b9810212d6b62f05d83b2dd"}, + {file = "regex-2025.11.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c56b4d162ca2b43318ac671c65bd4d563e841a694ac70e1a976ac38fcf4ca1d2"}, + {file = "regex-2025.11.3-cp313-cp313t-win32.whl", hash = "sha256:9ddc42e68114e161e51e272f667d640f97e84a2b9ef14b7477c53aac20c2d59a"}, + {file = "regex-2025.11.3-cp313-cp313t-win_amd64.whl", hash = "sha256:7a7c7fdf755032ffdd72c77e3d8096bdcb0eb92e89e17571a196f03d88b11b3c"}, + {file = "regex-2025.11.3-cp313-cp313t-win_arm64.whl", hash = "sha256:df9eb838c44f570283712e7cff14c16329a9f0fb19ca492d21d4b7528ee6821e"}, + {file = "regex-2025.11.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9697a52e57576c83139d7c6f213d64485d3df5bf84807c35fa409e6c970801c6"}, + {file = "regex-2025.11.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e18bc3f73bd41243c9b38a6d9f2366cd0e0137a9aebe2d8ff76c5b67d4c0a3f4"}, + {file = "regex-2025.11.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:61a08bcb0ec14ff4e0ed2044aad948d0659604f824cbd50b55e30b0ec6f09c73"}, + {file = "regex-2025.11.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9c30003b9347c24bcc210958c5d167b9e4f9be786cb380a7d32f14f9b84674f"}, + {file = "regex-2025.11.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4e1e592789704459900728d88d41a46fe3969b82ab62945560a31732ffc19a6d"}, + {file = "regex-2025.11.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6538241f45eb5a25aa575dbba1069ad786f68a4f2773a29a2bd3dd1f9de787be"}, + {file = "regex-2025.11.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce22519c989bb72a7e6b36a199384c53db7722fe669ba891da75907fe3587db"}, + {file = "regex-2025.11.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:66d559b21d3640203ab9075797a55165d79017520685fb407b9234d72ab63c62"}, + {file = "regex-2025.11.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:669dcfb2e38f9e8c69507bace46f4889e3abbfd9b0c29719202883c0a603598f"}, + {file = "regex-2025.11.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:32f74f35ff0f25a5021373ac61442edcb150731fbaa28286bbc8bb1582c89d02"}, + {file = "regex-2025.11.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e6c7a21dffba883234baefe91bc3388e629779582038f75d2a5be918e250f0ed"}, + {file = "regex-2025.11.3-cp314-cp314-win32.whl", hash = "sha256:795ea137b1d809eb6836b43748b12634291c0ed55ad50a7d72d21edf1cd565c4"}, + {file = "regex-2025.11.3-cp314-cp314-win_amd64.whl", hash = "sha256:9f95fbaa0ee1610ec0fc6b26668e9917a582ba80c52cc6d9ada15e30aa9ab9ad"}, + {file = "regex-2025.11.3-cp314-cp314-win_arm64.whl", hash = "sha256:dfec44d532be4c07088c3de2876130ff0fbeeacaa89a137decbbb5f665855a0f"}, + {file = "regex-2025.11.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ba0d8a5d7f04f73ee7d01d974d47c5834f8a1b0224390e4fe7c12a3a92a78ecc"}, + {file = "regex-2025.11.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:442d86cf1cfe4faabf97db7d901ef58347efd004934da045c745e7b5bd57ac49"}, + {file = "regex-2025.11.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fd0a5e563c756de210bb964789b5abe4f114dacae9104a47e1a649b910361536"}, + {file = "regex-2025.11.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf3490bcbb985a1ae97b2ce9ad1c0f06a852d5b19dde9b07bdf25bf224248c95"}, + {file = "regex-2025.11.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3809988f0a8b8c9dcc0f92478d6501fac7200b9ec56aecf0ec21f4a2ec4b6009"}, + {file = "regex-2025.11.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f4ff94e58e84aedb9c9fce66d4ef9f27a190285b451420f297c9a09f2b9abee9"}, + {file = "regex-2025.11.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eb542fd347ce61e1321b0a6b945d5701528dca0cd9759c2e3bb8bd57e47964d"}, + {file = "regex-2025.11.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2d5919075a1f2e413c00b056ea0c2f065b3f5fe83c3d07d325ab92dce51d6"}, + {file = "regex-2025.11.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:3f8bf11a4827cc7ce5a53d4ef6cddd5ad25595d3c1435ef08f76825851343154"}, + {file = "regex-2025.11.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:22c12d837298651e5550ac1d964e4ff57c3f56965fc1812c90c9fb2028eaf267"}, + {file = "regex-2025.11.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ba394a3dda9ad41c7c780f60f6e4a70988741415ae96f6d1bf6c239cf01379"}, + {file = "regex-2025.11.3-cp314-cp314t-win32.whl", hash = "sha256:4bf146dca15cdd53224a1bf46d628bd7590e4a07fbb69e720d561aea43a32b38"}, + {file = "regex-2025.11.3-cp314-cp314t-win_amd64.whl", hash = "sha256:adad1a1bcf1c9e76346e091d22d23ac54ef28e1365117d99521631078dfec9de"}, + {file = "regex-2025.11.3-cp314-cp314t-win_arm64.whl", hash = "sha256:c54f768482cef41e219720013cd05933b6f971d9562544d691c68699bf2b6801"}, + {file = "regex-2025.11.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:81519e25707fc076978c6143b81ea3dc853f176895af05bf7ec51effe818aeec"}, + {file = "regex-2025.11.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3bf28b1873a8af8bbb58c26cc56ea6e534d80053b41fb511a35795b6de507e6a"}, + {file = "regex-2025.11.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:856a25c73b697f2ce2a24e7968285579e62577a048526161a2c0f53090bea9f9"}, + {file = "regex-2025.11.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a3d571bd95fade53c86c0517f859477ff3a93c3fde10c9e669086f038e0f207"}, + {file = "regex-2025.11.3-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:732aea6de26051af97b94bc98ed86448821f839d058e5d259c72bf6d73ad0fc0"}, + {file = "regex-2025.11.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:51c1c1847128238f54930edb8805b660305dca164645a9fd29243f5610beea34"}, + {file = "regex-2025.11.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22dd622a402aad4558277305350699b2be14bc59f64d64ae1d928ce7d072dced"}, + {file = "regex-2025.11.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f3b5a391c7597ffa96b41bd5cbd2ed0305f515fcbb367dfa72735679d5502364"}, + {file = "regex-2025.11.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:cc4076a5b4f36d849fd709284b4a3b112326652f3b0466f04002a6c15a0c96c1"}, + {file = "regex-2025.11.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:a295ca2bba5c1c885826ce3125fa0b9f702a1be547d821c01d65f199e10c01e2"}, + {file = "regex-2025.11.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b4774ff32f18e0504bfc4e59a3e71e18d83bc1e171a3c8ed75013958a03b2f14"}, + {file = "regex-2025.11.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e7d1cdfa88ef33a2ae6aa0d707f9255eb286ffbd90045f1088246833223aee"}, + {file = "regex-2025.11.3-cp39-cp39-win32.whl", hash = "sha256:74d04244852ff73b32eeede4f76f51c5bcf44bc3c207bc3e6cf1c5c45b890708"}, + {file = "regex-2025.11.3-cp39-cp39-win_amd64.whl", hash = "sha256:7a50cd39f73faa34ec18d6720ee25ef10c4c1839514186fcda658a06c06057a2"}, + {file = "regex-2025.11.3-cp39-cp39-win_arm64.whl", hash = "sha256:43b4fb020e779ca81c1b5255015fe2b82816c76ec982354534ad9ec09ad7c9e3"}, + {file = "regex-2025.11.3.tar.gz", hash = "sha256:1fedc720f9bb2494ce31a58a1631f9c82df6a09b49c19517ea5cc280b4541e01"}, ] [[package]] name = "requests" -version = "2.32.3" +version = "2.32.5" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, + {file = "requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6"}, + {file = "requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" +charset_normalizer = ">=2,<4" idna = ">=2.5,<4" urllib3 = ">=1.21.1,<3" @@ -4824,6 +5858,7 @@ version = "0.1.4" description = "A pure python RFC3339 validator" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev"] files = [ {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, @@ -4838,17 +5873,37 @@ version = "0.1.1" description = "Pure python rfc3986 validator" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev"] files = [ {file = "rfc3986_validator-0.1.1-py2.py3-none-any.whl", hash = "sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9"}, {file = "rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055"}, ] +[[package]] +name = "rfc3987-syntax" +version = "1.1.0" +description = "Helper functions to syntactically validate strings according to RFC 3987." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "rfc3987_syntax-1.1.0-py3-none-any.whl", hash = "sha256:6c3d97604e4c5ce9f714898e05401a0445a641cfa276432b0a648c80856f6a3f"}, + {file = "rfc3987_syntax-1.1.0.tar.gz", hash = "sha256:717a62cbf33cffdd16dfa3a497d81ce48a660ea691b1ddd7be710c22f00b4a0d"}, +] + +[package.dependencies] +lark = ">=1.2.2" + +[package.extras] +testing = ["pytest (>=8.3.5)"] + [[package]] name = "rich" version = "13.9.4" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.8.0" +groups = ["main"] files = [ {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, @@ -4863,128 +5918,127 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "rpds-py" -version = "0.25.1" +version = "0.28.0" description = "Python bindings to Rust's persistent data structures (rpds)" optional = false -python-versions = ">=3.9" -files = [ - {file = "rpds_py-0.25.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f4ad628b5174d5315761b67f212774a32f5bad5e61396d38108bd801c0a8f5d9"}, - {file = "rpds_py-0.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c742af695f7525e559c16f1562cf2323db0e3f0fbdcabdf6865b095256b2d40"}, - {file = "rpds_py-0.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:605ffe7769e24b1800b4d024d24034405d9404f0bc2f55b6db3362cd34145a6f"}, - {file = "rpds_py-0.25.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ccc6f3ddef93243538be76f8e47045b4aad7a66a212cd3a0f23e34469473d36b"}, - {file = "rpds_py-0.25.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f70316f760174ca04492b5ab01be631a8ae30cadab1d1081035136ba12738cfa"}, - {file = "rpds_py-0.25.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1dafef8df605fdb46edcc0bf1573dea0d6d7b01ba87f85cd04dc855b2b4479e"}, - {file = "rpds_py-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0701942049095741a8aeb298a31b203e735d1c61f4423511d2b1a41dcd8a16da"}, - {file = "rpds_py-0.25.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e87798852ae0b37c88babb7f7bbbb3e3fecc562a1c340195b44c7e24d403e380"}, - {file = "rpds_py-0.25.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3bcce0edc1488906c2d4c75c94c70a0417e83920dd4c88fec1078c94843a6ce9"}, - {file = "rpds_py-0.25.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e2f6a2347d3440ae789505693a02836383426249d5293541cd712e07e7aecf54"}, - {file = "rpds_py-0.25.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4fd52d3455a0aa997734f3835cbc4c9f32571345143960e7d7ebfe7b5fbfa3b2"}, - {file = "rpds_py-0.25.1-cp310-cp310-win32.whl", hash = "sha256:3f0b1798cae2bbbc9b9db44ee068c556d4737911ad53a4e5093d09d04b3bbc24"}, - {file = "rpds_py-0.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:3ebd879ab996537fc510a2be58c59915b5dd63bccb06d1ef514fee787e05984a"}, - {file = "rpds_py-0.25.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:5f048bbf18b1f9120685c6d6bb70cc1a52c8cc11bdd04e643d28d3be0baf666d"}, - {file = "rpds_py-0.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4fbb0dbba559959fcb5d0735a0f87cdbca9e95dac87982e9b95c0f8f7ad10255"}, - {file = "rpds_py-0.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4ca54b9cf9d80b4016a67a0193ebe0bcf29f6b0a96f09db942087e294d3d4c2"}, - {file = "rpds_py-0.25.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ee3e26eb83d39b886d2cb6e06ea701bba82ef30a0de044d34626ede51ec98b0"}, - {file = "rpds_py-0.25.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89706d0683c73a26f76a5315d893c051324d771196ae8b13e6ffa1ffaf5e574f"}, - {file = "rpds_py-0.25.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2013ee878c76269c7b557a9a9c042335d732e89d482606990b70a839635feb7"}, - {file = "rpds_py-0.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45e484db65e5380804afbec784522de84fa95e6bb92ef1bd3325d33d13efaebd"}, - {file = "rpds_py-0.25.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:48d64155d02127c249695abb87d39f0faf410733428d499867606be138161d65"}, - {file = "rpds_py-0.25.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:048893e902132fd6548a2e661fb38bf4896a89eea95ac5816cf443524a85556f"}, - {file = "rpds_py-0.25.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0317177b1e8691ab5879f4f33f4b6dc55ad3b344399e23df2e499de7b10a548d"}, - {file = "rpds_py-0.25.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bffcf57826d77a4151962bf1701374e0fc87f536e56ec46f1abdd6a903354042"}, - {file = "rpds_py-0.25.1-cp311-cp311-win32.whl", hash = "sha256:cda776f1967cb304816173b30994faaf2fd5bcb37e73118a47964a02c348e1bc"}, - {file = "rpds_py-0.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:dc3c1ff0abc91444cd20ec643d0f805df9a3661fcacf9c95000329f3ddf268a4"}, - {file = "rpds_py-0.25.1-cp311-cp311-win_arm64.whl", hash = "sha256:5a3ddb74b0985c4387719fc536faced33cadf2172769540c62e2a94b7b9be1c4"}, - {file = "rpds_py-0.25.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b5ffe453cde61f73fea9430223c81d29e2fbf412a6073951102146c84e19e34c"}, - {file = "rpds_py-0.25.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:115874ae5e2fdcfc16b2aedc95b5eef4aebe91b28e7e21951eda8a5dc0d3461b"}, - {file = "rpds_py-0.25.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a714bf6e5e81b0e570d01f56e0c89c6375101b8463999ead3a93a5d2a4af91fa"}, - {file = "rpds_py-0.25.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:35634369325906bcd01577da4c19e3b9541a15e99f31e91a02d010816b49bfda"}, - {file = "rpds_py-0.25.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4cb2b3ddc16710548801c6fcc0cfcdeeff9dafbc983f77265877793f2660309"}, - {file = "rpds_py-0.25.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9ceca1cf097ed77e1a51f1dbc8d174d10cb5931c188a4505ff9f3e119dfe519b"}, - {file = "rpds_py-0.25.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2cd1a4b0c2b8c5e31ffff50d09f39906fe351389ba143c195566056c13a7ea"}, - {file = "rpds_py-0.25.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1de336a4b164c9188cb23f3703adb74a7623ab32d20090d0e9bf499a2203ad65"}, - {file = "rpds_py-0.25.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9fca84a15333e925dd59ce01da0ffe2ffe0d6e5d29a9eeba2148916d1824948c"}, - {file = "rpds_py-0.25.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:88ec04afe0c59fa64e2f6ea0dd9657e04fc83e38de90f6de201954b4d4eb59bd"}, - {file = "rpds_py-0.25.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8bd2f19e312ce3e1d2c635618e8a8d8132892bb746a7cf74780a489f0f6cdcb"}, - {file = "rpds_py-0.25.1-cp312-cp312-win32.whl", hash = "sha256:e5e2f7280d8d0d3ef06f3ec1b4fd598d386cc6f0721e54f09109a8132182fbfe"}, - {file = "rpds_py-0.25.1-cp312-cp312-win_amd64.whl", hash = "sha256:db58483f71c5db67d643857404da360dce3573031586034b7d59f245144cc192"}, - {file = "rpds_py-0.25.1-cp312-cp312-win_arm64.whl", hash = "sha256:6d50841c425d16faf3206ddbba44c21aa3310a0cebc3c1cdfc3e3f4f9f6f5728"}, - {file = "rpds_py-0.25.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:659d87430a8c8c704d52d094f5ba6fa72ef13b4d385b7e542a08fc240cb4a559"}, - {file = "rpds_py-0.25.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:68f6f060f0bbdfb0245267da014d3a6da9be127fe3e8cc4a68c6f833f8a23bb1"}, - {file = "rpds_py-0.25.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:083a9513a33e0b92cf6e7a6366036c6bb43ea595332c1ab5c8ae329e4bcc0a9c"}, - {file = "rpds_py-0.25.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:816568614ecb22b18a010c7a12559c19f6fe993526af88e95a76d5a60b8b75fb"}, - {file = "rpds_py-0.25.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c6564c0947a7f52e4792983f8e6cf9bac140438ebf81f527a21d944f2fd0a40"}, - {file = "rpds_py-0.25.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c4a128527fe415d73cf1f70a9a688d06130d5810be69f3b553bf7b45e8acf79"}, - {file = "rpds_py-0.25.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a49e1d7a4978ed554f095430b89ecc23f42014a50ac385eb0c4d163ce213c325"}, - {file = "rpds_py-0.25.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d74ec9bc0e2feb81d3f16946b005748119c0f52a153f6db6a29e8cd68636f295"}, - {file = "rpds_py-0.25.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3af5b4cc10fa41e5bc64e5c198a1b2d2864337f8fcbb9a67e747e34002ce812b"}, - {file = "rpds_py-0.25.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:79dc317a5f1c51fd9c6a0c4f48209c6b8526d0524a6904fc1076476e79b00f98"}, - {file = "rpds_py-0.25.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1521031351865e0181bc585147624d66b3b00a84109b57fcb7a779c3ec3772cd"}, - {file = "rpds_py-0.25.1-cp313-cp313-win32.whl", hash = "sha256:5d473be2b13600b93a5675d78f59e63b51b1ba2d0476893415dfbb5477e65b31"}, - {file = "rpds_py-0.25.1-cp313-cp313-win_amd64.whl", hash = "sha256:a7b74e92a3b212390bdce1d93da9f6488c3878c1d434c5e751cbc202c5e09500"}, - {file = "rpds_py-0.25.1-cp313-cp313-win_arm64.whl", hash = "sha256:dd326a81afe332ede08eb39ab75b301d5676802cdffd3a8f287a5f0b694dc3f5"}, - {file = "rpds_py-0.25.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:a58d1ed49a94d4183483a3ce0af22f20318d4a1434acee255d683ad90bf78129"}, - {file = "rpds_py-0.25.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f251bf23deb8332823aef1da169d5d89fa84c89f67bdfb566c49dea1fccfd50d"}, - {file = "rpds_py-0.25.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dbd586bfa270c1103ece2109314dd423df1fa3d9719928b5d09e4840cec0d72"}, - {file = "rpds_py-0.25.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6d273f136e912aa101a9274c3145dcbddbe4bac560e77e6d5b3c9f6e0ed06d34"}, - {file = "rpds_py-0.25.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:666fa7b1bd0a3810a7f18f6d3a25ccd8866291fbbc3c9b912b917a6715874bb9"}, - {file = "rpds_py-0.25.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:921954d7fbf3fccc7de8f717799304b14b6d9a45bbeec5a8d7408ccbf531faf5"}, - {file = "rpds_py-0.25.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3d86373ff19ca0441ebeb696ef64cb58b8b5cbacffcda5a0ec2f3911732a194"}, - {file = "rpds_py-0.25.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c8980cde3bb8575e7c956a530f2c217c1d6aac453474bf3ea0f9c89868b531b6"}, - {file = "rpds_py-0.25.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8eb8c84ecea987a2523e057c0d950bcb3f789696c0499290b8d7b3107a719d78"}, - {file = "rpds_py-0.25.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:e43a005671a9ed5a650f3bc39e4dbccd6d4326b24fb5ea8be5f3a43a6f576c72"}, - {file = "rpds_py-0.25.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:58f77c60956501a4a627749a6dcb78dac522f249dd96b5c9f1c6af29bfacfb66"}, - {file = "rpds_py-0.25.1-cp313-cp313t-win32.whl", hash = "sha256:2cb9e5b5e26fc02c8a4345048cd9998c2aca7c2712bd1b36da0c72ee969a3523"}, - {file = "rpds_py-0.25.1-cp313-cp313t-win_amd64.whl", hash = "sha256:401ca1c4a20cc0510d3435d89c069fe0a9ae2ee6495135ac46bdd49ec0495763"}, - {file = "rpds_py-0.25.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ce4c8e485a3c59593f1a6f683cf0ea5ab1c1dc94d11eea5619e4fb5228b40fbd"}, - {file = "rpds_py-0.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d8222acdb51a22929c3b2ddb236b69c59c72af4019d2cba961e2f9add9b6e634"}, - {file = "rpds_py-0.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4593c4eae9b27d22df41cde518b4b9e4464d139e4322e2127daa9b5b981b76be"}, - {file = "rpds_py-0.25.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd035756830c712b64725a76327ce80e82ed12ebab361d3a1cdc0f51ea21acb0"}, - {file = "rpds_py-0.25.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:114a07e85f32b125404f28f2ed0ba431685151c037a26032b213c882f26eb908"}, - {file = "rpds_py-0.25.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dec21e02e6cc932538b5203d3a8bd6aa1480c98c4914cb88eea064ecdbc6396a"}, - {file = "rpds_py-0.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:09eab132f41bf792c7a0ea1578e55df3f3e7f61888e340779b06050a9a3f16e9"}, - {file = "rpds_py-0.25.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c98f126c4fc697b84c423e387337d5b07e4a61e9feac494362a59fd7a2d9ed80"}, - {file = "rpds_py-0.25.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0e6a327af8ebf6baba1c10fadd04964c1965d375d318f4435d5f3f9651550f4a"}, - {file = "rpds_py-0.25.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:bc120d1132cff853ff617754196d0ac0ae63befe7c8498bd67731ba368abe451"}, - {file = "rpds_py-0.25.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:140f61d9bed7839446bdd44852e30195c8e520f81329b4201ceead4d64eb3a9f"}, - {file = "rpds_py-0.25.1-cp39-cp39-win32.whl", hash = "sha256:9c006f3aadeda131b438c3092124bd196b66312f0caa5823ef09585a669cf449"}, - {file = "rpds_py-0.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:a61d0b2c7c9a0ae45732a77844917b427ff16ad5464b4d4f5e4adb955f582890"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b24bf3cd93d5b6ecfbedec73b15f143596c88ee249fa98cefa9a9dc9d92c6f28"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:0eb90e94f43e5085623932b68840b6f379f26db7b5c2e6bcef3179bd83c9330f"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d50e4864498a9ab639d6d8854b25e80642bd362ff104312d9770b05d66e5fb13"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7c9409b47ba0650544b0bb3c188243b83654dfe55dcc173a86832314e1a6a35d"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:796ad874c89127c91970652a4ee8b00d56368b7e00d3477f4415fe78164c8000"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85608eb70a659bf4c1142b2781083d4b7c0c4e2c90eff11856a9754e965b2540"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4feb9211d15d9160bc85fa72fed46432cdc143eb9cf6d5ca377335a921ac37b"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ccfa689b9246c48947d31dd9d8b16d89a0ecc8e0e26ea5253068efb6c542b76e"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:3c5b317ecbd8226887994852e85de562f7177add602514d4ac40f87de3ae45a8"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:454601988aab2c6e8fd49e7634c65476b2b919647626208e376afcd22019eeb8"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:1c0c434a53714358532d13539272db75a5ed9df75a4a090a753ac7173ec14e11"}, - {file = "rpds_py-0.25.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f73ce1512e04fbe2bc97836e89830d6b4314c171587a99688082d090f934d20a"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ee86d81551ec68a5c25373c5643d343150cc54672b5e9a0cafc93c1870a53954"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89c24300cd4a8e4a51e55c31a8ff3918e6651b241ee8876a42cc2b2a078533ba"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:771c16060ff4e79584dc48902a91ba79fd93eade3aa3a12d6d2a4aadaf7d542b"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:785ffacd0ee61c3e60bdfde93baa6d7c10d86f15655bd706c89da08068dc5038"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a40046a529cc15cef88ac5ab589f83f739e2d332cb4d7399072242400ed68c9"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85fc223d9c76cabe5d0bff82214459189720dc135db45f9f66aa7cffbf9ff6c1"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0be9965f93c222fb9b4cc254235b3b2b215796c03ef5ee64f995b1b69af0762"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8378fa4a940f3fb509c081e06cb7f7f2adae8cf46ef258b0e0ed7519facd573e"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:33358883a4490287e67a2c391dfaea4d9359860281db3292b6886bf0be3d8692"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1d1fadd539298e70cac2f2cb36f5b8a65f742b9b9f1014dd4ea1f7785e2470bf"}, - {file = "rpds_py-0.25.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9a46c2fb2545e21181445515960006e85d22025bd2fe6db23e76daec6eb689fe"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:50f2c501a89c9a5f4e454b126193c5495b9fb441a75b298c60591d8a2eb92e1b"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d779b325cc8238227c47fbc53964c8cc9a941d5dbae87aa007a1f08f2f77b23"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:036ded36bedb727beeabc16dc1dad7cb154b3fa444e936a03b67a86dc6a5066e"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:245550f5a1ac98504147cba96ffec8fabc22b610742e9150138e5d60774686d7"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff7c23ba0a88cb7b104281a99476cccadf29de2a0ef5ce864959a52675b1ca83"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e37caa8cdb3b7cf24786451a0bdb853f6347b8b92005eeb64225ae1db54d1c2b"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f2f48ab00181600ee266a095fe815134eb456163f7d6699f525dee471f312cf"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e5fc7484fa7dce57e25063b0ec9638ff02a908304f861d81ea49273e43838c1"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:d3c10228d6cf6fe2b63d2e7985e94f6916fa46940df46b70449e9ff9297bd3d1"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:5d9e40f32745db28c1ef7aad23f6fc458dc1e29945bd6781060f0d15628b8ddf"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:35a8d1a24b5936b35c5003313bc177403d8bdef0f8b24f28b1c4a255f94ea992"}, - {file = "rpds_py-0.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6099263f526efff9cf3883dfef505518730f7a7a93049b1d90d42e50a22b4793"}, - {file = "rpds_py-0.25.1.tar.gz", hash = "sha256:8960b6dac09b62dac26e75d7e2c4a22efb835d827a7278c34f72b2b84fa160e3"}, +python-versions = ">=3.10" +groups = ["main", "dev"] +files = [ + {file = "rpds_py-0.28.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7b6013db815417eeb56b2d9d7324e64fcd4fa289caeee6e7a78b2e11fc9b438a"}, + {file = "rpds_py-0.28.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a4c6b05c685c0c03f80dabaeb73e74218c49deea965ca63f76a752807397207"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4794c6c3fbe8f9ac87699b131a1f26e7b4abcf6d828da46a3a52648c7930eba"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e8456b6ee5527112ff2354dd9087b030e3429e43a74f480d4a5ca79d269fd85"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:beb880a9ca0a117415f241f66d56025c02037f7c4efc6fe59b5b8454f1eaa50d"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6897bebb118c44b38c9cb62a178e09f1593c949391b9a1a6fe777ccab5934ee7"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b553dd06e875249fd43efd727785efb57a53180e0fde321468222eabbeaafa"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:f0b2044fdddeea5b05df832e50d2a06fe61023acb44d76978e1b060206a8a476"}, + {file = "rpds_py-0.28.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05cf1e74900e8da73fa08cc76c74a03345e5a3e37691d07cfe2092d7d8e27b04"}, + {file = "rpds_py-0.28.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:efd489fec7c311dae25e94fe7eeda4b3d06be71c68f2cf2e8ef990ffcd2cd7e8"}, + {file = "rpds_py-0.28.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ada7754a10faacd4f26067e62de52d6af93b6d9542f0df73c57b9771eb3ba9c4"}, + {file = "rpds_py-0.28.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c2a34fd26588949e1e7977cfcbb17a9a42c948c100cab890c6d8d823f0586457"}, + {file = "rpds_py-0.28.0-cp310-cp310-win32.whl", hash = "sha256:f9174471d6920cbc5e82a7822de8dfd4dcea86eb828b04fc8c6519a77b0ee51e"}, + {file = "rpds_py-0.28.0-cp310-cp310-win_amd64.whl", hash = "sha256:6e32dd207e2c4f8475257a3540ab8a93eff997abfa0a3fdb287cae0d6cd874b8"}, + {file = "rpds_py-0.28.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:03065002fd2e287725d95fbc69688e0c6daf6c6314ba38bdbaa3895418e09296"}, + {file = "rpds_py-0.28.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28ea02215f262b6d078daec0b45344c89e161eab9526b0d898221d96fdda5f27"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dbade8fbf30bcc551cb352376c0ad64b067e4fc56f90e22ba70c3ce205988c"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c03002f54cc855860bfdc3442928ffdca9081e73b5b382ed0b9e8efe6e5e205"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9699fa7990368b22032baf2b2dce1f634388e4ffc03dfefaaac79f4695edc95"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9b06fe1a75e05e0713f06ea0c89ecb6452210fd60e2f1b6ddc1067b990e08d9"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9f83e7b326a3f9ec3ef84cda98fb0a74c7159f33e692032233046e7fd15da2"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:0d3259ea9ad8743a75a43eb7819324cdab393263c91be86e2d1901ee65c314e0"}, + {file = "rpds_py-0.28.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a7548b345f66f6695943b4ef6afe33ccd3f1b638bd9afd0f730dd255c249c9e"}, + {file = "rpds_py-0.28.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9a40040aa388b037eb39416710fbcce9443498d2eaab0b9b45ae988b53f5c67"}, + {file = "rpds_py-0.28.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f60c7ea34e78c199acd0d3cda37a99be2c861dd2b8cf67399784f70c9f8e57d"}, + {file = "rpds_py-0.28.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1571ae4292649100d743b26d5f9c63503bb1fedf538a8f29a98dce2d5ba6b4e6"}, + {file = "rpds_py-0.28.0-cp311-cp311-win32.whl", hash = "sha256:5cfa9af45e7c1140af7321fa0bef25b386ee9faa8928c80dc3a5360971a29e8c"}, + {file = "rpds_py-0.28.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd8d86b5d29d1b74100982424ba53e56033dc47720a6de9ba0259cf81d7cecaa"}, + {file = "rpds_py-0.28.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e27d3a5709cc2b3e013bf93679a849213c79ae0573f9b894b284b55e729e120"}, + {file = "rpds_py-0.28.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6b4f28583a4f247ff60cd7bdda83db8c3f5b05a7a82ff20dd4b078571747708f"}, + {file = "rpds_py-0.28.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d678e91b610c29c4b3d52a2c148b641df2b4676ffe47c59f6388d58b99cdc424"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e819e0e37a44a78e1383bf1970076e2ccc4dc8c2bbaa2f9bd1dc987e9afff628"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ee514e0f0523db5d3fb171f397c54875dbbd69760a414dccf9d4d7ad628b5bd"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3fa06d27fdcee47f07a39e02862da0100cb4982508f5ead53ec533cd5fe55e"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46959ef2e64f9e4a41fc89aa20dbca2b85531f9a72c21099a3360f35d10b0d5a"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8455933b4bcd6e83fde3fefc987a023389c4b13f9a58c8d23e4b3f6d13f78c84"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ad50614a02c8c2962feebe6012b52f9802deec4263946cddea37aaf28dd25a66"}, + {file = "rpds_py-0.28.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e5deca01b271492553fdb6c7fd974659dce736a15bae5dad7ab8b93555bceb28"}, + {file = "rpds_py-0.28.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:735f8495a13159ce6a0d533f01e8674cec0c57038c920495f87dcb20b3ddb48a"}, + {file = "rpds_py-0.28.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:961ca621ff10d198bbe6ba4957decca61aa2a0c56695384c1d6b79bf61436df5"}, + {file = "rpds_py-0.28.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2374e16cc9131022e7d9a8f8d65d261d9ba55048c78f3b6e017971a4f5e6353c"}, + {file = "rpds_py-0.28.0-cp312-cp312-win32.whl", hash = "sha256:d15431e334fba488b081d47f30f091e5d03c18527c325386091f31718952fe08"}, + {file = "rpds_py-0.28.0-cp312-cp312-win_amd64.whl", hash = "sha256:a410542d61fc54710f750d3764380b53bf09e8c4edbf2f9141a82aa774a04f7c"}, + {file = "rpds_py-0.28.0-cp312-cp312-win_arm64.whl", hash = "sha256:1f0cfd1c69e2d14f8c892b893997fa9a60d890a0c8a603e88dca4955f26d1edd"}, + {file = "rpds_py-0.28.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e9e184408a0297086f880556b6168fa927d677716f83d3472ea333b42171ee3b"}, + {file = "rpds_py-0.28.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:edd267266a9b0448f33dc465a97cfc5d467594b600fe28e7fa2f36450e03053a"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85beb8b3f45e4e32f6802fb6cd6b17f615ef6c6a52f265371fb916fae02814aa"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2412be8d00a1b895f8ad827cc2116455196e20ed994bb704bf138fe91a42724"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf128350d384b777da0e68796afdcebc2e9f63f0e9f242217754e647f6d32491"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2036d09b363aa36695d1cc1a97b36865597f4478470b0697b5ee9403f4fe399"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8e1e9be4fa6305a16be628959188e4fd5cd6f1b0e724d63c6d8b2a8adf74ea6"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0a403460c9dd91a7f23fc3188de6d8977f1d9603a351d5db6cf20aaea95b538d"}, + {file = "rpds_py-0.28.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d7366b6553cdc805abcc512b849a519167db8f5e5c3472010cd1228b224265cb"}, + {file = "rpds_py-0.28.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5b43c6a3726efd50f18d8120ec0551241c38785b68952d240c45ea553912ac41"}, + {file = "rpds_py-0.28.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0cb7203c7bc69d7c1585ebb33a2e6074492d2fc21ad28a7b9d40457ac2a51ab7"}, + {file = "rpds_py-0.28.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7a52a5169c664dfb495882adc75c304ae1d50df552fbd68e100fdc719dee4ff9"}, + {file = "rpds_py-0.28.0-cp313-cp313-win32.whl", hash = "sha256:2e42456917b6687215b3e606ab46aa6bca040c77af7df9a08a6dcfe8a4d10ca5"}, + {file = "rpds_py-0.28.0-cp313-cp313-win_amd64.whl", hash = "sha256:e0a0311caedc8069d68fc2bf4c9019b58a2d5ce3cd7cb656c845f1615b577e1e"}, + {file = "rpds_py-0.28.0-cp313-cp313-win_arm64.whl", hash = "sha256:04c1b207ab8b581108801528d59ad80aa83bb170b35b0ddffb29c20e411acdc1"}, + {file = "rpds_py-0.28.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:f296ea3054e11fc58ad42e850e8b75c62d9a93a9f981ad04b2e5ae7d2186ff9c"}, + {file = "rpds_py-0.28.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5a7306c19b19005ad98468fcefeb7100b19c79fc23a5f24a12e06d91181193fa"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d9b86aa501fed9862a443c5c3116f6ead8bc9296185f369277c42542bd646b"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e5bbc701eff140ba0e872691d573b3d5d30059ea26e5785acba9132d10c8c31d"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5690671cd672a45aa8616d7374fdf334a1b9c04a0cac3c854b1136e92374fe"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9f1d92ecea4fa12f978a367c32a5375a1982834649cdb96539dcdc12e609ab1a"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d252db6b1a78d0a3928b6190156042d54c93660ce4d98290d7b16b5296fb7cc"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d61b355c3275acb825f8777d6c4505f42b5007e357af500939d4a35b19177259"}, + {file = "rpds_py-0.28.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:acbe5e8b1026c0c580d0321c8aae4b0a1e1676861d48d6e8c6586625055b606a"}, + {file = "rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8aa23b6f0fc59b85b4c7d89ba2965af274346f738e8d9fc2455763602e62fd5f"}, + {file = "rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7b14b0c680286958817c22d76fcbca4800ddacef6f678f3a7c79a1fe7067fe37"}, + {file = "rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bcf1d210dfee61a6c86551d67ee1031899c0fdbae88b2d44a569995d43797712"}, + {file = "rpds_py-0.28.0-cp313-cp313t-win32.whl", hash = "sha256:3aa4dc0fdab4a7029ac63959a3ccf4ed605fee048ba67ce89ca3168da34a1342"}, + {file = "rpds_py-0.28.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7b7d9d83c942855e4fdcfa75d4f96f6b9e272d42fffcb72cd4bb2577db2e2907"}, + {file = "rpds_py-0.28.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:dcdcb890b3ada98a03f9f2bb108489cdc7580176cb73b4f2d789e9a1dac1d472"}, + {file = "rpds_py-0.28.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f274f56a926ba2dc02976ca5b11c32855cbd5925534e57cfe1fda64e04d1add2"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fe0438ac4a29a520ea94c8c7f1754cdd8feb1bc490dfda1bfd990072363d527"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a358a32dd3ae50e933347889b6af9a1bdf207ba5d1a3f34e1a38cd3540e6733"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e80848a71c78aa328fefaba9c244d588a342c8e03bda518447b624ea64d1ff56"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f586db2e209d54fe177e58e0bc4946bea5fb0102f150b1b2f13de03e1f0976f8"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae8ee156d6b586e4292491e885d41483136ab994e719a13458055bec14cf370"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:a805e9b3973f7e27f7cab63a6b4f61d90f2e5557cff73b6e97cd5b8540276d3d"}, + {file = "rpds_py-0.28.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5d3fd16b6dc89c73a4da0b4ac8b12a7ecc75b2864b95c9e5afed8003cb50a728"}, + {file = "rpds_py-0.28.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6796079e5d24fdaba6d49bda28e2c47347e89834678f2bc2c1b4fc1489c0fb01"}, + {file = "rpds_py-0.28.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:76500820c2af232435cbe215e3324c75b950a027134e044423f59f5b9a1ba515"}, + {file = "rpds_py-0.28.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bbdc5640900a7dbf9dd707fe6388972f5bbd883633eb68b76591044cfe346f7e"}, + {file = "rpds_py-0.28.0-cp314-cp314-win32.whl", hash = "sha256:adc8aa88486857d2b35d75f0640b949759f79dc105f50aa2c27816b2e0dd749f"}, + {file = "rpds_py-0.28.0-cp314-cp314-win_amd64.whl", hash = "sha256:66e6fa8e075b58946e76a78e69e1a124a21d9a48a5b4766d15ba5b06869d1fa1"}, + {file = "rpds_py-0.28.0-cp314-cp314-win_arm64.whl", hash = "sha256:a6fe887c2c5c59413353b7c0caff25d0e566623501ccfff88957fa438a69377d"}, + {file = "rpds_py-0.28.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7a69df082db13c7070f7b8b1f155fa9e687f1d6aefb7b0e3f7231653b79a067b"}, + {file = "rpds_py-0.28.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b1cde22f2c30ebb049a9e74c5374994157b9b70a16147d332f89c99c5960737a"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5338742f6ba7a51012ea470bd4dc600a8c713c0c72adaa0977a1b1f4327d6592"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1460ebde1bcf6d496d80b191d854adedcc619f84ff17dc1c6d550f58c9efbba"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e3eb248f2feba84c692579257a043a7699e28a77d86c77b032c1d9fbb3f0219c"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3bbba5def70b16cd1c1d7255666aad3b290fbf8d0fe7f9f91abafb73611a91"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3114f4db69ac5a1f32e7e4d1cbbe7c8f9cf8217f78e6e002cedf2d54c2a548ed"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4b0cb8a906b1a0196b863d460c0222fb8ad0f34041568da5620f9799b83ccf0b"}, + {file = "rpds_py-0.28.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf681ac76a60b667106141e11a92a3330890257e6f559ca995fbb5265160b56e"}, + {file = "rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1e8ee6413cfc677ce8898d9cde18cc3a60fc2ba756b0dec5b71eb6eb21c49fa1"}, + {file = "rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b3072b16904d0b5572a15eb9d31c1954e0d3227a585fc1351aa9878729099d6c"}, + {file = "rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b670c30fd87a6aec281c3c9896d3bae4b205fd75d79d06dc87c2503717e46092"}, + {file = "rpds_py-0.28.0-cp314-cp314t-win32.whl", hash = "sha256:8014045a15b4d2b3476f0a287fcc93d4f823472d7d1308d47884ecac9e612be3"}, + {file = "rpds_py-0.28.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a4e59c90d9c27c561eb3160323634a9ff50b04e4f7820600a2beb0ac90db578"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f5e7101145427087e493b9c9b959da68d357c28c562792300dd21a095118ed16"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:31eb671150b9c62409a888850aaa8e6533635704fe2b78335f9aaf7ff81eec4d"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48b55c1f64482f7d8bd39942f376bfdf2f6aec637ee8c805b5041e14eeb771db"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24743a7b372e9a76171f6b69c01aedf927e8ac3e16c474d9fe20d552a8cb45c7"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:389c29045ee8bbb1627ea190b4976a310a295559eaf9f1464a1a6f2bf84dde78"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23690b5827e643150cf7b49569679ec13fe9a610a15949ed48b85eb7f98f34ec"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0c9266c26580e7243ad0d72fc3e01d6b33866cfab5084a6da7576bcf1c4f72"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4c6c4db5d73d179746951486df97fd25e92396be07fc29ee8ff9a8f5afbdfb27"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3b695a8fa799dd2cfdb4804b37096c5f6dba1ac7f48a7fbf6d0485bcd060316"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:6aa1bfce3f83baf00d9c5fcdbba93a3ab79958b4c7d7d1f55e7fe68c20e63912"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b0f9dceb221792b3ee6acb5438eb1f02b0cb2c247796a72b016dcc92c6de829"}, + {file = "rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5d0145edba8abd3db0ab22b5300c99dc152f5c9021fab861be0f0544dc3cbc5f"}, + {file = "rpds_py-0.28.0.tar.gz", hash = "sha256:abd4df20485a0983e2ca334a216249b6186d6e3c1627e106651943dbdb791aea"}, ] [[package]] @@ -4993,6 +6047,7 @@ version = "4.9.1" description = "Pure-Python RSA implementation" optional = false python-versions = "<4,>=3.6" +groups = ["main"] files = [ {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, @@ -5007,6 +6062,7 @@ version = "0.9.10" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "ruff-0.9.10-py3-none-linux_armv6l.whl", hash = "sha256:eb4d25532cfd9fe461acc83498361ec2e2252795b4f40b17e80692814329e42d"}, {file = "ruff-0.9.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:188a6638dab1aa9bb6228a7302387b2c9954e455fb25d6b4470cb0641d16759d"}, @@ -5034,6 +6090,8 @@ version = "0.4.5" description = "" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"transformers\" or extra == \"all\"" files = [ {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"}, {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"}, @@ -5166,6 +6224,7 @@ version = "1.6.1" description = "A set of python modules for machine learning and data mining" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e"}, {file = "scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36"}, @@ -5216,66 +6275,82 @@ tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc ( [[package]] name = "scipy" -version = "1.15.3" +version = "1.16.3" description = "Fundamental algorithms for scientific computing in Python" optional = false -python-versions = ">=3.10" -files = [ - {file = "scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c"}, - {file = "scipy-1.15.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253"}, - {file = "scipy-1.15.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:aef683a9ae6eb00728a542b796f52a5477b78252edede72b8327a886ab63293f"}, - {file = "scipy-1.15.3-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:1c832e1bd78dea67d5c16f786681b28dd695a8cb1fb90af2e27580d3d0967e92"}, - {file = "scipy-1.15.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:263961f658ce2165bbd7b99fa5135195c3a12d9bef045345016b8b50c315cb82"}, - {file = "scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2abc762b0811e09a0d3258abee2d98e0c703eee49464ce0069590846f31d40"}, - {file = "scipy-1.15.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed7284b21a7a0c8f1b6e5977ac05396c0d008b89e05498c8b7e8f4a1423bba0e"}, - {file = "scipy-1.15.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5380741e53df2c566f4d234b100a484b420af85deb39ea35a1cc1be84ff53a5c"}, - {file = "scipy-1.15.3-cp310-cp310-win_amd64.whl", hash = "sha256:9d61e97b186a57350f6d6fd72640f9e99d5a4a2b8fbf4b9ee9a841eab327dc13"}, - {file = "scipy-1.15.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:993439ce220d25e3696d1b23b233dd010169b62f6456488567e830654ee37a6b"}, - {file = "scipy-1.15.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:34716e281f181a02341ddeaad584205bd2fd3c242063bd3423d61ac259ca7eba"}, - {file = "scipy-1.15.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b0334816afb8b91dab859281b1b9786934392aa3d527cd847e41bb6f45bee65"}, - {file = "scipy-1.15.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6db907c7368e3092e24919b5e31c76998b0ce1684d51a90943cb0ed1b4ffd6c1"}, - {file = "scipy-1.15.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:721d6b4ef5dc82ca8968c25b111e307083d7ca9091bc38163fb89243e85e3889"}, - {file = "scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39cb9c62e471b1bb3750066ecc3a3f3052b37751c7c3dfd0fd7e48900ed52982"}, - {file = "scipy-1.15.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:795c46999bae845966368a3c013e0e00947932d68e235702b5c3f6ea799aa8c9"}, - {file = "scipy-1.15.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:18aaacb735ab38b38db42cb01f6b92a2d0d4b6aabefeb07f02849e47f8fb3594"}, - {file = "scipy-1.15.3-cp311-cp311-win_amd64.whl", hash = "sha256:ae48a786a28412d744c62fd7816a4118ef97e5be0bee968ce8f0a2fba7acf3bb"}, - {file = "scipy-1.15.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019"}, - {file = "scipy-1.15.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6"}, - {file = "scipy-1.15.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477"}, - {file = "scipy-1.15.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c"}, - {file = "scipy-1.15.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45"}, - {file = "scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49"}, - {file = "scipy-1.15.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e"}, - {file = "scipy-1.15.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539"}, - {file = "scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed"}, - {file = "scipy-1.15.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759"}, - {file = "scipy-1.15.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62"}, - {file = "scipy-1.15.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb"}, - {file = "scipy-1.15.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730"}, - {file = "scipy-1.15.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825"}, - {file = "scipy-1.15.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7"}, - {file = "scipy-1.15.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11"}, - {file = "scipy-1.15.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126"}, - {file = "scipy-1.15.3-cp313-cp313-win_amd64.whl", hash = "sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163"}, - {file = "scipy-1.15.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8"}, - {file = "scipy-1.15.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5"}, - {file = "scipy-1.15.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e"}, - {file = "scipy-1.15.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb"}, - {file = "scipy-1.15.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723"}, - {file = "scipy-1.15.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb"}, - {file = "scipy-1.15.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4"}, - {file = "scipy-1.15.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5"}, - {file = "scipy-1.15.3-cp313-cp313t-win_amd64.whl", hash = "sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca"}, - {file = "scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf"}, +python-versions = ">=3.11" +groups = ["main"] +files = [ + {file = "scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97"}, + {file = "scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511"}, + {file = "scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005"}, + {file = "scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb"}, + {file = "scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876"}, + {file = "scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2"}, + {file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e"}, + {file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733"}, + {file = "scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78"}, + {file = "scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184"}, + {file = "scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6"}, + {file = "scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07"}, + {file = "scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9"}, + {file = "scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686"}, + {file = "scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203"}, + {file = "scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1"}, + {file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe"}, + {file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70"}, + {file = "scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc"}, + {file = "scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2"}, + {file = "scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c"}, + {file = "scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d"}, + {file = "scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9"}, + {file = "scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4"}, + {file = "scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959"}, + {file = "scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88"}, + {file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234"}, + {file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d"}, + {file = "scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304"}, + {file = "scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2"}, + {file = "scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b"}, + {file = "scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079"}, + {file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a"}, + {file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119"}, + {file = "scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c"}, + {file = "scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e"}, + {file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135"}, + {file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6"}, + {file = "scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc"}, + {file = "scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a"}, + {file = "scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6"}, + {file = "scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657"}, + {file = "scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26"}, + {file = "scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc"}, + {file = "scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22"}, + {file = "scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc"}, + {file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0"}, + {file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800"}, + {file = "scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d"}, + {file = "scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f"}, + {file = "scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c"}, + {file = "scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40"}, + {file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d"}, + {file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa"}, + {file = "scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8"}, + {file = "scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353"}, + {file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146"}, + {file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d"}, + {file = "scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7"}, + {file = "scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562"}, + {file = "scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb"}, ] [package.dependencies] -numpy = ">=1.23.5,<2.5" +numpy = ">=1.25.2,<2.6" [package.extras] dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] -doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.0.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] -test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] +test = ["Cython", "array-api-strict (>=2.3.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja ; sys_platform != \"emscripten\"", "pooch", "pytest (>=8.0.0)", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] [[package]] name = "seaborn" @@ -5283,6 +6358,7 @@ version = "0.12.2" description = "Statistical data visualization" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "seaborn-0.12.2-py3-none-any.whl", hash = "sha256:ebf15355a4dba46037dfd65b7350f014ceb1f13c05e814eda2c9f5fd731afc08"}, {file = "seaborn-0.12.2.tar.gz", hash = "sha256:374645f36509d0dcab895cba5b47daf0586f77bfe3b36c97c607db7da5be0139"}, @@ -5304,15 +6380,16 @@ version = "1.8.3" description = "Send file to trash natively under Mac OS X, Windows and Linux" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["dev"] files = [ {file = "Send2Trash-1.8.3-py3-none-any.whl", hash = "sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9"}, {file = "Send2Trash-1.8.3.tar.gz", hash = "sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf"}, ] [package.extras] -nativelib = ["pyobjc-framework-Cocoa", "pywin32"] -objc = ["pyobjc-framework-Cocoa"] -win32 = ["pywin32"] +nativelib = ["pyobjc-framework-Cocoa ; sys_platform == \"darwin\"", "pywin32 ; sys_platform == \"win32\""] +objc = ["pyobjc-framework-Cocoa ; sys_platform == \"darwin\""] +win32 = ["pywin32 ; sys_platform == \"win32\""] [[package]] name = "setuptools" @@ -5320,19 +6397,21 @@ version = "80.9.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, ] +markers = {main = "python_version == \"3.12\""} [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] -core = ["importlib_metadata (>=6)", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "six" @@ -5340,6 +6419,7 @@ version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -5351,6 +6431,7 @@ version = "5.0.2" description = "A pure Python implementation of a sliding window memory map manager" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e"}, {file = "smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5"}, @@ -5358,13 +6439,14 @@ files = [ [[package]] name = "smolagents" -version = "1.17.0" +version = "1.22.0" description = "🤗 smolagents: a barebones library for agents. Agents write python code to call tools or orchestrate other agents." optional = false python-versions = ">=3.10" +groups = ["main"] files = [ - {file = "smolagents-1.17.0-py3-none-any.whl", hash = "sha256:b6b7853d454c24c949cb306858523e97792310b9ab422a61cba5ccbab48f01c1"}, - {file = "smolagents-1.17.0.tar.gz", hash = "sha256:8d4ec4ccb759986560299e5489eab530282c68a4110820919d13a69e642f2b5b"}, + {file = "smolagents-1.22.0-py3-none-any.whl", hash = "sha256:5334adb4e7e5814cd814f1d9ad7efa806ef57f53db40635a29d2bd727774c5f5"}, + {file = "smolagents-1.22.0.tar.gz", hash = "sha256:5fb66f48e3b3ab5e8defcef577a89d5b6dfa8fcb55fc98a58e156cb3c59eb68f"}, ] [package.dependencies] @@ -5376,7 +6458,7 @@ requests = ">=2.32.3" rich = ">=13.9.4" [package.extras] -all = ["smolagents[audio,bedrock,docker,e2b,gradio,litellm,mcp,mlx-lm,openai,telemetry,toolkit,transformers,vision]"] +all = ["smolagents[audio,bedrock,docker,e2b,gradio,litellm,mcp,mlx-lm,modal,openai,telemetry,toolkit,transformers,vision]"] audio = ["smolagents[torch]", "soundfile"] bedrock = ["boto3 (>=1.36.18)"] dev = ["smolagents[quality,test]", "sqlalchemy"] @@ -5384,13 +6466,14 @@ docker = ["docker (>=7.1.0)", "websocket-client"] e2b = ["e2b-code-interpreter (>=1.0.3)", "python-dotenv (>=1.0.1)"] gradio = ["gradio (>=5.14.0)"] litellm = ["litellm (>=1.60.2)"] -mcp = ["mcp", "mcpadapt (>=0.1.8)"] +mcp = ["mcp", "mcpadapt (>=0.1.13)"] mlx-lm = ["mlx-lm"] +modal = ["modal (>=1.1.3)", "websocket-client"] openai = ["openai (>=1.58.1)"] quality = ["ruff (>=0.9.0)"] -telemetry = ["arize-phoenix", "openinference-instrumentation-smolagents (>=0.1.4)", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] -test = ["Wikipedia-API (>=0.8.1)", "ipython (>=8.31.0)", "pandas (>=2.2.3)", "pytest (>=8.1.0)", "pytest-datadir", "python-dotenv (>=1.0.1)", "rank-bm25", "smolagents[all]"] -toolkit = ["duckduckgo-search (>=6.3.7)", "markdownify (>=0.14.1)"] +telemetry = ["arize-phoenix", "openinference-instrumentation-smolagents (>=0.1.15)", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] +test = ["Wikipedia-API (>=0.8.1)", "ipython (>=8.31.0)", "mlx[cpu]", "pandas (>=2.2.3)", "pytest (>=8.1.0)", "pytest-datadir", "pytest-timeout", "python-dotenv (>=1.0.1)", "rank-bm25", "smolagents[all]"] +toolkit = ["ddgs (>=9.0.0)", "markdownify (>=0.14.1)"] torch = ["numpy (>=1.21.2)", "torch", "torchvision"] transformers = ["accelerate", "smolagents[torch]", "transformers (>=4.0.0)"] vision = ["helium", "selenium"] @@ -5402,6 +6485,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -5413,6 +6497,7 @@ version = "2.4.0" description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, @@ -5420,83 +6505,85 @@ files = [ [[package]] name = "soupsieve" -version = "2.7" +version = "2.8" description = "A modern CSS selector implementation for Beautiful Soup." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4"}, - {file = "soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a"}, + {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"}, + {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"}, ] [[package]] name = "sqlalchemy" -version = "2.0.41" +version = "2.0.44" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" -files = [ - {file = "SQLAlchemy-2.0.41-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6854175807af57bdb6425e47adbce7d20a4d79bbfd6f6d6519cd10bb7109a7f8"}, - {file = "SQLAlchemy-2.0.41-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05132c906066142103b83d9c250b60508af556982a385d96c4eaa9fb9720ac2b"}, - {file = "SQLAlchemy-2.0.41-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b4af17bda11e907c51d10686eda89049f9ce5669b08fbe71a29747f1e876036"}, - {file = "SQLAlchemy-2.0.41-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:c0b0e5e1b5d9f3586601048dd68f392dc0cc99a59bb5faf18aab057ce00d00b2"}, - {file = "SQLAlchemy-2.0.41-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0b3dbf1e7e9bc95f4bac5e2fb6d3fb2f083254c3fdd20a1789af965caf2d2348"}, - {file = "SQLAlchemy-2.0.41-cp37-cp37m-win32.whl", hash = "sha256:1e3f196a0c59b0cae9a0cd332eb1a4bda4696e863f4f1cf84ab0347992c548c2"}, - {file = "SQLAlchemy-2.0.41-cp37-cp37m-win_amd64.whl", hash = "sha256:6ab60a5089a8f02009f127806f777fca82581c49e127f08413a66056bd9166dd"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b1f09b6821406ea1f94053f346f28f8215e293344209129a9c0fcc3578598d7b"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1936af879e3db023601196a1684d28e12f19ccf93af01bf3280a3262c4b6b4e5"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2ac41acfc8d965fb0c464eb8f44995770239668956dc4cdf502d1b1ffe0d747"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81c24e0c0fde47a9723c81d5806569cddef103aebbf79dbc9fcbb617153dea30"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:23a8825495d8b195c4aa9ff1c430c28f2c821e8c5e2d98089228af887e5d7e29"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:60c578c45c949f909a4026b7807044e7e564adf793537fc762b2489d522f3d11"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-win32.whl", hash = "sha256:118c16cd3f1b00c76d69343e38602006c9cfb9998fa4f798606d28d63f23beda"}, - {file = "sqlalchemy-2.0.41-cp310-cp310-win_amd64.whl", hash = "sha256:7492967c3386df69f80cf67efd665c0f667cee67032090fe01d7d74b0e19bb08"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6375cd674fe82d7aa9816d1cb96ec592bac1726c11e0cafbf40eeee9a4516b5f"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9f8c9fdd15a55d9465e590a402f42082705d66b05afc3ffd2d2eb3c6ba919560"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32f9dc8c44acdee06c8fc6440db9eae8b4af8b01e4b1aee7bdd7241c22edff4f"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90c11ceb9a1f482c752a71f203a81858625d8df5746d787a4786bca4ffdf71c6"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:911cc493ebd60de5f285bcae0491a60b4f2a9f0f5c270edd1c4dbaef7a38fc04"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03968a349db483936c249f4d9cd14ff2c296adfa1290b660ba6516f973139582"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-win32.whl", hash = "sha256:293cd444d82b18da48c9f71cd7005844dbbd06ca19be1ccf6779154439eec0b8"}, - {file = "sqlalchemy-2.0.41-cp311-cp311-win_amd64.whl", hash = "sha256:3d3549fc3e40667ec7199033a4e40a2f669898a00a7b18a931d3efb4c7900504"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:81f413674d85cfd0dfcd6512e10e0f33c19c21860342a4890c3a2b59479929f9"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:598d9ebc1e796431bbd068e41e4de4dc34312b7aa3292571bb3674a0cb415dd1"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a104c5694dfd2d864a6f91b0956eb5d5883234119cb40010115fd45a16da5e70"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6145afea51ff0af7f2564a05fa95eb46f542919e6523729663a5d285ecb3cf5e"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b46fa6eae1cd1c20e6e6f44e19984d438b6b2d8616d21d783d150df714f44078"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41836fe661cc98abfae476e14ba1906220f92c4e528771a8a3ae6a151242d2ae"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-win32.whl", hash = "sha256:a8808d5cf866c781150d36a3c8eb3adccfa41a8105d031bf27e92c251e3969d6"}, - {file = "sqlalchemy-2.0.41-cp312-cp312-win_amd64.whl", hash = "sha256:5b14e97886199c1f52c14629c11d90c11fbb09e9334fa7bb5f6d068d9ced0ce0"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4eeb195cdedaf17aab6b247894ff2734dcead6c08f748e617bfe05bd5a218443"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d4ae769b9c1c7757e4ccce94b0641bc203bbdf43ba7a2413ab2523d8d047d8dc"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a62448526dd9ed3e3beedc93df9bb6b55a436ed1474db31a2af13b313a70a7e1"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc56c9788617b8964ad02e8fcfeed4001c1f8ba91a9e1f31483c0dffb207002a"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c153265408d18de4cc5ded1941dcd8315894572cddd3c58df5d5b5705b3fa28d"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f67766965996e63bb46cfbf2ce5355fc32d9dd3b8ad7e536a920ff9ee422e23"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-win32.whl", hash = "sha256:bfc9064f6658a3d1cadeaa0ba07570b83ce6801a1314985bf98ec9b95d74e15f"}, - {file = "sqlalchemy-2.0.41-cp313-cp313-win_amd64.whl", hash = "sha256:82ca366a844eb551daff9d2e6e7a9e5e76d2612c8564f58db6c19a726869c1df"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:90144d3b0c8b139408da50196c5cad2a6909b51b23df1f0538411cd23ffa45d3"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:023b3ee6169969beea3bb72312e44d8b7c27c75b347942d943cf49397b7edeb5"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:725875a63abf7c399d4548e686debb65cdc2549e1825437096a0af1f7e374814"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81965cc20848ab06583506ef54e37cf15c83c7e619df2ad16807c03100745dea"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dd5ec3aa6ae6e4d5b5de9357d2133c07be1aff6405b136dad753a16afb6717dd"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ff8e80c4c4932c10493ff97028decfdb622de69cae87e0f127a7ebe32b4069c6"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-win32.whl", hash = "sha256:4d44522480e0bf34c3d63167b8cfa7289c1c54264c2950cc5fc26e7850967e45"}, - {file = "sqlalchemy-2.0.41-cp38-cp38-win_amd64.whl", hash = "sha256:81eedafa609917040d39aa9332e25881a8e7a0862495fcdf2023a9667209deda"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9a420a91913092d1e20c86a2f5f1fc85c1a8924dbcaf5e0586df8aceb09c9cc2"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:906e6b0d7d452e9a98e5ab8507c0da791856b2380fdee61b765632bb8698026f"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a373a400f3e9bac95ba2a06372c4fd1412a7cee53c37fc6c05f829bf672b8769"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:087b6b52de812741c27231b5a3586384d60c353fbd0e2f81405a814b5591dc8b"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:34ea30ab3ec98355235972dadc497bb659cc75f8292b760394824fab9cf39826"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8280856dd7c6a68ab3a164b4a4b1c51f7691f6d04af4d4ca23d6ecf2261b7923"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-win32.whl", hash = "sha256:b50eab9994d64f4a823ff99a0ed28a6903224ddbe7fef56a6dd865eec9243440"}, - {file = "sqlalchemy-2.0.41-cp39-cp39-win_amd64.whl", hash = "sha256:5e22575d169529ac3e0a120cf050ec9daa94b6a9597993d1702884f6954a7d71"}, - {file = "sqlalchemy-2.0.41-py3-none-any.whl", hash = "sha256:57df5dc6fdb5ed1a88a1ed2195fd31927e705cad62dedd86b46972752a80f576"}, - {file = "sqlalchemy-2.0.41.tar.gz", hash = "sha256:edba70118c4be3c2b1f90754d308d0b79c6fe2c0fdc52d8ddf603916f83f4db9"}, +groups = ["main"] +files = [ + {file = "SQLAlchemy-2.0.44-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:471733aabb2e4848d609141a9e9d56a427c0a038f4abf65dd19d7a21fd563632"}, + {file = "SQLAlchemy-2.0.44-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48bf7d383a35e668b984c805470518b635d48b95a3c57cb03f37eaa3551b5f9f"}, + {file = "SQLAlchemy-2.0.44-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf4bb6b3d6228fcf3a71b50231199fb94d2dd2611b66d33be0578ea3e6c2726"}, + {file = "SQLAlchemy-2.0.44-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:e998cf7c29473bd077704cea3577d23123094311f59bdc4af551923b168332b1"}, + {file = "SQLAlchemy-2.0.44-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:ebac3f0b5732014a126b43c2b7567f2f0e0afea7d9119a3378bde46d3dcad88e"}, + {file = "SQLAlchemy-2.0.44-cp37-cp37m-win32.whl", hash = "sha256:3255d821ee91bdf824795e936642bbf43a4c7cedf5d1aed8d24524e66843aa74"}, + {file = "SQLAlchemy-2.0.44-cp37-cp37m-win_amd64.whl", hash = "sha256:78e6c137ba35476adb5432103ae1534f2f5295605201d946a4198a0dea4b38e7"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c77f3080674fc529b1bd99489378c7f63fcb4ba7f8322b79732e0258f0ea3ce"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c26ef74ba842d61635b0152763d057c8d48215d5be9bb8b7604116a059e9985"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4a172b31785e2f00780eccab00bc240ccdbfdb8345f1e6063175b3ff12ad1b0"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9480c0740aabd8cb29c329b422fb65358049840b34aba0adf63162371d2a96e"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:17835885016b9e4d0135720160db3095dc78c583e7b902b6be799fb21035e749"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cbe4f85f50c656d753890f39468fcd8190c5f08282caf19219f684225bfd5fd2"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-win32.whl", hash = "sha256:2fcc4901a86ed81dc76703f3b93ff881e08761c63263c46991081fd7f034b165"}, + {file = "sqlalchemy-2.0.44-cp310-cp310-win_amd64.whl", hash = "sha256:9919e77403a483ab81e3423151e8ffc9dd992c20d2603bf17e4a8161111e55f5"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fe3917059c7ab2ee3f35e77757062b1bea10a0b6ca633c58391e3f3c6c488dd"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:de4387a354ff230bc979b46b2207af841dc8bf29847b6c7dbe60af186d97aefa"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3678a0fb72c8a6a29422b2732fe423db3ce119c34421b5f9955873eb9b62c1e"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cf6872a23601672d61a68f390e44703442639a12ee9dd5a88bbce52a695e46e"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:329aa42d1be9929603f406186630135be1e7a42569540577ba2c69952b7cf399"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:70e03833faca7166e6a9927fbee7c27e6ecde436774cd0b24bbcc96353bce06b"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-win32.whl", hash = "sha256:253e2f29843fb303eca6b2fc645aca91fa7aa0aa70b38b6950da92d44ff267f3"}, + {file = "sqlalchemy-2.0.44-cp311-cp311-win_amd64.whl", hash = "sha256:7a8694107eb4308a13b425ca8c0e67112f8134c846b6e1f722698708741215d5"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:72fea91746b5890f9e5e0997f16cbf3d53550580d76355ba2d998311b17b2250"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:585c0c852a891450edbb1eaca8648408a3cc125f18cf433941fa6babcc359e29"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b94843a102efa9ac68a7a30cd46df3ff1ed9c658100d30a725d10d9c60a2f44"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:119dc41e7a7defcefc57189cfa0e61b1bf9c228211aba432b53fb71ef367fda1"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0765e318ee9179b3718c4fd7ba35c434f4dd20332fbc6857a5e8df17719c24d7"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2e7b5b079055e02d06a4308d0481658e4f06bc7ef211567edc8f7d5dce52018d"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-win32.whl", hash = "sha256:846541e58b9a81cce7dee8329f352c318de25aa2f2bbe1e31587eb1f057448b4"}, + {file = "sqlalchemy-2.0.44-cp312-cp312-win_amd64.whl", hash = "sha256:7cbcb47fd66ab294703e1644f78971f6f2f1126424d2b300678f419aa73c7b6e"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ff486e183d151e51b1d694c7aa1695747599bb00b9f5f604092b54b74c64a8e1"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b1af8392eb27b372ddb783b317dea0f650241cea5bd29199b22235299ca2e45"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b61188657e3a2b9ac4e8f04d6cf8e51046e28175f79464c67f2fd35bceb0976"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b87e7b91a5d5973dda5f00cd61ef72ad75a1db73a386b62877d4875a8840959c"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:15f3326f7f0b2bfe406ee562e17f43f36e16167af99c4c0df61db668de20002d"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e77faf6ff919aa8cd63f1c4e561cac1d9a454a191bb864d5dd5e545935e5a40"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-win32.whl", hash = "sha256:ee51625c2d51f8baadf2829fae817ad0b66b140573939dd69284d2ba3553ae73"}, + {file = "sqlalchemy-2.0.44-cp313-cp313-win_amd64.whl", hash = "sha256:c1c80faaee1a6c3428cecf40d16a2365bcf56c424c92c2b6f0f9ad204b899e9e"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2fc44e5965ea46909a416fff0af48a219faefd5773ab79e5f8a5fcd5d62b2667"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dc8b3850d2a601ca2320d081874033684e246d28e1c5e89db0864077cfc8f5a9"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d733dec0614bb8f4bcb7c8af88172b974f685a31dc3a65cca0527e3120de5606"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22be14009339b8bc16d6b9dc8780bacaba3402aa7581658e246114abbd2236e3"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:357bade0e46064f88f2c3a99808233e67b0051cdddf82992379559322dfeb183"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4848395d932e93c1595e59a8672aa7400e8922c39bb9b0668ed99ac6fa867822"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-win32.whl", hash = "sha256:2f19644f27c76f07e10603580a47278abb2a70311136a7f8fd27dc2e096b9013"}, + {file = "sqlalchemy-2.0.44-cp38-cp38-win_amd64.whl", hash = "sha256:1df4763760d1de0dfc8192cc96d8aa293eb1a44f8f7a5fbe74caf1b551905c5e"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f7027414f2b88992877573ab780c19ecb54d3a536bef3397933573d6b5068be4"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3fe166c7d00912e8c10d3a9a0ce105569a31a3d0db1a6e82c4e0f4bf16d5eca9"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3caef1ff89b1caefc28f0368b3bde21a7e3e630c2eddac16abd9e47bd27cc36a"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc2856d24afa44295735e72f3c75d6ee7fdd4336d8d3a8f3d44de7aa6b766df2"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:11bac86b0deada30b6b5f93382712ff0e911fe8d31cb9bf46e6b149ae175eff0"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4d18cd0e9a0f37c9f4088e50e3839fcb69a380a0ec957408e0b57cff08ee0a26"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-win32.whl", hash = "sha256:9e9018544ab07614d591a26c1bd4293ddf40752cc435caf69196740516af7100"}, + {file = "sqlalchemy-2.0.44-cp39-cp39-win_amd64.whl", hash = "sha256:8e0e4e66fd80f277a8c3de016a81a554e76ccf6b8d881ee0b53200305a8433f6"}, + {file = "sqlalchemy-2.0.44-py3-none-any.whl", hash = "sha256:19de7ca1246fbef9f9d1bff8f1ab25641569df226364a0e40457dc5457c54b05"}, + {file = "sqlalchemy-2.0.44.tar.gz", hash = "sha256:0ae7454e1ab1d780aee69fd2aae7d6b8670a581d8847f2d1e0f7ddfbf47e5a22"}, ] [package.dependencies] -greenlet = {version = ">=1", markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} +greenlet = {version = ">=1", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} typing-extensions = ">=4.6.0" [package.extras] @@ -5530,6 +6617,7 @@ version = "0.5.3" description = "A non-validating SQL parser." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca"}, {file = "sqlparse-0.5.3.tar.gz", hash = "sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272"}, @@ -5545,6 +6633,7 @@ version = "0.6.3" description = "Extract data from python stack frames and tracebacks for informative displays" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, @@ -5564,6 +6653,7 @@ version = "0.46.2" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "starlette-0.46.2-py3-none-any.whl", hash = "sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35"}, {file = "starlette-0.46.2.tar.gz", hash = "sha256:7f7361f34eed179294600af672f565727419830b54b7b084efe44bb82d2fccd5"}, @@ -5577,41 +6667,48 @@ full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart [[package]] name = "statsmodels" -version = "0.14.4" +version = "0.14.5" description = "Statistical computations and models for Python" optional = false python-versions = ">=3.9" -files = [ - {file = "statsmodels-0.14.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7a62f1fc9086e4b7ee789a6f66b3c0fc82dd8de1edda1522d30901a0aa45e42b"}, - {file = "statsmodels-0.14.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:46ac7ddefac0c9b7b607eed1d47d11e26fe92a1bc1f4d9af48aeed4e21e87981"}, - {file = "statsmodels-0.14.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a337b731aa365d09bb0eab6da81446c04fde6c31976b1d8e3d3a911f0f1e07b"}, - {file = "statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:631bb52159117c5da42ba94bd94859276b68cab25dc4cac86475bc24671143bc"}, - {file = "statsmodels-0.14.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3bb2e580d382545a65f298589809af29daeb15f9da2eb252af8f79693e618abc"}, - {file = "statsmodels-0.14.4-cp310-cp310-win_amd64.whl", hash = "sha256:9729642884147ee9db67b5a06a355890663d21f76ed608a56ac2ad98b94d201a"}, - {file = "statsmodels-0.14.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ed7e118e6e3e02d6723a079b8c97eaadeed943fa1f7f619f7148dfc7862670f"}, - {file = "statsmodels-0.14.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f5f537f7d000de4a1708c63400755152b862cd4926bb81a86568e347c19c364b"}, - {file = "statsmodels-0.14.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa74aaa26eaa5012b0a01deeaa8a777595d0835d3d6c7175f2ac65435a7324d2"}, - {file = "statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e332c2d9b806083d1797231280602340c5c913f90d4caa0213a6a54679ce9331"}, - {file = "statsmodels-0.14.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9c8fa28dfd75753d9cf62769ba1fecd7e73a0be187f35cc6f54076f98aa3f3f"}, - {file = "statsmodels-0.14.4-cp311-cp311-win_amd64.whl", hash = "sha256:a6087ecb0714f7c59eb24c22781491e6f1cfffb660b4740e167625ca4f052056"}, - {file = "statsmodels-0.14.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5221dba7424cf4f2561b22e9081de85f5bb871228581124a0d1b572708545199"}, - {file = "statsmodels-0.14.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:17672b30c6b98afe2b095591e32d1d66d4372f2651428e433f16a3667f19eabb"}, - {file = "statsmodels-0.14.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab5e6312213b8cfb9dca93dd46a0f4dccb856541f91d3306227c3d92f7659245"}, - {file = "statsmodels-0.14.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bbb150620b53133d6cd1c5d14c28a4f85701e6c781d9b689b53681effaa655f"}, - {file = "statsmodels-0.14.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb695c2025d122a101c2aca66d2b78813c321b60d3a7c86bb8ec4467bb53b0f9"}, - {file = "statsmodels-0.14.4-cp312-cp312-win_amd64.whl", hash = "sha256:7f7917a51766b4e074da283c507a25048ad29a18e527207883d73535e0dc6184"}, - {file = "statsmodels-0.14.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5a24f5d2c22852d807d2b42daf3a61740820b28d8381daaf59dcb7055bf1a79"}, - {file = "statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df4f7864606fa843d7e7c0e6af288f034a2160dba14e6ccc09020a3cf67cb092"}, - {file = "statsmodels-0.14.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91341cbde9e8bea5fb419a76e09114e221567d03f34ca26e6d67ae2c27d8fe3c"}, - {file = "statsmodels-0.14.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1322286a7bfdde2790bf72d29698a1b76c20b8423a55bdcd0d457969d0041f72"}, - {file = "statsmodels-0.14.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e31b95ac603415887c9f0d344cb523889cf779bc52d68e27e2d23c358958fec7"}, - {file = "statsmodels-0.14.4-cp313-cp313-win_amd64.whl", hash = "sha256:81030108d27aecc7995cac05aa280cf8c6025f6a6119894eef648997936c2dd0"}, - {file = "statsmodels-0.14.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4793b01b7a5f5424f5a1dbcefc614c83c7608aa2b035f087538253007c339d5d"}, - {file = "statsmodels-0.14.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d330da34f59f1653c5193f9fe3a3a258977c880746db7f155fc33713ea858db5"}, - {file = "statsmodels-0.14.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e9ddefba1d4e1107c1f20f601b0581421ea3ad9fd75ce3c2ba6a76b6dc4682c"}, - {file = "statsmodels-0.14.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f43da7957e00190104c5dd0f661bfc6dfc68b87313e3f9c4dbd5e7d222e0aeb"}, - {file = "statsmodels-0.14.4-cp39-cp39-win_amd64.whl", hash = "sha256:8286f69a5e1d0e0b366ffed5691140c83d3efc75da6dbf34a3d06e88abfaaab6"}, - {file = "statsmodels-0.14.4.tar.gz", hash = "sha256:5d69e0f39060dc72c067f9bb6e8033b6dccdb0bae101d76a7ef0bcc94e898b67"}, +groups = ["main"] +files = [ + {file = "statsmodels-0.14.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9fc2b5cdc0c95cba894849651fec1fa1511d365e3eb72b0cc75caac44077cd48"}, + {file = "statsmodels-0.14.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b8d96b0bbaeabd3a557c35cc7249baa9cfbc6dd305c32a9f2cbdd7f46c037e7f"}, + {file = "statsmodels-0.14.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:145bc39b2cb201efb6c83cc3f2163c269e63b0d4809801853dec6f440bd3bc37"}, + {file = "statsmodels-0.14.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7c14fb2617bb819fb2532e1424e1da2b98a3419a80e95f33365a72d437d474e"}, + {file = "statsmodels-0.14.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1e9742d8a5ac38a3bfc4b7f4b0681903920f20cbbf466d72b1fd642033846108"}, + {file = "statsmodels-0.14.5-cp310-cp310-win_amd64.whl", hash = "sha256:1cab9e6fce97caf4239cdb2df375806937da5d0b7ba2699b13af33a07f438464"}, + {file = "statsmodels-0.14.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4b7091a8442076c708c926de3603653a160955e80a2b6d931475b7bb8ddc02e5"}, + {file = "statsmodels-0.14.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:128872be8f3208f4446d91ea9e4261823902fc7997fee7e1a983eb62fd3b7c6e"}, + {file = "statsmodels-0.14.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f2ad5aee04ae7196c429df2174df232c057e478c5fa63193d01c8ec9aae04d31"}, + {file = "statsmodels-0.14.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f402fc793458dd6d96e099acb44cd1de1428565bf7ef3030878a8daff091f08a"}, + {file = "statsmodels-0.14.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:26c028832730aebfbfd4e7501694e1f9ad31ec8536e776716673f4e7afd4059a"}, + {file = "statsmodels-0.14.5-cp311-cp311-win_amd64.whl", hash = "sha256:ec56f771d9529cdc17ed2fb2a950d100b6e83a7c5372aae8ac5bb065c474b856"}, + {file = "statsmodels-0.14.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:37e7364a39f9aa3b51d15a208c2868b90aadb8412f868530f5cba9197cb00eaa"}, + {file = "statsmodels-0.14.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4263d7f4d0f1d5ac6eb4db22e1ee34264a14d634b9332c975c9d9109b6b46e12"}, + {file = "statsmodels-0.14.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:86224f6e36f38486e471e75759d241fe2912d8bc25ab157d54ee074c6aedbf45"}, + {file = "statsmodels-0.14.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c3dd760a6fa80cd5e0371685c697bb9c2c0e6e1f394d975e596a1e6d0bbb9372"}, + {file = "statsmodels-0.14.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6264fb00e02f858b86bd01ef2dc05055a71d4a0cc7551b9976b07b0f0e6cf24f"}, + {file = "statsmodels-0.14.5-cp312-cp312-win_amd64.whl", hash = "sha256:b2ed065bfbaf8bb214c7201656df840457c2c8c65e1689e3eb09dc7440f9c61c"}, + {file = "statsmodels-0.14.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:906263134dd1a640e55ecb01fda4a9be7b9e08558dba9e4c4943a486fdb0c9c8"}, + {file = "statsmodels-0.14.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9118f76344f77cffbb3a9cbcff8682b325be5eed54a4b3253e09da77a74263d3"}, + {file = "statsmodels-0.14.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9dc4ee159070557c9a6c000625d85f653de437772fe7086857cff68f501afe45"}, + {file = "statsmodels-0.14.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a085d47c8ef5387279a991633883d0e700de2b0acc812d7032d165888627bef"}, + {file = "statsmodels-0.14.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9f866b2ebb2904b47c342d00def83c526ef2eb1df6a9a3c94ba5fe63d0005aec"}, + {file = "statsmodels-0.14.5-cp313-cp313-win_amd64.whl", hash = "sha256:2a06bca03b7a492f88c8106103ab75f1a5ced25de90103a89f3a287518017939"}, + {file = "statsmodels-0.14.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:07c4dad25bbb15864a31b4917a820f6d104bdc24e5ddadcda59027390c3bed9e"}, + {file = "statsmodels-0.14.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:babb067c852e966c2c933b79dbb5d0240919d861941a2ef6c0e13321c255528d"}, + {file = "statsmodels-0.14.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:110194b137286173cc676d7bad0119a197778de6478fc6cbdc3b33571165ac1e"}, + {file = "statsmodels-0.14.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c8a9c384a60c80731b278e7fd18764364c8817f4995b13a175d636f967823d1"}, + {file = "statsmodels-0.14.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:557df3a870a57248df744fdfcc444ecbc5bdbf1c042b8a8b5d8e3e797830dc2a"}, + {file = "statsmodels-0.14.5-cp314-cp314-win_amd64.whl", hash = "sha256:95af7a9c4689d514f4341478b891f867766f3da297f514b8c4adf08f4fa61d03"}, + {file = "statsmodels-0.14.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b23b8f646dd78ef5e8d775d879208f8dc0a73418b41c16acac37361ff9ab7738"}, + {file = "statsmodels-0.14.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4e5e26b21d2920905764fb0860957d08b5ba2fae4466ef41b1f7c53ecf9fc7fa"}, + {file = "statsmodels-0.14.5-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a060c7e0841c549c8ce2825fd6687e6757e305d9c11c9a73f6c5a0ce849bb69"}, + {file = "statsmodels-0.14.5-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56da20def5350d676388213a330fd40ed15d0e8dd0bb1b92c0e4b0f2a65d3ad2"}, + {file = "statsmodels-0.14.5-cp39-cp39-win_amd64.whl", hash = "sha256:afb37ca1d70d99b5fd876e8574ea46372298ae0f0a8b17e4cf0a9afd2373ae62"}, + {file = "statsmodels-0.14.5.tar.gz", hash = "sha256:de260e58cccfd2ceddf835b55a357233d6ca853a1aa4f90f7553a52cc71c6ddf"}, ] [package.dependencies] @@ -5623,29 +6720,28 @@ scipy = ">=1.8,<1.9.2 || >1.9.2" [package.extras] build = ["cython (>=3.0.10)"] -develop = ["colorama", "cython (>=3.0.10)", "cython (>=3.0.10,<4)", "flake8", "isort", "joblib", "matplotlib (>=3)", "pytest (>=7.3.0,<8)", "pytest-cov", "pytest-randomly", "pytest-xdist", "pywinpty", "setuptools-scm[toml] (>=8.0,<9.0)"] -docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "numpydoc", "pandas-datareader", "sphinx"] +develop = ["colorama", "cython (>=3.0.10)", "cython (>=3.0.10,<4)", "flake8", "isort", "jinja2", "joblib", "matplotlib (>=3)", "pytest (>=7.3.0,<8)", "pytest-cov", "pytest-randomly", "pytest-xdist", "pywinpty ; os_name == \"nt\"", "setuptools_scm[toml] (>=8.0,<9.0)"] +docs = ["ipykernel", "jupyter_client", "matplotlib", "nbconvert", "nbformat", "numpydoc", "pandas-datareader", "sphinx"] [[package]] name = "stevedore" -version = "5.4.1" +version = "5.5.0" description = "Manage dynamic plugins for Python applications" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "stevedore-5.4.1-py3-none-any.whl", hash = "sha256:d10a31c7b86cba16c1f6e8d15416955fc797052351a56af15e608ad20811fcfe"}, - {file = "stevedore-5.4.1.tar.gz", hash = "sha256:3135b5ae50fe12816ef291baff420acb727fcd356106e3e9cbfa9e5985cd6f4b"}, + {file = "stevedore-5.5.0-py3-none-any.whl", hash = "sha256:18363d4d268181e8e8452e71a38cd77630f345b2ef6b4a8d5614dac5ee0d18cf"}, + {file = "stevedore-5.5.0.tar.gz", hash = "sha256:d31496a4f4df9825e1a1e4f1f74d19abb0154aff311c3b376fcc89dae8fccd73"}, ] -[package.dependencies] -pbr = ">=2.0.0" - [[package]] name = "sympy" version = "1.14.0" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5"}, {file = "sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517"}, @@ -5663,6 +6759,7 @@ version = "9.1.2" description = "Retry code until it succeeds" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138"}, {file = "tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb"}, @@ -5678,6 +6775,7 @@ version = "0.18.1" description = "Tornado websocket backend for the Xterm.js Javascript terminal emulator library." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0"}, {file = "terminado-0.18.1.tar.gz", hash = "sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e"}, @@ -5699,6 +6797,7 @@ version = "1.3" description = "The most basic Text::Unidecode port" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, @@ -5710,6 +6809,7 @@ version = "3.6.0" description = "threadpoolctl" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb"}, {file = "threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e"}, @@ -5717,42 +6817,69 @@ files = [ [[package]] name = "tiktoken" -version = "0.9.0" +version = "0.12.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" optional = false python-versions = ">=3.9" -files = [ - {file = "tiktoken-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:586c16358138b96ea804c034b8acf3f5d3f0258bd2bc3b0227af4af5d622e382"}, - {file = "tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d9c59ccc528c6c5dd51820b3474402f69d9a9e1d656226848ad68a8d5b2e5108"}, - {file = "tiktoken-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0968d5beeafbca2a72c595e8385a1a1f8af58feaebb02b227229b69ca5357fd"}, - {file = "tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92a5fb085a6a3b7350b8fc838baf493317ca0e17bd95e8642f95fc69ecfed1de"}, - {file = "tiktoken-0.9.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:15a2752dea63d93b0332fb0ddb05dd909371ededa145fe6a3242f46724fa7990"}, - {file = "tiktoken-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:26113fec3bd7a352e4b33dbaf1bd8948de2507e30bd95a44e2b1156647bc01b4"}, - {file = "tiktoken-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f32cc56168eac4851109e9b5d327637f15fd662aa30dd79f964b7c39fbadd26e"}, - {file = "tiktoken-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:45556bc41241e5294063508caf901bf92ba52d8ef9222023f83d2483a3055348"}, - {file = "tiktoken-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33"}, - {file = "tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b3d80aad8d2c6b9238fc1a5524542087c52b860b10cbf952429ffb714bc1136"}, - {file = "tiktoken-0.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b2a21133be05dc116b1d0372af051cd2c6aa1d2188250c9b553f9fa49301b336"}, - {file = "tiktoken-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:11a20e67fdf58b0e2dea7b8654a288e481bb4fc0289d3ad21291f8d0849915fb"}, - {file = "tiktoken-0.9.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e88f121c1c22b726649ce67c089b90ddda8b9662545a8aeb03cfef15967ddd03"}, - {file = "tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a6600660f2f72369acb13a57fb3e212434ed38b045fd8cc6cdd74947b4b5d210"}, - {file = "tiktoken-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95e811743b5dfa74f4b227927ed86cbc57cad4df859cb3b643be797914e41794"}, - {file = "tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99376e1370d59bcf6935c933cb9ba64adc29033b7e73f5f7569f3aad86552b22"}, - {file = "tiktoken-0.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:badb947c32739fb6ddde173e14885fb3de4d32ab9d8c591cbd013c22b4c31dd2"}, - {file = "tiktoken-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:5a62d7a25225bafed786a524c1b9f0910a1128f4232615bf3f8257a73aaa3b16"}, - {file = "tiktoken-0.9.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b0e8e05a26eda1249e824156d537015480af7ae222ccb798e5234ae0285dbdb"}, - {file = "tiktoken-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:27d457f096f87685195eea0165a1807fae87b97b2161fe8c9b1df5bd74ca6f63"}, - {file = "tiktoken-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cf8ded49cddf825390e36dd1ad35cd49589e8161fdcb52aa25f0583e90a3e01"}, - {file = "tiktoken-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc156cb314119a8bb9748257a2eaebd5cc0753b6cb491d26694ed42fc7cb3139"}, - {file = "tiktoken-0.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cd69372e8c9dd761f0ab873112aba55a0e3e506332dd9f7522ca466e817b1b7a"}, - {file = "tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95"}, - {file = "tiktoken-0.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c6386ca815e7d96ef5b4ac61e0048cd32ca5a92d5781255e13b31381d28667dc"}, - {file = "tiktoken-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:75f6d5db5bc2c6274b674ceab1615c1778e6416b14705827d19b40e6355f03e0"}, - {file = "tiktoken-0.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e15b16f61e6f4625a57a36496d28dd182a8a60ec20a534c5343ba3cafa156ac7"}, - {file = "tiktoken-0.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebcec91babf21297022882344c3f7d9eed855931466c3311b1ad6b64befb3df"}, - {file = "tiktoken-0.9.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e5fd49e7799579240f03913447c0cdfa1129625ebd5ac440787afc4345990427"}, - {file = "tiktoken-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:26242ca9dc8b58e875ff4ca078b9a94d2f0813e6a535dcd2205df5d49d927cc7"}, - {file = "tiktoken-0.9.0.tar.gz", hash = "sha256:d02a5ca6a938e0490e1ff957bc48c8b078c88cb83977be1625b1fd8aac792c5d"}, +groups = ["main"] +files = [ + {file = "tiktoken-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3de02f5a491cfd179aec916eddb70331814bd6bf764075d39e21d5862e533970"}, + {file = "tiktoken-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6cfb6d9b7b54d20af21a912bfe63a2727d9cfa8fbda642fd8322c70340aad16"}, + {file = "tiktoken-0.12.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:cde24cdb1b8a08368f709124f15b36ab5524aac5fa830cc3fdce9c03d4fb8030"}, + {file = "tiktoken-0.12.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6de0da39f605992649b9cfa6f84071e3f9ef2cec458d08c5feb1b6f0ff62e134"}, + {file = "tiktoken-0.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6faa0534e0eefbcafaccb75927a4a380463a2eaa7e26000f0173b920e98b720a"}, + {file = "tiktoken-0.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:82991e04fc860afb933efb63957affc7ad54f83e2216fe7d319007dab1ba5892"}, + {file = "tiktoken-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:6fb2995b487c2e31acf0a9e17647e3b242235a20832642bb7a9d1a181c0c1bb1"}, + {file = "tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb"}, + {file = "tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa"}, + {file = "tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc"}, + {file = "tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded"}, + {file = "tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd"}, + {file = "tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967"}, + {file = "tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def"}, + {file = "tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8"}, + {file = "tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b"}, + {file = "tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37"}, + {file = "tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad"}, + {file = "tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5"}, + {file = "tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3"}, + {file = "tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd"}, + {file = "tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3"}, + {file = "tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160"}, + {file = "tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa"}, + {file = "tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be"}, + {file = "tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a"}, + {file = "tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3"}, + {file = "tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697"}, + {file = "tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16"}, + {file = "tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a"}, + {file = "tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27"}, + {file = "tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb"}, + {file = "tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e"}, + {file = "tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25"}, + {file = "tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f"}, + {file = "tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646"}, + {file = "tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88"}, + {file = "tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff"}, + {file = "tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830"}, + {file = "tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b"}, + {file = "tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b"}, + {file = "tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3"}, + {file = "tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365"}, + {file = "tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e"}, + {file = "tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63"}, + {file = "tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0"}, + {file = "tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a"}, + {file = "tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0"}, + {file = "tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71"}, + {file = "tiktoken-0.12.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:d51d75a5bffbf26f86554d28e78bfb921eae998edc2675650fd04c7e1f0cdc1e"}, + {file = "tiktoken-0.12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:09eb4eae62ae7e4c62364d9ec3a57c62eea707ac9a2b2c5d6bd05de6724ea179"}, + {file = "tiktoken-0.12.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:df37684ace87d10895acb44b7f447d4700349b12197a526da0d4a4149fde074c"}, + {file = "tiktoken-0.12.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4c9614597ac94bb294544345ad8cf30dac2129c05e2db8dc53e082f355857af7"}, + {file = "tiktoken-0.12.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:20cf97135c9a50de0b157879c3c4accbb29116bcf001283d26e073ff3b345946"}, + {file = "tiktoken-0.12.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:15d875454bbaa3728be39880ddd11a5a2a9e548c29418b41e8fd8a767172b5ec"}, + {file = "tiktoken-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cff3688ba3c639ebe816f8d58ffbbb0aa7433e23e08ab1cade5d175fc973fb3"}, + {file = "tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931"}, ] [package.dependencies] @@ -5768,6 +6895,7 @@ version = "1.4.0" description = "A tiny CSS parser" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289"}, {file = "tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7"}, @@ -5782,26 +6910,27 @@ test = ["pytest", "ruff"] [[package]] name = "tokenizers" -version = "0.21.1" +version = "0.21.4" description = "" optional = false python-versions = ">=3.9" -files = [ - {file = "tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41"}, - {file = "tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cd51cd0a91ecc801633829fcd1fda9cf8682ed3477c6243b9a095539de4aecf3"}, - {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28da6b72d4fb14ee200a1bd386ff74ade8992d7f725f2bde2c495a9a98cf4d9f"}, - {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34d8cfde551c9916cb92014e040806122295a6800914bab5865deb85623931cf"}, - {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaa852d23e125b73d283c98f007e06d4595732104b65402f46e8ef24b588d9f8"}, - {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a21a15d5c8e603331b8a59548bbe113564136dc0f5ad8306dd5033459a226da0"}, - {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2fdbd4c067c60a0ac7eca14b6bd18a5bebace54eb757c706b47ea93204f7a37c"}, - {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dd9a0061e403546f7377df940e866c3e678d7d4e9643d0461ea442b4f89e61a"}, - {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:db9484aeb2e200c43b915a1a0150ea885e35f357a5a8fabf7373af333dcc8dbf"}, - {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6"}, - {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:9ac78b12e541d4ce67b4dfd970e44c060a2147b9b2a21f509566d556a509c67d"}, - {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e5a69c1a4496b81a5ee5d2c1f3f7fbdf95e90a0196101b0ee89ed9956b8a168f"}, - {file = "tokenizers-0.21.1-cp39-abi3-win32.whl", hash = "sha256:1039a3a5734944e09de1d48761ade94e00d0fa760c0e0551151d4dd851ba63e3"}, - {file = "tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382"}, - {file = "tokenizers-0.21.1.tar.gz", hash = "sha256:a1bb04dc5b448985f86ecd4b05407f5a8d97cb2c0532199b2a302a604a0165ab"}, +groups = ["main"] +files = [ + {file = "tokenizers-0.21.4-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2ccc10a7c3bcefe0f242867dc914fc1226ee44321eb618cfe3019b5df3400133"}, + {file = "tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e2f601a8e0cd5be5cc7506b20a79112370b9b3e9cb5f13f68ab11acd6ca7d60"}, + {file = "tokenizers-0.21.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b376f5a1aee67b4d29032ee85511bbd1b99007ec735f7f35c8a2eb104eade5"}, + {file = "tokenizers-0.21.4-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2107ad649e2cda4488d41dfd031469e9da3fcbfd6183e74e4958fa729ffbf9c6"}, + {file = "tokenizers-0.21.4-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c73012da95afafdf235ba80047699df4384fdc481527448a078ffd00e45a7d9"}, + {file = "tokenizers-0.21.4-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f23186c40395fc390d27f519679a58023f368a0aad234af145e0f39ad1212732"}, + {file = "tokenizers-0.21.4-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc88bb34e23a54cc42713d6d98af5f1bf79c07653d24fe984d2d695ba2c922a2"}, + {file = "tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51b7eabb104f46c1c50b486520555715457ae833d5aee9ff6ae853d1130506ff"}, + {file = "tokenizers-0.21.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b05b2e1af1288bd1bc56ce496c4cebb64a20d158ee802887757791191e6e2"}, + {file = "tokenizers-0.21.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:1340ff877ceedfa937544b7d79f5b7becf33a4cfb58f89b3b49927004ef66f78"}, + {file = "tokenizers-0.21.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c1f4317576e465ac9ef0d165b247825a2a4078bcd01cba6b54b867bdf9fdd8b"}, + {file = "tokenizers-0.21.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c212aa4e45ec0bb5274b16b6f31dd3f1c41944025c2358faaa5782c754e84c24"}, + {file = "tokenizers-0.21.4-cp39-abi3-win32.whl", hash = "sha256:6c42a930bc5f4c47f4ea775c91de47d27910881902b0f20e4990ebe045a415d0"}, + {file = "tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597"}, + {file = "tokenizers-0.21.4.tar.gz", hash = "sha256:fa23f85fbc9a02ec5c6978da172cdcbac23498c3ca9f3645c5c68740ac007880"}, ] [package.dependencies] @@ -5814,110 +6943,148 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"] [[package]] name = "torch" -version = "2.7.1" +version = "2.9.1" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false -python-versions = ">=3.9.0" -files = [ - {file = "torch-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a103b5d782af5bd119b81dbcc7ffc6fa09904c423ff8db397a1e6ea8fd71508f"}, - {file = "torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:fe955951bdf32d182ee8ead6c3186ad54781492bf03d547d31771a01b3d6fb7d"}, - {file = "torch-2.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:885453d6fba67d9991132143bf7fa06b79b24352f4506fd4d10b309f53454162"}, - {file = "torch-2.7.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:d72acfdb86cee2a32c0ce0101606f3758f0d8bb5f8f31e7920dc2809e963aa7c"}, - {file = "torch-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:236f501f2e383f1cb861337bdf057712182f910f10aeaf509065d54d339e49b2"}, - {file = "torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:06eea61f859436622e78dd0cdd51dbc8f8c6d76917a9cf0555a333f9eac31ec1"}, - {file = "torch-2.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:8273145a2e0a3c6f9fd2ac36762d6ee89c26d430e612b95a99885df083b04e52"}, - {file = "torch-2.7.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:aea4fc1bf433d12843eb2c6b2204861f43d8364597697074c8d38ae2507f8730"}, - {file = "torch-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ea1e518df4c9de73af7e8a720770f3628e7f667280bce2be7a16292697e3fa"}, - {file = "torch-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c33360cfc2edd976c2633b3b66c769bdcbbf0e0b6550606d188431c81e7dd1fc"}, - {file = "torch-2.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:d8bf6e1856ddd1807e79dc57e54d3335f2b62e6f316ed13ed3ecfe1fc1df3d8b"}, - {file = "torch-2.7.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:787687087412c4bd68d315e39bc1223f08aae1d16a9e9771d95eabbb04ae98fb"}, - {file = "torch-2.7.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:03563603d931e70722dce0e11999d53aa80a375a3d78e6b39b9f6805ea0a8d28"}, - {file = "torch-2.7.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d632f5417b6980f61404a125b999ca6ebd0b8b4bbdbb5fbbba44374ab619a412"}, - {file = "torch-2.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:23660443e13995ee93e3d844786701ea4ca69f337027b05182f5ba053ce43b38"}, - {file = "torch-2.7.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:0da4f4dba9f65d0d203794e619fe7ca3247a55ffdcbd17ae8fb83c8b2dc9b585"}, - {file = "torch-2.7.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e08d7e6f21a617fe38eeb46dd2213ded43f27c072e9165dc27300c9ef9570934"}, - {file = "torch-2.7.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:30207f672328a42df4f2174b8f426f354b2baa0b7cca3a0adb3d6ab5daf00dc8"}, - {file = "torch-2.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:79042feca1c634aaf6603fe6feea8c6b30dfa140a6bbc0b973e2260c7e79a22e"}, - {file = "torch-2.7.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:988b0cbc4333618a1056d2ebad9eb10089637b659eb645434d0809d8d937b946"}, - {file = "torch-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:e0d81e9a12764b6f3879a866607c8ae93113cbcad57ce01ebde63eb48a576369"}, - {file = "torch-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:8394833c44484547ed4a47162318337b88c97acdb3273d85ea06e03ffff44998"}, - {file = "torch-2.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:df41989d9300e6e3c19ec9f56f856187a6ef060c3662fe54f4b6baf1fc90bd19"}, - {file = "torch-2.7.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:a737b5edd1c44a5c1ece2e9f3d00df9d1b3fb9541138bee56d83d38293fb6c9d"}, +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "torch-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1cc208435f6c379f9b8fdfd5ceb5be1e3b72a6bdf1cb46c0d2812aa73472db9e"}, + {file = "torch-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:9fd35c68b3679378c11f5eb73220fdcb4e6f4592295277fbb657d31fd053237c"}, + {file = "torch-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:2af70e3be4a13becba4655d6cc07dcfec7ae844db6ac38d6c1dafeb245d17d65"}, + {file = "torch-2.9.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a83b0e84cc375e3318a808d032510dde99d696a85fe9473fc8575612b63ae951"}, + {file = "torch-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:62b3fd888277946918cba4478cf849303da5359f0fb4e3bfb86b0533ba2eaf8d"}, + {file = "torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d033ff0ac3f5400df862a51bdde9bad83561f3739ea0046e68f5401ebfa67c1b"}, + {file = "torch-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:0d06b30a9207b7c3516a9e0102114024755a07045f0c1d2f2a56b1819ac06bcb"}, + {file = "torch-2.9.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:52347912d868653e1528b47cafaf79b285b98be3f4f35d5955389b1b95224475"}, + {file = "torch-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:da5f6f4d7f4940a173e5572791af238cb0b9e21b1aab592bd8b26da4c99f1cd6"}, + {file = "torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:27331cd902fb4322252657f3902adf1c4f6acad9dcad81d8df3ae14c7c4f07c4"}, + {file = "torch-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:81a285002d7b8cfd3fdf1b98aa8df138d41f1a8334fd9ea37511517cedf43083"}, + {file = "torch-2.9.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:c0d25d1d8e531b8343bea0ed811d5d528958f1dcbd37e7245bc686273177ad7e"}, + {file = "torch-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c29455d2b910b98738131990394da3e50eea8291dfeb4b12de71ecf1fdeb21cb"}, + {file = "torch-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:524de44cd13931208ba2c4bde9ec7741fd4ae6bfd06409a604fc32f6520c2bc9"}, + {file = "torch-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:545844cc16b3f91e08ce3b40e9c2d77012dd33a48d505aed34b7740ed627a1b2"}, + {file = "torch-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5be4bf7496f1e3ffb1dd44b672adb1ac3f081f204c5ca81eba6442f5f634df8e"}, + {file = "torch-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:30a3e170a84894f3652434b56d59a64a2c11366b0ed5776fab33c2439396bf9a"}, + {file = "torch-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8301a7b431e51764629208d0edaa4f9e4c33e6df0f2f90b90e261d623df6a4e2"}, + {file = "torch-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:2e1c42c0ae92bf803a4b2409fdfed85e30f9027a66887f5e7dcdbc014c7531db"}, + {file = "torch-2.9.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:2c14b3da5df416cf9cb5efab83aa3056f5b8cd8620b8fde81b4987ecab730587"}, + {file = "torch-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1edee27a7c9897f4e0b7c14cfc2f3008c571921134522d5b9b5ec4ebbc69041a"}, + {file = "torch-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:19d144d6b3e29921f1fc70503e9f2fc572cde6a5115c0c0de2f7ca8b1483e8b6"}, + {file = "torch-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:c432d04376f6d9767a9852ea0def7b47a7bbc8e7af3b16ac9cf9ce02b12851c9"}, + {file = "torch-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:d187566a2cdc726fc80138c3cdb260970fab1c27e99f85452721f7759bbd554d"}, + {file = "torch-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cb10896a1f7fedaddbccc2017ce6ca9ecaaf990f0973bdfcf405439750118d2c"}, + {file = "torch-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0a2bd769944991c74acf0c4ef23603b9c777fdf7637f115605a4b2d8023110c7"}, + {file = "torch-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:07c8a9660bc9414c39cac530ac83b1fb1b679d7155824144a40a54f4a47bfa73"}, + {file = "torch-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c88d3299ddeb2b35dcc31753305612db485ab6f1823e37fb29451c8b2732b87e"}, ] [package.dependencies] filelock = "*" -fsspec = "*" +fsspec = ">=0.8.5" jinja2 = "*" -networkx = "*" -nvidia-cublas-cu12 = {version = "12.6.4.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-cupti-cu12 = {version = "12.6.80", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-nvrtc-cu12 = {version = "12.6.77", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-runtime-cu12 = {version = "12.6.77", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cudnn-cu12 = {version = "9.5.1.17", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cufft-cu12 = {version = "11.3.0.4", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cufile-cu12 = {version = "1.11.1.6", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-curand-cu12 = {version = "10.3.7.77", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusolver-cu12 = {version = "11.7.1.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusparse-cu12 = {version = "12.5.4.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusparselt-cu12 = {version = "0.6.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nccl-cu12 = {version = "2.26.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nvjitlink-cu12 = {version = "12.6.85", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nvtx-cu12 = {version = "12.6.77", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +networkx = ">=2.5.1" +nvidia-cublas-cu12 = {version = "12.8.4.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-cupti-cu12 = {version = "12.8.90", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-nvrtc-cu12 = {version = "12.8.93", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-runtime-cu12 = {version = "12.8.90", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cudnn-cu12 = {version = "9.10.2.21", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cufft-cu12 = {version = "11.3.3.83", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cufile-cu12 = {version = "1.13.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-curand-cu12 = {version = "10.3.9.90", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusolver-cu12 = {version = "11.7.3.90", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusparse-cu12 = {version = "12.5.8.93", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusparselt-cu12 = {version = "0.7.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nccl-cu12 = {version = "2.27.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvjitlink-cu12 = {version = "12.8.93", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvshmem-cu12 = {version = "3.3.20", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvtx-cu12 = {version = "12.8.90", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} setuptools = {version = "*", markers = "python_version >= \"3.12\""} sympy = ">=1.13.3" -triton = {version = "3.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +triton = {version = "3.5.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} typing-extensions = ">=4.10.0" [package.extras] opt-einsum = ["opt-einsum (>=3.3)"] optree = ["optree (>=0.13.0)"] +pyyaml = ["pyyaml"] + +[[package]] +name = "torch-geometric" +version = "2.7.0" +description = "Graph Neural Network Library for PyTorch" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "torch_geometric-2.7.0-py3-none-any.whl", hash = "sha256:6e0cd3ad824d484651ef5d308fc66c687bfcf5ba040d56d1e0fe0f81f365e292"}, + {file = "torch_geometric-2.7.0.tar.gz", hash = "sha256:f9099e4aece1a9f618c84dbaac33a77f43139736698c7e8bddf3301ef1f2e8d4"}, +] + +[package.dependencies] +aiohttp = "*" +fsspec = "*" +jinja2 = "*" +numpy = "*" +psutil = ">=5.8.0" +pyparsing = "*" +requests = "*" +tqdm = "*" +xxhash = "*" + +[package.extras] +benchmark = ["matplotlib", "networkx", "pandas", "protobuf (<4.21)", "wandb"] +dev = ["ipython", "matplotlib-inline", "pre-commit", "torch_geometric[test]"] +full = ["ase", "captum (<0.7.0)", "graphviz", "h5py", "matplotlib", "networkx", "numba (<0.60.0)", "opt_einsum", "pandas", "pynndescent", "pytorch-memlab", "rdflib", "rdkit", "scikit-image", "scikit-learn", "scipy", "statsmodels", "sympy", "tabulate", "torch_geometric[graphgym,modelhub]", "torchmetrics", "trimesh"] +graphgym = ["protobuf (<4.21)", "pytorch-lightning", "yacs"] +modelhub = ["huggingface_hub"] +rag = ["accelerate", "datasets", "pandas", "pcst_fast", "sentencepiece", "torchmetrics", "transformers"] +test = ["onnx", "onnxruntime", "onnxscript", "pytest", "pytest-cov"] [[package]] name = "tornado" -version = "6.5.1" +version = "6.5.2" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d50065ba7fd11d3bd41bcad0825227cc9a95154bad83239357094c36708001f7"}, - {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9e9ca370f717997cb85606d074b0e5b247282cf5e2e1611568b8821afe0342d6"}, - {file = "tornado-6.5.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b77e9dfa7ed69754a54c89d82ef746398be82f749df69c4d3abe75c4d1ff4888"}, - {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:253b76040ee3bab8bcf7ba9feb136436a3787208717a1fb9f2c16b744fba7331"}, - {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:308473f4cc5a76227157cdf904de33ac268af770b2c5f05ca6c1161d82fdd95e"}, - {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:caec6314ce8a81cf69bd89909f4b633b9f523834dc1a352021775d45e51d9401"}, - {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:13ce6e3396c24e2808774741331638ee6c2f50b114b97a55c5b442df65fd9692"}, - {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5cae6145f4cdf5ab24744526cc0f55a17d76f02c98f4cff9daa08ae9a217448a"}, - {file = "tornado-6.5.1-cp39-abi3-win32.whl", hash = "sha256:e0a36e1bc684dca10b1aa75a31df8bdfed656831489bc1e6a6ebed05dc1ec365"}, - {file = "tornado-6.5.1-cp39-abi3-win_amd64.whl", hash = "sha256:908e7d64567cecd4c2b458075589a775063453aeb1d2a1853eedb806922f568b"}, - {file = "tornado-6.5.1-cp39-abi3-win_arm64.whl", hash = "sha256:02420a0eb7bf617257b9935e2b754d1b63897525d8a289c9d65690d580b4dcf7"}, - {file = "tornado-6.5.1.tar.gz", hash = "sha256:84ceece391e8eb9b2b95578db65e920d2a61070260594819589609ba9bc6308c"}, + {file = "tornado-6.5.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2436822940d37cde62771cff8774f4f00b3c8024fe482e16ca8387b8a2724db6"}, + {file = "tornado-6.5.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:583a52c7aa94ee046854ba81d9ebb6c81ec0fd30386d96f7640c96dad45a03ef"}, + {file = "tornado-6.5.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0fe179f28d597deab2842b86ed4060deec7388f1fd9c1b4a41adf8af058907e"}, + {file = "tornado-6.5.2-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b186e85d1e3536d69583d2298423744740986018e393d0321df7340e71898882"}, + {file = "tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e792706668c87709709c18b353da1f7662317b563ff69f00bab83595940c7108"}, + {file = "tornado-6.5.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:06ceb1300fd70cb20e43b1ad8aaee0266e69e7ced38fa910ad2e03285009ce7c"}, + {file = "tornado-6.5.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:74db443e0f5251be86cbf37929f84d8c20c27a355dd452a5cfa2aada0d001ec4"}, + {file = "tornado-6.5.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b5e735ab2889d7ed33b32a459cac490eda71a1ba6857b0118de476ab6c366c04"}, + {file = "tornado-6.5.2-cp39-abi3-win32.whl", hash = "sha256:c6f29e94d9b37a95013bb669616352ddb82e3bfe8326fccee50583caebc8a5f0"}, + {file = "tornado-6.5.2-cp39-abi3-win_amd64.whl", hash = "sha256:e56a5af51cc30dd2cae649429af65ca2f6571da29504a07995175df14c18f35f"}, + {file = "tornado-6.5.2-cp39-abi3-win_arm64.whl", hash = "sha256:d6c33dc3672e3a1f3618eb63b7ef4683a7688e7b9e6e8f0d9aa5726360a004af"}, + {file = "tornado-6.5.2.tar.gz", hash = "sha256:ab53c8f9a0fa351e2c0741284e06c7a45da86afb544133201c5cc8578eb076a0"}, ] [[package]] name = "tox" -version = "4.26.0" +version = "4.32.0" description = "tox is a generic virtualenv management and test command line tool" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "tox-4.26.0-py3-none-any.whl", hash = "sha256:75f17aaf09face9b97bd41645028d9f722301e912be8b4c65a3f938024560224"}, - {file = "tox-4.26.0.tar.gz", hash = "sha256:a83b3b67b0159fa58e44e646505079e35a43317a62d2ae94725e0586266faeca"}, + {file = "tox-4.32.0-py3-none-any.whl", hash = "sha256:451e81dc02ba8d1ed20efd52ee409641ae4b5d5830e008af10fe8823ef1bd551"}, + {file = "tox-4.32.0.tar.gz", hash = "sha256:1ad476b5f4d3679455b89a992849ffc3367560bbc7e9495ee8a3963542e7c8ff"}, ] [package.dependencies] -cachetools = ">=5.5.1" +cachetools = ">=6.2" chardet = ">=5.2" colorama = ">=0.4.6" -filelock = ">=3.16.1" -packaging = ">=24.2" -platformdirs = ">=4.3.6" -pluggy = ">=1.5" -pyproject-api = ">=1.8" -virtualenv = ">=20.31" - -[package.extras] -test = ["devpi-process (>=1.0.2)", "pytest (>=8.3.4)", "pytest-mock (>=3.14)"] +filelock = ">=3.20" +packaging = ">=25" +platformdirs = ">=4.5" +pluggy = ">=1.6" +pyproject-api = ">=1.9.1" +virtualenv = ">=20.34" [[package]] name = "tqdm" @@ -5925,6 +7092,7 @@ version = "4.67.1" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, @@ -5946,6 +7114,7 @@ version = "5.14.3" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, @@ -5957,18 +7126,20 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "transformers" -version = "4.52.4" +version = "4.55.4" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = true python-versions = ">=3.9.0" +groups = ["main"] +markers = "extra == \"transformers\" or extra == \"all\"" files = [ - {file = "transformers-4.52.4-py3-none-any.whl", hash = "sha256:203f5c19416d5877e36e88633943761719538a25d9775977a24fe77a1e5adfc7"}, - {file = "transformers-4.52.4.tar.gz", hash = "sha256:aff3764441c1adc192a08dba49740d3cbbcb72d850586075aed6bd89b98203e6"}, + {file = "transformers-4.55.4-py3-none-any.whl", hash = "sha256:df28f3849665faba4af5106f0db4510323277c4bb595055340544f7e59d06458"}, + {file = "transformers-4.55.4.tar.gz", hash = "sha256:574a30559bc273c7a4585599ff28ab6b676e96dc56ffd2025ecfce2fd0ab915d"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=0.30.0,<1.0" +huggingface-hub = ">=0.34.0,<1.0" numpy = ">=1.17" packaging = ">=20.0" pyyaml = ">=5.1" @@ -5980,95 +7151,94 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.26.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.4.4,<0.5)", "librosa", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.4.4,<0.5)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "kernels (>=0.4.4,<0.5)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] -hf-xet = ["hf-xet"] -hub-kernels = ["kernels (>=0.4.4,<0.5)"] -integrations = ["kernels (>=0.4.4,<0.5)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +hf-xet = ["hf_xet"] +hub-kernels = ["kernels (>=0.6.1,<=0.9)"] +integrations = ["kernels (>=0.6.1,<=0.9)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] +mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] num2words = ["num2words"] onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] +open-telemetry = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] optuna = ["optuna"] -quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "libcst", "rich", "ruff (==0.11.2)", "urllib3 (<2.0.0)"] +quality = ["GitPython (<3.1.19)", "datasets (>=2.15.0)", "libcst", "pandas (<2.3.0)", "rich", "ruff (==0.11.2)", "urllib3 (<2.0.0)"] ray = ["ray[tune] (>=2.7.0)"] -retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] +retrieval = ["datasets (>=2.15.0)", "faiss-cpu"] ruff = ["ruff (==0.11.2)"] sagemaker = ["sagemaker (>=2.31.0)"] sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] -serving = ["fastapi", "pydantic", "starlette", "uvicorn"] +serving = ["accelerate (>=0.26.0)", "fastapi", "openai (>=1.98.0)", "pydantic (>=2)", "starlette", "torch (>=2.1)", "uvicorn"] sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] tiktoken = ["blobfile", "tiktoken"] -timm = ["timm (<=1.0.11)"] +timm = ["timm (!=1.0.18,<=1.0.19)"] tokenizers = ["tokenizers (>=0.21,<0.22)"] -torch = ["accelerate (>=0.26.0)", "torch (>=2.1,<2.7)"] +torch = ["accelerate (>=0.26.0)", "torch (>=2.1)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.30.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "triton" -version = "3.3.1" +version = "3.5.1" description = "A language and compiler for custom Deep Learning operations" optional = false -python-versions = "*" -files = [ - {file = "triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e"}, - {file = "triton-3.3.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b31e3aa26f8cb3cc5bf4e187bf737cbacf17311e1112b781d4a059353dfd731b"}, - {file = "triton-3.3.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9999e83aba21e1a78c1f36f21bce621b77bcaa530277a50484a7cb4a822f6e43"}, - {file = "triton-3.3.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b89d846b5a4198317fec27a5d3a609ea96b6d557ff44b56c23176546023c4240"}, - {file = "triton-3.3.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3198adb9d78b77818a5388bff89fa72ff36f9da0bc689db2f0a651a67ce6a42"}, - {file = "triton-3.3.1-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f6139aeb04a146b0b8e0fbbd89ad1e65861c57cfed881f21d62d3cb94a36bab7"}, +python-versions = "<3.15,>=3.10" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" +files = [ + {file = "triton-3.5.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f63e34dcb32d7bd3a1d0195f60f30d2aee8b08a69a0424189b71017e23dfc3d2"}, + {file = "triton-3.5.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fc53d849f879911ea13f4a877243afc513187bc7ee92d1f2c0f1ba3169e3c94"}, + {file = "triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc"}, + {file = "triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579"}, + {file = "triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4"}, + {file = "triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232"}, + {file = "triton-3.5.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56765ffe12c554cd560698398b8a268db1f616c120007bfd8829d27139abd24a"}, + {file = "triton-3.5.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3f4346b6ebbd4fad18773f5ba839114f4826037c9f2f34e0148894cd5dd3dba"}, + {file = "triton-3.5.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02c770856f5e407d24d28ddc66e33cf026e6f4d360dcb8b2fabe6ea1fc758621"}, + {file = "triton-3.5.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0b4d2c70127fca6a23e247f9348b8adde979d2e7a20391bfbabaac6aebc7e6a8"}, + {file = "triton-3.5.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f617aa7925f9ea9968ec2e1adaf93e87864ff51549c8f04ce658f29bbdb71e2d"}, + {file = "triton-3.5.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0637b1efb1db599a8e9dc960d53ab6e4637db7d4ab6630a0974705d77b14b60"}, + {file = "triton-3.5.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8932391d7f93698dfe5bc9bead77c47a24f97329e9f20c10786bb230a9083f56"}, + {file = "triton-3.5.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bac7f7d959ad0f48c0e97d6643a1cc0fd5786fe61cb1f83b537c6b2d54776478"}, ] -[package.dependencies] -setuptools = ">=40.8.0" - [package.extras] -build = ["cmake (>=3.20)", "lit"] +build = ["cmake (>=3.20,<4.0)", "lit"] tests = ["autopep8", "isort", "llnl-hatchet", "numpy", "pytest", "pytest-forked", "pytest-xdist", "scipy (>=1.7.1)"] tutorials = ["matplotlib", "pandas", "tabulate"] -[[package]] -name = "types-python-dateutil" -version = "2.9.0.20250516" -description = "Typing stubs for python-dateutil" -optional = false -python-versions = ">=3.9" -files = [ - {file = "types_python_dateutil-2.9.0.20250516-py3-none-any.whl", hash = "sha256:2b2b3f57f9c6a61fba26a9c0ffb9ea5681c9b83e69cd897c6b5f668d9c0cab93"}, - {file = "types_python_dateutil-2.9.0.20250516.tar.gz", hash = "sha256:13e80d6c9c47df23ad773d54b2826bd52dbbb41be87c3f339381c1700ad21ee5"}, -] - [[package]] name = "typing-extensions" -version = "4.14.0" +version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af"}, - {file = "typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4"}, + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] [[package]] @@ -6077,6 +7247,7 @@ version = "0.9.0" description = "Runtime inspection utilities for typing module." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, @@ -6088,13 +7259,14 @@ typing-extensions = ">=3.7.4" [[package]] name = "typing-inspection" -version = "0.4.1" +version = "0.4.2" description = "Runtime typing introspection tools" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51"}, - {file = "typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28"}, + {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, + {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, ] [package.dependencies] @@ -6106,6 +7278,7 @@ version = "2025.2" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" +groups = ["main", "dev"] files = [ {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, @@ -6117,6 +7290,7 @@ version = "1.3.0" description = "RFC 6570 URI Template Processor" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "uri-template-1.3.0.tar.gz", hash = "sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7"}, {file = "uri_template-1.3.0-py3-none-any.whl", hash = "sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363"}, @@ -6127,17 +7301,18 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake [[package]] name = "urllib3" -version = "2.4.0" +version = "2.5.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ - {file = "urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813"}, - {file = "urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466"}, + {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"}, + {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -6148,6 +7323,7 @@ version = "0.34.3" description = "The lightning-fast ASGI server." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "uvicorn-0.34.3-py3-none-any.whl", hash = "sha256:16246631db62bdfbf069b0645177d6e8a77ba950cfedbfd093acef9444e4d885"}, {file = "uvicorn-0.34.3.tar.gz", hash = "sha256:35919a9a979d7a59334b6b10e05d77c1d0d574c50e0fc98b8b1a0f165708b55a"}, @@ -6160,73 +7336,88 @@ h11 = ">=0.8" httptools = {version = ">=0.6.3", optional = true, markers = "extra == \"standard\""} python-dotenv = {version = ">=0.13", optional = true, markers = "extra == \"standard\""} pyyaml = {version = ">=5.1", optional = true, markers = "extra == \"standard\""} -uvloop = {version = ">=0.15.1", optional = true, markers = "(sys_platform != \"win32\" and sys_platform != \"cygwin\") and platform_python_implementation != \"PyPy\" and extra == \"standard\""} +uvloop = {version = ">=0.15.1", optional = true, markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\" and extra == \"standard\""} watchfiles = {version = ">=0.13", optional = true, markers = "extra == \"standard\""} websockets = {version = ">=10.4", optional = true, markers = "extra == \"standard\""} [package.extras] -standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] +standard = ["colorama (>=0.4) ; sys_platform == \"win32\"", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.15.1) ; sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"", "watchfiles (>=0.13)", "websockets (>=10.4)"] [[package]] name = "uvloop" -version = "0.21.0" +version = "0.22.1" description = "Fast implementation of asyncio event loop on top of libuv" optional = true -python-versions = ">=3.8.0" -files = [ - {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, - {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, - {file = "uvloop-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26"}, - {file = "uvloop-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb"}, - {file = "uvloop-0.21.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f"}, - {file = "uvloop-0.21.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c"}, - {file = "uvloop-0.21.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8"}, - {file = "uvloop-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0"}, - {file = "uvloop-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e"}, - {file = "uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb"}, - {file = "uvloop-0.21.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6"}, - {file = "uvloop-0.21.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d"}, - {file = "uvloop-0.21.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c"}, - {file = "uvloop-0.21.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2"}, - {file = "uvloop-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d"}, - {file = "uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc"}, - {file = "uvloop-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb"}, - {file = "uvloop-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f"}, - {file = "uvloop-0.21.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281"}, - {file = "uvloop-0.21.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af"}, - {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6"}, - {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816"}, - {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc"}, - {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553"}, - {file = "uvloop-0.21.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414"}, - {file = "uvloop-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206"}, - {file = "uvloop-0.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe"}, - {file = "uvloop-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79"}, - {file = "uvloop-0.21.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a"}, - {file = "uvloop-0.21.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc"}, - {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b"}, - {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2"}, - {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0"}, - {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75"}, - {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd"}, - {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff"}, - {file = "uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3"}, +python-versions = ">=3.8.1" +groups = ["main"] +markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\" and (extra == \"chatui\" or extra == \"all\")" +files = [ + {file = "uvloop-0.22.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ef6f0d4cc8a9fa1f6a910230cd53545d9a14479311e87e3cb225495952eb672c"}, + {file = "uvloop-0.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7cd375a12b71d33d46af85a3343b35d98e8116134ba404bd657b3b1d15988792"}, + {file = "uvloop-0.22.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac33ed96229b7790eb729702751c0e93ac5bc3bcf52ae9eccbff30da09194b86"}, + {file = "uvloop-0.22.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:481c990a7abe2c6f4fc3d98781cc9426ebd7f03a9aaa7eb03d3bfc68ac2a46bd"}, + {file = "uvloop-0.22.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a592b043a47ad17911add5fbd087c76716d7c9ccc1d64ec9249ceafd735f03c2"}, + {file = "uvloop-0.22.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1489cf791aa7b6e8c8be1c5a080bae3a672791fcb4e9e12249b05862a2ca9cec"}, + {file = "uvloop-0.22.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c60ebcd36f7b240b30788554b6f0782454826a0ed765d8430652621b5de674b9"}, + {file = "uvloop-0.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b7f102bf3cb1995cfeaee9321105e8f5da76fdb104cdad8986f85461a1b7b77"}, + {file = "uvloop-0.22.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53c85520781d84a4b8b230e24a5af5b0778efdb39142b424990ff1ef7c48ba21"}, + {file = "uvloop-0.22.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56a2d1fae65fd82197cb8c53c367310b3eabe1bbb9fb5a04d28e3e3520e4f702"}, + {file = "uvloop-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40631b049d5972c6755b06d0bfe8233b1bd9a8a6392d9d1c45c10b6f9e9b2733"}, + {file = "uvloop-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:535cc37b3a04f6cd2c1ef65fa1d370c9a35b6695df735fcff5427323f2cd5473"}, + {file = "uvloop-0.22.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fe94b4564e865d968414598eea1a6de60adba0c040ba4ed05ac1300de402cd42"}, + {file = "uvloop-0.22.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:51eb9bd88391483410daad430813d982010f9c9c89512321f5b60e2cddbdddd6"}, + {file = "uvloop-0.22.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:700e674a166ca5778255e0e1dc4e9d79ab2acc57b9171b79e65feba7184b3370"}, + {file = "uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b5b1ac819a3f946d3b2ee07f09149578ae76066d70b44df3fa990add49a82e4"}, + {file = "uvloop-0.22.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e047cc068570bac9866237739607d1313b9253c3051ad84738cbb095be0537b2"}, + {file = "uvloop-0.22.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:512fec6815e2dd45161054592441ef76c830eddaad55c8aa30952e6fe1ed07c0"}, + {file = "uvloop-0.22.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:561577354eb94200d75aca23fbde86ee11be36b00e52a4eaf8f50fb0c86b7705"}, + {file = "uvloop-0.22.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cdf5192ab3e674ca26da2eada35b288d2fa49fdd0f357a19f0e7c4e7d5077c8"}, + {file = "uvloop-0.22.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e2ea3d6190a2968f4a14a23019d3b16870dd2190cd69c8180f7c632d21de68d"}, + {file = "uvloop-0.22.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0530a5fbad9c9e4ee3f2b33b148c6a64d47bbad8000ea63704fa8260f4cf728e"}, + {file = "uvloop-0.22.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bc5ef13bbc10b5335792360623cc378d52d7e62c2de64660616478c32cd0598e"}, + {file = "uvloop-0.22.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1f38ec5e3f18c8a10ded09742f7fb8de0108796eb673f30ce7762ce1b8550cad"}, + {file = "uvloop-0.22.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3879b88423ec7e97cd4eba2a443aa26ed4e59b45e6b76aabf13fe2f27023a142"}, + {file = "uvloop-0.22.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4baa86acedf1d62115c1dc6ad1e17134476688f08c6efd8a2ab076e815665c74"}, + {file = "uvloop-0.22.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:297c27d8003520596236bdb2335e6b3f649480bd09e00d1e3a99144b691d2a35"}, + {file = "uvloop-0.22.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c1955d5a1dd43198244d47664a5858082a3239766a839b2102a269aaff7a4e25"}, + {file = "uvloop-0.22.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b31dc2fccbd42adc73bc4e7cdbae4fc5086cf378979e53ca5d0301838c5682c6"}, + {file = "uvloop-0.22.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:93f617675b2d03af4e72a5333ef89450dfaa5321303ede6e67ba9c9d26878079"}, + {file = "uvloop-0.22.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:37554f70528f60cad66945b885eb01f1bb514f132d92b6eeed1c90fd54ed6289"}, + {file = "uvloop-0.22.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b76324e2dc033a0b2f435f33eb88ff9913c156ef78e153fb210e03c13da746b3"}, + {file = "uvloop-0.22.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:badb4d8e58ee08dad957002027830d5c3b06aea446a6a3744483c2b3b745345c"}, + {file = "uvloop-0.22.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b91328c72635f6f9e0282e4a57da7470c7350ab1c9f48546c0f2866205349d21"}, + {file = "uvloop-0.22.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:daf620c2995d193449393d6c62131b3fbd40a63bf7b307a1527856ace637fe88"}, + {file = "uvloop-0.22.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6cde23eeda1a25c75b2e07d39970f3374105d5eafbaab2a4482be82f272d5a5e"}, + {file = "uvloop-0.22.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:80eee091fe128e425177fbd82f8635769e2f32ec9daf6468286ec57ec0313efa"}, + {file = "uvloop-0.22.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:017bd46f9e7b78e81606329d07141d3da446f8798c6baeec124260e22c262772"}, + {file = "uvloop-0.22.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3e5c6727a57cb6558592a95019e504f605d1c54eb86463ee9f7a2dbd411c820"}, + {file = "uvloop-0.22.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:57df59d8b48feb0e613d9b1f5e57b7532e97cbaf0d61f7aa9aa32221e84bc4b6"}, + {file = "uvloop-0.22.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:55502bc2c653ed2e9692e8c55cb95b397d33f9f2911e929dc97c4d6b26d04242"}, + {file = "uvloop-0.22.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4a968a72422a097b09042d5fa2c5c590251ad484acf910a651b4b620acd7f193"}, + {file = "uvloop-0.22.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b45649628d816c030dba3c80f8e2689bab1c89518ed10d426036cdc47874dfc4"}, + {file = "uvloop-0.22.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ea721dd3203b809039fcc2983f14608dae82b212288b346e0bfe46ec2fab0b7c"}, + {file = "uvloop-0.22.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ae676de143db2b2f60a9696d7eca5bb9d0dd6cc3ac3dad59a8ae7e95f9e1b54"}, + {file = "uvloop-0.22.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:17d4e97258b0172dfa107b89aa1eeba3016f4b1974ce85ca3ef6a66b35cbf659"}, + {file = "uvloop-0.22.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:05e4b5f86e621cf3927631789999e697e58f0d2d32675b67d9ca9eb0bca55743"}, + {file = "uvloop-0.22.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:286322a90bea1f9422a470d5d2ad82d38080be0a29c4dd9b3e6384320a4d11e7"}, + {file = "uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f"}, ] [package.extras] dev = ["Cython (>=3.0,<4.0)", "setuptools (>=60)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["aiohttp (>=3.10.5)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx_rtd_theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["aiohttp (>=3.10.5)", "flake8 (>=6.1,<7.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=25.3.0,<25.4.0)", "pycodestyle (>=2.11.0,<2.12.0)"] [[package]] name = "virtualenv" -version = "20.31.2" +version = "20.35.4" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ - {file = "virtualenv-20.31.2-py3-none-any.whl", hash = "sha256:36efd0d9650ee985f0cad72065001e66d49a6f24eb44d98980f630686243cf11"}, - {file = "virtualenv-20.31.2.tar.gz", hash = "sha256:e10c0a9d02835e592521be48b332b6caee6887f332c111aa79a09b9e79efc2af"}, + {file = "virtualenv-20.35.4-py3-none-any.whl", hash = "sha256:c21c9cede36c9753eeade68ba7d523529f228a403463376cf821eaae2b650f1b"}, + {file = "virtualenv-20.35.4.tar.gz", hash = "sha256:643d3914d73d3eeb0c552cbb12d7e82adf0e504dbf86a3182f8771a153a1971c"}, ] [package.dependencies] @@ -6236,7 +7427,7 @@ platformdirs = ">=3.9.1,<5" [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] -test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] [[package]] name = "waitress" @@ -6244,6 +7435,8 @@ version = "3.0.2" description = "Waitress WSGI server" optional = false python-versions = ">=3.9.0" +groups = ["main"] +markers = "platform_system == \"Windows\"" files = [ {file = "waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e"}, {file = "waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f"}, @@ -6255,82 +7448,122 @@ testing = ["coverage (>=7.6.0)", "pytest", "pytest-cov"] [[package]] name = "watchfiles" -version = "1.0.5" +version = "1.1.1" description = "Simple, modern and high performance file watching and code reload in python." optional = true python-versions = ">=3.9" -files = [ - {file = "watchfiles-1.0.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:5c40fe7dd9e5f81e0847b1ea64e1f5dd79dd61afbedb57759df06767ac719b40"}, - {file = "watchfiles-1.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c0db396e6003d99bb2d7232c957b5f0b5634bbd1b24e381a5afcc880f7373fb"}, - {file = "watchfiles-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b551d4fb482fc57d852b4541f911ba28957d051c8776e79c3b4a51eb5e2a1b11"}, - {file = "watchfiles-1.0.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:830aa432ba5c491d52a15b51526c29e4a4b92bf4f92253787f9726fe01519487"}, - {file = "watchfiles-1.0.5-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a16512051a822a416b0d477d5f8c0e67b67c1a20d9acecb0aafa3aa4d6e7d256"}, - {file = "watchfiles-1.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe0cbc787770e52a96c6fda6726ace75be7f840cb327e1b08d7d54eadc3bc85"}, - {file = "watchfiles-1.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d363152c5e16b29d66cbde8fa614f9e313e6f94a8204eaab268db52231fe5358"}, - {file = "watchfiles-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee32c9a9bee4d0b7bd7cbeb53cb185cf0b622ac761efaa2eba84006c3b3a614"}, - {file = "watchfiles-1.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29c7fd632ccaf5517c16a5188e36f6612d6472ccf55382db6c7fe3fcccb7f59f"}, - {file = "watchfiles-1.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e637810586e6fe380c8bc1b3910accd7f1d3a9a7262c8a78d4c8fb3ba6a2b3d"}, - {file = "watchfiles-1.0.5-cp310-cp310-win32.whl", hash = "sha256:cd47d063fbeabd4c6cae1d4bcaa38f0902f8dc5ed168072874ea11d0c7afc1ff"}, - {file = "watchfiles-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:86c0df05b47a79d80351cd179893f2f9c1b1cae49d96e8b3290c7f4bd0ca0a92"}, - {file = "watchfiles-1.0.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:237f9be419e977a0f8f6b2e7b0475ababe78ff1ab06822df95d914a945eac827"}, - {file = "watchfiles-1.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0da39ff917af8b27a4bdc5a97ac577552a38aac0d260a859c1517ea3dc1a7c4"}, - {file = "watchfiles-1.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cfcb3952350e95603f232a7a15f6c5f86c5375e46f0bd4ae70d43e3e063c13d"}, - {file = "watchfiles-1.0.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:68b2dddba7a4e6151384e252a5632efcaa9bc5d1c4b567f3cb621306b2ca9f63"}, - {file = "watchfiles-1.0.5-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95cf944fcfc394c5f9de794ce581914900f82ff1f855326f25ebcf24d5397418"}, - {file = "watchfiles-1.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ecf6cd9f83d7c023b1aba15d13f705ca7b7d38675c121f3cc4a6e25bd0857ee9"}, - {file = "watchfiles-1.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:852de68acd6212cd6d33edf21e6f9e56e5d98c6add46f48244bd479d97c967c6"}, - {file = "watchfiles-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5730f3aa35e646103b53389d5bc77edfbf578ab6dab2e005142b5b80a35ef25"}, - {file = "watchfiles-1.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:18b3bd29954bc4abeeb4e9d9cf0b30227f0f206c86657674f544cb032296acd5"}, - {file = "watchfiles-1.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ba5552a1b07c8edbf197055bc9d518b8f0d98a1c6a73a293bc0726dce068ed01"}, - {file = "watchfiles-1.0.5-cp311-cp311-win32.whl", hash = "sha256:2f1fefb2e90e89959447bc0420fddd1e76f625784340d64a2f7d5983ef9ad246"}, - {file = "watchfiles-1.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:b6e76ceb1dd18c8e29c73f47d41866972e891fc4cc7ba014f487def72c1cf096"}, - {file = "watchfiles-1.0.5-cp311-cp311-win_arm64.whl", hash = "sha256:266710eb6fddc1f5e51843c70e3bebfb0f5e77cf4f27129278c70554104d19ed"}, - {file = "watchfiles-1.0.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b5eb568c2aa6018e26da9e6c86f3ec3fd958cee7f0311b35c2630fa4217d17f2"}, - {file = "watchfiles-1.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0a04059f4923ce4e856b4b4e5e783a70f49d9663d22a4c3b3298165996d1377f"}, - {file = "watchfiles-1.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e380c89983ce6e6fe2dd1e1921b9952fb4e6da882931abd1824c092ed495dec"}, - {file = "watchfiles-1.0.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fe43139b2c0fdc4a14d4f8d5b5d967f7a2777fd3d38ecf5b1ec669b0d7e43c21"}, - {file = "watchfiles-1.0.5-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee0822ce1b8a14fe5a066f93edd20aada932acfe348bede8aa2149f1a4489512"}, - {file = "watchfiles-1.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a0dbcb1c2d8f2ab6e0a81c6699b236932bd264d4cef1ac475858d16c403de74d"}, - {file = "watchfiles-1.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2014a2b18ad3ca53b1f6c23f8cd94a18ce930c1837bd891262c182640eb40a6"}, - {file = "watchfiles-1.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f6ae86d5cb647bf58f9f655fcf577f713915a5d69057a0371bc257e2553234"}, - {file = "watchfiles-1.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1a7bac2bde1d661fb31f4d4e8e539e178774b76db3c2c17c4bb3e960a5de07a2"}, - {file = "watchfiles-1.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ab626da2fc1ac277bbf752446470b367f84b50295264d2d313e28dc4405d663"}, - {file = "watchfiles-1.0.5-cp312-cp312-win32.whl", hash = "sha256:9f4571a783914feda92018ef3901dab8caf5b029325b5fe4558c074582815249"}, - {file = "watchfiles-1.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:360a398c3a19672cf93527f7e8d8b60d8275119c5d900f2e184d32483117a705"}, - {file = "watchfiles-1.0.5-cp312-cp312-win_arm64.whl", hash = "sha256:1a2902ede862969077b97523987c38db28abbe09fb19866e711485d9fbf0d417"}, - {file = "watchfiles-1.0.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0b289572c33a0deae62daa57e44a25b99b783e5f7aed81b314232b3d3c81a11d"}, - {file = "watchfiles-1.0.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a056c2f692d65bf1e99c41045e3bdcaea3cb9e6b5a53dcaf60a5f3bd95fc9763"}, - {file = "watchfiles-1.0.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9dca99744991fc9850d18015c4f0438865414e50069670f5f7eee08340d8b40"}, - {file = "watchfiles-1.0.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:894342d61d355446d02cd3988a7326af344143eb33a2fd5d38482a92072d9563"}, - {file = "watchfiles-1.0.5-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab44e1580924d1ffd7b3938e02716d5ad190441965138b4aa1d1f31ea0877f04"}, - {file = "watchfiles-1.0.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6f9367b132078b2ceb8d066ff6c93a970a18c3029cea37bfd7b2d3dd2e5db8f"}, - {file = "watchfiles-1.0.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2e55a9b162e06e3f862fb61e399fe9f05d908d019d87bf5b496a04ef18a970a"}, - {file = "watchfiles-1.0.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0125f91f70e0732a9f8ee01e49515c35d38ba48db507a50c5bdcad9503af5827"}, - {file = "watchfiles-1.0.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:13bb21f8ba3248386337c9fa51c528868e6c34a707f729ab041c846d52a0c69a"}, - {file = "watchfiles-1.0.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:839ebd0df4a18c5b3c1b890145b5a3f5f64063c2a0d02b13c76d78fe5de34936"}, - {file = "watchfiles-1.0.5-cp313-cp313-win32.whl", hash = "sha256:4a8ec1e4e16e2d5bafc9ba82f7aaecfeec990ca7cd27e84fb6f191804ed2fcfc"}, - {file = "watchfiles-1.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:f436601594f15bf406518af922a89dcaab416568edb6f65c4e5bbbad1ea45c11"}, - {file = "watchfiles-1.0.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:2cfb371be97d4db374cba381b9f911dd35bb5f4c58faa7b8b7106c8853e5d225"}, - {file = "watchfiles-1.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a3904d88955fda461ea2531fcf6ef73584ca921415d5cfa44457a225f4a42bc1"}, - {file = "watchfiles-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b7a21715fb12274a71d335cff6c71fe7f676b293d322722fe708a9ec81d91f5"}, - {file = "watchfiles-1.0.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dfd6ae1c385ab481766b3c61c44aca2b3cd775f6f7c0fa93d979ddec853d29d5"}, - {file = "watchfiles-1.0.5-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b659576b950865fdad31fa491d31d37cf78b27113a7671d39f919828587b429b"}, - {file = "watchfiles-1.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1909e0a9cd95251b15bff4261de5dd7550885bd172e3536824bf1cf6b121e200"}, - {file = "watchfiles-1.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:832ccc221927c860e7286c55c9b6ebcc0265d5e072f49c7f6456c7798d2b39aa"}, - {file = "watchfiles-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85fbb6102b3296926d0c62cfc9347f6237fb9400aecd0ba6bbda94cae15f2b3b"}, - {file = "watchfiles-1.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:15ac96dd567ad6c71c71f7b2c658cb22b7734901546cd50a475128ab557593ca"}, - {file = "watchfiles-1.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b6227351e11c57ae997d222e13f5b6f1f0700d84b8c52304e8675d33a808382"}, - {file = "watchfiles-1.0.5-cp39-cp39-win32.whl", hash = "sha256:974866e0db748ebf1eccab17862bc0f0303807ed9cda465d1324625b81293a18"}, - {file = "watchfiles-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:9848b21ae152fe79c10dd0197304ada8f7b586d3ebc3f27f43c506e5a52a863c"}, - {file = "watchfiles-1.0.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f59b870db1f1ae5a9ac28245707d955c8721dd6565e7f411024fa374b5362d1d"}, - {file = "watchfiles-1.0.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9475b0093767e1475095f2aeb1d219fb9664081d403d1dff81342df8cd707034"}, - {file = "watchfiles-1.0.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc533aa50664ebd6c628b2f30591956519462f5d27f951ed03d6c82b2dfd9965"}, - {file = "watchfiles-1.0.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fed1cd825158dcaae36acce7b2db33dcbfd12b30c34317a88b8ed80f0541cc57"}, - {file = "watchfiles-1.0.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:554389562c29c2c182e3908b149095051f81d28c2fec79ad6c8997d7d63e0009"}, - {file = "watchfiles-1.0.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a74add8d7727e6404d5dc4dcd7fac65d4d82f95928bbee0cf5414c900e86773e"}, - {file = "watchfiles-1.0.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb1489f25b051a89fae574505cc26360c8e95e227a9500182a7fe0afcc500ce0"}, - {file = "watchfiles-1.0.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0901429650652d3f0da90bad42bdafc1f9143ff3605633c455c999a2d786cac"}, - {file = "watchfiles-1.0.5.tar.gz", hash = "sha256:b7529b5dcc114679d43827d8c35a07c493ad6f083633d573d81c660abc5979e9"}, +groups = ["main"] +markers = "extra == \"chatui\" or extra == \"all\"" +files = [ + {file = "watchfiles-1.1.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:eef58232d32daf2ac67f42dea51a2c80f0d03379075d44a587051e63cc2e368c"}, + {file = "watchfiles-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03fa0f5237118a0c5e496185cafa92878568b652a2e9a9382a5151b1a0380a43"}, + {file = "watchfiles-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ca65483439f9c791897f7db49202301deb6e15fe9f8fe2fed555bf986d10c31"}, + {file = "watchfiles-1.1.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f0ab1c1af0cb38e3f598244c17919fb1a84d1629cc08355b0074b6d7f53138ac"}, + {file = "watchfiles-1.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bc570d6c01c206c46deb6e935a260be44f186a2f05179f52f7fcd2be086a94d"}, + {file = "watchfiles-1.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e84087b432b6ac94778de547e08611266f1f8ffad28c0ee4c82e028b0fc5966d"}, + {file = "watchfiles-1.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:620bae625f4cb18427b1bb1a2d9426dc0dd5a5ba74c7c2cdb9de405f7b129863"}, + {file = "watchfiles-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:544364b2b51a9b0c7000a4b4b02f90e9423d97fbbf7e06689236443ebcad81ab"}, + {file = "watchfiles-1.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bbe1ef33d45bc71cf21364df962af171f96ecaeca06bd9e3d0b583efb12aec82"}, + {file = "watchfiles-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1a0bb430adb19ef49389e1ad368450193a90038b5b752f4ac089ec6942c4dff4"}, + {file = "watchfiles-1.1.1-cp310-cp310-win32.whl", hash = "sha256:3f6d37644155fb5beca5378feb8c1708d5783145f2a0f1c4d5a061a210254844"}, + {file = "watchfiles-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:a36d8efe0f290835fd0f33da35042a1bb5dc0e83cbc092dcf69bce442579e88e"}, + {file = "watchfiles-1.1.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f57b396167a2565a4e8b5e56a5a1c537571733992b226f4f1197d79e94cf0ae5"}, + {file = "watchfiles-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:421e29339983e1bebc281fab40d812742268ad057db4aee8c4d2bce0af43b741"}, + {file = "watchfiles-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e43d39a741e972bab5d8100b5cdacf69db64e34eb19b6e9af162bccf63c5cc6"}, + {file = "watchfiles-1.1.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f537afb3276d12814082a2e9b242bdcf416c2e8fd9f799a737990a1dbe906e5b"}, + {file = "watchfiles-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2cd9e04277e756a2e2d2543d65d1e2166d6fd4c9b183f8808634fda23f17b14"}, + {file = "watchfiles-1.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3f58818dc0b07f7d9aa7fe9eb1037aecb9700e63e1f6acfed13e9fef648f5d"}, + {file = "watchfiles-1.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb9f66367023ae783551042d31b1d7fd422e8289eedd91f26754a66f44d5cff"}, + {file = "watchfiles-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aebfd0861a83e6c3d1110b78ad54704486555246e542be3e2bb94195eabb2606"}, + {file = "watchfiles-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5fac835b4ab3c6487b5dbad78c4b3724e26bcc468e886f8ba8cc4306f68f6701"}, + {file = "watchfiles-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:399600947b170270e80134ac854e21b3ccdefa11a9529a3decc1327088180f10"}, + {file = "watchfiles-1.1.1-cp311-cp311-win32.whl", hash = "sha256:de6da501c883f58ad50db3a32ad397b09ad29865b5f26f64c24d3e3281685849"}, + {file = "watchfiles-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:35c53bd62a0b885bf653ebf6b700d1bf05debb78ad9292cf2a942b23513dc4c4"}, + {file = "watchfiles-1.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:57ca5281a8b5e27593cb7d82c2ac927ad88a96ed406aa446f6344e4328208e9e"}, + {file = "watchfiles-1.1.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8c89f9f2f740a6b7dcc753140dd5e1ab9215966f7a3530d0c0705c83b401bd7d"}, + {file = "watchfiles-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bd404be08018c37350f0d6e34676bd1e2889990117a2b90070b3007f172d0610"}, + {file = "watchfiles-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8526e8f916bb5b9a0a777c8317c23ce65de259422bba5b31325a6fa6029d33af"}, + {file = "watchfiles-1.1.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2edc3553362b1c38d9f06242416a5d8e9fe235c204a4072e988ce2e5bb1f69f6"}, + {file = "watchfiles-1.1.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30f7da3fb3f2844259cba4720c3fc7138eb0f7b659c38f3bfa65084c7fc7abce"}, + {file = "watchfiles-1.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8979280bdafff686ba5e4d8f97840f929a87ed9cdf133cbbd42f7766774d2aa"}, + {file = "watchfiles-1.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dcc5c24523771db3a294c77d94771abcfcb82a0e0ee8efd910c37c59ec1b31bb"}, + {file = "watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db5d7ae38ff20153d542460752ff397fcf5c96090c1230803713cf3147a6803"}, + {file = "watchfiles-1.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:28475ddbde92df1874b6c5c8aaeb24ad5be47a11f87cde5a28ef3835932e3e94"}, + {file = "watchfiles-1.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:36193ed342f5b9842edd3532729a2ad55c4160ffcfa3700e0d54be496b70dd43"}, + {file = "watchfiles-1.1.1-cp312-cp312-win32.whl", hash = "sha256:859e43a1951717cc8de7f4c77674a6d389b106361585951d9e69572823f311d9"}, + {file = "watchfiles-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:91d4c9a823a8c987cce8fa2690923b069966dabb196dd8d137ea2cede885fde9"}, + {file = "watchfiles-1.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:a625815d4a2bdca61953dbba5a39d60164451ef34c88d751f6c368c3ea73d404"}, + {file = "watchfiles-1.1.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:130e4876309e8686a5e37dba7d5e9bc77e6ed908266996ca26572437a5271e18"}, + {file = "watchfiles-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f3bde70f157f84ece3765b42b4a52c6ac1a50334903c6eaf765362f6ccca88a"}, + {file = "watchfiles-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14e0b1fe858430fc0251737ef3824c54027bedb8c37c38114488b8e131cf8219"}, + {file = "watchfiles-1.1.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f27db948078f3823a6bb3b465180db8ebecf26dd5dae6f6180bd87383b6b4428"}, + {file = "watchfiles-1.1.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:059098c3a429f62fc98e8ec62b982230ef2c8df68c79e826e37b895bc359a9c0"}, + {file = "watchfiles-1.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfb5862016acc9b869bb57284e6cb35fdf8e22fe59f7548858e2f971d045f150"}, + {file = "watchfiles-1.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:319b27255aacd9923b8a276bb14d21a5f7ff82564c744235fc5eae58d95422ae"}, + {file = "watchfiles-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c755367e51db90e75b19454b680903631d41f9e3607fbd941d296a020c2d752d"}, + {file = "watchfiles-1.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c22c776292a23bfc7237a98f791b9ad3144b02116ff10d820829ce62dff46d0b"}, + {file = "watchfiles-1.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:3a476189be23c3686bc2f4321dd501cb329c0a0469e77b7b534ee10129ae6374"}, + {file = "watchfiles-1.1.1-cp313-cp313-win32.whl", hash = "sha256:bf0a91bfb5574a2f7fc223cf95eeea79abfefa404bf1ea5e339c0c1560ae99a0"}, + {file = "watchfiles-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:52e06553899e11e8074503c8e716d574adeeb7e68913115c4b3653c53f9bae42"}, + {file = "watchfiles-1.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:ac3cc5759570cd02662b15fbcd9d917f7ecd47efe0d6b40474eafd246f91ea18"}, + {file = "watchfiles-1.1.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:563b116874a9a7ce6f96f87cd0b94f7faf92d08d0021e837796f0a14318ef8da"}, + {file = "watchfiles-1.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3ad9fe1dae4ab4212d8c91e80b832425e24f421703b5a42ef2e4a1e215aff051"}, + {file = "watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce70f96a46b894b36eba678f153f052967a0d06d5b5a19b336ab0dbbd029f73e"}, + {file = "watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cb467c999c2eff23a6417e58d75e5828716f42ed8289fe6b77a7e5a91036ca70"}, + {file = "watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:836398932192dae4146c8f6f737d74baeac8b70ce14831a239bdb1ca882fc261"}, + {file = "watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:743185e7372b7bc7c389e1badcc606931a827112fbbd37f14c537320fca08620"}, + {file = "watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afaeff7696e0ad9f02cbb8f56365ff4686ab205fcf9c4c5b6fdfaaa16549dd04"}, + {file = "watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7eb7da0eb23aa2ba036d4f616d46906013a68caf61b7fdbe42fc8b25132e77"}, + {file = "watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:831a62658609f0e5c64178211c942ace999517f5770fe9436be4c2faeba0c0ef"}, + {file = "watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:f9a2ae5c91cecc9edd47e041a930490c31c3afb1f5e6d71de3dc671bfaca02bf"}, + {file = "watchfiles-1.1.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:d1715143123baeeaeadec0528bb7441103979a1d5f6fd0e1f915383fea7ea6d5"}, + {file = "watchfiles-1.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:39574d6370c4579d7f5d0ad940ce5b20db0e4117444e39b6d8f99db5676c52fd"}, + {file = "watchfiles-1.1.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7365b92c2e69ee952902e8f70f3ba6360d0d596d9299d55d7d386df84b6941fb"}, + {file = "watchfiles-1.1.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bfff9740c69c0e4ed32416f013f3c45e2ae42ccedd1167ef2d805c000b6c71a5"}, + {file = "watchfiles-1.1.1-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b27cf2eb1dda37b2089e3907d8ea92922b673c0c427886d4edc6b94d8dfe5db3"}, + {file = "watchfiles-1.1.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:526e86aced14a65a5b0ec50827c745597c782ff46b571dbfe46192ab9e0b3c33"}, + {file = "watchfiles-1.1.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04e78dd0b6352db95507fd8cb46f39d185cf8c74e4cf1e4fbad1d3df96faf510"}, + {file = "watchfiles-1.1.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c85794a4cfa094714fb9c08d4a218375b2b95b8ed1666e8677c349906246c05"}, + {file = "watchfiles-1.1.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:74d5012b7630714b66be7b7b7a78855ef7ad58e8650c73afc4c076a1f480a8d6"}, + {file = "watchfiles-1.1.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:8fbe85cb3201c7d380d3d0b90e63d520f15d6afe217165d7f98c9c649654db81"}, + {file = "watchfiles-1.1.1-cp314-cp314-win32.whl", hash = "sha256:3fa0b59c92278b5a7800d3ee7733da9d096d4aabcfabb9a928918bd276ef9b9b"}, + {file = "watchfiles-1.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:c2047d0b6cea13b3316bdbafbfa0c4228ae593d995030fda39089d36e64fc03a"}, + {file = "watchfiles-1.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:842178b126593addc05acf6fce960d28bc5fae7afbaa2c6c1b3a7b9460e5be02"}, + {file = "watchfiles-1.1.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:88863fbbc1a7312972f1c511f202eb30866370ebb8493aef2812b9ff28156a21"}, + {file = "watchfiles-1.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:55c7475190662e202c08c6c0f4d9e345a29367438cf8e8037f3155e10a88d5a5"}, + {file = "watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f53fa183d53a1d7a8852277c92b967ae99c2d4dcee2bfacff8868e6e30b15f7"}, + {file = "watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6aae418a8b323732fa89721d86f39ec8f092fc2af67f4217a2b07fd3e93c6101"}, + {file = "watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f096076119da54a6080e8920cbdaac3dbee667eb91dcc5e5b78840b87415bd44"}, + {file = "watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00485f441d183717038ed2e887a7c868154f216877653121068107b227a2f64c"}, + {file = "watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a55f3e9e493158d7bfdb60a1165035f1cf7d320914e7b7ea83fe22c6023b58fc"}, + {file = "watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c91ed27800188c2ae96d16e3149f199d62f86c7af5f5f4d2c61a3ed8cd3666c"}, + {file = "watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:311ff15a0bae3714ffb603e6ba6dbfba4065ab60865d15a6ec544133bdb21099"}, + {file = "watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:a916a2932da8f8ab582f242c065f5c81bed3462849ca79ee357dd9551b0e9b01"}, + {file = "watchfiles-1.1.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c882d69f6903ef6092bedfb7be973d9319940d56b8427ab9187d1ecd73438a70"}, + {file = "watchfiles-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d6ff426a7cb54f310d51bfe83fe9f2bbe40d540c741dc974ebc30e6aa238f52e"}, + {file = "watchfiles-1.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79ff6c6eadf2e3fc0d7786331362e6ef1e51125892c75f1004bd6b52155fb956"}, + {file = "watchfiles-1.1.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c1f5210f1b8fc91ead1283c6fd89f70e76fb07283ec738056cf34d51e9c1d62c"}, + {file = "watchfiles-1.1.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b9c4702f29ca48e023ffd9b7ff6b822acdf47cb1ff44cb490a3f1d5ec8987e9c"}, + {file = "watchfiles-1.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acb08650863767cbc58bca4813b92df4d6c648459dcaa3d4155681962b2aa2d3"}, + {file = "watchfiles-1.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08af70fd77eee58549cd69c25055dc344f918d992ff626068242259f98d598a2"}, + {file = "watchfiles-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c3631058c37e4a0ec440bf583bc53cdbd13e5661bb6f465bc1d88ee9a0a4d02"}, + {file = "watchfiles-1.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cf57a27fb986c6243d2ee78392c503826056ffe0287e8794503b10fb51b881be"}, + {file = "watchfiles-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d7e7067c98040d646982daa1f37a33d3544138ea155536c2e0e63e07ff8a7e0f"}, + {file = "watchfiles-1.1.1-cp39-cp39-win32.whl", hash = "sha256:6c9c9262f454d1c4d8aaa7050121eb4f3aea197360553699520767daebf2180b"}, + {file = "watchfiles-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:74472234c8370669850e1c312490f6026d132ca2d396abfad8830b4f1c096957"}, + {file = "watchfiles-1.1.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:17ef139237dfced9da49fb7f2232c86ca9421f666d78c264c7ffca6601d154c3"}, + {file = "watchfiles-1.1.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:672b8adf25b1a0d35c96b5888b7b18699d27d4194bac8beeae75be4b7a3fc9b2"}, + {file = "watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a13aea58bc2b90173bc69f2a90de8e282648939a00a602e1dc4ee23e26b66d"}, + {file = "watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b495de0bb386df6a12b18335a0285dda90260f51bdb505503c02bcd1ce27a8b"}, + {file = "watchfiles-1.1.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:db476ab59b6765134de1d4fe96a1a9c96ddf091683599be0f26147ea1b2e4b88"}, + {file = "watchfiles-1.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89eef07eee5e9d1fda06e38822ad167a044153457e6fd997f8a858ab7564a336"}, + {file = "watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce19e06cbda693e9e7686358af9cd6f5d61312ab8b00488bc36f5aabbaf77e24"}, + {file = "watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e6f39af2eab0118338902798b5aa6664f46ff66bc0280de76fca67a7f262a49"}, + {file = "watchfiles-1.1.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cdab464fee731e0884c35ae3588514a9bcf718d0e2c82169c1c4a85cc19c3c7f"}, + {file = "watchfiles-1.1.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3dbd8cbadd46984f802f6d479b7e3afa86c42d13e8f0f322d669d79722c8ec34"}, + {file = "watchfiles-1.1.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5524298e3827105b61951a29c3512deb9578586abf3a7c5da4a8069df247cccc"}, + {file = "watchfiles-1.1.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b943d3668d61cfa528eb949577479d3b077fd25fb83c641235437bc0b5bc60e"}, + {file = "watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2"}, ] [package.dependencies] @@ -6338,24 +7571,26 @@ anyio = ">=3.0.0" [[package]] name = "wcwidth" -version = "0.2.13" +version = "0.2.14" description = "Measures the displayed width of unicode strings in a terminal" optional = false -python-versions = "*" +python-versions = ">=3.6" +groups = ["dev"] files = [ - {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, - {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, + {file = "wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1"}, + {file = "wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605"}, ] [[package]] name = "webcolors" -version = "24.11.1" +version = "25.10.0" description = "A library for working with the color formats defined by HTML and CSS." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" +groups = ["dev"] files = [ - {file = "webcolors-24.11.1-py3-none-any.whl", hash = "sha256:515291393b4cdf0eb19c155749a096f779f7d909f7cceea072791cb9095b92e9"}, - {file = "webcolors-24.11.1.tar.gz", hash = "sha256:ecb3d768f32202af770477b8b65f318fa4f566c22948673a977b00d589dd80f6"}, + {file = "webcolors-25.10.0-py3-none-any.whl", hash = "sha256:032c727334856fc0b968f63daa252a1ac93d33db2f5267756623c210e57a4f1d"}, + {file = "webcolors-25.10.0.tar.gz", hash = "sha256:62abae86504f66d0f6364c2a8520de4a0c47b80c03fc3a5f1815fedbef7c19bf"}, ] [[package]] @@ -6364,6 +7599,7 @@ version = "0.5.1" description = "Character encoding aliases for legacy web content" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, @@ -6371,19 +7607,20 @@ files = [ [[package]] name = "websocket-client" -version = "1.8.0" +version = "1.9.0" description = "WebSocket client for Python with low level API options" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, - {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, + {file = "websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef"}, + {file = "websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98"}, ] [package.extras] -docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] +docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx_rtd_theme (>=1.1.0)"] optional = ["python-socks", "wsaccel"] -test = ["websockets"] +test = ["pytest", "websockets"] [[package]] name = "websockets" @@ -6391,6 +7628,8 @@ version = "12.0" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"chatui\" or extra == \"all\"" files = [ {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"}, {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"}, @@ -6472,6 +7711,7 @@ version = "3.1.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e"}, {file = "werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746"}, @@ -6483,100 +7723,160 @@ MarkupSafe = ">=2.1.1" [package.extras] watchdog = ["watchdog (>=2.3)"] +[[package]] +name = "woodwork" +version = "0.31.0" +description = "a data typing library for machine learning" +optional = false +python-versions = "<4,>=3.9" +groups = ["main"] +files = [ + {file = "woodwork-0.31.0-py3-none-any.whl", hash = "sha256:5cb3370553b5f466f8c8599b1bf559584dc0b798cc1f2da26bbd7029d256c6f9"}, + {file = "woodwork-0.31.0.tar.gz", hash = "sha256:6ef82af1d5b6525b02efe6417c574c810cfdcc606cb266bd0d7fb17a1d066b67"}, +] + +[package.dependencies] +importlib-resources = ">=5.10.0" +numpy = ">=1.25.0" +pandas = ">=2.0.0" +python-dateutil = ">=2.8.2" +scikit-learn = ">=1.1.0" +scipy = ">=1.10.0" + +[package.extras] +complete = ["woodwork[updater]"] +dev = ["click (>=8.1.7)", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)", "woodwork[docs,test]"] +docs = ["Sphinx (==5.1.1)", "ipython (==8.4.0)", "jupyter (==1.0.0)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pyarrow (>=14.0.1)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"] +test = ["boto3 (>=1.34.32)", "moto[all] (>=5.0.0)", "pyarrow (>=14.0.1)", "pytest (>=7.0.1)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=2.1.0)", "smart-open (>=5.0.0)"] +updater = ["alteryx-open-src-update-checker (>=3.1.0)"] + [[package]] name = "wrapt" -version = "1.17.2" +version = "2.0.1" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = ">=3.8" -files = [ - {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984"}, - {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22"}, - {file = "wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7"}, - {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c"}, - {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72"}, - {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061"}, - {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2"}, - {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c"}, - {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62"}, - {file = "wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563"}, - {file = "wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f"}, - {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58"}, - {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda"}, - {file = "wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438"}, - {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a"}, - {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000"}, - {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6"}, - {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b"}, - {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662"}, - {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72"}, - {file = "wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317"}, - {file = "wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3"}, - {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925"}, - {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392"}, - {file = "wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40"}, - {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d"}, - {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b"}, - {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98"}, - {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82"}, - {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae"}, - {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9"}, - {file = "wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9"}, - {file = "wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991"}, - {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125"}, - {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998"}, - {file = "wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5"}, - {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8"}, - {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6"}, - {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc"}, - {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2"}, - {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b"}, - {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504"}, - {file = "wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a"}, - {file = "wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845"}, - {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192"}, - {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b"}, - {file = "wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0"}, - {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306"}, - {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb"}, - {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681"}, - {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6"}, - {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6"}, - {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f"}, - {file = "wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555"}, - {file = "wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c"}, - {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9"}, - {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119"}, - {file = "wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6"}, - {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9"}, - {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a"}, - {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2"}, - {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a"}, - {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04"}, - {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f"}, - {file = "wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7"}, - {file = "wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3"}, - {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a"}, - {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061"}, - {file = "wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82"}, - {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9"}, - {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f"}, - {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b"}, - {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f"}, - {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8"}, - {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9"}, - {file = "wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb"}, - {file = "wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb"}, - {file = "wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8"}, - {file = "wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3"}, +groups = ["main"] +files = [ + {file = "wrapt-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:64b103acdaa53b7caf409e8d45d39a8442fe6dcfec6ba3f3d141e0cc2b5b4dbd"}, + {file = "wrapt-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91bcc576260a274b169c3098e9a3519fb01f2989f6d3d386ef9cbf8653de1374"}, + {file = "wrapt-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ab594f346517010050126fcd822697b25a7031d815bb4fbc238ccbe568216489"}, + {file = "wrapt-2.0.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:36982b26f190f4d737f04a492a68accbfc6fa042c3f42326fdfbb6c5b7a20a31"}, + {file = "wrapt-2.0.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23097ed8bc4c93b7bf36fa2113c6c733c976316ce0ee2c816f64ca06102034ef"}, + {file = "wrapt-2.0.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bacfe6e001749a3b64db47bcf0341da757c95959f592823a93931a422395013"}, + {file = "wrapt-2.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8ec3303e8a81932171f455f792f8df500fc1a09f20069e5c16bd7049ab4e8e38"}, + {file = "wrapt-2.0.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:3f373a4ab5dbc528a94334f9fe444395b23c2f5332adab9ff4ea82f5a9e33bc1"}, + {file = "wrapt-2.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f49027b0b9503bf6c8cdc297ca55006b80c2f5dd36cecc72c6835ab6e10e8a25"}, + {file = "wrapt-2.0.1-cp310-cp310-win32.whl", hash = "sha256:8330b42d769965e96e01fa14034b28a2a7600fbf7e8f0cc90ebb36d492c993e4"}, + {file = "wrapt-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:1218573502a8235bb8a7ecaed12736213b22dcde9feab115fa2989d42b5ded45"}, + {file = "wrapt-2.0.1-cp310-cp310-win_arm64.whl", hash = "sha256:eda8e4ecd662d48c28bb86be9e837c13e45c58b8300e43ba3c9b4fa9900302f7"}, + {file = "wrapt-2.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0e17283f533a0d24d6e5429a7d11f250a58d28b4ae5186f8f47853e3e70d2590"}, + {file = "wrapt-2.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:85df8d92158cb8f3965aecc27cf821461bb5f40b450b03facc5d9f0d4d6ddec6"}, + {file = "wrapt-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c1be685ac7700c966b8610ccc63c3187a72e33cab53526a27b2a285a662cd4f7"}, + {file = "wrapt-2.0.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:df0b6d3b95932809c5b3fecc18fda0f1e07452d05e2662a0b35548985f256e28"}, + {file = "wrapt-2.0.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4da7384b0e5d4cae05c97cd6f94faaf78cc8b0f791fc63af43436d98c4ab37bb"}, + {file = "wrapt-2.0.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ec65a78fbd9d6f083a15d7613b2800d5663dbb6bb96003899c834beaa68b242c"}, + {file = "wrapt-2.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7de3cc939be0e1174969f943f3b44e0d79b6f9a82198133a5b7fc6cc92882f16"}, + {file = "wrapt-2.0.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:fb1a5b72cbd751813adc02ef01ada0b0d05d3dcbc32976ce189a1279d80ad4a2"}, + {file = "wrapt-2.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3fa272ca34332581e00bf7773e993d4f632594eb2d1b0b162a9038df0fd971dd"}, + {file = "wrapt-2.0.1-cp311-cp311-win32.whl", hash = "sha256:fc007fdf480c77301ab1afdbb6ab22a5deee8885f3b1ed7afcb7e5e84a0e27be"}, + {file = "wrapt-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:47434236c396d04875180171ee1f3815ca1eada05e24a1ee99546320d54d1d1b"}, + {file = "wrapt-2.0.1-cp311-cp311-win_arm64.whl", hash = "sha256:837e31620e06b16030b1d126ed78e9383815cbac914693f54926d816d35d8edf"}, + {file = "wrapt-2.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1fdbb34da15450f2b1d735a0e969c24bdb8d8924892380126e2a293d9902078c"}, + {file = "wrapt-2.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3d32794fe940b7000f0519904e247f902f0149edbe6316c710a8562fb6738841"}, + {file = "wrapt-2.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:386fb54d9cd903ee0012c09291336469eb7b244f7183d40dc3e86a16a4bace62"}, + {file = "wrapt-2.0.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7b219cb2182f230676308cdcacd428fa837987b89e4b7c5c9025088b8a6c9faf"}, + {file = "wrapt-2.0.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:641e94e789b5f6b4822bb8d8ebbdfc10f4e4eae7756d648b717d980f657a9eb9"}, + {file = "wrapt-2.0.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe21b118b9f58859b5ebaa4b130dee18669df4bd111daad082b7beb8799ad16b"}, + {file = "wrapt-2.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:17fb85fa4abc26a5184d93b3efd2dcc14deb4b09edcdb3535a536ad34f0b4dba"}, + {file = "wrapt-2.0.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:b89ef9223d665ab255ae42cc282d27d69704d94be0deffc8b9d919179a609684"}, + {file = "wrapt-2.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a453257f19c31b31ba593c30d997d6e5be39e3b5ad9148c2af5a7314061c63eb"}, + {file = "wrapt-2.0.1-cp312-cp312-win32.whl", hash = "sha256:3e271346f01e9c8b1130a6a3b0e11908049fe5be2d365a5f402778049147e7e9"}, + {file = "wrapt-2.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:2da620b31a90cdefa9cd0c2b661882329e2e19d1d7b9b920189956b76c564d75"}, + {file = "wrapt-2.0.1-cp312-cp312-win_arm64.whl", hash = "sha256:aea9c7224c302bc8bfc892b908537f56c430802560e827b75ecbde81b604598b"}, + {file = "wrapt-2.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:47b0f8bafe90f7736151f61482c583c86b0693d80f075a58701dd1549b0010a9"}, + {file = "wrapt-2.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cbeb0971e13b4bd81d34169ed57a6dda017328d1a22b62fda45e1d21dd06148f"}, + {file = "wrapt-2.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb7cffe572ad0a141a7886a1d2efa5bef0bf7fe021deeea76b3ab334d2c38218"}, + {file = "wrapt-2.0.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c8d60527d1ecfc131426b10d93ab5d53e08a09c5fa0175f6b21b3252080c70a9"}, + {file = "wrapt-2.0.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c654eafb01afac55246053d67a4b9a984a3567c3808bb7df2f8de1c1caba2e1c"}, + {file = "wrapt-2.0.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:98d873ed6c8b4ee2418f7afce666751854d6d03e3c0ec2a399bb039cd2ae89db"}, + {file = "wrapt-2.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c9e850f5b7fc67af856ff054c71690d54fa940c3ef74209ad9f935b4f66a0233"}, + {file = "wrapt-2.0.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e505629359cb5f751e16e30cf3f91a1d3ddb4552480c205947da415d597f7ac2"}, + {file = "wrapt-2.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2879af909312d0baf35f08edeea918ee3af7ab57c37fe47cb6a373c9f2749c7b"}, + {file = "wrapt-2.0.1-cp313-cp313-win32.whl", hash = "sha256:d67956c676be5a24102c7407a71f4126d30de2a569a1c7871c9f3cabc94225d7"}, + {file = "wrapt-2.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:9ca66b38dd642bf90c59b6738af8070747b610115a39af2498535f62b5cdc1c3"}, + {file = "wrapt-2.0.1-cp313-cp313-win_arm64.whl", hash = "sha256:5a4939eae35db6b6cec8e7aa0e833dcca0acad8231672c26c2a9ab7a0f8ac9c8"}, + {file = "wrapt-2.0.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a52f93d95c8d38fed0669da2ebdb0b0376e895d84596a976c15a9eb45e3eccb3"}, + {file = "wrapt-2.0.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4e54bbf554ee29fcceee24fa41c4d091398b911da6e7f5d7bffda963c9aed2e1"}, + {file = "wrapt-2.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:908f8c6c71557f4deaa280f55d0728c3bca0960e8c3dd5ceeeafb3c19942719d"}, + {file = "wrapt-2.0.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e2f84e9af2060e3904a32cea9bb6db23ce3f91cfd90c6b426757cf7cc01c45c7"}, + {file = "wrapt-2.0.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3612dc06b436968dfb9142c62e5dfa9eb5924f91120b3c8ff501ad878f90eb3"}, + {file = "wrapt-2.0.1-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d2d947d266d99a1477cd005b23cbd09465276e302515e122df56bb9511aca1b"}, + {file = "wrapt-2.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:7d539241e87b650cbc4c3ac9f32c8d1ac8a54e510f6dca3f6ab60dcfd48c9b10"}, + {file = "wrapt-2.0.1-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:4811e15d88ee62dbf5c77f2c3ff3932b1e3ac92323ba3912f51fc4016ce81ecf"}, + {file = "wrapt-2.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c1c91405fcf1d501fa5d55df21e58ea49e6b879ae829f1039faaf7e5e509b41e"}, + {file = "wrapt-2.0.1-cp313-cp313t-win32.whl", hash = "sha256:e76e3f91f864e89db8b8d2a8311d57df93f01ad6bb1e9b9976d1f2e83e18315c"}, + {file = "wrapt-2.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:83ce30937f0ba0d28818807b303a412440c4b63e39d3d8fc036a94764b728c92"}, + {file = "wrapt-2.0.1-cp313-cp313t-win_arm64.whl", hash = "sha256:4b55cacc57e1dc2d0991dbe74c6419ffd415fb66474a02335cb10efd1aa3f84f"}, + {file = "wrapt-2.0.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5e53b428f65ece6d9dad23cb87e64506392b720a0b45076c05354d27a13351a1"}, + {file = "wrapt-2.0.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ad3ee9d0f254851c71780966eb417ef8e72117155cff04821ab9b60549694a55"}, + {file = "wrapt-2.0.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d7b822c61ed04ee6ad64bc90d13368ad6eb094db54883b5dde2182f67a7f22c0"}, + {file = "wrapt-2.0.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7164a55f5e83a9a0b031d3ffab4d4e36bbec42e7025db560f225489fa929e509"}, + {file = "wrapt-2.0.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e60690ba71a57424c8d9ff28f8d006b7ad7772c22a4af432188572cd7fa004a1"}, + {file = "wrapt-2.0.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3cd1a4bd9a7a619922a8557e1318232e7269b5fb69d4ba97b04d20450a6bf970"}, + {file = "wrapt-2.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b4c2e3d777e38e913b8ce3a6257af72fb608f86a1df471cb1d4339755d0a807c"}, + {file = "wrapt-2.0.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:3d366aa598d69416b5afedf1faa539fac40c1d80a42f6b236c88c73a3c8f2d41"}, + {file = "wrapt-2.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c235095d6d090aa903f1db61f892fffb779c1eaeb2a50e566b52001f7a0f66ed"}, + {file = "wrapt-2.0.1-cp314-cp314-win32.whl", hash = "sha256:bfb5539005259f8127ea9c885bdc231978c06b7a980e63a8a61c8c4c979719d0"}, + {file = "wrapt-2.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:4ae879acc449caa9ed43fc36ba08392b9412ee67941748d31d94e3cedb36628c"}, + {file = "wrapt-2.0.1-cp314-cp314-win_arm64.whl", hash = "sha256:8639b843c9efd84675f1e100ed9e99538ebea7297b62c4b45a7042edb84db03e"}, + {file = "wrapt-2.0.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:9219a1d946a9b32bb23ccae66bdb61e35c62773ce7ca6509ceea70f344656b7b"}, + {file = "wrapt-2.0.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fa4184e74197af3adad3c889a1af95b53bb0466bced92ea99a0c014e48323eec"}, + {file = "wrapt-2.0.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c5ef2f2b8a53b7caee2f797ef166a390fef73979b15778a4a153e4b5fedce8fa"}, + {file = "wrapt-2.0.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e042d653a4745be832d5aa190ff80ee4f02c34b21f4b785745eceacd0907b815"}, + {file = "wrapt-2.0.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2afa23318136709c4b23d87d543b425c399887b4057936cd20386d5b1422b6fa"}, + {file = "wrapt-2.0.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6c72328f668cf4c503ffcf9434c2b71fdd624345ced7941bc6693e61bbe36bef"}, + {file = "wrapt-2.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3793ac154afb0e5b45d1233cb94d354ef7a983708cc3bb12563853b1d8d53747"}, + {file = "wrapt-2.0.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:fec0d993ecba3991645b4857837277469c8cc4c554a7e24d064d1ca291cfb81f"}, + {file = "wrapt-2.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:949520bccc1fa227274da7d03bf238be15389cd94e32e4297b92337df9b7a349"}, + {file = "wrapt-2.0.1-cp314-cp314t-win32.whl", hash = "sha256:be9e84e91d6497ba62594158d3d31ec0486c60055c49179edc51ee43d095f79c"}, + {file = "wrapt-2.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:61c4956171c7434634401db448371277d07032a81cc21c599c22953374781395"}, + {file = "wrapt-2.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:35cdbd478607036fee40273be8ed54a451f5f23121bd9d4be515158f9498f7ad"}, + {file = "wrapt-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:90897ea1cf0679763b62e79657958cd54eae5659f6360fc7d2ccc6f906342183"}, + {file = "wrapt-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:50844efc8cdf63b2d90cd3d62d4947a28311e6266ce5235a219d21b195b4ec2c"}, + {file = "wrapt-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49989061a9977a8cbd6d20f2efa813f24bf657c6990a42967019ce779a878dbf"}, + {file = "wrapt-2.0.1-cp38-cp38-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:09c7476ab884b74dce081ad9bfd07fe5822d8600abade571cb1f66d5fc915af6"}, + {file = "wrapt-2.0.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1a8a09a004ef100e614beec82862d11fc17d601092c3599afd22b1f36e4137e"}, + {file = "wrapt-2.0.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:89a82053b193837bf93c0f8a57ded6e4b6d88033a499dadff5067e912c2a41e9"}, + {file = "wrapt-2.0.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f26f8e2ca19564e2e1fdbb6a0e47f36e0efbab1acc31e15471fad88f828c75f6"}, + {file = "wrapt-2.0.1-cp38-cp38-win32.whl", hash = "sha256:115cae4beed3542e37866469a8a1f2b9ec549b4463572b000611e9946b86e6f6"}, + {file = "wrapt-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c4012a2bd37059d04f8209916aa771dfb564cccb86079072bdcd48a308b6a5c5"}, + {file = "wrapt-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:68424221a2dc00d634b54f92441914929c5ffb1c30b3b837343978343a3512a3"}, + {file = "wrapt-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6bd1a18f5a797fe740cb3d7a0e853a8ce6461cc62023b630caec80171a6b8097"}, + {file = "wrapt-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fb3a86e703868561c5cad155a15c36c716e1ab513b7065bd2ac8ed353c503333"}, + {file = "wrapt-2.0.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5dc1b852337c6792aa111ca8becff5bacf576bf4a0255b0f05eb749da6a1643e"}, + {file = "wrapt-2.0.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c046781d422f0830de6329fa4b16796096f28a92c8aef3850674442cdcb87b7f"}, + {file = "wrapt-2.0.1-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f73f9f7a0ebd0db139253d27e5fc8d2866ceaeef19c30ab5d69dcbe35e1a6981"}, + {file = "wrapt-2.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b667189cf8efe008f55bbda321890bef628a67ab4147ebf90d182f2dadc78790"}, + {file = "wrapt-2.0.1-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:a9a83618c4f0757557c077ef71d708ddd9847ed66b7cc63416632af70d3e2308"}, + {file = "wrapt-2.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e9b121e9aeb15df416c2c960b8255a49d44b4038016ee17af03975992d03931"}, + {file = "wrapt-2.0.1-cp39-cp39-win32.whl", hash = "sha256:1f186e26ea0a55f809f232e92cc8556a0977e00183c3ebda039a807a42be1494"}, + {file = "wrapt-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:bf4cb76f36be5de950ce13e22e7fdf462b35b04665a12b64f3ac5c1bbbcf3728"}, + {file = "wrapt-2.0.1-cp39-cp39-win_arm64.whl", hash = "sha256:d6cc985b9c8b235bd933990cdbf0f891f8e010b65a3911f7a55179cd7b0fc57b"}, + {file = "wrapt-2.0.1-py3-none-any.whl", hash = "sha256:4d2ce1bf1a48c5277d7969259232b57645aae5686dba1eaeade39442277afbca"}, + {file = "wrapt-2.0.1.tar.gz", hash = "sha256:9c9c635e78497cacb81e84f8b11b23e0aacac7a136e73b8e5b2109a1d9fc468f"}, ] +[package.extras] +dev = ["pytest", "setuptools"] + [[package]] name = "xgboost" version = "2.1.4" description = "XGBoost Python Package" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "xgboost-2.1.4-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl", hash = "sha256:78d88da184562deff25c820d943420342014dd55e0f4c017cc4563c2148df5ee"}, {file = "xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl", hash = "sha256:523db01d4e74b05c61a985028bde88a4dd380eadc97209310621996d7d5d14a7"}, @@ -6601,117 +7901,294 @@ plotting = ["graphviz", "matplotlib"] pyspark = ["cloudpickle", "pyspark", "scikit-learn"] scikit-learn = ["scikit-learn"] +[[package]] +name = "xxhash" +version = "3.6.0" +description = "Python binding for xxHash" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "xxhash-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87ff03d7e35c61435976554477a7f4cd1704c3596a89a8300d5ce7fc83874a71"}, + {file = "xxhash-3.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f572dfd3d0e2eb1a57511831cf6341242f5a9f8298a45862d085f5b93394a27d"}, + {file = "xxhash-3.6.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:89952ea539566b9fed2bbd94e589672794b4286f342254fad28b149f9615fef8"}, + {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e6f2ffb07a50b52465a1032c3cf1f4a5683f944acaca8a134a2f23674c2058"}, + {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b5b848ad6c16d308c3ac7ad4ba6bede80ed5df2ba8ed382f8932df63158dd4b2"}, + {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a034590a727b44dd8ac5914236a7b8504144447a9682586c3327e935f33ec8cc"}, + {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a8f1972e75ebdd161d7896743122834fe87378160c20e97f8b09166213bf8cc"}, + {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ee34327b187f002a596d7b167ebc59a1b729e963ce645964bbc050d2f1b73d07"}, + {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:339f518c3c7a850dd033ab416ea25a692759dc7478a71131fe8869010d2b75e4"}, + {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:bf48889c9630542d4709192578aebbd836177c9f7a4a2778a7d6340107c65f06"}, + {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:5576b002a56207f640636056b4160a378fe36a58db73ae5c27a7ec8db35f71d4"}, + {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af1f3278bd02814d6dedc5dec397993b549d6f16c19379721e5a1d31e132c49b"}, + {file = "xxhash-3.6.0-cp310-cp310-win32.whl", hash = "sha256:aed058764db109dc9052720da65fafe84873b05eb8b07e5e653597951af57c3b"}, + {file = "xxhash-3.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:e82da5670f2d0d98950317f82a0e4a0197150ff19a6df2ba40399c2a3b9ae5fb"}, + {file = "xxhash-3.6.0-cp310-cp310-win_arm64.whl", hash = "sha256:4a082ffff8c6ac07707fb6b671caf7c6e020c75226c561830b73d862060f281d"}, + {file = "xxhash-3.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b47bbd8cf2d72797f3c2772eaaac0ded3d3af26481a26d7d7d41dc2d3c46b04a"}, + {file = "xxhash-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2b6821e94346f96db75abaa6e255706fb06ebd530899ed76d32cd99f20dc52fa"}, + {file = "xxhash-3.6.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d0a9751f71a1a65ce3584e9cae4467651c7e70c9d31017fa57574583a4540248"}, + {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b29ee68625ab37b04c0b40c3fafdf24d2f75ccd778333cfb698f65f6c463f62"}, + {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6812c25fe0d6c36a46ccb002f40f27ac903bf18af9f6dd8f9669cb4d176ab18f"}, + {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ccbff013972390b51a18ef1255ef5ac125c92dc9143b2d1909f59abc765540e"}, + {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:297b7fbf86c82c550e12e8fb71968b3f033d27b874276ba3624ea868c11165a8"}, + {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dea26ae1eb293db089798d3973a5fc928a18fdd97cc8801226fae705b02b14b0"}, + {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7a0b169aafb98f4284f73635a8e93f0735f9cbde17bd5ec332480484241aaa77"}, + {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:08d45aef063a4531b785cd72de4887766d01dc8f362a515693df349fdb825e0c"}, + {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929142361a48ee07f09121fe9e96a84950e8d4df3bb298ca5d88061969f34d7b"}, + {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:51312c768403d8540487dbbfb557454cfc55589bbde6424456951f7fcd4facb3"}, + {file = "xxhash-3.6.0-cp311-cp311-win32.whl", hash = "sha256:d1927a69feddc24c987b337ce81ac15c4720955b667fe9b588e02254b80446fd"}, + {file = "xxhash-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:26734cdc2d4ffe449b41d186bbeac416f704a482ed835d375a5c0cb02bc63fef"}, + {file = "xxhash-3.6.0-cp311-cp311-win_arm64.whl", hash = "sha256:d72f67ef8bf36e05f5b6c65e8524f265bd61071471cd4cf1d36743ebeeeb06b7"}, + {file = "xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c"}, + {file = "xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204"}, + {file = "xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490"}, + {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2"}, + {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa"}, + {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0"}, + {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2"}, + {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bd17fede52a17a4f9a7bc4472a5867cb0b160deeb431795c0e4abe158bc784e9"}, + {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6fb5f5476bef678f69db04f2bd1efbed3030d2aba305b0fc1773645f187d6a4e"}, + {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374"}, + {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d"}, + {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae"}, + {file = "xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb"}, + {file = "xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c"}, + {file = "xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829"}, + {file = "xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:599e64ba7f67472481ceb6ee80fa3bd828fd61ba59fb11475572cc5ee52b89ec"}, + {file = "xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d8b8aaa30fca4f16f0c84a5c8d7ddee0e25250ec2796c973775373257dde8f1"}, + {file = "xxhash-3.6.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d597acf8506d6e7101a4a44a5e428977a51c0fadbbfd3c39650cca9253f6e5a6"}, + {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:858dc935963a33bc33490128edc1c12b0c14d9c7ebaa4e387a7869ecc4f3e263"}, + {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546"}, + {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89"}, + {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d"}, + {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f22927652cba98c44639ffdc7aaf35828dccf679b10b31c4ad72a5b530a18eb7"}, + {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b45fad44d9c5c119e9c6fbf2e1c656a46dc68e280275007bbfd3d572b21426db"}, + {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42"}, + {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11"}, + {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd"}, + {file = "xxhash-3.6.0-cp313-cp313-win32.whl", hash = "sha256:2577b276e060b73b73a53042ea5bd5203d3e6347ce0d09f98500f418a9fcf799"}, + {file = "xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392"}, + {file = "xxhash-3.6.0-cp313-cp313-win_arm64.whl", hash = "sha256:457b8f85dec5825eed7b69c11ae86834a018b8e3df5e77783c999663da2f96d6"}, + {file = "xxhash-3.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a42e633d75cdad6d625434e3468126c73f13f7584545a9cf34e883aa1710e702"}, + {file = "xxhash-3.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:568a6d743219e717b07b4e03b0a828ce593833e498c3b64752e0f5df6bfe84db"}, + {file = "xxhash-3.6.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bec91b562d8012dae276af8025a55811b875baace6af510412a5e58e3121bc54"}, + {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78e7f2f4c521c30ad5e786fdd6bae89d47a32672a80195467b5de0480aa97b1f"}, + {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5"}, + {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1"}, + {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee"}, + {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:794fe9145fe60191c6532fa95063765529770edcdd67b3d537793e8004cabbfd"}, + {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:6105ef7e62b5ac73a837778efc331a591d8442f8ef5c7e102376506cb4ae2729"}, + {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292"}, + {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf"}, + {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033"}, + {file = "xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec"}, + {file = "xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8"}, + {file = "xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746"}, + {file = "xxhash-3.6.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a40a3d35b204b7cc7643cbcf8c9976d818cb47befcfac8bbefec8038ac363f3e"}, + {file = "xxhash-3.6.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a54844be970d3fc22630b32d515e79a90d0a3ddb2644d8d7402e3c4c8da61405"}, + {file = "xxhash-3.6.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:016e9190af8f0a4e3741343777710e3d5717427f175adfdc3e72508f59e2a7f3"}, + {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f6f72232f849eb9d0141e2ebe2677ece15adfd0fa599bc058aad83c714bb2c6"}, + {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63275a8aba7865e44b1813d2177e0f5ea7eadad3dd063a21f7cf9afdc7054063"}, + {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cd01fa2aa00d8b017c97eb46b9a794fbdca53fc14f845f5a328c71254b0abb7"}, + {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0226aa89035b62b6a86d3c68df4d7c1f47a342b8683da2b60cedcddb46c4d95b"}, + {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c6e193e9f56e4ca4923c61238cdaced324f0feac782544eb4c6d55ad5cc99ddd"}, + {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9176dcaddf4ca963d4deb93866d739a343c01c969231dbe21680e13a5d1a5bf0"}, + {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c1ce4009c97a752e682b897aa99aef84191077a9433eb237774689f14f8ec152"}, + {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:8cb2f4f679b01513b7adbb9b1b2f0f9cdc31b70007eaf9d59d0878809f385b11"}, + {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:653a91d7c2ab54a92c19ccf43508b6a555440b9be1bc8be553376778be7f20b5"}, + {file = "xxhash-3.6.0-cp314-cp314-win32.whl", hash = "sha256:a756fe893389483ee8c394d06b5ab765d96e68fbbfe6fde7aa17e11f5720559f"}, + {file = "xxhash-3.6.0-cp314-cp314-win_amd64.whl", hash = "sha256:39be8e4e142550ef69629c9cd71b88c90e9a5db703fecbcf265546d9536ca4ad"}, + {file = "xxhash-3.6.0-cp314-cp314-win_arm64.whl", hash = "sha256:25915e6000338999236f1eb68a02a32c3275ac338628a7eaa5a269c401995679"}, + {file = "xxhash-3.6.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c5294f596a9017ca5a3e3f8884c00b91ab2ad2933cf288f4923c3fd4346cf3d4"}, + {file = "xxhash-3.6.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1cf9dcc4ab9cff01dfbba78544297a3a01dafd60f3bde4e2bfd016cf7e4ddc67"}, + {file = "xxhash-3.6.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:01262da8798422d0685f7cef03b2bd3f4f46511b02830861df548d7def4402ad"}, + {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51a73fb7cb3a3ead9f7a8b583ffd9b8038e277cdb8cb87cf890e88b3456afa0b"}, + {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b9c6df83594f7df8f7f708ce5ebeacfc69f72c9fbaaababf6cf4758eaada0c9b"}, + {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:627f0af069b0ea56f312fd5189001c24578868643203bca1abbc2c52d3a6f3ca"}, + {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa912c62f842dfd013c5f21a642c9c10cd9f4c4e943e0af83618b4a404d9091a"}, + {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b465afd7909db30168ab62afe40b2fcf79eedc0b89a6c0ab3123515dc0df8b99"}, + {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a881851cf38b0a70e7c4d3ce81fc7afd86fbc2a024f4cfb2a97cf49ce04b75d3"}, + {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9b3222c686a919a0f3253cfc12bb118b8b103506612253b5baeaac10d8027cf6"}, + {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:c5aa639bc113e9286137cec8fadc20e9cd732b2cc385c0b7fa673b84fc1f2a93"}, + {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5c1343d49ac102799905e115aee590183c3921d475356cb24b4de29a4bc56518"}, + {file = "xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119"}, + {file = "xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f"}, + {file = "xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95"}, + {file = "xxhash-3.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7dac94fad14a3d1c92affb661021e1d5cbcf3876be5f5b4d90730775ccb7ac41"}, + {file = "xxhash-3.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6965e0e90f1f0e6cb78da568c13d4a348eeb7f40acfd6d43690a666a459458b8"}, + {file = "xxhash-3.6.0-cp38-cp38-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2ab89a6b80f22214b43d98693c30da66af910c04f9858dd39c8e570749593d7e"}, + {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4903530e866b7a9c1eadfd3fa2fbe1b97d3aed4739a80abf506eb9318561c850"}, + {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4da8168ae52c01ac64c511d6f4a709479da8b7a4a1d7621ed51652f93747dffa"}, + {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:97460eec202017f719e839a0d3551fbc0b2fcc9c6c6ffaa5af85bbd5de432788"}, + {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:45aae0c9df92e7fa46fbb738737324a563c727990755ec1965a6a339ea10a1df"}, + {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:0d50101e57aad86f4344ca9b32d091a2135a9d0a4396f19133426c88025b09f1"}, + {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9085e798c163ce310d91f8aa6b325dda3c2944c93c6ce1edb314030d4167cc65"}, + {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:a87f271a33fad0e5bf3be282be55d78df3a45ae457950deb5241998790326f87"}, + {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:9e040d3e762f84500961791fa3709ffa4784d4dcd7690afc655c095e02fff05f"}, + {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:b0359391c3dad6de872fefb0cf5b69d55b0655c55ee78b1bb7a568979b2ce96b"}, + {file = "xxhash-3.6.0-cp38-cp38-win32.whl", hash = "sha256:e4ff728a2894e7f436b9e94c667b0f426b9c74b71f900cf37d5468c6b5da0536"}, + {file = "xxhash-3.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:01be0c5b500c5362871fc9cfdf58c69b3e5c4f531a82229ddb9eb1eb14138004"}, + {file = "xxhash-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cc604dc06027dbeb8281aeac5899c35fcfe7c77b25212833709f0bff4ce74d2a"}, + {file = "xxhash-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:277175a73900ad43a8caeb8b99b9604f21fe8d7c842f2f9061a364a7e220ddb7"}, + {file = "xxhash-3.6.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfbc5b91397c8c2972fdac13fb3e4ed2f7f8ccac85cd2c644887557780a9b6e2"}, + {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2762bfff264c4e73c0e507274b40634ff465e025f0eaf050897e88ec8367575d"}, + {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f171a900d59d51511209f7476933c34a0c2c711078d3c80e74e0fe4f38680ec"}, + {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:780b90c313348f030b811efc37b0fa1431163cb8db8064cf88a7936b6ce5f222"}, + {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b242455eccdfcd1fa4134c431a30737d2b4f045770f8fe84356b3469d4b919"}, + {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a75ffc1bd5def584129774c158e108e5d768e10b75813f2b32650bb041066ed6"}, + {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1fc1ed882d1e8df932a66e2999429ba6cc4d5172914c904ab193381fba825360"}, + {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:44e342e8cc11b4e79dae5c57f2fb6360c3c20cc57d32049af8f567f5b4bcb5f4"}, + {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c2f9ccd5c4be370939a2e17602fbc49995299203da72a3429db013d44d590e86"}, + {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:02ea4cb627c76f48cd9fb37cf7ab22bd51e57e1b519807234b473faebe526796"}, + {file = "xxhash-3.6.0-cp39-cp39-win32.whl", hash = "sha256:6551880383f0e6971dc23e512c9ccc986147ce7bfa1cd2e4b520b876c53e9f3d"}, + {file = "xxhash-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:7c35c4cdc65f2a29f34425c446f2f5cdcd0e3c34158931e1cc927ece925ab802"}, + {file = "xxhash-3.6.0-cp39-cp39-win_arm64.whl", hash = "sha256:ffc578717a347baf25be8397cb10d2528802d24f94cfc005c0e44fef44b5cdd6"}, + {file = "xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0"}, + {file = "xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296"}, + {file = "xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13"}, + {file = "xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6dc31591899f5e5666f04cc2e529e69b4072827085c1ef15294d91a004bc1bd"}, + {file = "xxhash-3.6.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15e0dac10eb9309508bfc41f7f9deaa7755c69e35af835db9cb10751adebc35d"}, + {file = "xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6"}, +] + [[package]] name = "yarl" -version = "1.20.0" +version = "1.22.0" description = "Yet another URL library" optional = false python-versions = ">=3.9" -files = [ - {file = "yarl-1.20.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f1f6670b9ae3daedb325fa55fbe31c22c8228f6e0b513772c2e1c623caa6ab22"}, - {file = "yarl-1.20.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85a231fa250dfa3308f3c7896cc007a47bc76e9e8e8595c20b7426cac4884c62"}, - {file = "yarl-1.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a06701b647c9939d7019acdfa7ebbfbb78ba6aa05985bb195ad716ea759a569"}, - {file = "yarl-1.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7595498d085becc8fb9203aa314b136ab0516c7abd97e7d74f7bb4eb95042abe"}, - {file = "yarl-1.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:af5607159085dcdb055d5678fc2d34949bd75ae6ea6b4381e784bbab1c3aa195"}, - {file = "yarl-1.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:95b50910e496567434cb77a577493c26bce0f31c8a305135f3bda6a2483b8e10"}, - {file = "yarl-1.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b594113a301ad537766b4e16a5a6750fcbb1497dcc1bc8a4daae889e6402a634"}, - {file = "yarl-1.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:083ce0393ea173cd37834eb84df15b6853b555d20c52703e21fbababa8c129d2"}, - {file = "yarl-1.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f1a350a652bbbe12f666109fbddfdf049b3ff43696d18c9ab1531fbba1c977a"}, - {file = "yarl-1.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:fb0caeac4a164aadce342f1597297ec0ce261ec4532bbc5a9ca8da5622f53867"}, - {file = "yarl-1.20.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:d88cc43e923f324203f6ec14434fa33b85c06d18d59c167a0637164863b8e995"}, - {file = "yarl-1.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e52d6ed9ea8fd3abf4031325dc714aed5afcbfa19ee4a89898d663c9976eb487"}, - {file = "yarl-1.20.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ce360ae48a5e9961d0c730cf891d40698a82804e85f6e74658fb175207a77cb2"}, - {file = "yarl-1.20.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:06d06c9d5b5bc3eb56542ceeba6658d31f54cf401e8468512447834856fb0e61"}, - {file = "yarl-1.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c27d98f4e5c4060582f44e58309c1e55134880558f1add7a87c1bc36ecfade19"}, - {file = "yarl-1.20.0-cp310-cp310-win32.whl", hash = "sha256:f4d3fa9b9f013f7050326e165c3279e22850d02ae544ace285674cb6174b5d6d"}, - {file = "yarl-1.20.0-cp310-cp310-win_amd64.whl", hash = "sha256:bc906b636239631d42eb8a07df8359905da02704a868983265603887ed68c076"}, - {file = "yarl-1.20.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fdb5204d17cb32b2de2d1e21c7461cabfacf17f3645e4b9039f210c5d3378bf3"}, - {file = "yarl-1.20.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eaddd7804d8e77d67c28d154ae5fab203163bd0998769569861258e525039d2a"}, - {file = "yarl-1.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:634b7ba6b4a85cf67e9df7c13a7fb2e44fa37b5d34501038d174a63eaac25ee2"}, - {file = "yarl-1.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d409e321e4addf7d97ee84162538c7258e53792eb7c6defd0c33647d754172e"}, - {file = "yarl-1.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:ea52f7328a36960ba3231c6677380fa67811b414798a6e071c7085c57b6d20a9"}, - {file = "yarl-1.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8703517b924463994c344dcdf99a2d5ce9eca2b6882bb640aa555fb5efc706a"}, - {file = "yarl-1.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:077989b09ffd2f48fb2d8f6a86c5fef02f63ffe6b1dd4824c76de7bb01e4f2e2"}, - {file = "yarl-1.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0acfaf1da020253f3533526e8b7dd212838fdc4109959a2c53cafc6db611bff2"}, - {file = "yarl-1.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4230ac0b97ec5eeb91d96b324d66060a43fd0d2a9b603e3327ed65f084e41f8"}, - {file = "yarl-1.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a6a1e6ae21cdd84011c24c78d7a126425148b24d437b5702328e4ba640a8902"}, - {file = "yarl-1.20.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:86de313371ec04dd2531f30bc41a5a1a96f25a02823558ee0f2af0beaa7ca791"}, - {file = "yarl-1.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:dd59c9dd58ae16eaa0f48c3d0cbe6be8ab4dc7247c3ff7db678edecbaf59327f"}, - {file = "yarl-1.20.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a0bc5e05f457b7c1994cc29e83b58f540b76234ba6b9648a4971ddc7f6aa52da"}, - {file = "yarl-1.20.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c9471ca18e6aeb0e03276b5e9b27b14a54c052d370a9c0c04a68cefbd1455eb4"}, - {file = "yarl-1.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:40ed574b4df723583a26c04b298b283ff171bcc387bc34c2683235e2487a65a5"}, - {file = "yarl-1.20.0-cp311-cp311-win32.whl", hash = "sha256:db243357c6c2bf3cd7e17080034ade668d54ce304d820c2a58514a4e51d0cfd6"}, - {file = "yarl-1.20.0-cp311-cp311-win_amd64.whl", hash = "sha256:8c12cd754d9dbd14204c328915e23b0c361b88f3cffd124129955e60a4fbfcfb"}, - {file = "yarl-1.20.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e06b9f6cdd772f9b665e5ba8161968e11e403774114420737f7884b5bd7bdf6f"}, - {file = "yarl-1.20.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b9ae2fbe54d859b3ade40290f60fe40e7f969d83d482e84d2c31b9bff03e359e"}, - {file = "yarl-1.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d12b8945250d80c67688602c891237994d203d42427cb14e36d1a732eda480e"}, - {file = "yarl-1.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:087e9731884621b162a3e06dc0d2d626e1542a617f65ba7cc7aeab279d55ad33"}, - {file = "yarl-1.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:69df35468b66c1a6e6556248e6443ef0ec5f11a7a4428cf1f6281f1879220f58"}, - {file = "yarl-1.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b2992fe29002fd0d4cbaea9428b09af9b8686a9024c840b8a2b8f4ea4abc16f"}, - {file = "yarl-1.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4c903e0b42aab48abfbac668b5a9d7b6938e721a6341751331bcd7553de2dcae"}, - {file = "yarl-1.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf099e2432131093cc611623e0b0bcc399b8cddd9a91eded8bfb50402ec35018"}, - {file = "yarl-1.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a7f62f5dc70a6c763bec9ebf922be52aa22863d9496a9a30124d65b489ea672"}, - {file = "yarl-1.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:54ac15a8b60382b2bcefd9a289ee26dc0920cf59b05368c9b2b72450751c6eb8"}, - {file = "yarl-1.20.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:25b3bc0763a7aca16a0f1b5e8ef0f23829df11fb539a1b70476dcab28bd83da7"}, - {file = "yarl-1.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b2586e36dc070fc8fad6270f93242124df68b379c3a251af534030a4a33ef594"}, - {file = "yarl-1.20.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:866349da9d8c5290cfefb7fcc47721e94de3f315433613e01b435473be63daa6"}, - {file = "yarl-1.20.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:33bb660b390a0554d41f8ebec5cd4475502d84104b27e9b42f5321c5192bfcd1"}, - {file = "yarl-1.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:737e9f171e5a07031cbee5e9180f6ce21a6c599b9d4b2c24d35df20a52fabf4b"}, - {file = "yarl-1.20.0-cp312-cp312-win32.whl", hash = "sha256:839de4c574169b6598d47ad61534e6981979ca2c820ccb77bf70f4311dd2cc64"}, - {file = "yarl-1.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:3d7dbbe44b443b0c4aa0971cb07dcb2c2060e4a9bf8d1301140a33a93c98e18c"}, - {file = "yarl-1.20.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2137810a20b933b1b1b7e5cf06a64c3ed3b4747b0e5d79c9447c00db0e2f752f"}, - {file = "yarl-1.20.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:447c5eadd750db8389804030d15f43d30435ed47af1313303ed82a62388176d3"}, - {file = "yarl-1.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42fbe577272c203528d402eec8bf4b2d14fd49ecfec92272334270b850e9cd7d"}, - {file = "yarl-1.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18e321617de4ab170226cd15006a565d0fa0d908f11f724a2c9142d6b2812ab0"}, - {file = "yarl-1.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4345f58719825bba29895011e8e3b545e6e00257abb984f9f27fe923afca2501"}, - {file = "yarl-1.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d9b980d7234614bc4674468ab173ed77d678349c860c3af83b1fffb6a837ddc"}, - {file = "yarl-1.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af4baa8a445977831cbaa91a9a84cc09debb10bc8391f128da2f7bd070fc351d"}, - {file = "yarl-1.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123393db7420e71d6ce40d24885a9e65eb1edefc7a5228db2d62bcab3386a5c0"}, - {file = "yarl-1.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab47acc9332f3de1b39e9b702d9c916af7f02656b2a86a474d9db4e53ef8fd7a"}, - {file = "yarl-1.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4a34c52ed158f89876cba9c600b2c964dfc1ca52ba7b3ab6deb722d1d8be6df2"}, - {file = "yarl-1.20.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:04d8cfb12714158abf2618f792c77bc5c3d8c5f37353e79509608be4f18705c9"}, - {file = "yarl-1.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7dc63ad0d541c38b6ae2255aaa794434293964677d5c1ec5d0116b0e308031f5"}, - {file = "yarl-1.20.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d02b591a64e4e6ca18c5e3d925f11b559c763b950184a64cf47d74d7e41877"}, - {file = "yarl-1.20.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:95fc9876f917cac7f757df80a5dda9de59d423568460fe75d128c813b9af558e"}, - {file = "yarl-1.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bb769ae5760cd1c6a712135ee7915f9d43f11d9ef769cb3f75a23e398a92d384"}, - {file = "yarl-1.20.0-cp313-cp313-win32.whl", hash = "sha256:70e0c580a0292c7414a1cead1e076c9786f685c1fc4757573d2967689b370e62"}, - {file = "yarl-1.20.0-cp313-cp313-win_amd64.whl", hash = "sha256:4c43030e4b0af775a85be1fa0433119b1565673266a70bf87ef68a9d5ba3174c"}, - {file = "yarl-1.20.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b6c4c3d0d6a0ae9b281e492b1465c72de433b782e6b5001c8e7249e085b69051"}, - {file = "yarl-1.20.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8681700f4e4df891eafa4f69a439a6e7d480d64e52bf460918f58e443bd3da7d"}, - {file = "yarl-1.20.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:84aeb556cb06c00652dbf87c17838eb6d92cfd317799a8092cee0e570ee11229"}, - {file = "yarl-1.20.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f166eafa78810ddb383e930d62e623d288fb04ec566d1b4790099ae0f31485f1"}, - {file = "yarl-1.20.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5d3d6d14754aefc7a458261027a562f024d4f6b8a798adb472277f675857b1eb"}, - {file = "yarl-1.20.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a8f64df8ed5d04c51260dbae3cc82e5649834eebea9eadfd829837b8093eb00"}, - {file = "yarl-1.20.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d9949eaf05b4d30e93e4034a7790634bbb41b8be2d07edd26754f2e38e491de"}, - {file = "yarl-1.20.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c366b254082d21cc4f08f522ac201d0d83a8b8447ab562732931d31d80eb2a5"}, - {file = "yarl-1.20.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91bc450c80a2e9685b10e34e41aef3d44ddf99b3a498717938926d05ca493f6a"}, - {file = "yarl-1.20.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9c2aa4387de4bc3a5fe158080757748d16567119bef215bec643716b4fbf53f9"}, - {file = "yarl-1.20.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:d2cbca6760a541189cf87ee54ff891e1d9ea6406079c66341008f7ef6ab61145"}, - {file = "yarl-1.20.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:798a5074e656f06b9fad1a162be5a32da45237ce19d07884d0b67a0aa9d5fdda"}, - {file = "yarl-1.20.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f106e75c454288472dbe615accef8248c686958c2e7dd3b8d8ee2669770d020f"}, - {file = "yarl-1.20.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:3b60a86551669c23dc5445010534d2c5d8a4e012163218fc9114e857c0586fdd"}, - {file = "yarl-1.20.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e429857e341d5e8e15806118e0294f8073ba9c4580637e59ab7b238afca836f"}, - {file = "yarl-1.20.0-cp313-cp313t-win32.whl", hash = "sha256:65a4053580fe88a63e8e4056b427224cd01edfb5f951498bfefca4052f0ce0ac"}, - {file = "yarl-1.20.0-cp313-cp313t-win_amd64.whl", hash = "sha256:53b2da3a6ca0a541c1ae799c349788d480e5144cac47dba0266c7cb6c76151fe"}, - {file = "yarl-1.20.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:119bca25e63a7725b0c9d20ac67ca6d98fa40e5a894bd5d4686010ff73397914"}, - {file = "yarl-1.20.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35d20fb919546995f1d8c9e41f485febd266f60e55383090010f272aca93edcc"}, - {file = "yarl-1.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:484e7a08f72683c0f160270566b4395ea5412b4359772b98659921411d32ad26"}, - {file = "yarl-1.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d8a3d54a090e0fff5837cd3cc305dd8a07d3435a088ddb1f65e33b322f66a94"}, - {file = "yarl-1.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f0cf05ae2d3d87a8c9022f3885ac6dea2b751aefd66a4f200e408a61ae9b7f0d"}, - {file = "yarl-1.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a884b8974729e3899d9287df46f015ce53f7282d8d3340fa0ed57536b440621c"}, - {file = "yarl-1.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f8d8aa8dd89ffb9a831fedbcb27d00ffd9f4842107d52dc9d57e64cb34073d5c"}, - {file = "yarl-1.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b4e88d6c3c8672f45a30867817e4537df1bbc6f882a91581faf1f6d9f0f1b5a"}, - {file = "yarl-1.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdb77efde644d6f1ad27be8a5d67c10b7f769804fff7a966ccb1da5a4de4b656"}, - {file = "yarl-1.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4ba5e59f14bfe8d261a654278a0f6364feef64a794bd456a8c9e823071e5061c"}, - {file = "yarl-1.20.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:d0bf955b96ea44ad914bc792c26a0edcd71b4668b93cbcd60f5b0aeaaed06c64"}, - {file = "yarl-1.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:27359776bc359ee6eaefe40cb19060238f31228799e43ebd3884e9c589e63b20"}, - {file = "yarl-1.20.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:04d9c7a1dc0a26efb33e1acb56c8849bd57a693b85f44774356c92d610369efa"}, - {file = "yarl-1.20.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:faa709b66ae0e24c8e5134033187a972d849d87ed0a12a0366bedcc6b5dc14a5"}, - {file = "yarl-1.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:44869ee8538208fe5d9342ed62c11cc6a7a1af1b3d0bb79bb795101b6e77f6e0"}, - {file = "yarl-1.20.0-cp39-cp39-win32.whl", hash = "sha256:b7fa0cb9fd27ffb1211cde944b41f5c67ab1c13a13ebafe470b1e206b8459da8"}, - {file = "yarl-1.20.0-cp39-cp39-win_amd64.whl", hash = "sha256:d4fad6e5189c847820288286732075f213eabf81be4d08d6cc309912e62be5b7"}, - {file = "yarl-1.20.0-py3-none-any.whl", hash = "sha256:5d0fe6af927a47a230f31e6004621fd0959eaa915fc62acfafa67ff7229a3124"}, - {file = "yarl-1.20.0.tar.gz", hash = "sha256:686d51e51ee5dfe62dec86e4866ee0e9ed66df700d55c828a615640adc885307"}, +groups = ["main"] +files = [ + {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c7bd6683587567e5a49ee6e336e0612bec8329be1b7d4c8af5687dcdeb67ee1e"}, + {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5cdac20da754f3a723cceea5b3448e1a2074866406adeb4ef35b469d089adb8f"}, + {file = "yarl-1.22.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07a524d84df0c10f41e3ee918846e1974aba4ec017f990dc735aad487a0bdfdf"}, + {file = "yarl-1.22.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b329cb8146d7b736677a2440e422eadd775d1806a81db2d4cded80a48efc1a"}, + {file = "yarl-1.22.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:75976c6945d85dbb9ee6308cd7ff7b1fb9409380c82d6119bd778d8fcfe2931c"}, + {file = "yarl-1.22.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:80ddf7a5f8c86cb3eb4bc9028b07bbbf1f08a96c5c0bc1244be5e8fefcb94147"}, + {file = "yarl-1.22.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d332fc2e3c94dad927f2112395772a4e4fedbcf8f80efc21ed7cdfae4d574fdb"}, + {file = "yarl-1.22.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cf71bf877efeac18b38d3930594c0948c82b64547c1cf420ba48722fe5509f6"}, + {file = "yarl-1.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:663e1cadaddae26be034a6ab6072449a8426ddb03d500f43daf952b74553bba0"}, + {file = "yarl-1.22.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:6dcbb0829c671f305be48a7227918cfcd11276c2d637a8033a99a02b67bf9eda"}, + {file = "yarl-1.22.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f0d97c18dfd9a9af4490631905a3f131a8e4c9e80a39353919e2cfed8f00aedc"}, + {file = "yarl-1.22.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:437840083abe022c978470b942ff832c3940b2ad3734d424b7eaffcd07f76737"}, + {file = "yarl-1.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a899cbd98dce6f5d8de1aad31cb712ec0a530abc0a86bd6edaa47c1090138467"}, + {file = "yarl-1.22.0-cp310-cp310-win32.whl", hash = "sha256:595697f68bd1f0c1c159fcb97b661fc9c3f5db46498043555d04805430e79bea"}, + {file = "yarl-1.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb95a9b1adaa48e41815a55ae740cfda005758104049a640a398120bf02515ca"}, + {file = "yarl-1.22.0-cp310-cp310-win_arm64.whl", hash = "sha256:b85b982afde6df99ecc996990d4ad7ccbdbb70e2a4ba4de0aecde5922ba98a0b"}, + {file = "yarl-1.22.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1ab72135b1f2db3fed3997d7e7dc1b80573c67138023852b6efb336a5eae6511"}, + {file = "yarl-1.22.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:669930400e375570189492dc8d8341301578e8493aec04aebc20d4717f899dd6"}, + {file = "yarl-1.22.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:792a2af6d58177ef7c19cbf0097aba92ca1b9cb3ffdd9c7470e156c8f9b5e028"}, + {file = "yarl-1.22.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea66b1c11c9150f1372f69afb6b8116f2dd7286f38e14ea71a44eee9ec51b9d"}, + {file = "yarl-1.22.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3e2daa88dc91870215961e96a039ec73e4937da13cf77ce17f9cad0c18df3503"}, + {file = "yarl-1.22.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba440ae430c00eee41509353628600212112cd5018d5def7e9b05ea7ac34eb65"}, + {file = "yarl-1.22.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e6438cc8f23a9c1478633d216b16104a586b9761db62bfacb6425bac0a36679e"}, + {file = "yarl-1.22.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c52a6e78aef5cf47a98ef8e934755abf53953379b7d53e68b15ff4420e6683d"}, + {file = "yarl-1.22.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3b06bcadaac49c70f4c88af4ffcfbe3dc155aab3163e75777818092478bcbbe7"}, + {file = "yarl-1.22.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:6944b2dc72c4d7f7052683487e3677456050ff77fcf5e6204e98caf785ad1967"}, + {file = "yarl-1.22.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d5372ca1df0f91a86b047d1277c2aaf1edb32d78bbcefffc81b40ffd18f027ed"}, + {file = "yarl-1.22.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:51af598701f5299012b8416486b40fceef8c26fc87dc6d7d1f6fc30609ea0aa6"}, + {file = "yarl-1.22.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b266bd01fedeffeeac01a79ae181719ff848a5a13ce10075adbefc8f1daee70e"}, + {file = "yarl-1.22.0-cp311-cp311-win32.whl", hash = "sha256:a9b1ba5610a4e20f655258d5a1fdc7ebe3d837bb0e45b581398b99eb98b1f5ca"}, + {file = "yarl-1.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:078278b9b0b11568937d9509b589ee83ef98ed6d561dfe2020e24a9fd08eaa2b"}, + {file = "yarl-1.22.0-cp311-cp311-win_arm64.whl", hash = "sha256:b6a6f620cfe13ccec221fa312139135166e47ae169f8253f72a0abc0dae94376"}, + {file = "yarl-1.22.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e340382d1afa5d32b892b3ff062436d592ec3d692aeea3bef3a5cfe11bbf8c6f"}, + {file = "yarl-1.22.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f1e09112a2c31ffe8d80be1b0988fa6a18c5d5cad92a9ffbb1c04c91bfe52ad2"}, + {file = "yarl-1.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:939fe60db294c786f6b7c2d2e121576628468f65453d86b0fe36cb52f987bd74"}, + {file = "yarl-1.22.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1651bf8e0398574646744c1885a41198eba53dc8a9312b954073f845c90a8df"}, + {file = "yarl-1.22.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b8a0588521a26bf92a57a1705b77b8b59044cdceccac7151bd8d229e66b8dedb"}, + {file = "yarl-1.22.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:42188e6a615c1a75bcaa6e150c3fe8f3e8680471a6b10150c5f7e83f47cc34d2"}, + {file = "yarl-1.22.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f6d2cb59377d99718913ad9a151030d6f83ef420a2b8f521d94609ecc106ee82"}, + {file = "yarl-1.22.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50678a3b71c751d58d7908edc96d332af328839eea883bb554a43f539101277a"}, + {file = "yarl-1.22.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e8fbaa7cec507aa24ea27a01456e8dd4b6fab829059b69844bd348f2d467124"}, + {file = "yarl-1.22.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:433885ab5431bc3d3d4f2f9bd15bfa1614c522b0f1405d62c4f926ccd69d04fa"}, + {file = "yarl-1.22.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b790b39c7e9a4192dc2e201a282109ed2985a1ddbd5ac08dc56d0e121400a8f7"}, + {file = "yarl-1.22.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31f0b53913220599446872d757257be5898019c85e7971599065bc55065dc99d"}, + {file = "yarl-1.22.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a49370e8f711daec68d09b821a34e1167792ee2d24d405cbc2387be4f158b520"}, + {file = "yarl-1.22.0-cp312-cp312-win32.whl", hash = "sha256:70dfd4f241c04bd9239d53b17f11e6ab672b9f1420364af63e8531198e3f5fe8"}, + {file = "yarl-1.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:8884d8b332a5e9b88e23f60bb166890009429391864c685e17bd73a9eda9105c"}, + {file = "yarl-1.22.0-cp312-cp312-win_arm64.whl", hash = "sha256:ea70f61a47f3cc93bdf8b2f368ed359ef02a01ca6393916bc8ff877427181e74"}, + {file = "yarl-1.22.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8dee9c25c74997f6a750cd317b8ca63545169c098faee42c84aa5e506c819b53"}, + {file = "yarl-1.22.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01e73b85a5434f89fc4fe27dcda2aff08ddf35e4d47bbbea3bdcd25321af538a"}, + {file = "yarl-1.22.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:22965c2af250d20c873cdbee8ff958fb809940aeb2e74ba5f20aaf6b7ac8c70c"}, + {file = "yarl-1.22.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4f15793aa49793ec8d1c708ab7f9eded1aa72edc5174cae703651555ed1b601"}, + {file = "yarl-1.22.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5542339dcf2747135c5c85f68680353d5cb9ffd741c0f2e8d832d054d41f35a"}, + {file = "yarl-1.22.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5c401e05ad47a75869c3ab3e35137f8468b846770587e70d71e11de797d113df"}, + {file = "yarl-1.22.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:243dda95d901c733f5b59214d28b0120893d91777cb8aa043e6ef059d3cddfe2"}, + {file = "yarl-1.22.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bec03d0d388060058f5d291a813f21c011041938a441c593374da6077fe21b1b"}, + {file = "yarl-1.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0748275abb8c1e1e09301ee3cf90c8a99678a4e92e4373705f2a2570d581273"}, + {file = "yarl-1.22.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:47fdb18187e2a4e18fda2c25c05d8251a9e4a521edaed757fef033e7d8498d9a"}, + {file = "yarl-1.22.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c7044802eec4524fde550afc28edda0dd5784c4c45f0be151a2d3ba017daca7d"}, + {file = "yarl-1.22.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:139718f35149ff544caba20fce6e8a2f71f1e39b92c700d8438a0b1d2a631a02"}, + {file = "yarl-1.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e1b51bebd221006d3d2f95fbe124b22b247136647ae5dcc8c7acafba66e5ee67"}, + {file = "yarl-1.22.0-cp313-cp313-win32.whl", hash = "sha256:d3e32536234a95f513bd374e93d717cf6b2231a791758de6c509e3653f234c95"}, + {file = "yarl-1.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:47743b82b76d89a1d20b83e60d5c20314cbd5ba2befc9cda8f28300c4a08ed4d"}, + {file = "yarl-1.22.0-cp313-cp313-win_arm64.whl", hash = "sha256:5d0fcda9608875f7d052eff120c7a5da474a6796fe4d83e152e0e4d42f6d1a9b"}, + {file = "yarl-1.22.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:719ae08b6972befcba4310e49edb1161a88cdd331e3a694b84466bd938a6ab10"}, + {file = "yarl-1.22.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:47d8a5c446df1c4db9d21b49619ffdba90e77c89ec6e283f453856c74b50b9e3"}, + {file = "yarl-1.22.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cfebc0ac8333520d2d0423cbbe43ae43c8838862ddb898f5ca68565e395516e9"}, + {file = "yarl-1.22.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4398557cbf484207df000309235979c79c4356518fd5c99158c7d38203c4da4f"}, + {file = "yarl-1.22.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2ca6fd72a8cd803be290d42f2dec5cdcd5299eeb93c2d929bf060ad9efaf5de0"}, + {file = "yarl-1.22.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca1f59c4e1ab6e72f0a23c13fca5430f889634166be85dbf1013683e49e3278e"}, + {file = "yarl-1.22.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c5010a52015e7c70f86eb967db0f37f3c8bd503a695a49f8d45700144667708"}, + {file = "yarl-1.22.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d7672ecf7557476642c88497c2f8d8542f8e36596e928e9bcba0e42e1e7d71f"}, + {file = "yarl-1.22.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:3b7c88eeef021579d600e50363e0b6ee4f7f6f728cd3486b9d0f3ee7b946398d"}, + {file = "yarl-1.22.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f4afb5c34f2c6fecdcc182dfcfc6af6cccf1aa923eed4d6a12e9d96904e1a0d8"}, + {file = "yarl-1.22.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:59c189e3e99a59cf8d83cbb31d4db02d66cda5a1a4374e8a012b51255341abf5"}, + {file = "yarl-1.22.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:5a3bf7f62a289fa90f1990422dc8dff5a458469ea71d1624585ec3a4c8d6960f"}, + {file = "yarl-1.22.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:de6b9a04c606978fdfe72666fa216ffcf2d1a9f6a381058d4378f8d7b1e5de62"}, + {file = "yarl-1.22.0-cp313-cp313t-win32.whl", hash = "sha256:1834bb90991cc2999f10f97f5f01317f99b143284766d197e43cd5b45eb18d03"}, + {file = "yarl-1.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff86011bd159a9d2dfc89c34cfd8aff12875980e3bd6a39ff097887520e60249"}, + {file = "yarl-1.22.0-cp313-cp313t-win_arm64.whl", hash = "sha256:7861058d0582b847bc4e3a4a4c46828a410bca738673f35a29ba3ca5db0b473b"}, + {file = "yarl-1.22.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:34b36c2c57124530884d89d50ed2c1478697ad7473efd59cfd479945c95650e4"}, + {file = "yarl-1.22.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:0dd9a702591ca2e543631c2a017e4a547e38a5c0f29eece37d9097e04a7ac683"}, + {file = "yarl-1.22.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:594fcab1032e2d2cc3321bb2e51271e7cd2b516c7d9aee780ece81b07ff8244b"}, + {file = "yarl-1.22.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3d7a87a78d46a2e3d5b72587ac14b4c16952dd0887dbb051451eceac774411e"}, + {file = "yarl-1.22.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:852863707010316c973162e703bddabec35e8757e67fcb8ad58829de1ebc8590"}, + {file = "yarl-1.22.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:131a085a53bfe839a477c0845acf21efc77457ba2bcf5899618136d64f3303a2"}, + {file = "yarl-1.22.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:078a8aefd263f4d4f923a9677b942b445a2be970ca24548a8102689a3a8ab8da"}, + {file = "yarl-1.22.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bca03b91c323036913993ff5c738d0842fc9c60c4648e5c8d98331526df89784"}, + {file = "yarl-1.22.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:68986a61557d37bb90d3051a45b91fa3d5c516d177dfc6dd6f2f436a07ff2b6b"}, + {file = "yarl-1.22.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:4792b262d585ff0dff6bcb787f8492e40698443ec982a3568c2096433660c694"}, + {file = "yarl-1.22.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ebd4549b108d732dba1d4ace67614b9545b21ece30937a63a65dd34efa19732d"}, + {file = "yarl-1.22.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f87ac53513d22240c7d59203f25cc3beac1e574c6cd681bbfd321987b69f95fd"}, + {file = "yarl-1.22.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:22b029f2881599e2f1b06f8f1db2ee63bd309e2293ba2d566e008ba12778b8da"}, + {file = "yarl-1.22.0-cp314-cp314-win32.whl", hash = "sha256:6a635ea45ba4ea8238463b4f7d0e721bad669f80878b7bfd1f89266e2ae63da2"}, + {file = "yarl-1.22.0-cp314-cp314-win_amd64.whl", hash = "sha256:0d6e6885777af0f110b0e5d7e5dda8b704efed3894da26220b7f3d887b839a79"}, + {file = "yarl-1.22.0-cp314-cp314-win_arm64.whl", hash = "sha256:8218f4e98d3c10d683584cb40f0424f4b9fd6e95610232dd75e13743b070ee33"}, + {file = "yarl-1.22.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45c2842ff0e0d1b35a6bf1cd6c690939dacb617a70827f715232b2e0494d55d1"}, + {file = "yarl-1.22.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d947071e6ebcf2e2bee8fce76e10faca8f7a14808ca36a910263acaacef08eca"}, + {file = "yarl-1.22.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:334b8721303e61b00019474cc103bdac3d7b1f65e91f0bfedeec2d56dfe74b53"}, + {file = "yarl-1.22.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e7ce67c34138a058fd092f67d07a72b8e31ff0c9236e751957465a24b28910c"}, + {file = "yarl-1.22.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d77e1b2c6d04711478cb1c4ab90db07f1609ccf06a287d5607fcd90dc9863acf"}, + {file = "yarl-1.22.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4647674b6150d2cae088fc07de2738a84b8bcedebef29802cf0b0a82ab6face"}, + {file = "yarl-1.22.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efb07073be061c8f79d03d04139a80ba33cbd390ca8f0297aae9cce6411e4c6b"}, + {file = "yarl-1.22.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e51ac5435758ba97ad69617e13233da53908beccc6cfcd6c34bbed8dcbede486"}, + {file = "yarl-1.22.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33e32a0dd0c8205efa8e83d04fc9f19313772b78522d1bdc7d9aed706bfd6138"}, + {file = "yarl-1.22.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:bf4a21e58b9cde0e401e683ebd00f6ed30a06d14e93f7c8fd059f8b6e8f87b6a"}, + {file = "yarl-1.22.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:e4b582bab49ac33c8deb97e058cd67c2c50dac0dd134874106d9c774fd272529"}, + {file = "yarl-1.22.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:0b5bcc1a9c4839e7e30b7b30dd47fe5e7e44fb7054ec29b5bb8d526aa1041093"}, + {file = "yarl-1.22.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c0232bce2170103ec23c454e54a57008a9a72b5d1c3105dc2496750da8cfa47c"}, + {file = "yarl-1.22.0-cp314-cp314t-win32.whl", hash = "sha256:8009b3173bcd637be650922ac455946197d858b3630b6d8787aa9e5c4564533e"}, + {file = "yarl-1.22.0-cp314-cp314t-win_amd64.whl", hash = "sha256:9fb17ea16e972c63d25d4a97f016d235c78dd2344820eb35bc034bc32012ee27"}, + {file = "yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1"}, + {file = "yarl-1.22.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3aa27acb6de7a23785d81557577491f6c38a5209a254d1191519d07d8fe51748"}, + {file = "yarl-1.22.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:af74f05666a5e531289cb1cc9c883d1de2088b8e5b4de48004e5ca8a830ac859"}, + {file = "yarl-1.22.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:62441e55958977b8167b2709c164c91a6363e25da322d87ae6dd9c6019ceecf9"}, + {file = "yarl-1.22.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b580e71cac3f8113d3135888770903eaf2f507e9421e5697d6ee6d8cd1c7f054"}, + {file = "yarl-1.22.0-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e81fda2fb4a07eda1a2252b216aa0df23ebcd4d584894e9612e80999a78fd95b"}, + {file = "yarl-1.22.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:99b6fc1d55782461b78221e95fc357b47ad98b041e8e20f47c1411d0aacddc60"}, + {file = "yarl-1.22.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:088e4e08f033db4be2ccd1f34cf29fe994772fb54cfe004bbf54db320af56890"}, + {file = "yarl-1.22.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e4e1f6f0b4da23e61188676e3ed027ef0baa833a2e633c29ff8530800edccba"}, + {file = "yarl-1.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:84fc3ec96fce86ce5aa305eb4aa9358279d1aa644b71fab7b8ed33fe3ba1a7ca"}, + {file = "yarl-1.22.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:5dbeefd6ca588b33576a01b0ad58aa934bc1b41ef89dee505bf2932b22ddffba"}, + {file = "yarl-1.22.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:14291620375b1060613f4aab9ebf21850058b6b1b438f386cc814813d901c60b"}, + {file = "yarl-1.22.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:a4fcfc8eb2c34148c118dfa02e6427ca278bfd0f3df7c5f99e33d2c0e81eae3e"}, + {file = "yarl-1.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:029866bde8d7b0878b9c160e72305bbf0a7342bcd20b9999381704ae03308dc8"}, + {file = "yarl-1.22.0-cp39-cp39-win32.whl", hash = "sha256:4dcc74149ccc8bba31ce1944acee24813e93cfdee2acda3c172df844948ddf7b"}, + {file = "yarl-1.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:10619d9fdee46d20edc49d3479e2f8269d0779f1b031e6f7c2aa1c76be04b7ed"}, + {file = "yarl-1.22.0-cp39-cp39-win_arm64.whl", hash = "sha256:dd7afd3f8b0bfb4e0d9fc3c31bfe8a4ec7debe124cfd90619305def3c8ca8cd2"}, + {file = "yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff"}, + {file = "yarl-1.22.0.tar.gz", hash = "sha256:bebf8557577d4401ba8bd9ff33906f1376c877aa78d1fe216ad01b4d6745af71"}, ] [package.dependencies] @@ -6721,21 +8198,22 @@ propcache = ">=0.2.1" [[package]] name = "zipp" -version = "3.22.0" +version = "3.23.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "zipp-3.22.0-py3-none-any.whl", hash = "sha256:fe208f65f2aca48b81f9e6fd8cf7b8b32c26375266b009b413d45306b6148343"}, - {file = "zipp-3.22.0.tar.gz", hash = "sha256:dd2f28c3ce4bc67507bfd3781d21b7bb2be31103b51a4553ad7d90b84e57ace5"}, + {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"}, + {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib_resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] [extras] @@ -6744,6 +8222,6 @@ chatui = ["fastapi", "uvicorn", "websockets"] transformers = ["accelerate", "safetensors", "tokenizers", "transformers"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "46f431b72bf1852e5bd0f17a3f9b65b47a2c53b0cfdc900eceb7323683f5a282" +content-hash = "b2f8d7f28d8b81a7e7dd2128b7264a4d91d3fa1ddef9d33b0ffd54e52dbae0f2" diff --git a/pyproject.toml b/pyproject.toml index 47974ed1..9d919e23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ plexe = "plexe.main:main" [tool.poetry.dependencies] python = ">=3.11,<3.13" +python-dotenv = "^1.2.0" pandas = ">=1.5.0, <=2.2.0" imbalanced-learn = "^0.12.4" pydantic = "^2.9.2" @@ -51,14 +52,38 @@ mlflow = "^3.0.0rc2" ray = "^2.9.0" rich = "^13.7.1" torch = "^2.3.0" -smolagents = "^1.14.0" deprecated = "^1.2.18" +python-multipart = "^0.0.16" +psycopg2-binary = "^2.9.9" +featuretools = "^1.31.0" +sqlalchemy = "^2.0.0" +torch-geometric = "^2.5.0" +pytorch-frame = "^0.3.0" +pooch = "^1.8.0" +duckdb = "^1.3.0" +mcp = { version = "^1.25.0", extras = ["all"] } +scholarly = "^1.7.11" +kaggle = "^1.6.17" + + +# LangGraph and LangChain dependencies +langgraph = "^0.2.0" +langchain = "^0.3.0" +langchain-core = "^0.3.0" +langchain-openai = "^0.2.0" +langchain-anthropic = "^0.2.0" +langchain-google-genai = "^2.0.0" + +# External API clients +requests = "^2.32.0" +openml = "^0.14.0" # Deep learning dependencies transformers = { version = "^4.50.0", optional = true } tokenizers = { version = "^0.21.1", optional = true } accelerate = { version = "0.24.1", optional = true } safetensors = { version = "^0.4.1", optional = true } +sentence-transformers = { version = "^3.0.0", optional = true } # Local web serving dependencies for chat UI fastapi = { version = "^0.115.12", optional = true } @@ -66,13 +91,14 @@ uvicorn = {extras = ["standard"], version = "^0.34.2", optional = true } websockets = { version = "^12.0", optional = true } [tool.poetry.extras] -transformers = ["transformers", "tokenizers", "accelerate", "safetensors"] +transformers = ["transformers", "tokenizers", "accelerate", "safetensors", "sentence-transformers"] chatui = ["fastapi", "uvicorn", "websockets"] all = [ "transformers", "tokenizers", "accelerate", "safetensors", + "sentence-transformers", "fastapi", "uvicorn", "websockets" diff --git a/resources/backed-by-yc.png b/resources/backed-by-yc.png deleted file mode 100644 index 4199597e..00000000 Binary files a/resources/backed-by-yc.png and /dev/null differ diff --git a/resources/demo-thumbnail.png b/resources/demo-thumbnail.png deleted file mode 100644 index e139ca63..00000000 Binary files a/resources/demo-thumbnail.png and /dev/null differ diff --git a/scripts/.python-version b/scripts/.python-version new file mode 100644 index 00000000..c8cfe395 --- /dev/null +++ b/scripts/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/plexe/core/entities/__init__.py b/scripts/README.md similarity index 100% rename from plexe/core/entities/__init__.py rename to scripts/README.md diff --git a/scripts/download_relbench_csv.py b/scripts/download_relbench_csv.py new file mode 100644 index 00000000..0b9e9119 --- /dev/null +++ b/scripts/download_relbench_csv.py @@ -0,0 +1,28 @@ +import os +import pandas as pd +from relbench.datasets import get_dataset + +# Cấu hình đường dẫn khớp với script import bash của bạn +OUTPUT_DIR = "./data/database" +DATASET_NAME = "rel-f1" # Ví dụ: rel-stack, rel-amazon, rel-arxiv + +def export_to_csv(dataset_name, output_dir): + os.makedirs(output_dir, exist_ok=True) + print(f"Downloading and processing {dataset_name}...") + + # Tải dataset (sẽ cache tự động) + dataset = get_dataset(name=dataset_name, download=True) + db = dataset.make_db() + + print(f"Exporting tables to {output_dir}...") + for table_name, table in db.table_dict.items(): + df = table.df + + # Lưu ý: RelBench có thể chứa các cột object phức tạp, cần convert về string nếu cần + # Postgres COPY yêu cầu format CSV chuẩn + file_path = os.path.join(output_dir, f"{table_name}.csv") + df.to_csv(file_path, index=False, header=True) + print(f"-> Saved {table_name}.csv ({len(df)} rows)") + +if __name__ == "__main__": + export_to_csv(DATASET_NAME, OUTPUT_DIR) \ No newline at end of file diff --git a/scripts/dumpcode.py b/scripts/dumpcode.py deleted file mode 100644 index f2d8bc3a..00000000 --- a/scripts/dumpcode.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -This script collects all code files from the project directory and writes it to a single output file. - -The purpose of this script is to enable easily passing the entire codebase as context to a language model -with large context window, such as the Google Gemini models. -""" - -from pathlib import Path - -# === Config === -EXTENSIONS = {".py", ".md", ".jinja", ".yaml"} -TARGET_DIRS = {"plexe"} -ROOT_FILES = {"README.md"} # Loose files to include from root -OUTPUT_FILE = "plexe-full-codebase.txt" - - -def collect_files(base: Path): - files = [] - for target in TARGET_DIRS: - target_path = base / target - if target_path.is_dir(): - files.extend(f for f in target_path.rglob("*") if f.suffix in EXTENSIONS and f.is_file()) - # Include specified root files if they exist and match extension filter - files.extend(base / f for f in ROOT_FILES if (base / f).is_file() and (base / f).suffix in EXTENSIONS) - return files - - -def format_entry(rel_path: Path, content: str) -> str: - return f"## {rel_path}\n```\n{content}```\n\n\n" - - -def main(): - base = Path.cwd() - files = collect_files(base) - - with open(OUTPUT_FILE, "w", encoding="utf-8") as out: - out.write(f"# Full Codebase for {Path.cwd().name}\n\n") - for file in files: - rel_path = file.relative_to(base) - content = file.read_text(encoding="utf-8") - out.write(format_entry(rel_path, content)) - - -if __name__ == "__main__": - main() diff --git a/scripts/generate_relbench_sql.py b/scripts/generate_relbench_sql.py new file mode 100644 index 00000000..ddc3740c --- /dev/null +++ b/scripts/generate_relbench_sql.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +Generate SQL DDL and export CSV from RelBench datasets +Support any RelBench dataset: rel-f1, rel-amazon, rel-hm, rel-stack, etc. +""" + +import os +import sys +import csv +import argparse +import pandas as pd +from relbench.datasets import get_dataset +from pathlib import Path + + +def pandas_dtype_to_sql(dtype, col_name): + """Convert pandas dtype to PostgreSQL type""" + if pd.api.types.is_integer_dtype(dtype): + return "INTEGER" + elif pd.api.types.is_float_dtype(dtype): + return "FLOAT" + elif pd.api.types.is_bool_dtype(dtype): + return "BOOLEAN" + elif pd.api.types.is_datetime64_any_dtype(dtype): + return "TIMESTAMP" + else: + return "TEXT" + + +def get_primary_key_column(table): + """Get primary key column from table""" + if hasattr(table, 'pkey_col') and table.pkey_col: + return table.pkey_col + return None + + +def generate_create_table_sql(table_name, table): + """Generate SQL CREATE TABLE statement for PostgreSQL""" + sql = f"CREATE TABLE {table_name} (\n" + + columns = [] + for col_name in table.df.columns: + dtype = table.df[col_name].dtype + sql_type = pandas_dtype_to_sql(dtype, col_name) + col_def = f" {col_name} {sql_type}" + columns.append(col_def) + + sql += ",\n".join(columns) + + pkey = get_primary_key_column(table) + if pkey: + sql += f",\n PRIMARY KEY ({pkey})" + + sql += "\n);" + return sql + + +def generate_temp_table_sql(table_name, table): + """Generate temporary table with all columns as TEXT""" + sql = f"CREATE TEMP TABLE temp_{table_name} (\n" + columns = [f" {col} TEXT" for col in table.df.columns] + sql += ",\n".join(columns) + sql += "\n);" + return sql + + +def generate_insert_sql(table_name, table): + """Generate INSERT statement with type conversion""" + columns = [] + conversions = [] + + for col_name in table.df.columns: + dtype = table.df[col_name].dtype + columns.append(col_name) + + if pd.api.types.is_integer_dtype(dtype): + conversions.append(f" NULLIF({col_name}, '')::INTEGER") + elif pd.api.types.is_float_dtype(dtype): + conversions.append(f" NULLIF({col_name}, '')::FLOAT") + elif pd.api.types.is_bool_dtype(dtype): + conversions.append(f" NULLIF({col_name}, '')::BOOLEAN") + elif pd.api.types.is_datetime64_any_dtype(dtype): + conversions.append(f" NULLIF({col_name}, '')::TIMESTAMP") + else: + conversions.append(f" NULLIF({col_name}, '')") + + sql = f"INSERT INTO {table_name}\nSELECT \n" + sql += ",\n".join(conversions) + sql += f"\nFROM temp_{table_name};" + return sql + + +def generate_foreign_keys_sql(db): + """Generate ALTER TABLE statements for foreign keys""" + fk_statements = [] + + for table_name, table in db.table_dict.items(): + if hasattr(table, 'fkey_col_to_pkey_table'): + for fkey_col, ref_table in table.fkey_col_to_pkey_table.items(): + if ref_table in db.table_dict: + ref_pkey = get_primary_key_column(db.table_dict[ref_table]) + if ref_pkey: + fk_name = f"fk_{table_name}_{fkey_col}" + stmt = f"ALTER TABLE {table_name} ADD CONSTRAINT {fk_name}\n" + stmt += f" FOREIGN KEY ({fkey_col}) REFERENCES {ref_table}({ref_pkey});" + fk_statements.append(stmt) + + return fk_statements + + +def generate_indexes_sql(db): + """Generate indexes for foreign keys and important columns""" + index_statements = [] + + for table_name, table in db.table_dict.items(): + if hasattr(table, 'fkey_col_to_pkey_table'): + for fkey_col in table.fkey_col_to_pkey_table.keys(): + idx_name = f"idx_{table_name}_{fkey_col}" + stmt = f"CREATE INDEX {idx_name} ON {table_name}({fkey_col}) WHERE {fkey_col} IS NOT NULL;" + index_statements.append(stmt) + + if hasattr(table, 'time_col') and table.time_col: + idx_name = f"idx_{table_name}_{table.time_col}" + stmt = f"CREATE INDEX {idx_name} ON {table_name}({table.time_col});" + index_statements.append(stmt) + + return index_statements + + +def generate_complete_sql(db, dataset_name): + """Generate complete SQL import script""" + sql_parts = [] + + sql_parts.append(f"-- RelBench {dataset_name.upper()} Database Schema") + sql_parts.append("-- Auto-generated from RelBench dataset") + sql_parts.append(f"-- Total tables: {len(db.table_dict)}") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 1: Drop existing tables'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + for table_name in reversed(list(db.table_dict.keys())): + sql_parts.append(f"DROP TABLE IF EXISTS {table_name} CASCADE;") + sql_parts.append("") + sql_parts.append("\\echo 'Tables dropped'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 2: Create tables'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + for table_name, table in db.table_dict.items(): + sql_parts.append(generate_create_table_sql(table_name, table)) + sql_parts.append("") + + sql_parts.append("\\echo 'Tables created'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 3: Create temp tables for import'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + for table_name, table in db.table_dict.items(): + sql_parts.append(generate_temp_table_sql(table_name, table)) + sql_parts.append("") + + sql_parts.append("\\echo 'Temp tables created'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 4: Import CSV into temp tables'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + for table_name in db.table_dict.keys(): + sql_parts.append(f"\\echo ' Importing {table_name}...'") + sql_parts.append(f"\\copy temp_{table_name} FROM '/tmp/{table_name}.csv' WITH (FORMAT CSV, HEADER, DELIMITER ',', QUOTE '\"');") + sql_parts.append("") + + sql_parts.append("\\echo 'CSV imported to temp tables'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 5: Transfer data with type conversion'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + for table_name, table in db.table_dict.items(): + sql_parts.append(f"\\echo ' Processing {table_name}...'") + sql_parts.append(generate_insert_sql(table_name, table)) + sql_parts.append("") + + sql_parts.append("\\echo 'Data transferred with NULL handling'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 6: Add Foreign Keys'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + fk_statements = generate_foreign_keys_sql(db) + for stmt in fk_statements: + sql_parts.append(stmt) + sql_parts.append("") + + sql_parts.append("\\echo 'Foreign keys added'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Step 7: Create Indexes'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + + index_statements = generate_indexes_sql(db) + for stmt in index_statements: + sql_parts.append(stmt) + sql_parts.append("") + + sql_parts.append("\\echo 'Indexes created'") + sql_parts.append("\\echo ''") + sql_parts.append("") + + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'Summary'") + sql_parts.append("\\echo '========================================='") + sql_parts.append("") + sql_parts.append("SELECT ") + sql_parts.append(" table_name,") + sql_parts.append(" to_char(record_count, 'FM999,999,999') as records") + sql_parts.append("FROM (") + + union_parts = [] + for table_name in db.table_dict.keys(): + union_parts.append(f" SELECT '{table_name}' as table_name, COUNT(*) as record_count FROM {table_name}") + + sql_parts.append("\n UNION ALL\n".join(union_parts)) + sql_parts.append(") t") + sql_parts.append("ORDER BY record_count DESC;") + sql_parts.append("") + + sql_parts.append("\\echo ''") + sql_parts.append("\\echo '========================================='") + sql_parts.append("\\echo 'IMPORT COMPLETE'") + sql_parts.append("\\echo '========================================='") + + return "\n".join(sql_parts) + + +def main(): + parser = argparse.ArgumentParser( + description='Generate SQL DDL and export CSV from RelBench datasets', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + %(prog)s rel-f1 + %(prog)s rel-amazon --output-dir ./amazon_data + %(prog)s rel-hm --no-download + +Supported datasets: + rel-f1, rel-amazon, rel-hm, rel-stack, rel-trial, + rel-event, rel-avito, rel-salt, rel-arxiv, rel-ratebeer + ''' + ) + + parser.add_argument('dataset', type=str, + help='Dataset name (e.g., rel-f1, rel-amazon)') + parser.add_argument('--output-dir', type=str, default=None, + help='Output directory for CSV and SQL files (default: ./{dataset}_data)') + parser.add_argument('--no-download', action='store_true', + help='Skip download if dataset already exists in cache') + + args = parser.parse_args() + + dataset_name = args.dataset + + if args.output_dir: + output_dir = Path(args.output_dir) + else: + dataset_short = dataset_name.replace('rel-', '') + output_dir = Path(f"./{dataset_short}_data") + + print("=" * 60) + print(f"RelBench {dataset_name.upper()} - SQL Generation") + print("=" * 60) + print() + + print(f"Downloading {dataset_name} dataset...") + try: + dataset = get_dataset(dataset_name, download=not args.no_download) + db = dataset.get_db() + except Exception as e: + print(f"Error loading dataset: {e}") + print(f"\nMake sure '{dataset_name}' is a valid RelBench dataset.") + sys.exit(1) + + print(f"Dataset loaded: {len(db.table_dict)} tables") + print() + + output_dir.mkdir(exist_ok=True) + + print("Exporting CSV files...") + for table_name, table in db.table_dict.items(): + csv_path = output_dir / f"{table_name}.csv" + # Convert any list/array columns to strings and clean problematic characters + df_export = table.df.copy() + for col in df_export.columns: + if df_export[col].dtype == 'object': + # Convert all object types to clean strings + def clean_value(x): + import numpy as np + # Handle None + if x is None: + return '' + # Handle numpy arrays and lists first (before pd.isna which fails on arrays) + if isinstance(x, (list, dict, tuple, np.ndarray)): + return str(x) + # Now safe to check isna for scalar values + try: + if pd.isna(x): + return '' + except (ValueError, TypeError): + pass + # Convert to string and remove problematic characters + s = str(x) + # Replace newlines and carriage returns with space + s = s.replace('\n', ' ').replace('\r', ' ') + return s + df_export[col] = df_export[col].apply(clean_value) + # Use QUOTE_MINIMAL with escapechar to handle special characters properly + df_export.to_csv(csv_path, index=False, quoting=csv.QUOTE_MINIMAL, doublequote=True) + print(f" {table_name}.csv ({len(table.df):,} rows)") + print() + + print("Generating SQL DDL...") + sql_content = generate_complete_sql(db, dataset_name) + + dataset_short = dataset_name.replace('rel-', '') + sql_file = output_dir / f"import_{dataset_short}.sql" + sql_file.write_text(sql_content) + + print(f"SQL script generated: {sql_file}") + print() + + print("Dataset Statistics:") + print(f" Tables: {len(db.table_dict)}") + total_rows = 0 + for table_name, table in db.table_dict.items(): + pkey = get_primary_key_column(table) + fkeys = len(table.fkey_col_to_pkey_table) if hasattr(table, 'fkey_col_to_pkey_table') else 0 + rows = len(table.df) + total_rows += rows + print(f" - {table_name:30s} {rows:10,} rows, {len(table.df.columns):2} cols, PK: {pkey}, FKs: {fkeys}") + print(f"\n Total rows: {total_rows:,}") + print() + + print("=" * 60) + print("Generation complete") + print("=" * 60) + print() + print("Output directory:", output_dir) + print("SQL script:", sql_file) + print() + print("Next steps:") + print(f" ./import_relbench.sh {dataset_name}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/import_relbench.sh b/scripts/import_relbench.sh new file mode 100644 index 00000000..c2fc2a66 --- /dev/null +++ b/scripts/import_relbench.sh @@ -0,0 +1,1249 @@ +#!/bin/bash +set -e + +CONTAINER_NAME="plexe-clone-postgres-1" +DB_USER="mlflow" +DB_PASSWORD="mlflow" # Thêm password + +# Thiết lập file log +LOG_DIR="./logs" +mkdir -p "$LOG_DIR" +LOG_FILE="$LOG_DIR/import_$(date +%Y%m%d_%H%M%S).log" + +# Cấu hình sample size +SAMPLE_SIZE=5000 # Mục tiêu số bản ghi tổng +MAX_SEED_RECORDS=5000 # Số bản ghi seed ban đầu từ bảng chính + +# Hàm log - ghi cả ra console và file +log() { + local message="[$(date '+%Y-%m-%d %H:%M:%S')] $1" + echo "$message" | tee -a "$LOG_FILE" +} + +log_error() { + local message="[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" + echo "$message" | tee -a "$LOG_FILE" >&2 +} + +log_step() { + local message="[$(date '+%Y-%m-%d %H:%M:%S')] STEP: $1" + echo "$message" | tee -a "$LOG_FILE" +} + +print_header() { + local header="=========================================" + echo "" | tee -a "$LOG_FILE" + echo "$header" | tee -a "$LOG_FILE" + echo "$1" | tee -a "$LOG_FILE" + echo "$header" | tee -a "$LOG_FILE" + echo "" | tee -a "$LOG_FILE" +} + +print_error() { + log_error "$1" +} + +print_info() { + log "INFO: $1" +} + +cleanup() { + log_step "Starting cleanup process" + + if [ "$KEEP_DATA" != "true" ]; then + print_header "Cleanup" + + print_info "Removing CSV files from container..." + if docker exec $CONTAINER_NAME bash -c "rm -f /tmp/*.csv /tmp/import_*.sql /tmp/sample_*.sql" 2>/dev/null; then + log "CSV files removed from container successfully" + else + log "Failed to remove CSV files from container (may not exist)" + fi + + if [ -d "$DATA_DIR" ]; then + print_info "Removing local data directory: $DATA_DIR" + rm -rf "$DATA_DIR" + log "Data directory removed: $DATA_DIR" + fi + + if [ "$REMOVE_RELBENCH_CACHE" = "true" ]; then + RELBENCH_CACHE="$HOME/.cache/relbench" + if [ -d "$RELBENCH_CACHE" ]; then + print_info "Removing RelBench cache: $RELBENCH_CACHE" + rm -rf "$RELBENCH_CACHE" + log "RelBench cache removed" + fi + fi + + echo "Cleanup complete" | tee -a "$LOG_FILE" + else + print_info "Keeping data files (KEEP_DATA=true)" + fi + + log_step "Cleanup completed" +} + +trap 'log_error "Script failed on line $LINENO with exit code $?"; cleanup; exit 1' ERR +trap 'cleanup' EXIT + +show_usage() { + cat << EOF +Usage: $0 [options] + +Arguments: + dataset-name RelBench dataset name (e.g., rel-f1, rel-amazon, rel-hm) + +Options: + --db-name NAME Database name (default: derived from dataset name) + --db-password PASS Database password (default: mlflow) + --output-dir DIR Output directory for data files + --sample-size N Target total records to import (default: 3000) + --seed-records N Initial seed records from main table (default: 500) + --keep-data Keep CSV and SQL files after import + --remove-cache Remove RelBench cache after import + --help Show this help message + +Examples: + $0 rel-f1 + $0 rel-amazon --db-name amazon --sample-size 5000 + $0 rel-hm --keep-data --seed-records 1000 + $0 rel-stack --db-name stackoverflow --output-dir ./stack_data + +Supported datasets: + rel-f1, rel-amazon, rel-hm, rel-stack, rel-trial, + rel-event, rel-avito, rel-salt, rel-arxiv, rel-ratebeer + +Logs are saved to: $LOG_DIR/ + +EOF + exit 0 +} + +# Bắt đầu logging +log "==========================================" +log "Script started: $0" +log "Arguments: $*" +log "==========================================" + +if [ $# -eq 0 ]; then + show_usage +fi + +DATASET_NAME="" +DB_NAME="" +OUTPUT_DIR="" +KEEP_DATA="false" +REMOVE_RELBENCH_CACHE="false" + +log_step "Parsing command line arguments" + +while [ $# -gt 0 ]; do + case "$1" in + --db-name) + DB_NAME="$2" + log "Setting DB_NAME=$DB_NAME" + shift 2 + ;; + --db-password) + DB_PASSWORD="$2" + log "Setting DB_PASSWORD=[HIDDEN]" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + log "Setting OUTPUT_DIR=$OUTPUT_DIR" + shift 2 + ;; + --sample-size) + SAMPLE_SIZE="$2" + log "Setting SAMPLE_SIZE=$SAMPLE_SIZE" + shift 2 + ;; + --seed-records) + MAX_SEED_RECORDS="$2" + log "Setting MAX_SEED_RECORDS=$MAX_SEED_RECORDS" + shift 2 + ;; + --keep-data) + KEEP_DATA="true" + log "Setting KEEP_DATA=true" + shift + ;; + --remove-cache) + REMOVE_RELBENCH_CACHE="true" + log "Setting REMOVE_RELBENCH_CACHE=true" + shift + ;; + --help) + show_usage + ;; + -*) + print_error "Unknown option: $1" + show_usage + ;; + *) + if [ -z "$DATASET_NAME" ]; then + DATASET_NAME="$1" + log "Setting DATASET_NAME=$DATASET_NAME" + else + print_error "Multiple dataset names provided" + show_usage + fi + shift + ;; + esac +done + +if [ -z "$DATASET_NAME" ]; then + print_error "Dataset name is required" + show_usage +fi + +DATASET_SHORT=$(echo "$DATASET_NAME" | sed 's/^rel-//') +log "DATASET_SHORT=$DATASET_SHORT" + +if [ -z "$DB_NAME" ]; then + DB_NAME="$DATASET_SHORT" + log "DB_NAME set to default: $DB_NAME" +fi + +if [ -z "$OUTPUT_DIR" ]; then + OUTPUT_DIR="./${DATASET_SHORT}_data" + log "OUTPUT_DIR set to default: $OUTPUT_DIR" +fi + +DATA_DIR="$OUTPUT_DIR" +SQL_FILE="$DATA_DIR/import_${DATASET_SHORT}.sql" +SAMPLE_SQL_FILE="$DATA_DIR/sample_${DATASET_SHORT}.sql" + +print_header "RelBench Partial Import - $DATASET_NAME" + +echo "Configuration:" | tee -a "$LOG_FILE" +echo " Dataset: $DATASET_NAME" | tee -a "$LOG_FILE" +echo " Database: $DB_NAME" | tee -a "$LOG_FILE" +echo " Container: $CONTAINER_NAME" | tee -a "$LOG_FILE" +echo " Data dir: $DATA_DIR" | tee -a "$LOG_FILE" +echo " Sample size: $SAMPLE_SIZE records" | tee -a "$LOG_FILE" +echo " Seed records: $MAX_SEED_RECORDS" | tee -a "$LOG_FILE" +echo " Log file: $LOG_FILE" | tee -a "$LOG_FILE" +echo "" | tee -a "$LOG_FILE" + +log_step "Checking Python installation" +if ! command -v python3 &> /dev/null; then + print_error "Python 3 is not installed" + exit 1 +fi +log "Python 3 is available: $(python3 --version)" + +log_step "Checking Python dependencies" +print_info "Checking Python dependencies..." + +if ! python3 -c "import relbench" 2>/dev/null; then + print_info "Installing relbench..." + if pip install relbench >> "$LOG_FILE" 2>&1; then + log "relbench installed successfully" + else + log_error "Failed to install relbench" + exit 1 + fi +else + log "relbench is already installed" +fi + +if ! python3 -c "import pandas" 2>/dev/null; then + print_info "Installing pandas..." + if pip install pandas >> "$LOG_FILE" 2>&1; then + log "pandas installed successfully" + else + log_error "Failed to install pandas" + exit 1 + fi +else + log "pandas is already installed" +fi +echo "Python dependencies OK" | tee -a "$LOG_FILE" + +print_header "Step 1: Verify PostgreSQL Container" +log_step "Verifying PostgreSQL container" +print_info "Checking container: $CONTAINER_NAME" + +if ! docker ps | grep -q $CONTAINER_NAME; then + print_error "Container '$CONTAINER_NAME' is not running" + print_info "Start it with: docker start $CONTAINER_NAME" + exit 1 +fi +log "Container $CONTAINER_NAME is running" + +print_header "Step 2: Prepare Database" +log_step "Preparing database" +print_info "Checking database: $DB_NAME" + +if ! docker exec $CONTAINER_NAME psql -U $DB_USER -lqt | cut -d \| -f 1 | grep -qw $DB_NAME; then + print_info "Database '$DB_NAME' does not exist, creating..." + if docker exec $CONTAINER_NAME psql -U $DB_USER -d postgres -c "CREATE DATABASE $DB_NAME;" >> "$LOG_FILE" 2>&1; then + log "Database '$DB_NAME' created successfully" + else + log_error "Failed to create database '$DB_NAME'" + exit 1 + fi +else + log "Database '$DB_NAME' already exists" + print_info "Existing tables will be dropped and recreated" +fi + +print_header "Step 3: Download Dataset and Generate SQL" +log_step "Downloading dataset and generating SQL" +print_info "Running Python script to download $DATASET_NAME dataset..." + +if [ ! -f "generate_relbench_sql.py" ]; then + print_error "generate_relbench_sql.py not found in current directory" + exit 1 +fi + +log "Executing: python3 generate_relbench_sql.py $DATASET_NAME --output-dir $DATA_DIR" +if python3 generate_relbench_sql.py "$DATASET_NAME" --output-dir "$DATA_DIR" >> "$LOG_FILE" 2>&1; then + log "Python script executed successfully" +else + log_error "Python script failed with exit code $?" + exit 1 +fi + +if [ ! -d "$DATA_DIR" ]; then + print_error "Data directory not created, Python script may have failed" + exit 1 +fi + +if [ ! -f "$SQL_FILE" ]; then + print_error "SQL script not generated, Python script may have failed" + exit 1 +fi + +log "Dataset downloaded and SQL generated successfully" + +CSV_COUNT=$(ls -1 $DATA_DIR/*.csv 2>/dev/null | wc -l) +print_info "Found $CSV_COUNT CSV files" +log "CSV files count: $CSV_COUNT" + +print_header "Step 4: Create Schema and Load Full Data to Temporary Database" +log_step "Creating temporary database for sampling" + +TEMP_DB="${DB_NAME}_temp" +print_info "Creating temporary database: $TEMP_DB" + +docker exec $CONTAINER_NAME psql -U $DB_USER -d postgres -c "DROP DATABASE IF EXISTS $TEMP_DB;" >> "$LOG_FILE" 2>&1 +if docker exec $CONTAINER_NAME psql -U $DB_USER -d postgres -c "CREATE DATABASE $TEMP_DB;" >> "$LOG_FILE" 2>&1; then + log "Temporary database created: $TEMP_DB" +else + log_error "Failed to create temporary database" + exit 1 +fi + +print_info "Copying CSV files to container..." +for csv_file in $DATA_DIR/*.csv; do + if [ -f "$csv_file" ]; then + filename=$(basename "$csv_file") + if docker cp "$csv_file" $CONTAINER_NAME:/tmp/ >> "$LOG_FILE" 2>&1; then + log "Copied $filename to container" + else + log_error "Failed to copy $filename" + exit 1 + fi + fi +done + +print_info "Copying SQL script to container..." +if docker cp "$SQL_FILE" $CONTAINER_NAME:/tmp/ >> "$LOG_FILE" 2>&1; then + log "SQL script copied to container successfully" +else + log_error "Failed to copy SQL script to container" + exit 1 +fi + +print_info "Loading full data into temporary database..." +if docker exec -i $CONTAINER_NAME psql -U $DB_USER -d $TEMP_DB -f /tmp/import_${DATASET_SHORT}.sql >> "$LOG_FILE" 2>&1; then + log "Full data loaded into temporary database" +else + log_error "Failed to load data into temporary database" + exit 1 +fi + +print_header "Step 5: Analyze Schema and Generate Sample" +log_step "Analyzing database schema and relationships" + +print_info "Generating intelligent sample with foreign key relationships..." + +# Tạo script Python để phân tích và tạo sample +cat > "$DATA_DIR/create_sample.py" << 'PYTHON_SCRIPT' +#!/usr/bin/env python3 +import sys +import psycopg2 +from collections import defaultdict, deque +import json + +def get_table_info(conn): + """Lấy thông tin về các bảng và số lượng bản ghi""" + cur = conn.cursor() + cur.execute(""" + SELECT schemaname, relname, n_live_tup + FROM pg_stat_user_tables + WHERE schemaname = 'public' + ORDER BY n_live_tup DESC + """) + tables = {} + for row in cur.fetchall(): + tables[row[1]] = int(row[2]) # row[1] is relname, row[2] is n_live_tup + cur.close() + return tables + +def format_ids_for_sql(ids): + """Format list of IDs for SQL IN clause""" + formatted = [] + for id_val in ids: + if id_val is None: + continue + elif isinstance(id_val, str): + escaped = id_val.replace("'", "''") + formatted.append(f"'{escaped}'") + elif isinstance(id_val, (int, float)): + formatted.append(str(id_val)) + else: + escaped = str(id_val).replace("'", "''") + formatted.append(f"'{escaped}'") + return ','.join(formatted) if formatted else "''" + +def get_foreign_keys(conn): + """Lấy thông tin về foreign keys""" + cur = conn.cursor() + cur.execute(""" + SELECT + tc.table_name as from_table, + kcu.column_name as from_column, + ccu.table_name AS to_table, + ccu.column_name AS to_column + FROM information_schema.table_constraints AS tc + JOIN information_schema.key_column_usage AS kcu + ON tc.constraint_name = kcu.constraint_name + AND tc.table_schema = kcu.table_schema + JOIN information_schema.constraint_column_usage AS ccu + ON ccu.constraint_name = tc.constraint_name + AND ccu.table_schema = tc.table_schema + WHERE tc.constraint_type = 'FOREIGN KEY' + AND tc.table_schema = 'public' + """) + + fks = defaultdict(list) + reverse_fks = defaultdict(list) + + for row in cur.fetchall(): + from_table, from_col, to_table, to_col = row + fks[from_table].append({ + 'from_col': from_col, + 'to_table': to_table, + 'to_col': to_col + }) + reverse_fks[to_table].append({ + 'to_col': to_col, + 'from_table': from_table, + 'from_col': from_col + }) + + cur.close() + return dict(fks), dict(reverse_fks) + +def find_central_table(tables, fks, reverse_fks, pk_columns): + """Find central table - prioritize fact tables with multiple FKs for better sampling""" + import math + scores = {} + + print("\nAnalyzing table importance for sampling...") + for table in tables: + # Điểm = số FK đi ra + số FK đi vào + outgoing = len(fks.get(table, [])) + incoming = len(reverse_fks.get(table, [])) + + # PRIORITIZE FACT TABLES (tables with 2+ FKs) - these connect dimensions + # Example: review table connects customer + product + if outgoing >= 2: + fk_score = 10000 + (outgoing * 1000) # Huge bonus for fact tables + else: + fk_score = outgoing * 100 + + # Add points for being referenced by other tables + fk_score += incoming * 50 + + # Prefer tables with primary key (easier to track) + has_pk = pk_columns.get(table, 'ctid') != 'ctid' + pk_bonus = 500 if has_pk else 0 + + # Consider table size - prefer tables with data + size_score = math.log10(tables.get(table, 0) + 1) * 10 if tables.get(table, 0) > 0 else 0 + + total_score = fk_score + pk_bonus + size_score + scores[table] = total_score + + print(f" {table:20s}: FKs={outgoing}, reverse={incoming}, PK={has_pk}, rows={tables.get(table, 0):,}, score={total_score:.0f}") + + # Sắp xếp theo điểm + sorted_tables = sorted(scores.items(), key=lambda x: x[1], reverse=True) + + if sorted_tables: + selected = sorted_tables[0][0] + print(f"\nSelected central table: {selected} (score={sorted_tables[0][1]:.0f})") + return selected + return list(tables.keys())[0] if tables else None + +def create_sample_sql(conn, db_name, target_db, sample_size, seed_records): + """Tạo SQL script để sample dữ liệu có liên kết""" + + print("Analyzing database schema...") + tables = get_table_info(conn) + fks, reverse_fks = get_foreign_keys(conn) + + print(f"\nFound {len(tables)} tables:") + for table, count in sorted(tables.items(), key=lambda x: x[1], reverse=True): + print(f" - {table}: {count:,} records") + + print(f"\nFound {sum(len(v) for v in fks.values())} foreign key relationships") + + # Get primary keys for all tables FIRST (needed for central table selection) + cur = conn.cursor() + pk_columns = {} + for table in tables: + cur.execute(f""" + SELECT a.attname + FROM pg_index i + JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) + WHERE i.indrelid = '{table}'::regclass AND i.indisprimary + """) + pk_result = cur.fetchone() + if pk_result: + pk_columns[table] = pk_result[0] + else: + # Fallback: use ctid (system column) for tables without primary key + pk_columns[table] = 'ctid' + + central_table = find_central_table(tables, fks, reverse_fks, pk_columns) + print(f"\nCentral table identified: {central_table} (PK: {pk_columns.get(central_table, 'none')})") + + # Tạo temporary tables để lưu IDs đã chọn + sampled_ids = defaultdict(set) + visited_tables = set() + + cur = conn.cursor() + + # Bước 1: Lấy seed records từ bảng trung tâm + print(f"\nStep 1: Sampling {seed_records} seed records from {central_table}...") + + pk_col = pk_columns[central_table] + + # Sample random records - for tables without PK, use ctid + if pk_col == 'ctid': + # For tables without PK (fact tables), sample and immediately get referenced dimension records + cur.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name = '{central_table}' ORDER BY ordinal_position") + all_columns = [row[0] for row in cur.fetchall()] + columns_str = ', '.join(all_columns) + + # Use TABLESAMPLE for better performance on large tables + cur.execute(f""" + SELECT {columns_str} FROM {central_table} + TABLESAMPLE SYSTEM (1) -- Sample ~1% of pages + LIMIT {seed_records} + """) + # Store tuples of all column values + seed_ids = [row for row in cur.fetchall()] + sampled_ids[central_table] = set(seed_ids) + print(f" Sampled {len(seed_ids)} records from {central_table} (no PK - using full row)") + + # CRITICAL: Extract FK values to get referenced dimension records + col_name_to_idx = {col: idx for idx, col in enumerate(all_columns)} + + for fk in fks.get(central_table, []): + to_table = fk['to_table'] + from_col = fk['from_col'] + to_col = fk['to_col'] + + # Extract FK values from sampled rows + fk_col_idx = col_name_to_idx.get(from_col) + if fk_col_idx is not None: + fk_values = set() + for row in seed_ids: + if row[fk_col_idx] is not None: + fk_values.add(row[fk_col_idx]) + + if fk_values: + sampled_ids[to_table] = fk_values + visited_tables.add(to_table) + total_records += len(fk_values) + print(f" -> Collected {len(fk_values)} {to_table} records (via {from_col})") + + total_records = len(seed_ids) + else: + cur.execute(f""" + SELECT {pk_col} FROM {central_table} + TABLESAMPLE SYSTEM (1) + LIMIT {seed_records} + """) + seed_ids = [row[0] for row in cur.fetchall()] + sampled_ids[central_table] = set(seed_ids) + print(f" Sampled {len(seed_ids)} records from {central_table}") + total_records = len(seed_ids) + + # Bước 2: BFS để lấy các bản ghi liên quan + queue = deque([(central_table, seed_ids)]) + visited_tables.add(central_table) + total_records = len(seed_ids) + + print(f"\nStep 2: Cascading through foreign key relationships...") + iteration = 0 + + while queue and total_records < sample_size: + iteration += 1 + current_table, current_ids = queue.popleft() + + if not current_ids: + continue + + print(f"\n Iteration {iteration}: Processing {current_table} ({len(current_ids)} IDs)") + + # Lấy bảng cha (through FK) + for fk in fks.get(current_table, []): + to_table = fk['to_table'] + from_col = fk['from_col'] + to_col = fk['to_col'] + + if total_records >= sample_size: + break + + # Lấy parent IDs - sử dụng subquery an toàn hơn + ids_str = format_ids_for_sql(current_ids) + if not ids_str or ids_str == "''": + continue + + try: + cur.execute(f""" + SELECT DISTINCT {to_col} + FROM {to_table} + WHERE {to_col} IS NOT NULL + AND {to_col} IN ( + SELECT DISTINCT {from_col} + FROM {current_table} + WHERE {from_col} IS NOT NULL + AND {from_col} IN ({ids_str}) + ) + """) + + parent_ids = [row[0] for row in cur.fetchall()] + new_ids = set(parent_ids) - sampled_ids[to_table] + + if new_ids: + sampled_ids[to_table].update(new_ids) + total_records += len(new_ids) + print(f" -> {to_table}: +{len(new_ids)} records (total: {len(sampled_ids[to_table])})") + + if to_table not in visited_tables: + queue.append((to_table, list(new_ids))) + visited_tables.add(to_table) + except Exception as e: + print(f" -> {to_table}: Error - {e}") + conn.rollback() # Rollback transaction on error + continue + + # Lấy bảng con (reverse FK) - giới hạn số lượng để tránh quá nhiều + for rfk in reverse_fks.get(current_table, []): + from_table = rfk['from_table'] + from_col = rfk['from_col'] + to_col = rfk['to_col'] + + if total_records >= sample_size: + break + + # Lấy child IDs với giới hạn + ids_str = format_ids_for_sql(current_ids) + if not ids_str or ids_str == "''": + continue + + limit = min(500, sample_size - total_records) # Giới hạn mỗi lần lấy 500 records + + try: + child_pk_col = pk_columns.get(from_table) + + # For tables without PK, we can't track by ID - skip them in reverse FK traversal + # They will be sampled based on FK constraints later + if child_pk_col == 'ctid': + print(f" <- {from_table}: Skipped (no PK - will sample by FK constraint)") + continue + + cur.execute(f""" + SELECT DISTINCT {child_pk_col} + FROM {from_table} + WHERE {from_col} IS NOT NULL + AND {from_col} IN ({ids_str}) + LIMIT {limit} + """) + + child_ids = [row[0] for row in cur.fetchall()] + new_ids = set(child_ids) - sampled_ids[from_table] + + if new_ids: + sampled_ids[from_table].update(new_ids) + total_records += len(new_ids) + print(f" <- {from_table}: +{len(new_ids)} records (total: {len(sampled_ids[from_table])})") + + if from_table not in visited_tables: + queue.append((from_table, list(new_ids))) + visited_tables.add(from_table) + except Exception as e: + print(f" <- {from_table}: Error - {e}") + conn.rollback() # Rollback transaction on error + continue + + # Bước 2.3: Sample tables without primary keys based on FK relationships + print(f"\nStep 2.3: Sampling tables without primary keys...") + for table in tables: + if pk_columns.get(table) != 'ctid': + continue # Skip tables with primary keys + + if table in sampled_ids and sampled_ids[table]: + continue # Already sampled + + # Sample records from this table based on foreign key relationships + if table in fks and fks[table]: + print(f" Processing {table} (no PK)...") + + # Build WHERE clause based on all FKs + where_conditions = [] + for fk in fks[table]: + to_table = fk['to_table'] + from_col = fk['from_col'] + + if to_table in sampled_ids and sampled_ids[to_table]: + ids_str = format_ids_for_sql(list(sampled_ids[to_table])) + if ids_str and ids_str != "''": + where_conditions.append(f"{from_col} IN ({ids_str})") + + if where_conditions: + limit = min(1000, sample_size - total_records) + where_clause = " OR ".join(where_conditions) + + try: + # Get all columns for this table + cur.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table}' ORDER BY ordinal_position") + all_columns = [row[0] for row in cur.fetchall()] + columns_str = ', '.join(all_columns) + + cur.execute(f""" + SELECT {columns_str} + FROM {table} + WHERE {where_clause} + LIMIT {limit} + """) + + rows = cur.fetchall() + if rows: + # Store as tuples (can't use IDs for tables without PK) + sampled_ids[table] = set(rows) + total_records += len(rows) + print(f" {table}: +{len(rows)} records (total: {len(rows)})") + + # IMPORTANT: Collect referenced entities from these rows + # Map column names to indices + col_name_to_idx = {col: idx for idx, col in enumerate(all_columns)} + + for fk in fks[table]: + to_table = fk['to_table'] + from_col = fk['from_col'] + to_col = fk['to_col'] + + # Skip if target table doesn't have PK (can't collect them) + if pk_columns.get(to_table) == 'ctid': + continue + + # Extract foreign key values from sampled rows + fk_col_idx = col_name_to_idx.get(from_col) + if fk_col_idx is not None: + fk_values = set() + for row in rows: + if row[fk_col_idx] is not None: + fk_values.add(row[fk_col_idx]) + + # Add these to sampled_ids for the referenced table + if fk_values: + new_fk_values = fk_values - sampled_ids.get(to_table, set()) + if new_fk_values: + if to_table not in sampled_ids: + sampled_ids[to_table] = set() + sampled_ids[to_table].update(new_fk_values) + total_records += len(new_fk_values) + print(f" -> {to_table}: +{len(new_fk_values)} records (from {from_col})") + except Exception as e: + print(f" {table}: Error - {e}") + conn.rollback() # Rollback transaction on error + + # Bước 2.5: Validate và ensure FK integrity by adding missing referenced records + print(f"\nStep 2.5: Ensuring foreign key integrity...") + added_count = 0 + max_iterations = 10 + + for iteration in range(max_iterations): + added_this_iteration = 0 + + for table in list(sampled_ids.keys()): + # Skip tables without primary keys - can't validate them the same way + if pk_columns.get(table) == 'ctid': + continue + + if table not in fks or not fks[table]: + continue + + ids_list = list(sampled_ids[table]) + + # Check each FK constraint and ADD missing referenced records + for fk in fks[table]: + to_table = fk['to_table'] + from_col = fk['from_col'] + to_col = fk['to_col'] + + # Skip if target table has no primary key + if pk_columns.get(to_table) == 'ctid': + continue + + # Get FK values for sampled records + ids_str = format_ids_for_sql(ids_list) + if not ids_str or ids_str == "''": + continue + + try: + # Find FK values that are NOT in the sampled set + cur.execute(f""" + SELECT DISTINCT t.{from_col} + FROM {table} t + WHERE t.{pk_columns[table]} IN ({ids_str}) + AND t.{from_col} IS NOT NULL + """) + + fk_values = set(row[0] for row in cur.fetchall()) + + # Find which ones are missing from sampled_ids + existing_ids = sampled_ids.get(to_table, set()) + missing_ids = fk_values - existing_ids + + if missing_ids: + # Add these missing IDs to the sample + if to_table not in sampled_ids: + sampled_ids[to_table] = set() + sampled_ids[to_table].update(missing_ids) + added_this_iteration += len(missing_ids) + added_count += len(missing_ids) + print(f" {table}.{from_col} -> {to_table}: +{len(missing_ids)} referenced records") + + except Exception as e: + print(f" Warning: Could not check FK {table}.{from_col} -> {to_table}: {e}") + conn.rollback() + continue + + if added_this_iteration == 0: + print(f" Validation complete after {iteration + 1} iteration(s)") + break + + if added_count > 0: + print(f"\nTotal FK-referenced records added: {added_count}") + total_records = sum(len(ids) for ids in sampled_ids.values()) + print(f"Clean sample size: {total_records} records") + else: + print(f" No orphaned records found - data integrity is perfect!") + + # Bước 3: Tạo SQL script + print(f"\nStep 3: Generating SQL script...") + sql_lines = [] + sql_lines.append(f"-- Sample data from {db_name}") + sql_lines.append(f"-- Total records to import: {total_records}") + sql_lines.append(f"-- Generated at: {import_timestamp}") + sql_lines.append("") + + # Tạo bảng theo thứ tự dependency + table_order = [] + remaining = set(sampled_ids.keys()) + + while remaining: + added = False + for table in list(remaining): + # Check if all dependencies are satisfied + deps = set(fk['to_table'] for fk in fks.get(table, [])) + if deps.issubset(set(table_order)): + table_order.append(table) + remaining.remove(table) + added = True + + if not added and remaining: + # Break circular dependency + table_order.append(remaining.pop()) + + print(f"\nTable processing order: {' -> '.join(table_order)}") + + for table in table_order: + ids = sampled_ids[table] + if not ids: + continue + + print(f"\nProcessing {table}: {len(ids)} records") + + # Lấy primary key + cur.execute(f""" + SELECT a.attname + FROM pg_index i + JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) + WHERE i.indrelid = '{table}'::regclass AND i.indisprimary + """) + pk_result = cur.fetchone() + if pk_result: + pk_col = pk_result[0] + else: + cur.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table}' LIMIT 1") + pk_col = cur.fetchone()[0] + + # Lấy danh sách các cột + cur.execute(f""" + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_name = '{table}' + AND table_schema = 'public' + ORDER BY ordinal_position + """) + columns = cur.fetchall() + col_names = [col[0] for col in columns] + + sql_lines.append(f"-- Table: {table} ({len(ids)} records)") + + pk_col = pk_columns.get(table) + + # For tables without primary key, data is already stored as full rows + if pk_col == 'ctid': + # Lấy danh sách các cột + cur.execute(f""" + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_name = '{table}' + AND table_schema = 'public' + ORDER BY ordinal_position + """) + columns = cur.fetchall() + col_names = [col[0] for col in columns] + + # ids already contains full row tuples + rows = list(ids) + else: + # For tables with primary key, query by PK + # Lấy danh sách các cột + cur.execute(f""" + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_name = '{table}' + AND table_schema = 'public' + ORDER BY ordinal_position + """) + columns = cur.fetchall() + col_names = [col[0] for col in columns] + + # Lấy dữ liệu thực tế từ database + ids_str = format_ids_for_sql(list(ids)) + if not ids_str or ids_str == "''": + print(f" Skipping {table} - no valid IDs") + continue + + query = f""" + SELECT {', '.join(col_names)} + FROM {table} + WHERE {pk_col} IN ({ids_str}) + """ + + try: + cur.execute(query) + rows = cur.fetchall() + except Exception as e: + print(f" Error querying {table}: {e}") + continue + + # Tạo INSERT statements + if rows: + sql_lines.append(f"INSERT INTO {table} ({', '.join(col_names)}) VALUES") + for i, row in enumerate(rows): + # Format values + values = [] + for val in row: + if val is None: + values.append('NULL') + elif isinstance(val, str): + # Escape single quotes + escaped = val.replace("'", "''") + values.append(f"'{escaped}'") + elif isinstance(val, (int, float)): + values.append(str(val)) + elif isinstance(val, bool): + values.append('TRUE' if val else 'FALSE') + else: + # For other types (date, timestamp, etc.), convert to string + escaped = str(val).replace("'", "''") + values.append(f"'{escaped}'") + + if i < len(rows) - 1: + sql_lines.append(f" ({', '.join(values)}),") + else: + sql_lines.append(f" ({', '.join(values)});") + + sql_lines.append("") + + cur.close() + + print(f"\n{'='*50}") + print(f"Sample Summary:") + print(f"{'='*50}") + for table in table_order: + if table in sampled_ids: + print(f" {table:30} {len(sampled_ids[table]):>10,} records") + print(f"{'='*50}") + print(f" {'TOTAL':30} {sum(len(ids) for ids in sampled_ids.values()):>10,} records") + print(f"{'='*50}") + + return '\n'.join(sql_lines), sampled_ids + +if __name__ == '__main__': + import datetime + import_timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + db_name = sys.argv[1] + target_db = sys.argv[2] + sample_size = int(sys.argv[3]) + seed_records = int(sys.argv[4]) + output_file = sys.argv[5] + pg_port = int(sys.argv[6]) if len(sys.argv) > 6 else 5432 + pg_password = sys.argv[7] if len(sys.argv) > 7 else '' + + print(f"Connecting to PostgreSQL on localhost:{pg_port}...") + conn = psycopg2.connect( + host='localhost', + database=db_name, + user='mlflow', + password=pg_password, + port=pg_port + ) + print("Connected successfully!") + print() + + sql_script, sampled_ids = create_sample_sql(conn, db_name, target_db, sample_size, seed_records) + + with open(output_file, 'w') as f: + f.write(sql_script) + + # Save summary as JSON + summary = { + 'total_records': sum(len(ids) for ids in sampled_ids.values()), + 'tables': {table: len(ids) for table, ids in sampled_ids.items()}, + 'timestamp': import_timestamp + } + + summary_file = output_file.replace('.sql', '_summary.json') + with open(summary_file, 'w') as f: + json.dump(summary, f, indent=2) + + print(f"\nSQL script saved to: {output_file}") + print(f"Summary saved to: {summary_file}") + + conn.close() +PYTHON_SCRIPT + +chmod +x "$DATA_DIR/create_sample.py" + +# Cài đặt psycopg2 trên host nếu chưa có +print_info "Checking psycopg2 on host machine..." +if ! python3 -c "import psycopg2" 2>/dev/null; then + print_info "Installing psycopg2-binary on host..." + if pip install psycopg2-binary >> "$LOG_FILE" 2>&1; then + log "psycopg2-binary installed successfully" + else + log_error "Failed to install psycopg2-binary" + exit 1 + fi +else + log "psycopg2 is already installed" +fi + +# Lấy port của PostgreSQL container +print_info "Getting PostgreSQL container port..." +PG_PORT=$(docker port $CONTAINER_NAME 5432 | cut -d: -f2) +if [ -z "$PG_PORT" ]; then + log_error "Could not determine PostgreSQL port" + exit 1 +fi +log "PostgreSQL port: $PG_PORT" + +# Chạy script Python trên host +print_info "Running intelligent sampling algorithm..." +log "Command: python3 $DATA_DIR/create_sample.py $TEMP_DB $DB_NAME $SAMPLE_SIZE $MAX_SEED_RECORDS $SAMPLE_SQL_FILE $PG_PORT $DB_PASSWORD" + +if python3 "$DATA_DIR/create_sample.py" \ + "$TEMP_DB" "$DB_NAME" "$SAMPLE_SIZE" "$MAX_SEED_RECORDS" "$SAMPLE_SQL_FILE" "$PG_PORT" "$DB_PASSWORD" \ + 2>&1 | tee -a "$LOG_FILE"; then + log "Sampling completed successfully" +else + log_error "Python sampling script failed with exit code $?" + log_error "Check if psycopg2 is installed: pip install psycopg2-binary" + exit 1 +fi + +# Kiểm tra file output +if [ ! -f "$SAMPLE_SQL_FILE" ]; then + log_error "Sample SQL file was not created by Python script" + exit 1 +fi + +if [ ! -s "$SAMPLE_SQL_FILE" ]; then + log_error "Sample SQL file is empty" + exit 1 +fi + +log "Sample SQL file created: $SAMPLE_SQL_FILE ($(wc -l < "$SAMPLE_SQL_FILE") lines)" + +# Kiểm tra summary file +if [ -f "$DATA_DIR/sample_summary.json" ]; then + log "Sample summary created: $DATA_DIR/sample_summary.json" + cat "$DATA_DIR/sample_summary.json" | tee -a "$LOG_FILE" +else + log "Warning: Sample summary file not created" +fi + +print_header "Step 6: Import Sampled Data" +log_step "Importing sampled data into target database" + +# Kiểm tra file SQL đã được tạo chưa +if [ ! -f "$SAMPLE_SQL_FILE" ]; then + log_error "Sample SQL file was not created: $SAMPLE_SQL_FILE" + log_error "Python sampling script may have failed. Check the log above." + exit 1 +fi + +log "Sample SQL file found: $SAMPLE_SQL_FILE ($(wc -l < "$SAMPLE_SQL_FILE") lines)" + +# Drop và recreate database để đảm bảo clean state +print_info "Recreating target database for clean import..." +docker exec $CONTAINER_NAME psql -U $DB_USER -d postgres -c "DROP DATABASE IF EXISTS $DB_NAME;" >> "$LOG_FILE" 2>&1 +if docker exec $CONTAINER_NAME psql -U $DB_USER -d postgres -c "CREATE DATABASE $DB_NAME;" >> "$LOG_FILE" 2>&1; then + log "Target database recreated: $DB_NAME" +else + log_error "Failed to recreate target database" + exit 1 +fi + +print_info "Creating schema in target database..." +# Tạo schema từ SQL file gốc nhưng loại bỏ foreign keys và indexes +# Sử dụng sed để tách phần CREATE TABLE +cat > "$DATA_DIR/schema_only.sql" << 'SQL_HEADER' +-- Schema creation script (tables only, no foreign keys) +-- Generated by import_relbench.sh +SQL_HEADER + +# Extract only CREATE TABLE statements (multi-line) +awk '/^CREATE TABLE/,/;$/' "$SQL_FILE" >> "$DATA_DIR/schema_only.sql" + +docker cp "$DATA_DIR/schema_only.sql" $CONTAINER_NAME:/tmp/ + +if docker exec $CONTAINER_NAME psql -U $DB_USER -d $DB_NAME -f /tmp/schema_only.sql >> "$LOG_FILE" 2>&1; then + log "Schema created successfully (without foreign keys)" +else + log_error "Failed to create schema" + exit 1 +fi + +print_info "Importing sampled data..." + +# Kiểm tra file SQL có tồn tại không +if [ ! -f "$SAMPLE_SQL_FILE" ]; then + log_error "Sample SQL file not found: $SAMPLE_SQL_FILE" + exit 1 +fi + +# Kiểm tra file có rỗng không +if [ ! -s "$SAMPLE_SQL_FILE" ]; then + log_error "Sample SQL file is empty: $SAMPLE_SQL_FILE" + exit 1 +fi + +log "Sample SQL file size: $(wc -l < "$SAMPLE_SQL_FILE") lines" + +# Copy SQL file vào container +if docker cp "$SAMPLE_SQL_FILE" $CONTAINER_NAME:/tmp/sample_${DATASET_SHORT}.sql >> "$LOG_FILE" 2>&1; then + log "SQL file copied to container" +else + log_error "Failed to copy SQL file to container" + exit 1 +fi + +# Import với error handling chi tiết +print_info "Executing SQL import..." +if docker exec $CONTAINER_NAME psql -U $DB_USER -d $DB_NAME -f /tmp/sample_${DATASET_SHORT}.sql 2>&1 | tee -a "$LOG_FILE"; then + log "SQL import completed successfully" +else + log_error "SQL import failed. Check log file for details: $LOG_FILE" + log_error "Last 20 lines of SQL file:" + tail -20 "$SAMPLE_SQL_FILE" | tee -a "$LOG_FILE" + exit 1 +fi + +# Thêm foreign keys sau khi import data +print_info "Adding foreign key constraints..." +# Extract ALTER TABLE ADD CONSTRAINT statements with their FOREIGN KEY lines (2 lines each) +grep -A 1 "^ALTER TABLE.*ADD CONSTRAINT" "$SQL_FILE" | grep -v "^--$" > "$DATA_DIR/foreign_keys.sql" || true +if [ -s "$DATA_DIR/foreign_keys.sql" ]; then + docker cp "$DATA_DIR/foreign_keys.sql" $CONTAINER_NAME:/tmp/ + if docker exec $CONTAINER_NAME psql -U $DB_USER -d $DB_NAME -f /tmp/foreign_keys.sql >> "$LOG_FILE" 2>&1; then + log "Foreign key constraints added successfully" + else + log "Warning: Some foreign key constraints failed (expected with partial data)" + fi +else + log "No foreign key constraints to add" +fi + +# Xóa temporary database +print_info "Cleaning up temporary database..." +docker exec $CONTAINER_NAME psql -U $DB_USER -d postgres -c "DROP DATABASE IF EXISTS $TEMP_DB;" >> "$LOG_FILE" 2>&1 +log "Temporary database dropped" + +print_header "Step 7: Verify Import" +log_step "Verifying sampled data import" + +print_info "Actual record counts in database:" +echo "" | tee -a "$LOG_FILE" +echo "=========================================" | tee -a "$LOG_FILE" +echo "IMPORTED RECORDS BY TABLE" | tee -a "$LOG_FILE" +echo "=========================================" | tee -a "$LOG_FILE" + +# Query để lấy số lượng records trong mỗi bảng +TOTAL_COUNT=0 +docker exec $CONTAINER_NAME psql -U $DB_USER -d $DB_NAME -t -c " +SELECT + relname || ':' || n_live_tup as info +FROM pg_stat_user_tables +WHERE schemaname = 'public' +ORDER BY n_live_tup DESC; +" | while IFS=: read -r table count; do + if [ ! -z "$table" ] && [ ! -z "$count" ]; then + # Trim whitespace + table=$(echo "$table" | xargs) + count=$(echo "$count" | xargs) + printf " %-30s %10s records\n" "$table" "$count" | tee -a "$LOG_FILE" + TOTAL_COUNT=$((TOTAL_COUNT + count)) + fi +done + +echo "=========================================" | tee -a "$LOG_FILE" + +# Get total count from database +TOTAL_RECORDS=$(docker exec $CONTAINER_NAME psql -U $DB_USER -d $DB_NAME -t -c " +SELECT SUM(n_live_tup) FROM pg_stat_user_tables WHERE schemaname = 'public'; +" | xargs) + +printf " %-30s %10s records\n" "TOTAL" "$TOTAL_RECORDS" | tee -a "$LOG_FILE" +echo "=========================================" | tee -a "$LOG_FILE" +echo "" | tee -a "$LOG_FILE" + +log "Import verification completed - Total: $TOTAL_RECORDS records imported" \ No newline at end of file diff --git a/scripts/main.py b/scripts/main.py new file mode 100644 index 00000000..00f55181 --- /dev/null +++ b/scripts/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from scripts!") + + +if __name__ == "__main__": + main() diff --git a/scripts/pyproject.toml b/scripts/pyproject.toml new file mode 100644 index 00000000..5913644f --- /dev/null +++ b/scripts/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "scripts" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [] diff --git a/scripts/uv.lock b/scripts/uv.lock new file mode 100644 index 00000000..a5cd35a6 --- /dev/null +++ b/scripts/uv.lock @@ -0,0 +1,8 @@ +version = 1 +revision = 3 +requires-python = ">=3.10" + +[[package]] +name = "scripts" +version = "0.1.0" +source = { virtual = "." } diff --git a/test_training_table.py b/test_training_table.py new file mode 100644 index 00000000..edec959c --- /dev/null +++ b/test_training_table.py @@ -0,0 +1,21 @@ +import os, sys +import numpy as np +import pandas as pd +from typing import Optional +import duckdb +from plexe.relbench.base import Database, Table, EntityTask, TaskType, Dataset +from plexe.relbench.metrics import accuracy, f1, roc_auc, average_precision + +from workdir.rel_f1_driver_dnf.dataset import GenDataset +from workdir.rel_f1_driver_dnf.task import GenTask + +from plexe.relbench.tasks.f1 import DriverDNFTask + +csv_dir = '/home/ta/kl/plexe-clone/workdir/rel_f1_driver_dnf/csv_files' +dataset = GenDataset(csv_dir=csv_dir) +gen_task = GenTask(dataset) +root_task = DriverDNFTask(dataset) + +db = dataset.get_db() +print("Training Table from GenTask:", gen_task.get_table("train")) +print("Training Table from DriverDNFTask:", root_task.get_table("train")) \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md deleted file mode 100644 index 91724ee6..00000000 --- a/tests/benchmark/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# MLE-Bench Runner for Plexe - -This script benchmarks `plexe` on the `mle-bench` test suite released by OpenAI. It automates the process of running the Plexe library on Kaggle competitions and evaluating the results using the MLE-Bench framework. See the [mle bench repository](https://github.com/openai/mle-bench) or the [mle-bench paper](https://openai.com/index/mle-bench/) for more information. - -## Prerequisites - -To run the benchmark, you need to complete these steps: - -1. Clone the `plexe` repository: `git clone https://github.com/plexe-ai/plexe.git` -2. Install `git lfs` on your machine ([installation instructions](https://git-lfs.com/)) -3. Create a Kaggle account, create an API key, and save it in `~/.kaggle/kaggle.json` ([Kaggle API instructions](https://www.kaggle.com/docs/api)) -4. Install Python `3.11.0` or later -5. Install poetry: `pip install poetry` -6. Set up the project: `poetry install` in the project root directory -7. Configure API keys for your LLM provider as described in the main README.md: - ``` - # For OpenAI - export OPENAI_API_KEY= - # For Anthropic - export ANTHROPIC_API_KEY= - # For Gemini - export GEMINI_API_KEY= - ``` - -## Usage - -Run the benchmark with: - -```bash -poetry run python tests/benchmark/mle_bench.py -``` - -When you run the script for the first time, you will be prompted to: -1. Specify a directory where the `mle-bench` repository will be cloned -2. Set LLM provider details (provider, max iterations, timeout) - -The script will then: -1. Clone the `mle-bench` repository -2. Download the Kaggle datasets (by default, only the spaceship-titanic challenge) -3. Run Plexe on the datasets -4. Generate predictions and submission files -5. Evaluate the results using MLE-Bench - -### Command Line Options - -- `--config PATH`: Specify a custom config file path (default: mle-bench-config.yaml) -- `--rebuild`: Force re-clone the MLE-bench repository and regenerate the config file - -## Configuration - -The first time you run the script, it creates a file called `mle-bench-config.yaml` with the following structure: - -```yaml -repo_url: https://github.com/openai/mle-bench.git -repo_dir: /path/to/mle-bench -datasets: - - spaceship-titanic -provider: openai/gpt-4o -max_iterations: 3 -timeout: 3600 -``` - -Configuration options: - -- `repo_url`: The GitHub repository URL for MLE-bench -- `repo_dir`: Local directory to clone MLE-bench into -- `datasets`: List of Kaggle competitions to run (for initial testing, only spaceship-titanic is used) -- `provider`: LLM provider to use with Plexe (format: "provider/model") -- `max_iterations`: Maximum number of model solutions to explore -- `timeout`: Maximum time in seconds for model building - -The available datasets match the competitions in the [MLE-bench competitions directory](https://github.com/openai/mle-bench/tree/main/mlebench/competitions). - -## Output - -After running the benchmark, the results are stored in: -- `workdir/{dataset_name}/`: Model outputs and submissions for each dataset -- `grades/`: Evaluation results from MLE-bench scoring - -You can find submission CSV files and saved models in the `workdir/{dataset_name}/` directory. \ No newline at end of file diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/benchmark/mle-bench-config.yaml.jinja b/tests/benchmark/mle-bench-config.yaml.jinja deleted file mode 100644 index 21dc9521..00000000 --- a/tests/benchmark/mle-bench-config.yaml.jinja +++ /dev/null @@ -1,88 +0,0 @@ -repo_url: https://github.com/openai/mle-bench.git -repo_dir: {{ repo_dir }} -provider: {{ provider }} -max_iterations: {{ max_iterations }} -timeout: {{ timeout }} -datasets: -{# - 3d-object-detection-for-autonomous-vehicles#} -{# - AI4Code#} -{# - aerial-cactus-identification#} -{# - alaska2-image-steganalysis#} -{# - aptos2019-blindness-detection#} -{# - billion-word-imputation#} -{# - bms-molecular-translation#} -{# - cassava-leaf-disease-classification#} -{# - cdiscount-image-classification-challenge#} -{# - chaii-hindi-and-tamil-question-answering#} -{# - champs-scalar-coupling#} -{# - denoising-dirty-documents#} -{# - detecting-insults-in-social-commentary#} -{# - dog-breed-identification#} -{# - dogs-vs-cats-redux-kernels-edition#} -{# - facebook-recruiting-iii-keyword-extraction#} -{# - freesound-audio-tagging-2019#} -{# - google-quest-challenge#} -{# - google-research-identify-contrails-reduce-global-warming#} -{# - h-and-m-personalized-fashion-recommendations#} -{# - herbarium-2020-fgvc7#} -{# - herbarium-2021-fgvc8#} -{# - herbarium-2022-fgvc9#} -{# - histopathologic-cancer-detection#} -{# - hms-harmful-brain-activity-classification#} -{# - hotel-id-2021-fgvc8#} -{# - hubmap-kidney-segmentation#} -{# - icecube-neutrinos-in-deep-ice#} -{# - imet-2020-fgvc7#} -{# - inaturalist-2019-fgvc6#} -{# - invasive-species-monitoring#} -{# - iwildcam-2019-fgvc6#} -{# - iwildcam-2020-fgvc7#} -{# - jigsaw-toxic-comment-classification-challenge#} -{# - jigsaw-unintended-bias-in-toxicity-classification#} -{# - kuzushiji-recognition#} -{# - leaf-classification#} -{# - learning-agency-lab-automated-essay-scoring-2#} -{# - lmsys-chatbot-arena#} -{# - ml2021spring-hw2#} -{# - mlsp-2013-birds#} -{# - movie-review-sentiment-analysis-kernels-only#} -{# - multi-modal-gesture-recognition#} -{# - new-york-city-taxi-fare-prediction#} -{# - nfl-player-contact-detection#} -{# - nomad2018-predict-transparent-conductors#} -{# - osic-pulmonary-fibrosis-progression#} -{# - paddy-disease-classification#} -{# - petfinder-pawpularity-score#} -{# - plant-pathology-2020-fgvc7#} -{# - plant-pathology-2021-fgvc8#} -{# - plant-seedlings-classification#} -{# - playground-series-s3e18#} -{# - predict-volcanic-eruptions-ingv-oe#} -{# - random-acts-of-pizza#} -{# - ranzcr-clip-catheter-line-classification#} -{# - rsna-2022-cervical-spine-fracture-detection#} -{# - rsna-breast-cancer-detection#} -{# - rsna-miccai-brain-tumor-radiogenomic-classification#} -{# - seti-breakthrough-listen#} -{# - siim-covid19-detection#} -{# - siim-isic-melanoma-classification#} -{# - smartphone-decimeter-2022#} - - spaceship-titanic -{# - spooky-author-identification#} -{# - stanford-covid-vaccine#} -{# - statoil-iceberg-classifier-challenge#} -{# - tabular-playground-series-dec-2021#} - - tabular-playground-series-may-2022 -{# - tensorflow-speech-recognition-challenge#} -{# - tensorflow2-question-answering#} -{# - text-normalization-challenge-english-language#} -{# - text-normalization-challenge-russian-language#} -{# - tgs-salt-identification-challenge#} -{# - the-icml-2013-whale-challenge-right-whale-redux#} -{# - tweet-sentiment-extraction#} -{# - us-patent-phrase-to-phrase-matching#} -{# - uw-madison-gi-tract-image-segmentation#} -{# - ventilator-pressure-prediction#} -{# - vesuvius-challenge-ink-detection#} -{# - vinbigdata-chest-xray-abnormalities-detection#} -{# - whale-categorization-playground#} \ No newline at end of file diff --git a/tests/benchmark/mle_bench.py b/tests/benchmark/mle_bench.py deleted file mode 100644 index bfad12ef..00000000 --- a/tests/benchmark/mle_bench.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -""" -This script automates the setup, execution, and grading process for the "mle-bench" framework using plexe. - -Usage: - python mle_bench.py --config CONFIG_PATH --rebuild - -Description: - The script clones and sets up "mle-bench", prepares datasets, reads a configuration file - to determine the tests to run, executes models using plexe, and grades their performance. The - --rebuild flag forces the script to re-clone the "mle-bench" repository and reinstall dependencies. - -Ensure that your environment has the required permissions and Kaggle API credentials configured. -""" - -import argparse -import sys - -# Import the main runner class from the mlebench package -from mlebench.core.runner import MLEBenchRunner - - -def main(cli_args): - """Main entry point for the script""" - runner = MLEBenchRunner() - runner.setup(cli_args) - runner.run() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run and grade an agent on the MLE-bench framework.") - parser.add_argument( - "--config", type=str, required=False, default="mle-bench-config.yaml", help="Path to the configuration file." - ) - parser.add_argument( - "--rebuild", action="store_true", help="Force re-clone the MLE-bench repository and reinstall dependencies." - ) - - # Parse arguments and run main - args = parser.parse_args() - - try: - main(args) - except KeyboardInterrupt: - print("\n⚠️ Process interrupted by user.") - sys.exit(1) - except Exception as e: - print(f"❌ An unexpected error occurred: {e}") - import traceback - - traceback.print_exc() - sys.exit(1) diff --git a/tests/benchmark/mlebench/__init__.py b/tests/benchmark/mlebench/__init__.py deleted file mode 100644 index f25afee3..00000000 --- a/tests/benchmark/mlebench/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""MLE Bench runner package for Plexe.""" diff --git a/tests/benchmark/mlebench/core/__init__.py b/tests/benchmark/mlebench/core/__init__.py deleted file mode 100644 index a1849a56..00000000 --- a/tests/benchmark/mlebench/core/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Core modules for MLE Bench runner.""" diff --git a/tests/benchmark/mlebench/core/config.py b/tests/benchmark/mlebench/core/config.py deleted file mode 100644 index 2bad6f2b..00000000 --- a/tests/benchmark/mlebench/core/config.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Configuration management for MLE Bench runner.""" - -import os -import sys -from pathlib import Path -import yaml -from jinja2 import Environment, Template, meta - - -class ConfigManager: - """Class to handle configuration loading and generation""" - - @staticmethod - def load_config(config_path): - """Load configuration from YAML file""" - print(f"🔍 Loading test configuration from {config_path}...") - if not os.path.exists(config_path): - print(f"❌ Config file not found at: {config_path}") - sys.exit(1) - try: - with open(config_path, "r") as config_file: - config = yaml.safe_load(config_file) - print("✅ Configuration loaded successfully.") - return config - except yaml.YAMLError as e: - print(f"❌ Error parsing config file: {e}") - sys.exit(1) - - @staticmethod - def ensure_config_exists(rebuild: bool = False): - """Check if `mle-bench-config.yaml` exists, and if not, generate it from `mle-bench-config.yaml.jinja`""" - config_path = Path("mle-bench-config.yaml") - if config_path.exists() and not rebuild: - print("✅ Configuration file 'mle-bench-config.yaml' already exists.") - return - - # Get the script directory for finding the template - script_dir = Path(__file__).parent.parent.parent.absolute() - template_path = script_dir / "mle-bench-config.yaml.jinja" - - if not template_path.exists(): - print(f"❌ Template file '{template_path}' not found. Cannot proceed.") - sys.exit(1) - - if rebuild: - print(f"🔄 Rebuilding 'mle-bench-config.yaml' from '{template_path}'...") - else: - print(f"📝 'mle-bench-config.yaml' not found. Generating it from '{template_path}'...") - - # Load the template - with open(template_path, "r") as template_file: - template_content = template_file.read() - - env = Environment() - ast = env.parse(template_content) - template = Template(template_content) - - print(f"📝 Template loaded from {template_path}") - - # Set default values and gather user inputs for template variables - variables = { - "repo_dir": str(Path.home() / "mle-bench"), - "provider": "openai/gpt-4o", - "max_iterations": "3", - "timeout": "3600", - } - - # Allow user to override defaults - for var in meta.find_undeclared_variables(ast): - if not var.startswith("_"): - if var in variables: - prompt = f"💡 Provide a value for '{var}' (default: {variables[var]}): " - else: - prompt = f"💡 Provide a value for '{var}': " - - try: - user_input = input(prompt) - if user_input.strip(): # Only update if user provided a non-empty value - variables[var] = user_input - except EOFError: - print(f"Using default value for '{var}': {variables.get(var, '')}") - - # Render and write the config file - config_content = template.render(**variables) - - # Parse the rendered config - config_yaml = yaml.safe_load(config_content) - - # Add plexe configurations to the config - config_yaml["provider"] = variables["provider"] - config_yaml["max_iterations"] = int(variables["max_iterations"]) - config_yaml["timeout"] = int(variables["timeout"]) - - # Write the updated config - with open(config_path, "w") as config_file: - yaml.dump(config_yaml, config_file, default_flow_style=False) - - print("✅ 'mle-bench-config.yaml' generated successfully.") diff --git a/tests/benchmark/mlebench/core/models.py b/tests/benchmark/mlebench/core/models.py deleted file mode 100644 index 98d8eeee..00000000 --- a/tests/benchmark/mlebench/core/models.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Data models for MLE Bench runner.""" - -from dataclasses import dataclass -from typing import Optional - - -@dataclass -class TestResult: - """Structured class to store test results""" - - name: str - success: bool - submission_path: Optional[str] = None - model_path: Optional[str] = None - failure_reason: Optional[str] = None - - -@dataclass -class SubmissionInfo: - """Structured class to store submission information""" - - competition_id: str - submission_path: str diff --git a/tests/benchmark/mlebench/core/runner.py b/tests/benchmark/mlebench/core/runner.py deleted file mode 100644 index 8c1f6414..00000000 --- a/tests/benchmark/mlebench/core/runner.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Main runner class for MLE Bench benchmark.""" - -import os -import sys -from pathlib import Path - -from mlebench.core.config import ConfigManager -from mlebench.core.validator import EnvironmentValidator -from mlebench.runners.setup import MLEBenchSetup -from mlebench.runners.test_runner import TestRunner -from mlebench.runners.grader import MLEBenchGrader - - -class MLEBenchRunner: - """Main class to run the MLE-bench benchmarking framework""" - - def __init__(self): - self.config = None - self.workdir = None - - def setup(self, cli_args): - """Set up the MLE-bench environment""" - print("🚀 Starting the MLE-bench Runner with Plexe...") - - # Get the absolute path for the config file - config_path = Path(cli_args.config).absolute() - print(f"📄 Using configuration file: {config_path}") - - # Create workdir if it doesn't exist - self.workdir = Path(os.getcwd()) / "workdir" - self.workdir.mkdir(exist_ok=True) - print(f"📁 Using working directory: {self.workdir}") - - # Check if LLM API key is set - if not EnvironmentValidator.check_llm_api_keys(): - print("❌ Required LLM API key environment variables not set. Please set them before running.") - print("❌ See the README.md file for instructions on configuring API keys.") - sys.exit(1) - - # Ensure that the configuration file exists, then load it - ConfigManager.ensure_config_exists(cli_args.rebuild) - self.config = ConfigManager.load_config(config_path) - - # Ensure Kaggle credentials are set up - EnvironmentValidator.ensure_kaggle_credentials() - - # Set up MLE-bench and prepare datasets - MLEBenchSetup.setup_mle_bench(self.config, cli_args.rebuild) - MLEBenchSetup.prepare_datasets(self.config) - - def run(self): - """Run the tests and evaluate the results""" - # Run tests - test_runner = TestRunner(self.config) - submissions = test_runner.run_tests() - - # Grade agent if there are submissions - if submissions: - grades_dir = MLEBenchGrader.grade_agent(submissions) - print(f"📊 Benchmark results saved to: {grades_dir}") - else: - print("❌ No submissions were generated. Cannot grade the agent.") - - print("✅ Script completed. Thank you for using the MLE-bench Runner!") diff --git a/tests/benchmark/mlebench/core/validator.py b/tests/benchmark/mlebench/core/validator.py deleted file mode 100644 index a442141a..00000000 --- a/tests/benchmark/mlebench/core/validator.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Environment validation for MLE Bench runner.""" - -import os -import sys - - -class EnvironmentValidator: - """Class to validate environment setup""" - - @staticmethod - def ensure_kaggle_credentials(): - """Ensure that Kaggle API credentials are set up""" - print("🔑 Checking Kaggle API credentials...") - if not os.path.exists(os.path.expanduser("~/.kaggle/kaggle.json")): - print( - "❌ Kaggle API credentials not found. Please save 'kaggle.json' to '~/.kaggle/' following " - "the instructions at https://www.kaggle.com/docs/api." - ) - sys.exit(1) - print("✅ Kaggle API credentials found.") - - @staticmethod - def check_llm_api_keys(): - """Check if required LLM API key environment variables are set""" - # Check for common LLM provider API keys - api_keys = {"OpenAI": "OPENAI_API_KEY", "Anthropic": "ANTHROPIC_API_KEY", "Gemini": "GEMINI_API_KEY"} - - keys_found = False - for provider, env_var in api_keys.items(): - if os.environ.get(env_var): - print(f"✅ {provider} API key found ({env_var})") - keys_found = True - - if not keys_found: - print("❌ No LLM API keys found. Please set one of the following environment variables:") - for provider, env_var in api_keys.items(): - print(f"❌ - {env_var} (for {provider})") - return False - - return True diff --git a/tests/benchmark/mlebench/runners/__init__.py b/tests/benchmark/mlebench/runners/__init__.py deleted file mode 100644 index d7878e7d..00000000 --- a/tests/benchmark/mlebench/runners/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Test runner implementations for MLE Bench runner.""" diff --git a/tests/benchmark/mlebench/runners/grader.py b/tests/benchmark/mlebench/runners/grader.py deleted file mode 100644 index 489d195b..00000000 --- a/tests/benchmark/mlebench/runners/grader.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Grader for MLE Bench benchmark results.""" - -import json -from pathlib import Path -from typing import List - -from mlebench.core.models import SubmissionInfo -from mlebench.utils.command import CommandRunner - - -class MLEBenchGrader: - """Class to handle grading of model submissions""" - - @staticmethod - def grade_agent(submissions: List[SubmissionInfo]): - """Grade the agent's performance based on the test results""" - print("📊 Grading the agent's performance...") - - # Get current working directory - original_cwd = Path.cwd() - - # Write the list of dicts to a JSONL file - submissions_file = original_cwd / "submissions.jsonl" - with open(submissions_file, "w") as f: - for submission in submissions: - f.write( - json.dumps( - {"competition_id": submission.competition_id, "submission_path": submission.submission_path} - ) - + "\n" - ) - - # Create grades directory if it doesn't exist - grades_dir = original_cwd / "grades" - grades_dir.mkdir(exist_ok=True) - - CommandRunner.run( - ["mlebench", "grade", "--submission", str(submissions_file), "--output-dir", str(grades_dir)], - "Failed to grade the agent.", - "Agent graded successfully.", - ) - print(f"🏆 Agent grading completed for {len(submissions)} tests.") - - return grades_dir diff --git a/tests/benchmark/mlebench/runners/setup.py b/tests/benchmark/mlebench/runners/setup.py deleted file mode 100644 index aaaa012c..00000000 --- a/tests/benchmark/mlebench/runners/setup.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Setup and preparation for MLE Bench runner.""" - -import os -import sys -import shutil - -from mlebench.utils.command import CommandRunner, working_directory - - -class MLEBenchSetup: - """Class to handle setup of MLE-bench framework""" - - @staticmethod - def setup_mle_bench(config, rebuild: bool = False): - """Set up the MLE-bench framework""" - print("🔧 Setting up 'mle-bench' framework...") - - # First, ensure kaggle package is properly installed - print("📦 Checking kaggle package version...") - CommandRunner.run( - [sys.executable, "-m", "pip", "show", "kaggle"], - "Failed to check kaggle package version.", - "Kaggle package version checked successfully.", - ) - - repo_dir = config.get("repo_dir") - repo_url = config.get("repo_url") - - if os.path.exists(repo_dir) and not rebuild: - print(f"📂 '{repo_dir}' repository already exists. Skipping setup step.") - return - else: - if rebuild: - print("🔄 Rebuilding 'mle-bench' repository...") - if os.path.exists(repo_dir): - if os.access(repo_dir, os.W_OK): - print(f"Removing '{repo_dir}'...") - shutil.rmtree(repo_dir) - print(f"Removed '{repo_dir}' successfully.") - else: - print(f"⚠️ No write permission for '{repo_dir}'. Attempting to change permissions...") - os.chmod(repo_dir, 0o700) # Grant read, write, and execute permissions to the owner - if os.access(repo_dir, os.W_OK): - print(f"Permissions changed. Removing '{repo_dir}'...") - shutil.rmtree(repo_dir) - print(f"Removed '{repo_dir}' successfully.") - else: - print(f"❌ Failed to change permissions for '{repo_dir}'. Cannot remove the directory.") - sys.exit(1) - else: - print(f"Directory '{repo_dir}' not found. Skipping removal.") - print(f"🔍 Cloning '{repo_url}' into '{repo_dir}'...") - CommandRunner.run( - ["git", "clone", repo_url, repo_dir], - f"Failed to clone '{repo_url}'.", - f"'{repo_url}' cloned successfully into '{repo_dir}'.", - ) - - # Install MLE-bench using pip - with working_directory(repo_dir): - print("🔍 Skipping Git LFS setup for testing...") - CommandRunner.run( - ["git", "lfs", "install"], "Failed to install Git LFS.", "Git LFS installed successfully." - ) - CommandRunner.run( - ["git", "lfs", "fetch", "--all"], - "Failed to fetch large files with Git LFS.", - "Fetched all large files using Git LFS.", - ) - CommandRunner.run( - ["git", "lfs", "pull"], - "Failed to pull large files with Git LFS.", - "Pulled all large files using Git LFS.", - ) - - print("🔍 Installing 'mle-bench' and dependencies...") - CommandRunner.run( - [sys.executable, "-m", "pip", "install", "-e", "."], - "Failed to install 'mle-bench'.", - "'mle-bench' installed successfully.", - ) - - @staticmethod - def prepare_datasets(config): - """Prepare datasets listed in the config file""" - print("📦 Preparing datasets for 'mle-bench'...") - - repo_dir = config.get("repo_dir") - datasets = config.get("datasets", []) - print(f"📂 Datasets to prepare: {datasets}") - - if not datasets: - print("⚠️ No datasets listed in 'mle-bench-config.yaml'. Skipping dataset preparation.") - return - - with working_directory(repo_dir): - for dataset in datasets: - print(f"📂 Preparing dataset: {dataset}") - CommandRunner.run( - ["mlebench", "prepare", "-c", dataset, "--skip-verification"], - f"Failed to prepare dataset: {dataset}", - f"Dataset '{dataset}' prepared successfully.", - ) diff --git a/tests/benchmark/mlebench/runners/test_runner.py b/tests/benchmark/mlebench/runners/test_runner.py deleted file mode 100644 index 5bd38e9a..00000000 --- a/tests/benchmark/mlebench/runners/test_runner.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Test runner for MLE Bench benchmarks.""" - -import os -import time -import warnings -from pathlib import Path -from typing import List - -import pandas as pd -import platformdirs -import plexe -from tqdm import tqdm - -from mlebench.core.models import TestResult, SubmissionInfo -from mlebench.utils.error import ErrorHandler - -# Sklearn often throws warnings when used in plexe -warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") - - -class TestRunner: - """Class to run tests using plexe""" - - def __init__(self, config): - self.config = config - self.provider = config.get("provider", "openai/gpt-4o") - self.max_iterations = config.get("max_iterations", 3) - self.timeout = config.get("timeout", 3600) # Default 1 hour timeout - self.workdir = Path(os.getcwd()) / "workdir" - self.workdir.mkdir(exist_ok=True) - self.mle_bench_data_dir = Path(platformdirs.user_cache_dir()) / "mle-bench" / "data" - - print(f"🔧 Using provider: {self.provider}, max_iterations: {self.max_iterations}, timeout: {self.timeout}s") - - def verify_test_files(self, test_name) -> bool: - """Verify that all required files for a test exist""" - data_dir = self.mle_bench_data_dir / test_name / "prepared" / "public" - required_files = [ - data_dir / "train.csv", - data_dir / "test.csv", - data_dir / "description.md", - data_dir / "sample_submission.csv", - ] - - for file_path in required_files: - if not file_path.exists(): - print(f"❌ Required file not found: {file_path}") - return False - - return True - - def prepare_test(self, test_name): - """Prepare test data and create output directory""" - data_dir = self.mle_bench_data_dir / test_name / "prepared" / "public" - - # Read task description - with open(data_dir / "description.md", "r") as f: - task_description = f.read() - - # Create output directory for this test - output_dir = self.workdir / test_name - output_dir.mkdir(exist_ok=True) - - # Load datasets - print(f"📊 Loading datasets for {test_name}...") - train_data = pd.read_csv(data_dir / "train.csv") - test_data = pd.read_csv(data_dir / "test.csv") - sample_submission = pd.read_csv(data_dir / "sample_submission.csv") - - test_data_info = { - "train_data": train_data, - "test_data": test_data, - "sample_submission": sample_submission, - "task_description": task_description, - "output_dir": output_dir, - } - - return test_data_info - - def build_model(self, test_name, test_data_info): - """Build a model using plexe""" - print(f"🤖 Creating model for {test_name}...") - model = plexe.Model( - intent=test_data_info["task_description"], - ) - - # Build the model - print(f"🏗️ Building model for {test_name}...") - start_time = time.time() - try: - model.build( - datasets=[test_data_info["train_data"]], - provider=self.provider, - max_iterations=self.max_iterations, - timeout=self.timeout, - ) - build_time = time.time() - start_time - print(f"✅ Model built successfully in {build_time:.2f} seconds") - return model - except Exception as e: - ErrorHandler.handle_error("model building", test_name, e) - return None - - def validate_predictions(self, predictions, expected_columns): - """Validate prediction data has required columns and format""" - missing_cols = [col for col in expected_columns if col not in predictions.columns] - if missing_cols: - raise ValueError(f"Predictions missing required columns: {missing_cols}") - return True - - def generate_predictions(self, model, test_name, test_data_info): - """Generate predictions using the model""" - print(f"🔮 Generating predictions for {test_name}...") - - test_data = test_data_info["test_data"] - sample_submission = test_data_info["sample_submission"] - output_dir = test_data_info["output_dir"] - - # Determine columns that need to be in submission file from the sample submission - submission_columns = list(sample_submission.columns) - - print(f"🎯 Target columns: {submission_columns}") - - # Create submission file path - submission_path = output_dir / "submission.csv" - - try: - # Process each row in test data - prediction_results = [] - - print(f"📊 Processing {len(test_data)} test records...") - - # Use tqdm for progress tracking - for idx, (_, row) in enumerate( - tqdm(test_data.iterrows(), total=len(test_data), desc=f"Generating predictions for {test_name}") - ): - try: - # Convert row to dictionary - row_dict = row.to_dict() - - # Make prediction for this row - row_prediction = model.predict(row_dict) - - # Concatenate row_dict with row_prediction, then keep only submissions columns - row_prediction = {**row_dict, **row_prediction} - row_prediction = {k: v for k, v in row_prediction.items() if k in submission_columns} - - prediction_results.append(row_prediction) - - except Exception as e: - print(f"⚠️ Error predicting row {idx}: {e}") - # Add empty prediction to maintain row count - empty_prediction = {col: None for col in submission_columns} - prediction_results.append(empty_prediction) - - # Create a DataFrame from all the prediction results - all_predictions_df = pd.DataFrame(prediction_results) - - # Validate predictions have required columns - self.validate_predictions(all_predictions_df, sample_submission.columns) - - # Save the prediction results - all_predictions_df.to_csv(submission_path, index=False) - print(f"✅ Predictions generated and submission file created at {submission_path}") - return submission_path - - except Exception as e: - ErrorHandler.handle_error("prediction generation", test_name, e) - return None - - def save_model(self, model, test_name, output_dir): - """Save model for future reference""" - model_save_path = output_dir / f"{test_name}_model.tar.gz" - try: - plexe.save_model(model, str(model_save_path)) - print(f"✅ Model saved to {model_save_path}") - return model_save_path - except Exception as e: - print(f"⚠️ Failed to save model (non-critical): {e}") - return None - - def run_tests(self) -> List[SubmissionInfo]: - """Run tests from the configuration file using plexe""" - print("🏁 Starting test execution with plexe...") - test_results = [] - submissions = [] - - for test_name in self.config.get("datasets", []): - try: - print(f"🔍 Running test: {test_name}") - - # Check if required files exist - if not self.verify_test_files(test_name): - test_results.append(TestResult(name=test_name, success=False, failure_reason="missing files")) - continue - - # Prepare test data - test_data_info = self.prepare_test(test_name) - - # Build model - model = self.build_model(test_name, test_data_info) - - if model: - # Generate predictions - submission_path = self.generate_predictions(model, test_name, test_data_info) - - if submission_path: - # Save model - model_path = self.save_model(model, test_name, test_data_info["output_dir"]) - - # Record successful test - test_results.append( - TestResult( - name=test_name, - success=True, - submission_path=str(submission_path), - model_path=str(model_path) if model_path else None, - ) - ) - - # Add to submissions for grading - submissions.append( - SubmissionInfo(competition_id=test_name, submission_path=str(submission_path)) - ) - else: - test_results.append( - TestResult(name=test_name, success=False, failure_reason="prediction failed") - ) - else: - test_results.append(TestResult(name=test_name, success=False, failure_reason="model build failed")) - - except Exception as e: - ErrorHandler.handle_error("test execution", test_name, e) - test_results.append( - TestResult(name=test_name, success=False, failure_reason=f"general error: {str(e)}") - ) - - # Report failed tests - failed_tests = [test for test in test_results if not test.success] - if failed_tests: - print(f"⚠️ {len(failed_tests)} tests failed:") - for test in failed_tests: - print(f"⚠️ - {test.name}: {test.failure_reason}") - - return submissions diff --git a/tests/benchmark/mlebench/utils/__init__.py b/tests/benchmark/mlebench/utils/__init__.py deleted file mode 100644 index 63d7fda7..00000000 --- a/tests/benchmark/mlebench/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utility functions and classes for MLE Bench runner.""" diff --git a/tests/benchmark/mlebench/utils/command.py b/tests/benchmark/mlebench/utils/command.py deleted file mode 100644 index eff803fc..00000000 --- a/tests/benchmark/mlebench/utils/command.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Command execution utilities for MLE Bench runner.""" - -import subprocess -import sys -from contextlib import contextmanager -import os -from pathlib import Path - - -class CommandRunner: - """Class to handle command execution and error handling""" - - @staticmethod - def run(command, error_message, success_message=None): - """Run a shell command and handle errors""" - try: - subprocess.run(command, check=True, text=True) - if success_message: - print(f"✅ {success_message}") - except subprocess.CalledProcessError as e: - print(f"❌ {error_message}") - print(f"❌ Error details: {e}") - sys.exit(1) - except FileNotFoundError as e: - print(f"❌ {str(e)}") - print(f"❌ Command not found: {' '.join(command)}") - print( - "❌ This usually means that the required tool is not installed or not in the PATH. " - "Please install the required dependencies and try again." - ) - sys.exit(1) - - -@contextmanager -def working_directory(path): - """Context manager for changing the current working directory""" - prev_cwd = Path.cwd() - os.chdir(path) - try: - yield - finally: - os.chdir(prev_cwd) diff --git a/tests/benchmark/mlebench/utils/error.py b/tests/benchmark/mlebench/utils/error.py deleted file mode 100644 index e9037fc7..00000000 --- a/tests/benchmark/mlebench/utils/error.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Error handling utilities for MLE Bench runner.""" - -import sys -import traceback - - -class ErrorHandler: - """Class to handle and format errors""" - - @staticmethod - def handle_error(operation, context, error, exit_on_failure=False): - """Handle exceptions with consistent formatting""" - print(f"❌ Error during {operation}: {error}") - if context: - print(f"❌ Context: {context}") - print(traceback.format_exc()) - if exit_on_failure: - sys.exit(1) diff --git a/tests/fixtures/legacy_models/model_v0_18_3.tar.gz b/tests/fixtures/legacy_models/model_v0_18_3.tar.gz deleted file mode 100644 index 7270c997..00000000 Binary files a/tests/fixtures/legacy_models/model_v0_18_3.tar.gz and /dev/null differ diff --git a/tests/fixtures/legacy_models/model_v0_23_2.tar.gz b/tests/fixtures/legacy_models/model_v0_23_2.tar.gz deleted file mode 100644 index e12dfa7c..00000000 Binary files a/tests/fixtures/legacy_models/model_v0_23_2.tar.gz and /dev/null differ diff --git a/tests/integration/README.md b/tests/integration/README.md deleted file mode 100644 index 9ad06245..00000000 --- a/tests/integration/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Plexe Integration Tests - - -## 1. Overview -This directory contains integration tests validating the end-to-end functionality of Plexe across various ML tasks. - - -## 2. Test Suite -### 2.1 Classification -- **Binary Classification** (`test_binary_classification.py`): Heart disease prediction; tests model lifecycle. -- **Multiclass Classification** (`test_multiclass_classification.py`): Sentiment analysis; tests synthetic data generation. - -### 2.2 Regression & Forecasting -- **Regression** (`test_regression.py`): House price prediction; tests schema specification and inference. -- **Time Series Forecasting** (`test_time_series.py`): Sales prediction; tests multi-feature temporal forecasting. - -### 2.3 Other ML Tasks -- **Recommendation** (`test_recommendation.py`): Product recommendations; tests list-based output and cross-selling. -- **Customer Churn** (`test_customer_churn.py`): Churn prediction; tests probability outputs and schema validation. -- **Schema Validation** (`test_schema_validation.py`): Validates complex schemas using Pydantic. - - -## 3. Execution -### 3.1 Prerequisites -- Plexe installed with development dependencies. -- `OPENAI_API_KEY` set as an environment variable. - -### 3.2 Running Tests - -#### 3.2.1 Single Test -```bash -poetry run pytest tests/integration/test_.py -v -``` -Example: -```bash -poetry run pytest tests/integration/test_binary_classification.py -v -``` - -#### 3.2.2 All Tests -```bash -poetry run pytest tests/integration/ -v -``` - -#### 3.2.3 Filtering Tests -```bash -poetry run pytest -k -v -``` -Example: -```bash -poetry run pytest tests/integration/test_binary_classification.py::test_heart_disease_classification -v -``` - -### 3.3 Optimization Tips -- Run tests individually to reduce execution time. -- Set `max_iterations` to 2-3 and timeouts to ~10 minutes. -- Use small synthetic datasets (~30-60 samples). - - -## 4. Adding New Tests -- Scope tests to a single model type or feature. -- Ensure runtime-generated synthetic data. -- Validate model training, inference, saving/loading. -- Confirm schema compliance and expected outputs. -- Use `openai/gpt-4o` for all tests. - diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/integration/test_binary_classification.py b/tests/integration/test_binary_classification.py deleted file mode 100644 index a0cfd216..00000000 --- a/tests/integration/test_binary_classification.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Integration test for binary classification models using plexe. - -This test covers: -1. Creating a binary classification model for heart disease prediction -2. Building the model with synthetic data -3. Making predictions with the model -4. Saving and loading the model -""" - -import os -import pytest -from pathlib import Path -from pydantic import create_model -import plexe -from tests.utils.utils import generate_heart_data, verify_prediction, cleanup_files, verify_model_description - - -@pytest.fixture -def heart_data(): - """Generate synthetic heart disease data for testing.""" - return generate_heart_data(n_samples=30) - - -@pytest.fixture -def heart_input_schema(): - """Define the input schema for heart disease prediction.""" - return create_model( - "HeartDiseaseInput", - **{ - "age": int, - "gender": int, - "cp": int, - "trtbps": int, - "chol": int, - "fbs": int, - "restecg": int, - "thalachh": int, - "exng": int, - "oldpeak": float, - "slp": int, - "caa": int, - "thall": int, - }, - ) - - -@pytest.fixture -def heart_output_schema(): - """Define the output schema for heart disease prediction.""" - return create_model("HeartDiseaseOutput", **{"output": int}) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_heart_disease_classification(heart_data, heart_input_schema, heart_output_schema): - """Test binary classification for heart disease prediction.""" - # Create a model for heart disease prediction - model = plexe.Model( - intent="Predict whether a patient is likely to have heart disease based on their medical features", - input_schema=heart_input_schema, - output_schema=heart_output_schema, - ) - - # Build the model with minimal data and iterations for faster testing - model.build( - datasets=[heart_data], - provider="openai/gpt-4o", - max_iterations=3, # Minimum iterations for reliable model generation - timeout=300, # 5 minute timeout - run_timeout=150, - ) - - # Test a sample prediction - test_input = { - "age": 61, - "gender": 1, - "cp": 3, - "trtbps": 145, - "chol": 233, - "fbs": 1, - "restecg": 0, - "thalachh": 150, - "exng": 0, - "oldpeak": 2.3, - "slp": 0, - "caa": 0, - "thall": 1, - } - # Try prediction with a dictionary directly - prediction = model.predict(test_input) - - # Verify the prediction - verify_prediction(prediction, heart_output_schema) - assert prediction["output"] in [0, 1], "Binary classification output should be 0 or 1" - - # Verify model description - description = model.describe() - verify_model_description(description) - - # Test model saving - model_path = plexe.save_model(model, "heart_disease_model.tar.gz") - assert Path(model_path).exists(), f"Model file {model_path} not created" - - # Test model loading - loaded_model = plexe.load_model(model_path) - # Use dictionary for prediction with the loaded model - loaded_prediction = loaded_model.predict(test_input) - - # Verify the loaded model's prediction - verify_prediction(loaded_prediction, heart_output_schema) - assert loaded_prediction["output"] in [0, 1], "Binary classification output should be 0 or 1" - assert loaded_prediction == prediction, "Loaded model prediction should match original model prediction" diff --git a/tests/integration/test_customer_churn.py b/tests/integration/test_customer_churn.py deleted file mode 100644 index 39d03e5e..00000000 --- a/tests/integration/test_customer_churn.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Integration test for customer churn prediction models using plexe. - -This test covers: -1. Creating a model for predicting customer churn -2. Building the model with synthetic data -3. Making predictions with the model -4. Validating predictions with probability output -5. Using schema inference -""" - -import os -from pathlib import Path - -import pytest -from pydantic import create_model, Field - -import plexe -from tests.utils.utils import generate_customer_churn_data, verify_prediction, cleanup_files, verify_model_description - - -@pytest.fixture -def churn_data(): - """Generate synthetic customer churn data for testing.""" - return generate_customer_churn_data(n_samples=30) - - -@pytest.fixture -def churn_input_schema(): - """Define the input schema for churn prediction with field validations.""" - return create_model( - "ChurnInput", - **{ - "tenure": (int, Field(ge=0, description="Number of months the customer has been with the company")), - "monthly_charges": (float, Field(ge=0, description="Monthly charges in dollars")), - "total_charges": (float, Field(ge=0, description="Total charges in dollars")), - "contract_type": (int, Field(ge=0, le=2, description="0=month-to-month, 1=one year, 2=two year")), - "payment_method": ( - int, - Field(ge=0, le=3, description="0=electronic check, 1=mailed check, 2=bank transfer, 3=credit card"), - ), - "tech_support": (int, Field(ge=0, le=1, description="0=no, 1=yes")), - "online_backup": (int, Field(ge=0, le=1, description="0=no, 1=yes")), - "online_security": (int, Field(ge=0, le=1, description="0=no, 1=yes")), - }, - ) - - -@pytest.fixture -def churn_output_schema(): - """Define the output schema for churn prediction with probability.""" - return create_model( - "ChurnOutput", - **{ - "churn_probability": (float, Field(ge=0, le=1, description="Probability of customer churning (0-1)")), - "churn": (int, Field(ge=0, le=1, description="Binary churn prediction (0=no, 1=yes)")), - }, - ) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_customer_churn_prediction(churn_data, churn_input_schema, churn_output_schema): - """Test customer churn prediction with probability output.""" - # Create a model for churn prediction - model = plexe.Model( - intent=""" - Predict the probability that a customer will churn (leave the company) based on their - service usage and contract details. Return both the probability of churning (0-1) and - a binary prediction (0=will not churn, 1=will churn) using a threshold of 0.5. - """, - input_schema=churn_input_schema, - output_schema=churn_output_schema, - ) - - # Build the model with minimal iterations for faster testing - model.build( - datasets=[churn_data], - provider="openai/gpt-4o", - max_iterations=4, # Minimum iterations for reliable model generation - timeout=300, # 5 minute timeout - run_timeout=150, - ) - - # Test sample predictions - test_inputs = [ - # High risk customer (month-to-month contract, high monthly charge, low tenure) - { - "tenure": 3, - "monthly_charges": 120.00, - "total_charges": 360.00, - "contract_type": 0, - "payment_method": 0, - "tech_support": 0, - "online_backup": 0, - "online_security": 0, - }, - # Low risk customer (two-year contract, moderate monthly charge, high tenure) - { - "tenure": 60, - "monthly_charges": 90.00, - "total_charges": 5400.00, - "contract_type": 2, - "payment_method": 3, - "tech_support": 1, - "online_backup": 1, - "online_security": 1, - }, - ] - - for test_input in test_inputs: - prediction = model.predict(test_input) - - # Verify the prediction structure - verify_prediction(prediction, churn_output_schema) - - # Verify probability constraints - assert 0 <= prediction["churn_probability"] <= 1, "Churn probability should be between 0 and 1" - - # Verify binary prediction - assert prediction["churn"] in [0, 1], "Binary churn prediction should be 0 or 1" - - # Verify consistency between probability and binary prediction - if prediction["churn_probability"] > 0.5: - assert prediction["churn"] == 1, "Binary prediction should be 1 if probability > 0.5" - else: - assert prediction["churn"] == 0, "Binary prediction should be 0 if probability <= 0.5" - - # Verify model description - description = model.describe() - verify_model_description(description) - - # Test model saving and loading - model_path = plexe.save_model(model, "churn_model.tar.gz") - loaded_model = plexe.load_model(model_path) - - # Verify loaded model predictions - for test_input in test_inputs: - loaded_prediction = loaded_model.predict(test_input) - verify_prediction(loaded_prediction, churn_output_schema) diff --git a/tests/integration/test_model_description.py b/tests/integration/test_model_description.py deleted file mode 100644 index 42816f19..00000000 --- a/tests/integration/test_model_description.py +++ /dev/null @@ -1,215 +0,0 @@ -"""Integration test for model description functionality in plexe. - -This test covers: -1. Creating a simple model for iris flower classification -2. Building the model with synthetic data -3. Generating model descriptions in different formats (dict, json, text, markdown) -4. Verifying the content of the model descriptions -""" - -import json -import os -import pytest -from pathlib import Path -from pydantic import create_model - -import pandas as pd -import numpy as np -import plexe -from tests.utils.utils import cleanup_files - - -@pytest.fixture -def iris_data(): - """Generate synthetic iris data for testing.""" - # Create a simple dataset similar to the iris dataset - np.random.seed(42) - n_samples = 30 - - data = { - "sepal_length": np.random.uniform(4.5, 7.5, n_samples), - "sepal_width": np.random.uniform(2.0, 4.5, n_samples), - "petal_length": np.random.uniform(1.0, 6.5, n_samples), - "petal_width": np.random.uniform(0.1, 2.5, n_samples), - "species": ["setosa"] * n_samples, - } - - # Generate target based on petal length and width - # Simplistic rule: if petal_length > 3.0, it's likely virginica or versicolor - - for i in range(n_samples): - if data["petal_length"][i] < 2.0: - data["species"][i] = "setosa" - elif data["petal_length"][i] < 5.0: - data["species"][i] = "versicolor" - else: - data["species"][i] = "virginica" - - return pd.DataFrame(data) - - -@pytest.fixture -def iris_input_schema(): - """Define the input schema for iris classification.""" - return create_model( - "IrisInput", - **{ - "sepal_length": float, - "sepal_width": float, - "petal_length": float, - "petal_width": float, - }, - ) - - -@pytest.fixture -def iris_output_schema(): - """Define the output schema for iris classification.""" - return create_model("IrisOutput", **{"species": str}) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def verify_description_format(description, format_type): - """Verify that a description has the expected format and content.""" - if format_type == "dict": - assert isinstance(description, dict) - assert "id" in description - assert "intent" in description - assert "schemas" in description - assert "implementation" in description - assert "performance" in description - assert "code" in description - elif format_type == "json": - assert isinstance(description, str) - # Try to parse the JSON string - try: - json_dict = json.loads(description) - assert isinstance(json_dict, dict) - assert "id" in json_dict - assert "intent" in json_dict - except json.JSONDecodeError: - pytest.fail("Description is not valid JSON") - elif format_type == "text": - assert isinstance(description, str) - assert "Model:" in description - assert "Intent:" in description - assert "Input Schema:" in description - assert "Output Schema:" in description - elif format_type == "markdown": - assert isinstance(description, str) - assert "# Model:" in description - assert "**Intent:**" in description - assert "## Input Schema" in description - assert "## Output Schema" in description - - -def test_model_description(iris_data, iris_input_schema, iris_output_schema): - """Test model description generation in various formats and content verification.""" - - # Create a model for iris species classification - model = plexe.Model( - intent="Classify iris flowers into species based on their sepal and petal measurements", - input_schema=iris_input_schema, - output_schema=iris_output_schema, - ) - - # Build the model with minimal iterations for faster testing - model.build( - datasets=[iris_data], - provider="openai/gpt-4o", - max_iterations=2, # Minimum iterations for quick testing - timeout=180, # 3 minute timeout - run_timeout=150, - ) - - # Test that the model is in the ready state - assert model.state.value == "ready", "Model should be in ready state after building" - - # PART 1: Test description object and its format methods - - # Get the model description object - desc = model.describe() - - # Test the object has the expected attributes and methods - assert hasattr(desc, "id") - assert hasattr(desc, "intent") - assert hasattr(desc, "schemas") - assert hasattr(desc, "to_dict") - assert hasattr(desc, "to_json") - assert hasattr(desc, "as_text") - assert hasattr(desc, "as_markdown") - - # Test dictionary format - dict_desc = desc.to_dict() - verify_description_format(dict_desc, "dict") - - # Test JSON format - json_desc = desc.to_json() - verify_description_format(json_desc, "json") - - # Test text format - text_desc = desc.as_text() - verify_description_format(text_desc, "text") - - # Test markdown format - md_desc = desc.as_markdown() - verify_description_format(md_desc, "markdown") - - # Ensure output is always visible, even when tests pass - print("\n\n=== MODEL DESCRIPTION IN JSON FORMAT ===\n") - print(json_desc) - - print("\n\n=== MODEL DESCRIPTION IN TEXT FORMAT ===\n") - print(text_desc) - - print("\n\n=== MODEL DESCRIPTION IN MARKDOWN FORMAT ===\n") - print(md_desc) - - # PART 2: Verify description content details - - # Verify basic content - assert "id" in dict_desc - assert dict_desc["intent"] == "Classify iris flowers into species based on their sepal and petal measurements" - - # Verify schema information - assert "schemas" in dict_desc - assert "input" in dict_desc["schemas"] - assert "output" in dict_desc["schemas"] - - # Verify input schema has the expected fields - input_schema = dict_desc["schemas"]["input"] - for field in ["sepal_length", "sepal_width", "petal_length", "petal_width"]: - assert field in input_schema - - # Verify output schema has species field - output_schema = dict_desc["schemas"]["output"] - assert any(field.lower() == "species" for field in output_schema.keys()) - - # Verify implementation info - assert "implementation" in dict_desc - assert "artifacts" in dict_desc["implementation"] - - # Verify code sections - assert "code" in dict_desc - assert "prediction" in dict_desc["code"] - - # Verify performance metrics - assert "performance" in dict_desc - assert "metrics" in dict_desc["performance"] diff --git a/tests/integration/test_multiclass_classification.py b/tests/integration/test_multiclass_classification.py deleted file mode 100644 index 3f458f59..00000000 --- a/tests/integration/test_multiclass_classification.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Integration test for multiclass classification models using plexe. - -This test covers: -1. Creating a multiclass classification model for sentiment analysis -2. Building the model with synthetic data -3. Making predictions with the model -4. Testing dataset generation capabilities -""" - -import os -import pytest -from pathlib import Path -from pydantic import create_model -import plexe -from tests.utils.utils import verify_prediction, cleanup_files, verify_model_description, generate_sentiment_data - - -@pytest.fixture -def sentiment_data(): - """Generate synthetic sentiment data for testing.""" - return generate_sentiment_data(n_samples=30) - - -@pytest.fixture -def sentiment_input_schema(): - """Define the input schema for sentiment analysis.""" - return create_model("SentimentInput", **{"text": str}) - - -@pytest.fixture -def sentiment_output_schema(): - """Define the output schema for sentiment analysis.""" - return create_model("SentimentOutput", **{"sentiment": str}) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_multiclass_classification(sentiment_data, sentiment_input_schema, sentiment_output_schema): - """Test multiclass classification for sentiment analysis.""" - # Create a model for sentiment analysis - model = plexe.Model( - intent="Classify text sentiment into positive, negative, or neutral categories", - input_schema=sentiment_input_schema, - output_schema=sentiment_output_schema, - ) - - # Build the model with minimal data and iterations for faster testing - model.build( - datasets=[sentiment_data], - provider="openai/gpt-4o", - max_iterations=3, # Minimum iterations for reliable model generation - timeout=300, # 5 minute timeout - run_timeout=150, - ) - - # Test sample predictions - test_inputs = [ - {"text": "This product exceeded my expectations! The quality is amazing."}, - {"text": "Very disappointed with this purchase. Would not recommend."}, - {"text": "The product is okay, nothing special but works as expected."}, - ] - - for test_input in test_inputs: - prediction = model.predict(test_input) - - # Verify the prediction - verify_prediction(prediction, sentiment_output_schema) - assert prediction["sentiment"] in [ - "positive", - "negative", - "neutral", - ], f"Prediction should be one of ['positive', 'negative', 'neutral'], got {prediction['sentiment']}" - - # Verify model description - description = model.describe() - verify_model_description(description) diff --git a/tests/integration/test_ray_integration.py b/tests/integration/test_ray_integration.py deleted file mode 100644 index 1b20942e..00000000 --- a/tests/integration/test_ray_integration.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Integration test for Ray-based distributed training. -""" - -import pytest -import pandas as pd -import numpy as np -from plexe.models import Model - - -@pytest.fixture -def sample_dataset(): - """Create a simple synthetic dataset for testing.""" - # Create a sample regression dataset - np.random.seed(42) - n_samples = 1000 - X = np.random.randn(n_samples, 5) - y = 2 + 3 * X[:, 0] + 0.5 * X[:, 1] - X[:, 2] + np.random.randn(n_samples) * 0.1 - - # Create a DataFrame with feature and target columns - df = pd.DataFrame(data=np.column_stack([X, y]), columns=[f"feature_{i}" for i in range(5)] + ["target"]) - return df - - -def test_model_with_ray(sample_dataset): - """Test building a model with Ray-based distributed execution.""" - # Skip this test if no API key is available - import os - - if not os.environ.get("OPENAI_API_KEY"): - pytest.skip("OpenAI API key not available") - - # Ray is already initialized in the RayExecutor when needed - - # Create a model with distributed=True - model = Model(intent="Predict the target variable given 5 numerical features", distributed=True) - - # Set a short timeout for testing - model.build( - datasets=[sample_dataset], - provider="openai/gpt-4o-mini", - timeout=300, # 5 minutes max - run_timeout=60, # 1 minute per run - ) - - # Test a prediction - input_data = {f"feature_{i}": 0.5 for i in range(5)} - prediction = model.predict(input_data) - - # Verify that prediction has expected structure - assert prediction is not None - assert "target" in prediction - - # Verify that Ray was used in training - assert model.distributed - - # Verify model built successfully - assert model.metric is not None - - # Get executor classes - from plexe.tools.execution import _get_executor_class - from plexe.internal.models.execution.ray_executor import RayExecutor - - # Verify model has the distributed flag set - assert model.distributed, "Model should have distributed=True" - - # Verify the factory would select RayExecutor when distributed=True - executor_class = _get_executor_class(distributed=True) - assert executor_class == RayExecutor, "Factory should return RayExecutor when distributed=True" - - # The logs show Ray is being used, but the flag might not be set when checked - # Let's just print the status for diagnostics but not fail the test on it - print(f"Ray executor was used: {RayExecutor._ray_was_used}") - - # Instead, verify our factory returns the right executor when asked - # The logs confirm Ray is actually used - assert _get_executor_class(distributed=True) == RayExecutor diff --git a/tests/integration/test_recommendation.py b/tests/integration/test_recommendation.py deleted file mode 100644 index 6400695f..00000000 --- a/tests/integration/test_recommendation.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Integration test for recommendation models using plexe. - -This test covers: -1. Creating a recommendation model for product cross-selling -2. Building the model with synthetic data -3. Making predictions with the model -4. Validating the returned recommendations -""" - -import os -import pytest -from pathlib import Path -from pydantic import create_model -import plexe -from tests.utils.utils import ( - generate_product_recommendation_data, - verify_prediction, - cleanup_files, - verify_model_description, -) - - -@pytest.fixture -def product_data(): - """Generate synthetic product recommendation data for testing.""" - return generate_product_recommendation_data(n_samples=60) # Gives ~20 orders with ~3 items each - - -@pytest.fixture -def recommendation_input_schema(): - """Define the input schema for product recommendation.""" - return create_model("ProductInput", **{"style": str}) - - -@pytest.fixture -def recommendation_output_schema(): - """Define the output schema for product recommendation.""" - return create_model("ProductOutput", **{"recommended_styles": list}) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_product_recommendation(product_data, recommendation_input_schema, recommendation_output_schema): - """Test recommendation model for suggesting related products.""" - # Create a model for product recommendations - model = plexe.Model( - intent=""" - Given a product style code, recommend up to 3 other product styles that are frequently - purchased together with it based on transaction history. Use the order_id to identify - products purchased in the same transaction. - """, - input_schema=recommendation_input_schema, - output_schema=recommendation_output_schema, - ) - - # Build the model with minimal iterations for faster testing - model.build( - datasets=[product_data], - provider="openai/gpt-4o", - max_iterations=3, # Minimum iterations for reliable model generation - timeout=300, # 5 minute timeout - run_timeout=150, - ) - - # Get a sample style to test with (first style in the dataset) - test_style = product_data["style"].iloc[0] - - # Test a sample prediction - test_input = {"style": test_style} - prediction = model.predict(test_input) - - # Verify the prediction - verify_prediction(prediction, recommendation_output_schema) - - # Check that the recommendations are a list - assert isinstance(prediction["recommended_styles"], list), "Recommendations should be a list" - - # Check that we have at most 3 recommendations - assert len(prediction["recommended_styles"]) <= 3, "Should have at most 3 recommendations" - - # Check that the recommended items are not the input item - for style in prediction["recommended_styles"]: - assert style != test_style, "Recommendations should not include the input item" - - # Verify model description - description = model.describe() - verify_model_description(description) - - # Test model saving - model_path = plexe.save_model(model, "recommendation_model.tar.gz") - assert Path(model_path).exists(), f"Model file {model_path} not created" - - # Test model loading - loaded_model = plexe.load_model(model_path) - loaded_prediction = loaded_model.predict(test_input) - - # Verify the loaded model's prediction - verify_prediction(loaded_prediction, recommendation_output_schema) - assert isinstance(loaded_prediction["recommended_styles"], list), "Recommendations should be a list" diff --git a/tests/integration/test_regression.py b/tests/integration/test_regression.py deleted file mode 100644 index 6d7b6c9a..00000000 --- a/tests/integration/test_regression.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Integration test for regression models using plexe. - -This test covers: -1. Creating a regression model for house price prediction -2. Building the model with synthetic data -3. Making predictions with the model -4. Testing schema inference -""" - -import os -import pytest -from pathlib import Path -from pydantic import create_model -import plexe -from tests.utils.utils import generate_house_prices_data, verify_prediction, cleanup_files, verify_model_description - - -@pytest.fixture -def house_data(): - """Generate synthetic house price data for testing.""" - return generate_house_prices_data(n_samples=30) - - -@pytest.fixture -def house_input_schema(): - """Define the input schema for house price prediction.""" - return create_model( - "HousePriceInput", - **{ - "area": int, - "bedrooms": int, - "bathrooms": int, - "stories": int, - "garage": int, - "garden": int, - "fenced": int, - "age": int, - }, - ) - - -@pytest.fixture -def house_output_schema(): - """Define the output schema for house price prediction.""" - return create_model("HousePriceOutput", **{"price": float}) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_house_price_regression(house_data, house_input_schema, house_output_schema): - """Test regression for house price prediction.""" - # Create a model for house price prediction - model = plexe.Model( - intent="Predict the price of a house based on its features", - input_schema=house_input_schema, - output_schema=house_output_schema, - ) - - # Build the model with minimal data and iterations for faster testing - model.build( - datasets=[house_data], - provider="openai/gpt-4o", - max_iterations=3, # Minimum iterations for reliable model generation - timeout=300, # 5 minute timeout - run_timeout=150, - ) - - # Test a sample prediction - test_input = { - "area": 2500, - "bedrooms": 4, - "bathrooms": 2, - "stories": 2, - "garage": 1, - "garden": 1, - "fenced": 1, - "age": 5, - } - prediction = model.predict(test_input) - - # Verify the prediction - verify_prediction(prediction, house_output_schema) - assert isinstance(prediction["price"], (int, float)), "House price should be numeric" - assert prediction["price"] > 0, "House price should be positive" - - # Verify model description - description = model.describe() - verify_model_description(description) - - # Test model saving and loading - model_path = plexe.save_model(model, "house_price_model.tar.gz") - loaded_model = plexe.load_model(model_path) - loaded_prediction = loaded_model.predict(test_input) - - # Verify the loaded model's prediction - verify_prediction(loaded_prediction, house_output_schema) - assert isinstance(loaded_prediction["price"], (int, float)), "House price should be numeric" - assert loaded_prediction["price"] > 0, "House price should be positive" - assert loaded_prediction == prediction, "Loaded model prediction should match original model prediction" diff --git a/tests/integration/test_schema_validation.py b/tests/integration/test_schema_validation.py deleted file mode 100644 index 99e49107..00000000 --- a/tests/integration/test_schema_validation.py +++ /dev/null @@ -1,174 +0,0 @@ -"""Integration test for schema validation in plexe. - -This test covers: -1. Creating models with fields that have validation requirements -2. Testing that invalid inputs fail properly -3. Testing that valid inputs pass properly -""" - -import os -import pytest -from pathlib import Path -from pydantic import create_model, Field -import plexe -from tests.utils.utils import generate_house_prices_data, verify_prediction, cleanup_files - - -@pytest.fixture -def house_data(): - """Generate synthetic house price data for testing.""" - return generate_house_prices_data(n_samples=30) - - -@pytest.fixture -def house_data_copy(house_data): - """Create a copy of the house data to avoid mutation issues.""" - return house_data.copy() - - -@pytest.fixture -def validated_input_schema(): - """Define the input schema for house price prediction with validation.""" - return create_model( - "ValidatedHouseInput", - **{ - "area": (int, Field(description="Square feet (500-10000)")), - "bedrooms": (int, Field(description="Number of bedrooms (1-10)")), - "bathrooms": (int, Field(description="Number of bathrooms (1-7)")), - "stories": (int, Field(description="Number of stories (1-4)")), - "garage": (int, Field(description="Garage capacity in cars (0-3)")), - "garden": (int, Field(description="Has garden (0=no, 1=yes)")), - "fenced": (int, Field(description="Has fenced yard (0=no, 1=yes)")), - "age": (int, Field(description="Age of house in years (0-100)")), - }, - ) - - -@pytest.fixture -def validated_output_schema(): - """Define the output schema for house price prediction with validation.""" - return create_model( - "ValidatedHouseOutput", - **{"price": (float, Field(ge=50, le=5000, description="House price in thousands of dollars (50-5000)"))}, - ) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_input_validation(house_data_copy, validated_input_schema, validated_output_schema): - """Test validation of input schema.""" - # Create a model with validated input schema - model = plexe.Model( - intent="Predict the price of a house based on its features, with input validation", - input_schema=validated_input_schema, - output_schema=validated_output_schema, - ) - - # Build the model - model.build( - datasets=[house_data_copy], - provider="openai/gpt-4o", - max_iterations=3, # Minimum iterations for reliable model generation - timeout=300, # 5 minute timeout - run_timeout=150, - ) - - # Valid input should work - valid_input = { - "area": 2500, - "bedrooms": 4, - "bathrooms": 2, - "stories": 2, - "garage": 1, - "garden": 1, - "fenced": 1, - "age": 5, - } - prediction = model.predict(valid_input) - verify_prediction(prediction, validated_output_schema) - - # Invalid inputs should raise validation errors - invalid_inputs = [ - { - "area": 300, - "stories": 2, - "garage": 1, - "garden": 1, - "fenced": 1, - "age": 5, - }, # Missing features - { - "area": "not-a-number", - "bedrooms": None, - "bathrooms": "two", - "stories": False, - "garage": 1, - "garden": 1, - "fenced": 1, - "age": 5, - }, # Invalid types - ] - - for invalid_input in invalid_inputs: - with pytest.raises(Exception): - # This should raise a validation error when the model validates the input - # against the schema before prediction - model.predict(invalid_input, validate_input=True) - - -def test_output_validation(house_data_copy, validated_input_schema): - """Test validation of output schema.""" - # Create an output schema with strict range validation - strict_output_schema = create_model( - "StrictHouseOutput", - **{"price": (float, Field(ge=500, le=600, description="House price in thousands of dollars (500-600)"))}, - ) - - # Create a model with standard input but strictly bounded output - model = plexe.Model( - intent="Predict the price of a house based on its features, ensuring predictions are between 500-600k", - input_schema=validated_input_schema, - output_schema=strict_output_schema, - ) - - # Build the model - model.build( - datasets=[house_data_copy], - provider="openai/gpt-4o", - max_iterations=3, # Minimum iterations for reliable model generation - timeout=300, - run_timeout=150, - ) - - # Test a sample prediction - test_input = { - "area": 2500, - "bedrooms": 4, - "bathrooms": 2, - "stories": 2, - "garage": 1, - "garden": 1, - "fenced": 1, - "age": 5, - } - - prediction = model.predict(test_input) - - # Verify the prediction meets the strict output schema - verify_prediction(prediction, strict_output_schema) diff --git a/tests/integration/test_time_series.py b/tests/integration/test_time_series.py deleted file mode 100644 index b793d381..00000000 --- a/tests/integration/test_time_series.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Integration test for time series forecasting models using plexe. - -This test covers: -1. Creating a time series forecasting model for sales prediction -2. Building the model with synthetic time series data -3. Making predictions with the model for future time periods -""" - -import os -import pytest -from pathlib import Path -from pydantic import create_model -import plexe -from tests.utils.utils import generate_time_series_data, verify_prediction, cleanup_files, verify_model_description - - -@pytest.fixture -def sales_data(): - """Generate synthetic time series data for testing.""" - return generate_time_series_data(n_samples=60) - - -@pytest.fixture -def sales_data_copy(sales_data): - """Create a copy of the sales data to avoid mutation issues.""" - return sales_data.copy() - - -@pytest.fixture -def sales_input_schema(): - """Define the input schema for sales forecasting.""" - return create_model( - "SalesInput", - **{ - "date": str, - "promo": int, - "holiday": int, - "day_of_week": int, - }, - ) - - -@pytest.fixture -def sales_output_schema(): - """Define the output schema for sales forecasting.""" - return create_model("SalesOutput", **{"sales": float}) - - -@pytest.fixture -def model_dir(tmpdir): - """Create and manage a temporary directory for model files.""" - model_path = Path(tmpdir) / "models" - model_path.mkdir(exist_ok=True) - return model_path - - -@pytest.fixture(autouse=True) -def run_around_tests(model_dir): - """Set up and tear down for each test.""" - cleanup_files(model_dir) - os.environ["MODEL_PATH"] = str(model_dir) - yield - # Teardown - cleanup_files(model_dir) - - -def test_time_series_forecasting(sales_data_copy, sales_input_schema, sales_output_schema): - """Test time series forecasting for sales prediction.""" - # Ensure date is in string format for the model input - sales_data_copy["date"] = sales_data_copy["date"].dt.strftime("%Y-%m-%d") - - # Create a model for sales forecasting - model = plexe.Model( - intent="Predict daily sales based on the date, promotions, holidays, and day of the week", - input_schema=sales_input_schema, - output_schema=sales_output_schema, - ) - - # Build the model - model.build( - datasets=[sales_data_copy], - provider="openai/gpt-4o", - max_iterations=4, # Minimum iterations for reliable model generation - timeout=400, # 5 minute timeout - run_timeout=150, - ) - - # Test prediction for a future date - future_date = "2023-03-01" # A date after the training data - test_input = { - "date": future_date, - "promo": 1, - "holiday": 0, - "day_of_week": 2, # Wednesday - } - prediction = model.predict(test_input) - - # Verify the prediction - verify_prediction(prediction, sales_output_schema) - assert isinstance(prediction["sales"], (int, float)), "Sales should be numeric" - assert prediction["sales"] > 0, "Sales should be positive" - - # Test another date with different features - future_date_2 = "2023-03-04" # Saturday - test_input_2 = { - "date": future_date_2, - "promo": 0, - "holiday": 1, - "day_of_week": 5, # Saturday - } - prediction_2 = model.predict(test_input_2) - - # Verify the prediction - verify_prediction(prediction_2, sales_output_schema) - assert isinstance(prediction_2["sales"], (int, float)), "Sales should be numeric" - assert prediction_2["sales"] > 0, "Sales should be positive" - - # Verify model description - description = model.describe() - verify_model_description(description) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/__init__.py b/tests/unit/internal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/common/datasets/__init__.py b/tests/unit/internal/common/datasets/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/common/datasets/test_adapter.py b/tests/unit/internal/common/datasets/test_adapter.py deleted file mode 100644 index 777d58ec..00000000 --- a/tests/unit/internal/common/datasets/test_adapter.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Tests for the DatasetAdapter class. - -This module verifies: -1. Conversion of different data formats to appropriate dataset types -2. Auto-detection of dataset types -3. Feature extraction functionality -4. Error handling for unsupported dataset types -""" - -import pytest -import pandas as pd - -from plexe.internal.common.datasets.adapter import DatasetAdapter -from plexe.internal.common.datasets.interface import Dataset, DatasetStructure -from plexe.internal.common.datasets.tabular import TabularDataset - - -class MockDataset(Dataset): - """Mock dataset implementation for testing.""" - - def __init__(self, features=None): - self._features = features or ["feature1", "feature2"] - - def split(self, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, stratify_column=None, random_state=None): - return self, self, self - - def sample(self, n=None, frac=None, replace=False, random_state=None): - return self - - def to_bytes(self): - return b"mock_dataset" - - @classmethod - def from_bytes(cls, data): - return cls() - - @property - def structure(self): - return DatasetStructure(modality="other", features=self._features, details={"mock": True}) - - def __len__(self): - return 10 - - def __getitem__(self, idx): - return {"item": idx} - - -def test_adapter_coerce_pandas(): - """Test that DatasetAdapter.coerce handles pandas DataFrames.""" - # Create test data - df = pd.DataFrame({"feature1": [1, 2, 3], "feature2": ["a", "b", "c"], "target": [0, 1, 0]}) - - # Coerce to dataset - result = DatasetAdapter.coerce(df) - - # Check that the result is a TabularDataset - assert isinstance(result, TabularDataset) - assert len(result) == 3 - - # Check that the data was preserved - pd.testing.assert_frame_equal(result.to_pandas(), df) - - -def test_adapter_coerce_dataset(): - """Test that DatasetAdapter.coerce passes through Dataset instances.""" - # Create test data - df = pd.DataFrame({"feature1": [1, 2, 3], "feature2": ["a", "b", "c"], "target": [0, 1, 0]}) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Coerce the dataset - result = DatasetAdapter.coerce(dataset) - - # Check that the result is the same TabularDataset - assert result is dataset - - -def test_adapter_auto_detect(): - """Test the auto_detect functionality.""" - # Test with pandas DataFrame - df = pd.DataFrame({"a": [1, 2, 3]}) - assert DatasetAdapter.auto_detect(df) == "tabular" - - # Test with unsupported type - assert DatasetAdapter.auto_detect("not a dataset") is None - - -def test_adapter_coerce_unsupported(): - """Test error handling for unsupported dataset types.""" - # Try to coerce an unsupported type - with pytest.raises(ValueError): - DatasetAdapter.coerce("not a dataset") - - -def test_adapter_features(): - """Test the features extraction functionality.""" - # Create test datasets - df1 = pd.DataFrame({"feature1": [1, 2, 3], "feature2": ["a", "b", "c"]}) - - df2 = pd.DataFrame({"feature3": [4, 5, 6], "feature4": ["d", "e", "f"]}) - - dataset1 = TabularDataset(df1) - dataset2 = TabularDataset(df2) - - # Create dataset dictionary - datasets = {"dataset1": dataset1, "dataset2": dataset2} - - # Extract features - features = DatasetAdapter.features(datasets) - - # Check that features were correctly extracted - assert len(features) == 4 - assert "dataset1.feature1" in features - assert "dataset1.feature2" in features - assert "dataset2.feature3" in features - assert "dataset2.feature4" in features diff --git a/tests/unit/internal/common/datasets/test_interface.py b/tests/unit/internal/common/datasets/test_interface.py deleted file mode 100644 index 9e6d239f..00000000 --- a/tests/unit/internal/common/datasets/test_interface.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -Tests for the dataset interface. - -This module verifies: -1. Creation and usage of DatasetStructure -2. Basic attributes and operations of DatasetStructure -3. Abstract methods in Dataset must be implemented by subclasses -4. Error handling when trying to instantiate Dataset directly -""" - -import pytest -from plexe.internal.common.datasets.interface import Dataset, DatasetStructure - - -def test_dataset_structure_creation(): - """Test creation of a DatasetStructure with valid parameters.""" - # Create a simple structure - structure = DatasetStructure( - modality="table", features=["feature1", "feature2", "target"], details={"num_rows": 10, "num_columns": 3} - ) - - # Check attributes - assert structure.modality == "table" - assert structure.features == ["feature1", "feature2", "target"] - assert structure.details["num_rows"] == 10 - assert structure.details["num_columns"] == 3 - - -def test_dataset_structure_tensor_modality(): - """Test creation of a DatasetStructure with tensor modality.""" - # Create a tensor structure - structure = DatasetStructure( - modality="tensor", features=["pixel_values"], details={"shape": [32, 32, 3], "dtype": "float32"} - ) - - # Check attributes - assert structure.modality == "tensor" - assert structure.features == ["pixel_values"] - assert structure.details["shape"] == [32, 32, 3] - assert structure.details["dtype"] == "float32" - - -def test_dataset_structure_other_modality(): - """Test creation of a DatasetStructure with 'other' modality.""" - # Create a structure with 'other' modality - structure = DatasetStructure( - modality="other", features=["custom_data"], details={"type": "custom", "format": "specialized"} - ) - - # Check attributes - assert structure.modality == "other" - assert structure.features == ["custom_data"] - assert structure.details["type"] == "custom" - assert structure.details["format"] == "specialized" - - -# Note: Python doesn't enforce Literal type annotations at runtime, -# so we're not testing invalid modality values since they would pass -# in a regular Python environment. In a strictly typed environment -# or with runtime type checking, this would raise an error. - - -class MinimalDataset(Dataset): - """Minimal implementation of Dataset for testing.""" - - def split(self, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, stratify_column=None, random_state=None): - return self, self, self - - def sample(self, n=None, frac=None, replace=False, random_state=None): - return self - - def to_bytes(self): - return b"minimal_dataset" - - @classmethod - def from_bytes(cls, data): - return cls() - - @property - def structure(self): - return DatasetStructure(modality="other", features=["dummy"], details={}) - - def __len__(self): - return 0 - - def __getitem__(self, index): - return None - - -class IncompleteDataset(Dataset): - """Dataset implementation missing required methods.""" - - def split(self, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, stratify_column=None, random_state=None): - return self, self, self - - # Missing other required methods - - -def test_dataset_instantiation(): - """Test that Dataset can't be instantiated directly.""" - with pytest.raises(TypeError): - Dataset() # Should raise TypeError: Can't instantiate abstract class - - -def test_minimal_dataset(): - """Test that a minimal implementation of Dataset can be instantiated.""" - dataset = MinimalDataset() - assert isinstance(dataset, Dataset) - - # Test basic functionality - train, val, test = dataset.split() - assert isinstance(train, MinimalDataset) - assert isinstance(val, MinimalDataset) - assert isinstance(test, MinimalDataset) - - sample = dataset.sample(n=5) - assert isinstance(sample, MinimalDataset) - - data_bytes = dataset.to_bytes() - assert data_bytes == b"minimal_dataset" - - restored = MinimalDataset.from_bytes(data_bytes) - assert isinstance(restored, MinimalDataset) - - assert len(dataset) == 0 - assert dataset[0] is None - - structure = dataset.structure - assert structure.modality == "other" - assert structure.features == ["dummy"] - - -def test_incomplete_dataset(): - """Test that a Dataset implementation missing required methods raises errors.""" - with pytest.raises(TypeError): - IncompleteDataset() # Should raise TypeError for missing abstract methods diff --git a/tests/unit/internal/common/datasets/test_tabular.py b/tests/unit/internal/common/datasets/test_tabular.py deleted file mode 100644 index cb19b567..00000000 --- a/tests/unit/internal/common/datasets/test_tabular.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -Tests for the TabularDataset implementation. - -This module verifies: -1. Creation of TabularDatasets from pandas DataFrames -2. Dataset splitting into train/val/test sets -3. Dataset sampling functionality -4. Serialization and deserialization -5. Dataset structure metadata -6. Conversion to pandas and numpy formats -7. Basic operations like __len__ and __getitem__ -""" - -import pytest -import pandas as pd -import numpy as np - -from plexe.internal.common.datasets.tabular import TabularDataset -from plexe.internal.common.datasets.interface import DatasetStructure -from plexe.internal.common.utils.dataset_storage import write_dataset_to_file, read_dataset_from_file - - -def test_tabular_dataset_creation(): - """Test that TabularDataset can be created from pandas DataFrame.""" - # Create test data - df = pd.DataFrame({"feature1": [1, 2, 3, 4, 5], "feature2": ["a", "b", "c", "d", "e"], "target": [0, 1, 0, 1, 0]}) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Check that the dataset was created correctly - assert len(dataset) == 5 - assert isinstance(dataset.to_pandas(), pd.DataFrame) - assert isinstance(dataset.to_numpy(), np.ndarray) - pd.testing.assert_frame_equal(dataset.to_pandas(), df) - - -def test_tabular_dataset_validation(): - """Test validation of input data types.""" - # Valid input - df = pd.DataFrame({"a": [1, 2, 3]}) - TabularDataset(df) # Should work - - # Invalid input - with pytest.raises(ValueError): - TabularDataset("not a dataframe") - - with pytest.raises(ValueError): - TabularDataset([1, 2, 3]) - - -def test_tabular_dataset_split_standard(): - """Test standard train/val/test split with default ratios.""" - # Create test data - df = pd.DataFrame( - {"feature1": range(100), "feature2": [f"val_{i}" for i in range(100)], "target": [i % 2 for i in range(100)]} - ) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Split dataset with default ratios (0.7, 0.15, 0.15) - train, val, test = dataset.split() - - # Check that each split is a TabularDataset - assert isinstance(train, TabularDataset) - assert isinstance(val, TabularDataset) - assert isinstance(test, TabularDataset) - - # Check split sizes - assert len(train) == 70 - assert len(val) == 15 - assert len(test) == 15 - - # Check total size - assert len(train) + len(val) + len(test) == 100 - - -def test_tabular_dataset_split_custom_ratios(): - """Test train/val/test split with custom ratios.""" - df = pd.DataFrame( - {"feature1": range(100), "feature2": [f"val_{i}" for i in range(100)], "target": [i % 2 for i in range(100)]} - ) - - dataset = TabularDataset(df) - - # Split with custom ratios - train, val, test = dataset.split(train_ratio=0.8, val_ratio=0.1, test_ratio=0.1) - - # Check split sizes - assert len(train) == 80 - assert len(val) == 10 - assert len(test) == 10 - - # Check total size - assert len(train) + len(val) + len(test) == 100 - - -def test_tabular_dataset_split_stratified(): - """Test stratified splitting.""" - df = pd.DataFrame( - { - "feature1": range(100), - "feature2": [f"val_{i}" for i in range(100)], - "target": [i % 2 for i in range(100)], # 50/50 split of 0s and 1s - } - ) - - dataset = TabularDataset(df) - - # Split with stratification - train, val, test = dataset.split(stratify_column="target") - - # Check that class proportions are maintained - assert abs(train.to_pandas()["target"].mean() - 0.5) < 0.1 - assert abs(val.to_pandas()["target"].mean() - 0.5) < 0.1 - assert abs(test.to_pandas()["target"].mean() - 0.5) < 0.1 - - -def test_tabular_dataset_split_reproducibility(): - """Test that splits are reproducible with same random state.""" - df = pd.DataFrame( - {"feature1": range(100), "feature2": [f"val_{i}" for i in range(100)], "target": [i % 2 for i in range(100)]} - ) - - dataset = TabularDataset(df) - - # Create two splits with the same random state - train1, val1, test1 = dataset.split(random_state=42) - train2, val2, test2 = dataset.split(random_state=42) - - # Check that the splits are identical - pd.testing.assert_frame_equal(train1.to_pandas(), train2.to_pandas()) - pd.testing.assert_frame_equal(val1.to_pandas(), val2.to_pandas()) - pd.testing.assert_frame_equal(test1.to_pandas(), test2.to_pandas()) - - -def test_tabular_dataset_split_edge_cases(): - """Test edge cases for splitting.""" - df = pd.DataFrame( - {"feature1": range(100), "feature2": [f"val_{i}" for i in range(100)], "target": [i % 2 for i in range(100)]} - ) - - dataset = TabularDataset(df) - - # All data to train - train, val, test = dataset.split(train_ratio=1.0, val_ratio=0.0, test_ratio=0.0) - assert len(train) == 100 - assert len(val) == 0 - assert len(test) == 0 - - # No validation set - train, val, test = dataset.split(train_ratio=0.8, val_ratio=0.0, test_ratio=0.2) - assert len(train) == 80 - assert len(val) == 0 - assert len(test) == 20 - - # No test set - train, val, test = dataset.split(train_ratio=0.8, val_ratio=0.2, test_ratio=0.0) - assert len(train) == 80 - assert len(val) == 20 - assert len(test) == 0 - - # Invalid ratios - with pytest.raises(ValueError): - dataset.split(train_ratio=0.8, val_ratio=0.3, test_ratio=0.2) # Sum > 1 - - -def test_tabular_dataset_sample(): - """Test sampling functionality.""" - # Create test data - df = pd.DataFrame( - {"feature1": range(100), "feature2": [f"val_{i}" for i in range(100)], "target": [i % 2 for i in range(100)]} - ) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Sample by count - sampled = dataset.sample(n=10, random_state=42) - assert len(sampled) == 10 - assert isinstance(sampled, TabularDataset) - - # Sample by fraction - sampled = dataset.sample(frac=0.1, random_state=42) - assert len(sampled) == 10 - assert isinstance(sampled, TabularDataset) - - # Sample with replacement - sampled = dataset.sample(n=120, replace=True, random_state=42) - assert len(sampled) == 120 - assert isinstance(sampled, TabularDataset) - - # Check reproducibility - sampled1 = dataset.sample(n=10, random_state=42) - sampled2 = dataset.sample(n=10, random_state=42) - pd.testing.assert_frame_equal(sampled1.to_pandas(), sampled2.to_pandas()) - - -def test_tabular_dataset_serialization(): - """Test that TabularDataset can be serialized and deserialized.""" - # Create test data - df = pd.DataFrame( - {"feature1": range(10), "feature2": [f"val_{i}" for i in range(10)], "target": [i % 2 for i in range(10)]} - ) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Serialize to bytes - data_bytes = dataset.to_bytes() - assert isinstance(data_bytes, bytes) - assert len(data_bytes) > 0 - - # Deserialize from bytes - deserialized = TabularDataset.from_bytes(data_bytes) - - # Check that the deserialized dataset matches the original - assert len(deserialized) == len(dataset) - pd.testing.assert_frame_equal(deserialized.to_pandas(), dataset.to_pandas()) - - -def test_tabular_dataset_serialization_error_handling(): - """Test error handling during serialization/deserialization.""" - # Create a TabularDataset - dataset = TabularDataset(pd.DataFrame({"a": [1, 2, 3]})) - - # Test serialization (should succeed) - dataset.to_bytes() - - # Test deserialization with invalid data - with pytest.raises(RuntimeError): - TabularDataset.from_bytes(b"invalid data") - - -def test_tabular_dataset_structure(): - """Test structure property.""" - # Create test data - df = pd.DataFrame({"feature1": [1, 2, 3, 4, 5], "feature2": ["a", "b", "c", "d", "e"], "target": [0, 1, 0, 1, 0]}) - - # Create TabularDataset - dataset = TabularDataset(df) - structure = dataset.structure - - # Check structure type and fields - assert isinstance(structure, DatasetStructure) - assert structure.modality == "table" - assert set(structure.features) == {"feature1", "feature2", "target"} - - # Check details - assert structure.details["num_rows"] == 5 - assert structure.details["num_columns"] == 3 - assert set(structure.details["column_names"]) == {"feature1", "feature2", "target"} - assert isinstance(structure.details["column_types"], dict) - - -def test_tabular_dataset_file_storage(tmp_path): - """Test that TabularDataset can be stored to and loaded from a file.""" - # Create test data - df = pd.DataFrame( - {"feature1": range(10), "feature2": [f"val_{i}" for i in range(10)], "target": [i % 2 for i in range(10)]} - ) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Write to file - file_path = tmp_path / "test_dataset.bin" - write_dataset_to_file(dataset, str(file_path)) - - # Check that the file exists - assert file_path.exists() - - # Read from file - loaded = read_dataset_from_file(TabularDataset, str(file_path)) - - # Check that the loaded dataset matches the original - assert len(loaded) == len(dataset) - pd.testing.assert_frame_equal(loaded.to_pandas(), dataset.to_pandas()) - - -def test_tabular_dataset_conversion(): - """Test conversion to pandas and numpy.""" - # Create test data - df = pd.DataFrame({"feature1": [1, 2, 3, 4, 5], "feature2": ["a", "b", "c", "d", "e"], "target": [0, 1, 0, 1, 0]}) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Test pandas conversion - out_df = dataset.to_pandas() - assert isinstance(out_df, pd.DataFrame) - pd.testing.assert_frame_equal(out_df, df) - - # Test numpy conversion - arr = dataset.to_numpy() - assert isinstance(arr, np.ndarray) - np.testing.assert_array_equal(arr, df.to_numpy()) - - -def test_tabular_dataset_getitem(): - """Test __getitem__ functionality.""" - # Create test data - df = pd.DataFrame( - {"feature1": range(10), "feature2": [f"val_{i}" for i in range(10)], "target": [i % 2 for i in range(10)]} - ) - - # Create TabularDataset - dataset = TabularDataset(df) - - # Single item access - item = dataset[0] - assert item.feature1 == 0 - assert item.feature2 == "val_0" - assert item.target == 0 - - # Slice access - slice_items = dataset[1:4] - assert isinstance(slice_items, pd.Series) or isinstance(slice_items, pd.DataFrame) - - if isinstance(slice_items, pd.DataFrame): - assert len(slice_items) == 3 - else: - # If it returns a Series, check that the values match - expected_series = df.iloc[1:4] - if isinstance(expected_series, pd.Series): - pd.testing.assert_series_equal(slice_items, expected_series) - - -def test_tabular_dataset_len(): - """Test __len__ functionality.""" - # Create datasets of different sizes - dataset1 = TabularDataset(pd.DataFrame({"a": range(5)})) - dataset2 = TabularDataset(pd.DataFrame({"a": range(10)})) - - # Check lengths - assert len(dataset1) == 5 - assert len(dataset2) == 10 diff --git a/tests/unit/internal/common/utils/__init__.py b/tests/unit/internal/common/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/common/utils/test_dataset_storage.py b/tests/unit/internal/common/utils/test_dataset_storage.py deleted file mode 100644 index 8e6647e0..00000000 --- a/tests/unit/internal/common/utils/test_dataset_storage.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Tests for the dataset storage utilities. - -This module verifies: -1. Writing datasets to files and reading them back -2. Storing datasets in shared memory and retrieving them -3. Error handling for invalid file paths and data -""" - -import os -import pytest - -from plexe.internal.common.datasets.tabular import TabularDataset -from plexe.internal.common.utils.dataset_storage import ( - write_dataset_to_file, - read_dataset_from_file, - dataset_to_shared_memory, - dataset_from_shared_memory, -) -import pandas as pd - - -def test_write_and_read_file(tmp_path): - """Test writing a dataset to a file and reading it back.""" - # Create a test dataset - df = pd.DataFrame({"feature1": [1, 2, 3], "feature2": ["a", "b", "c"], "target": [0, 1, 0]}) - dataset = TabularDataset(df) - - # Write to file - file_path = tmp_path / "test_dataset.bin" - write_dataset_to_file(dataset, str(file_path)) - - # Check that file exists - assert file_path.exists() - assert os.path.getsize(str(file_path)) > 0 - - # Read from file - loaded_dataset = read_dataset_from_file(TabularDataset, str(file_path)) - - # Check that loaded dataset matches original - assert isinstance(loaded_dataset, TabularDataset) - assert len(loaded_dataset) == len(dataset) - pd.testing.assert_frame_equal(loaded_dataset.to_pandas(), dataset.to_pandas()) - - -def test_file_storage_error_handling(tmp_path): - """Test error handling for file storage operations.""" - # Test with non-existent file - non_existent_path = tmp_path / "non_existent.bin" - with pytest.raises(FileNotFoundError): - read_dataset_from_file(TabularDataset, str(non_existent_path)) - - # Test with invalid file content - invalid_path = tmp_path / "invalid.bin" - with open(str(invalid_path), "wb") as f: - f.write(b"invalid data") - - with pytest.raises(RuntimeError): - read_dataset_from_file(TabularDataset, str(invalid_path)) - - # Test with invalid directory path - invalid_dir = tmp_path / "non_existent_dir" / "dataset.bin" - - # Create a dataset - df = pd.DataFrame({"a": [1, 2, 3]}) - dataset = TabularDataset(df) - - with pytest.raises(FileNotFoundError): - write_dataset_to_file(dataset, str(invalid_dir)) - - -@pytest.mark.skipif( - not hasattr(__import__("multiprocessing", fromlist=["shared_memory"]), "shared_memory"), - reason="Shared memory requires Python 3.8+", -) -def test_shared_memory_error_handling(): - """Test error handling in shared memory functions.""" - # Mock an ImportError for dataset_to_shared_memory - import builtins - - original_import = builtins.__import__ - - def mock_import(name, *args, **kwargs): - if name == "multiprocessing" or name == "multiprocessing.shared_memory": - raise ImportError("Mocked import error") - return original_import(name, *args, **kwargs) - - # Test with ImportError - try: - builtins.__import__ = mock_import - - # Test dataset_to_shared_memory - dataset = TabularDataset(pd.DataFrame({"a": [1, 2, 3]})) - with pytest.raises(ImportError): - dataset_to_shared_memory(dataset, "test_segment") - - # Test dataset_from_shared_memory - with pytest.raises(ImportError): - dataset_from_shared_memory(TabularDataset, "test_segment") - finally: - builtins.__import__ = original_import - - # Restore original import function - builtins.__import__ = original_import - - # Test with non-existent shared memory segment - try: - with pytest.raises((FileNotFoundError, ValueError)): - dataset_from_shared_memory(TabularDataset, "non_existent_segment_name") - except ImportError: - pytest.skip("shared_memory not available") diff --git a/tests/unit/internal/datasets/__init__.py b/tests/unit/internal/datasets/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/datasets/core/__init__.py b/tests/unit/internal/datasets/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/datasets/core/generation/__init__.py b/tests/unit/internal/datasets/core/generation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/datasets/core/generation/utils/__init__.py b/tests/unit/internal/datasets/core/generation/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/__init__.py b/tests/unit/internal/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/callbacks/__init__.py b/tests/unit/internal/models/callbacks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/callbacks/test_mlflow.py b/tests/unit/internal/models/callbacks/test_mlflow.py deleted file mode 100644 index a24e4c88..00000000 --- a/tests/unit/internal/models/callbacks/test_mlflow.py +++ /dev/null @@ -1,258 +0,0 @@ -""" -Unit tests for the MLFlowCallback class. - -These tests validate the functionality of the MLFlowCallback, which is responsible -for logging model building metrics, parameters, and artifacts to MLFlow. -""" - -from unittest.mock import MagicMock, patch -import pytest -from pydantic import BaseModel - -from plexe.callbacks import BuildStateInfo -from plexe.internal.models.callbacks.mlflow import MLFlowCallback -from plexe.internal.models.entities.artifact import Artifact -from plexe.internal.models.entities.metric import Metric, MetricComparator, ComparisonMethod -from plexe.core.entities.solution import Solution - - -@pytest.fixture -def setup_env(): - """Set up common test environment.""" - # Create metric - metric = Metric(name="accuracy", value=0.95, comparator=MetricComparator(ComparisonMethod.HIGHER_IS_BETTER)) - - # Create node with the metric - node = Solution( - plan="Train a random forest model", - performance=metric, - execution_time=10.5, - model_artifacts=[Artifact.from_path("/path/to/artifact.pkl")], - ) - - # Create input/output schemas - class InputSchema(BaseModel): - feature1: float - feature2: str - - class OutputSchema(BaseModel): - prediction: float - - # Create a mock dataset - mock_dataset = MagicMock() - mock_dataset.to_pandas.return_value = MagicMock() - - # Create build info for different stages - build_info = BuildStateInfo( - intent="Predict house prices", - provider="openai/gpt-4o-mini", - input_schema=InputSchema, - output_schema=OutputSchema, - run_timeout=300, - max_iterations=10, - timeout=3600, - datasets={"train": mock_dataset}, - ) - - # Create iteration info with node - iteration_info = BuildStateInfo( - intent="Predict house prices", - provider="openai/gpt-4o-mini", - input_schema=InputSchema, - output_schema=OutputSchema, - run_timeout=300, - max_iterations=10, - timeout=3600, - iteration=1, - node=node, - datasets={"train": mock_dataset}, - ) - - return { - "metric": metric, - "node": node, - "input_schema": InputSchema, - "output_schema": OutputSchema, - "mock_dataset": mock_dataset, - "build_info": build_info, - "iteration_info": iteration_info, - } - - -def test_callback_initialization(): - """Test that the MLFlowCallback can be initialized properly.""" - with patch("mlflow.set_tracking_uri") as mock_set_tracking_uri: - with patch("mlflow.active_run", return_value=None): # No active run - with patch("mlflow.get_experiment_by_name", return_value=None): # Experiment doesn't exist - with patch("mlflow.create_experiment", return_value="test-experiment-id"): - with patch("mlflow.set_experiment"): - callback = MLFlowCallback( - tracking_uri="http://localhost:5000", experiment_name="test-experiment" - ) - - # Verify tracking URI was set - mock_set_tracking_uri.assert_called_once_with("http://localhost:5000") - - # Verify default values - assert callback.experiment_name == "test-experiment" - assert callback.experiment_id == "test-experiment-id" - - -@patch("mlflow.set_tracking_uri") -@patch("mlflow.get_experiment_by_name") -@patch("mlflow.create_experiment", return_value="initial-experiment-id") -def test_build_start(mock_create_experiment, mock_get_experiment, _, setup_env): - """Test on_build_start callback.""" - # Set up mocks - during initialization, experiment should be found - mock_experiment = MagicMock() - mock_experiment.experiment_id = "test-experiment-id" - mock_get_experiment.return_value = mock_experiment - - # Initialize callback with active_run patched - with patch("mlflow.active_run", return_value=None): - with patch("mlflow.set_experiment"): - callback = MLFlowCallback(tracking_uri="http://localhost:5000", experiment_name="test-experiment") - - # During initialization, get_experiment_by_name was called once (experiment exists, so no create_experiment) - mock_get_experiment.assert_called_once_with("test-experiment") - mock_create_experiment.assert_not_called() - - # Reset mocks for on_build_start testing - mock_get_experiment.reset_mock() - mock_create_experiment.reset_mock() - - # Mock mlflow methods for on_build_start - with patch("mlflow.active_run", return_value=None): - with patch("mlflow.set_experiment"): - with patch("mlflow.start_run") as mock_start_run: - with patch("mlflow.log_params"): - with patch("mlflow.set_tags"): - mock_run = MagicMock() - mock_run.info.run_id = "parent-run-id" - mock_start_run.return_value = mock_run - - # Call on_build_start - callback.on_build_start(setup_env["build_info"]) - - # Since experiment_id is already set, _get_or_create_experiment should not call get_experiment_by_name again - mock_get_experiment.assert_not_called() - mock_create_experiment.assert_not_called() - - # Experiment ID should remain the same from initialization - assert callback.experiment_id == "test-experiment-id" - - -@patch("mlflow.set_tracking_uri") -@patch("mlflow.get_experiment_by_name") -@patch("mlflow.create_experiment") -@patch("mlflow.set_experiment") -@patch("mlflow.active_run", return_value=None) -def test_build_start_new_experiment( - mock_active_run, mock_set_experiment, mock_create_experiment, mock_get_experiment, mock_set_tracking_uri, setup_env -): - """Test on_build_start with a new experiment.""" - # Set up mocks for a new experiment - experiment doesn't exist during initialization - mock_get_experiment.return_value = None - mock_create_experiment.return_value = "init-experiment-id" - - # Initialize callback - callback = MLFlowCallback(tracking_uri="http://localhost:5000", experiment_name="new-experiment") - - # During initialization: get_experiment_by_name was called, then create_experiment - mock_get_experiment.assert_called_once_with("new-experiment") - mock_create_experiment.assert_called_once_with("new-experiment") - assert callback.experiment_id == "init-experiment-id" - - # Reset mocks for on_build_start testing - mock_get_experiment.reset_mock() - mock_create_experiment.reset_mock() - mock_set_experiment.reset_mock() - - # Mock mlflow methods for on_build_start - with patch("mlflow.start_run") as mock_start_run: - with patch("mlflow.log_params"): - with patch("mlflow.set_tags"): - mock_run = MagicMock() - mock_run.info.run_id = "parent-run-id" - mock_start_run.return_value = mock_run - - # Call on_build_start - callback.on_build_start(setup_env["build_info"]) - - # Since experiment_id is already set, _get_or_create_experiment should not call these again - mock_get_experiment.assert_not_called() - mock_create_experiment.assert_not_called() - - # Experiment ID should remain the same from initialization - assert callback.experiment_id == "init-experiment-id" - - -def test_build_end(setup_env): - """Test on_build_end callback.""" - # Initialize callback with all necessary patches - with patch("mlflow.active_run", return_value=None): - with patch("mlflow.get_experiment_by_name") as mock_get_exp: - with patch("mlflow.create_experiment", return_value="test-experiment-id"): - with patch("mlflow.set_experiment"): - # Mock experiment exists during initialization - mock_experiment = MagicMock() - mock_experiment.experiment_id = "test-experiment-id" - mock_get_exp.return_value = mock_experiment - - callback = MLFlowCallback(tracking_uri="http://localhost:5000", experiment_name="test-experiment") - callback.parent_run_id = "parent-run-id" # Set parent run ID for testing - - # Mock all MLflow methods for on_build_end - mock_run = MagicMock() - mock_run.info.run_id = "parent-run-id" - - with patch("mlflow.active_run", return_value=mock_run): - with patch("mlflow.set_experiment"): - with patch("mlflow.start_run", return_value=mock_run): - with patch("mlflow.log_artifact"): - with patch("mlflow.log_metric"): - with patch("mlflow.set_tag"): - with patch("mlflow.end_run") as mock_end_run: - # Call on_build_end - callback.on_build_end(setup_env["build_info"]) - - # Verify end_run was called once - mock_end_run.assert_called_once() - - -def test_log_metric(setup_env): - """Test _log_metric helper method.""" - # Initialize callback with all necessary patches - with patch("mlflow.active_run", return_value=None): - with patch("mlflow.get_experiment_by_name") as mock_get_exp: - with patch("mlflow.create_experiment", return_value="test-experiment-id"): - with patch("mlflow.set_experiment"): - # Mock experiment exists during initialization - mock_experiment = MagicMock() - mock_experiment.experiment_id = "test-experiment-id" - mock_get_exp.return_value = mock_experiment - - callback = MLFlowCallback(tracking_uri="http://localhost:5000", experiment_name="test-experiment") - - # Mock active_run to True when _log_metric is called - with patch("mlflow.active_run", return_value=MagicMock()): - # Mock the log_metric method - with patch("mlflow.log_metric") as mock_log_metric: - # Call _log_metric - callback._log_metric(setup_env["metric"], prefix="test_", step=1) - - # Verify metric was logged - mock_log_metric.assert_called_once() - - # Just check the first arg (metric name) and validate that other args exist - args = mock_log_metric.call_args[0] - assert "accuracy" in args[0] # Name contains "accuracy" - assert args[1] == 0.95 # Value is correct - - # Check that step was passed as a kwarg - kwargs = mock_log_metric.call_args[1] - assert kwargs.get("step") == 1 - - -if __name__ == "__main__": - pytest.main() diff --git a/tests/unit/internal/models/entities/__init__.py b/tests/unit/internal/models/entities/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/entities/test_metric.py b/tests/unit/internal/models/entities/test_metric.py deleted file mode 100644 index 60b1dac5..00000000 --- a/tests/unit/internal/models/entities/test_metric.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -Module: test_metric_class - -This module contains unit tests for the `Metric` and `MetricComparator` classes, ensuring their functionality -and robustness. The tests cover: - -- Comparison methods (`HIGHER_IS_BETTER`, `LOWER_IS_BETTER`, `TARGET_IS_BETTER`). -- Handling of edge cases like floating-point precision and boundary values. -- Validation logic for metrics (e.g., checking valid/invalid states). -- Compatibility and error handling for metrics with different names or comparison methods. -- System-level behaviours, such as sorting collections of metrics. - -Dependencies: - - pytest: For running the test suite. - - metric_class: The module containing the `Metric` and `MetricComparator` class implementations. - -Example: - pytest test_metric_class.py -""" - -import pytest -from plexe.internal.models.entities.metric import Metric, MetricComparator, ComparisonMethod - - -def test_comparator_higher_is_better(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - assert comparator.compare(0.8, 0.9) == 1 # 0.9 is better - assert comparator.compare(0.9, 0.8) == -1 # 0.9 is better - assert comparator.compare(0.8, 0.8) == 0 # Equal - - -def test_comparator_lower_is_better(): - comparator = MetricComparator(ComparisonMethod.LOWER_IS_BETTER) - assert comparator.compare(0.8, 0.9) == -1 # 0.8 is better - assert comparator.compare(0.9, 0.8) == 1 # 0.8 is better - assert comparator.compare(0.8, 0.8) == 0 # Equal - - -def test_comparator_target_is_better(): - comparator = MetricComparator(ComparisonMethod.TARGET_IS_BETTER, target=1.0) - assert comparator.compare(0.9, 1.1) == 0 # Both are equally close to the target - assert comparator.compare(1.0, 1.2) == -1 # 1.0 is closer to the target - assert comparator.compare(0.8, 1.0) == 1 # 1.0 is closer to the target - - -def test_comparator_invalid_target(): - with pytest.raises(ValueError, match="requires a target value"): - MetricComparator(ComparisonMethod.TARGET_IS_BETTER) - - -def test_comparator_floating_point_precision(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - assert comparator.compare(1.0000001, 1.0000002) == 1 - assert comparator.compare(1.0000002, 1.0000001) == -1 - - -def test_metric_higher_is_better(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metric1 = Metric(name="accuracy", value=0.8, comparator=comparator) - metric2 = Metric(name="accuracy", value=0.9, comparator=comparator) - assert metric1 < metric2 - assert metric2 > metric1 - assert metric1 != metric2 - - -def test_metric_lower_is_better(): - comparator = MetricComparator(ComparisonMethod.LOWER_IS_BETTER) - metric1 = Metric(name="loss", value=0.8, comparator=comparator) - metric2 = Metric(name="loss", value=0.6, comparator=comparator) - assert metric1 < metric2 # metric1 is "lower" because it's worse - assert metric2 > metric1 - assert metric1 != metric2 - - -def test_metric_target_is_better(): - comparator = MetricComparator(ComparisonMethod.TARGET_IS_BETTER, target=1.0) - metric1 = Metric(name="value", value=0.9, comparator=comparator) - metric2 = Metric(name="value", value=1.1, comparator=comparator) - assert metric1 == metric2 # Both are equally close to the target - - -def test_metric_different_names(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metric1 = Metric(name="accuracy", value=0.8, comparator=comparator) - metric2 = Metric(name="loss", value=0.9, comparator=comparator) - with pytest.raises(ValueError, match="Cannot compare metrics with different names"): - metric1 > metric2 - - -def test_metric_invalid_comparison(): - comparator1 = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - comparator2 = MetricComparator(ComparisonMethod.LOWER_IS_BETTER) - comparator3 = MetricComparator(ComparisonMethod.TARGET_IS_BETTER, target=1.0) - comparator4 = MetricComparator(ComparisonMethod.TARGET_IS_BETTER, target=2.0) - metric1 = Metric(name="accuracy", value=0.8, comparator=comparator1) - metric2 = Metric(name="accuracy", value=0.9, comparator=comparator2) - metric3 = Metric(name="accuracy", value=1.0, comparator=comparator3) - metric4 = Metric(name="accuracy", value=1.1, comparator=comparator4) - with pytest.raises(ValueError, match="Cannot compare metrics with different comparison methods"): - metric1 > metric2 - with pytest.raises(ValueError, match="Cannot compare 'TARGET_IS_BETTER' metrics with different target values"): - metric3 > metric4 - - -def test_metric_is_valid(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metric = Metric(name="accuracy", value=float("nan"), comparator=comparator) - assert not metric.is_valid - - metric = Metric(name="accuracy", value=0.8, comparator=comparator) - assert metric.is_valid - - -def test_metric_repr_and_str(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metric = Metric(name="accuracy", value=0.8, comparator=comparator) - assert repr(metric) == "Metric(name='accuracy', value=0.8, comparison=HIGHER_IS_BETTER)" - assert str(metric) == "Metric accuracy ↑ 0.8" - - -def test_metric_transitivity(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metric1 = Metric(name="accuracy", value=0.8, comparator=comparator) - metric2 = Metric(name="accuracy", value=0.9, comparator=comparator) - metric3 = Metric(name="accuracy", value=1.0, comparator=comparator) - assert metric1 < metric2 < metric3 - assert metric3 > metric2 > metric1 - - -def test_metric_collection_sorting(): - comparator = MetricComparator(ComparisonMethod.HIGHER_IS_BETTER) - metrics = [ - Metric(name="accuracy", value=0.8, comparator=comparator), - Metric(name="accuracy", value=0.6, comparator=comparator), - Metric(name="accuracy", value=0.9, comparator=comparator), - ] - metrics.sort(reverse=True) - assert [m.value for m in metrics] == [0.9, 0.8, 0.6] diff --git a/tests/unit/internal/models/execution/__init__.py b/tests/unit/internal/models/execution/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/execution/test_factory.py b/tests/unit/internal/models/execution/test_factory.py deleted file mode 100644 index 1fa7892c..00000000 --- a/tests/unit/internal/models/execution/test_factory.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Test the executor factory.""" - -import pytest -from unittest.mock import patch - -import importlib - -from plexe.tools.execution import _get_executor_class -from plexe.internal.models.execution.process_executor import ProcessExecutor - - -def test_get_executor_class_non_distributed(): - """Test that ProcessExecutor is returned when distributed=False.""" - executor_class = _get_executor_class(distributed=False) - assert executor_class == ProcessExecutor - - -def test_get_executor_class_distributed(): - """Test that RayExecutor is returned when distributed=True and Ray is available.""" - # Check if Ray is available - ray_available = importlib.util.find_spec("ray") is not None - - if ray_available: - executor_class = _get_executor_class(distributed=True) - from plexe.internal.models.execution.ray_executor import RayExecutor - - assert executor_class == RayExecutor - else: - pytest.skip("Ray not available, skipping test") - - -def test_get_executor_class_distributed_ray_not_available(): - """Test that ProcessExecutor is returned as fallback when Ray is not available.""" - # Use a mock to simulate Ray not being available - with patch( - "builtins.__import__", - side_effect=lambda name, *args, **kwargs: ( - ModuleNotFoundError("No module named 'ray'") - if name == "plexe.internal.models.execution.ray_executor" - else importlib.import_module(name) - ), - ): - executor_class = _get_executor_class(distributed=True) - assert executor_class == ProcessExecutor diff --git a/tests/unit/internal/models/execution/test_process_executor.py b/tests/unit/internal/models/execution/test_process_executor.py deleted file mode 100644 index d684d9b3..00000000 --- a/tests/unit/internal/models/execution/test_process_executor.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Unit tests for the ProcessExecutor class and its associated components. - -These tests validate the functionality of the following: -- RedirectQueue: Ensures that stdout and stderr redirection to a queue behaves as expected. -- ProcessExecutor: Tests execution of Python code in an isolated process, including handling of: - - Successful execution. - - Timeouts. - - Exceptions raised during execution. - - Dataset handling and working directory creation. - -The tests use pytest as the test runner and employ mocking to isolate external dependencies. -""" - -import os -import shutil -import subprocess -import sys -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pandas as pd -import pyarrow -import pytest - -from plexe.internal.common.datasets.tabular import TabularDataset -from plexe.internal.models.execution.executor import ExecutionResult -from plexe.internal.models.execution.process_executor import ProcessExecutor - - -class TestProcessExecutor: - def setup_method(self): - self.execution_id = "test_execution" - self.code = "print('Hello, World!')" - self.working_dir = Path(os.getcwd()) / self.execution_id - self.timeout = 5 - self.datasets = {"training_data": TabularDataset(pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}))} - self.process_executor = ProcessExecutor( - execution_id=self.execution_id, - code=self.code, - working_dir=Path(os.getcwd()), - datasets=self.datasets, - timeout=self.timeout, - code_execution_file_name="run.py", - ) - - def teardown_method(self): - if self.working_dir.exists(): - shutil.rmtree(self.working_dir, ignore_errors=True) - - def test_constructor_creates_working_directory(self): - assert self.working_dir.exists() - - @patch("pyarrow.parquet.write_table") - def test_run_successful_execution(self, mock_write_table): - mock_process = MagicMock() - mock_process.communicate.return_value = ("Performance: 0.5", "") - mock_process.returncode = 0 - - with patch("subprocess.Popen", return_value=mock_process) as mock_popen: - result = self.process_executor.run() - - dataset_file = self.working_dir / "training_data.parquet" - mock_write_table.assert_called_once_with( - pyarrow.Table.from_pandas(self.datasets["training_data"].to_pandas()), dataset_file - ) - mock_popen.assert_called_once_with( - [sys.executable, str(self.working_dir / "run.py")], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=str(self.working_dir), - text=True, - ) - assert isinstance(result, ExecutionResult) - assert result.exception is None - assert "Performance: 0.5" in result.term_out - - @patch("subprocess.Popen") - def test_run_timeout(self, mock_popen): - mock_process = MagicMock() - mock_process.communicate.side_effect = subprocess.TimeoutExpired(cmd="test", timeout=self.timeout) - - with patch("subprocess.Popen", return_value=mock_process): - result = self.process_executor.run() - - assert isinstance(result, ExecutionResult) - assert isinstance(result.exception, TimeoutError) - assert result.exec_time == self.timeout - - @patch("subprocess.Popen") - def test_run_exception(self, mock_popen): - mock_process = MagicMock() - mock_process.communicate.return_value = ("", "RuntimeError: Something went wrong") - mock_process.returncode = 1 - - with patch("subprocess.Popen", return_value=mock_process): - result = self.process_executor.run() - - assert isinstance(result, ExecutionResult) - assert isinstance(result.exception, RuntimeError) - assert "Something went wrong" in str(result.exception) - - @patch("pyarrow.parquet.write_table") - def test_dataset_written_to_file(self, mock_write_table): - self.process_executor.run() - dataset_file = self.working_dir / "training_data.parquet" - mock_write_table.assert_called_once_with( - pyarrow.Table.from_pandas(self.datasets["training_data"].to_pandas()), dataset_file - ) - - -if __name__ == "__main__": - pytest.main() diff --git a/tests/unit/internal/models/validation/__init__.py b/tests/unit/internal/models/validation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/validation/primitives/__init__.py b/tests/unit/internal/models/validation/primitives/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/internal/models/validation/primitives/test_syntax.py b/tests/unit/internal/models/validation/primitives/test_syntax.py deleted file mode 100644 index 942349ed..00000000 --- a/tests/unit/internal/models/validation/primitives/test_syntax.py +++ /dev/null @@ -1,97 +0,0 @@ -import pytest -from plexe.internal.models.validation.primitives.syntax import SyntaxValidator - - -@pytest.fixture -def syntax_validator(): - """ - Fixture to provide an instance of SyntaxValidator. - - :return: An instance of SyntaxValidator. - """ - return SyntaxValidator() - - -def test_valid_code(syntax_validator): - """Test that the validate method correctly identifies valid Python code.""" - valid_code = "def add(a, b):\n" " return a + b\n\n" "result = add(2, 3)\n" - result = syntax_validator.validate(valid_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None - - -def test_invalid_code(syntax_validator): - """Test that the validate method correctly identifies invalid Python code.""" - invalid_code = "def add(a, b):\n" " return a + b\n" "result = add(2, 3\n" # Missing closing parenthesis - result = syntax_validator.validate(invalid_code) - assert result.passed is False - assert "Syntax is not valid" in result.message - assert "line 3" in result.message # Ensures the line number is reported - assert result.exception is not None - assert isinstance(result.exception, SyntaxError) - - -def test_empty_code(syntax_validator): - """Test that the validate method handles empty code correctly.""" - empty_code = "" - result = syntax_validator.validate(empty_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None - - -def test_code_with_comments(syntax_validator): - """Test that the validate method handles code containing only comments.""" - comment_code = "# This is a comment\n" "# Another comment line\n" - result = syntax_validator.validate(comment_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None - - -def test_code_with_syntax_warning(syntax_validator): - """Test that the validate method handles code with a warning but no syntax error.""" - warning_code = "x = 1 # Variable assigned but not used" - result = syntax_validator.validate(warning_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None - - -def test_code_with_non_ascii_characters(syntax_validator): - """Test that the validate method handles code with non-ASCII characters.""" - non_ascii_code = "def greet():\n" ' return "Hello, \u4f60\u597d!"' # Includes Chinese characters for "Hello" - result = syntax_validator.validate(non_ascii_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None - - -def test_code_with_indentation_error(syntax_validator): - """Test that the validate method correctly identifies indentation errors.""" - indentation_error_code = "def add(a, b):\n" "return a + b" # Missing indentation for the return statement - result = syntax_validator.validate(indentation_error_code) - assert result.passed is False - assert "Syntax is not valid" in result.message - assert "line 2" in result.message # Ensures the line number is reported - assert result.exception is not None - assert isinstance(result.exception, SyntaxError) - - -def test_code_with_nested_functions(syntax_validator): - """Test that the validate method handles code with nested functions.""" - nested_function_code = "def outer():\n" " def inner():\n" ' return "Hello"\n' " return inner()\n" - result = syntax_validator.validate(nested_function_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None - - -def test_code_with_large_input(syntax_validator): - """Test that the validate method handles a large amount of valid code.""" - large_code = "\n".join([f"def func{i}():\n return {i}" for i in range(1000)]) - result = syntax_validator.validate(large_code) - assert result.passed is True - assert result.message == "Syntax is valid." - assert result.exception is None diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py deleted file mode 100644 index 0912ab19..00000000 --- a/tests/unit/test_datasets.py +++ /dev/null @@ -1,95 +0,0 @@ -from unittest.mock import patch - -import numpy as np -import pandas as pd -import pytest - -from plexe import DatasetGenerator - - -@pytest.fixture -def sample_schema(): - """Test schema for house price prediction""" - return { - "input_schema": {"square_feet": float, "bedrooms": int, "location": str}, - "output_schema": {"price": float}, - } - - -@pytest.fixture -def mock_generated_data(): - """Mock data generation output""" - return pd.DataFrame( - { - "square_feet": np.random.uniform(1000, 3000, 50), - "bedrooms": np.random.randint(2, 6, 50), - "location": np.random.choice(["suburban", "urban", "rural"], 50), - "price": np.random.uniform(200000, 600000, 50), - } - ) - - -class TestDataGeneration: - """Test suite for data generation with comprehensive mocking""" - - @pytest.fixture(autouse=True) - def setup_mocks(self): - """Setup all required mocks for the test class""" - # Mock the data generation function - self.mock_generate_data = patch("plexe.datasets.DataGenerator.generate", return_value=pd.DataFrame()).start() - - yield - - # Stop all mocks after the test - patch.stopall() - - def test_basic_generation(self, sample_schema, mock_generated_data): - """Test basic data generation""" - self.mock_generate_data.return_value = mock_generated_data - - dataset = DatasetGenerator( - description="House features and prices, each row is a house", - schema={**sample_schema["input_schema"], **sample_schema["output_schema"]}, - provider="openai/gpt-4o", - ) - dataset.generate(50) - - # Verify generate_data was called with correct parameters - self.mock_generate_data.assert_called_once() - call_args = self.mock_generate_data.call_args[0] - assert isinstance(call_args[0], int) - assert call_args[0] == 50 - - # Verify generated data was added to the dataset - assert dataset._data is not None - assert len(dataset._data) == 50 - - def test_data_augmentation(self, sample_schema, mock_generated_data): - """Test data augmentation with existing dataset""" - self.mock_generate_data.return_value = mock_generated_data - - existing_data = pd.DataFrame( - { - "square_feet": [1000, 1500, 2000], - "bedrooms": [2, 3, 4], - "location": ["A", "B", "C"], - "price": [200000, 300000, 400000], - } - ) - - dataset = DatasetGenerator( - description="House features and prices, each row is a house", - schema={**sample_schema["input_schema"], **sample_schema["output_schema"]}, - provider="openai/gpt-4o", - data=existing_data.copy(), - ) - dataset.generate(50) - - # Verify generate_data was called with correct parameters - self.mock_generate_data.assert_called_once() - call_args = self.mock_generate_data.call_args[0] - assert isinstance(call_args[0], int) - assert call_args[0] == 50 - - # Verify final dataset includes both original and synthetic data - assert len(dataset._data) == len(existing_data) + 50 diff --git a/tests/unit/test_fileio.py b/tests/unit/test_fileio.py deleted file mode 100644 index 80a07f3f..00000000 --- a/tests/unit/test_fileio.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Unit tests for plexe.fileio module, including backwards compatibility testing. -""" - -from pathlib import Path -from typing import Any - -import pytest - -import plexe.fileio as fileio - - -def _validate_model(model: Any) -> None: - """Helper function to validate the loaded model.""" - # Basic validation - model should load successfully - assert model is not None, "Model should not be None" - assert hasattr(model, "intent"), "Model should have an 'intent' attribute" - assert hasattr(model, "input_schema"), "Model should have an 'input_schema' attribute" - assert hasattr(model, "output_schema"), "Model should have an 'output_schema' attribute" - assert hasattr(model, "predictor"), "Model should have a 'predictor' attribute" - assert isinstance(model.intent, str), "Intent should be of type str" - assert hasattr(model, "predict"), "Model should have a 'predict' method" - assert callable(model.predict), "Model's 'predict' should be callable" - # Schema validation - assert model.input_schema is not None - assert model.output_schema is not None - # Model should be in READY state if it was saved as a complete model - from plexe.core.state import ModelState - - assert model.state == ModelState.READY - - -class TestFileIO: - """Test cases for fileio module functionality.""" - - def test_load_model_backwards_compatibility_v0_18_3(self): - """Test loading a model bundle from v0.18.3 for backwards compatibility.""" - fixture_path = Path(__file__).parent.parent / "fixtures/legacy_models/model_v0_18_3.tar.gz" - - if not fixture_path.exists(): - pytest.skip(f"Legacy model fixture not found: {fixture_path}") - - # Load the legacy model - model = fileio.load_model(fixture_path) - - _validate_model(model) - - def test_load_model_backwards_compatibility_v0_23_2(self): - """Test loading a model bundle from v0.23.2 for backwards compatibility.""" - fixture_path = Path(__file__).parent.parent / "fixtures/legacy_models/model_v0_23_2.tar.gz" - - if not fixture_path.exists(): - pytest.skip(f"Legacy model fixture not found: {fixture_path}") - - # Load the legacy model - model = fileio.load_model(fixture_path) - - _validate_model(model) - - def test_load_model_file_not_found(self): - """Test that load_model raises appropriate error for missing files.""" - non_existent_path = Path("non_existent_model.tar.gz") - - with pytest.raises(ValueError, match="Failed to load model"): - fileio.load_model(non_existent_path) diff --git a/tests/utils/utils.py b/tests/utils/utils.py deleted file mode 100644 index 62f62f70..00000000 --- a/tests/utils/utils.py +++ /dev/null @@ -1,445 +0,0 @@ -# tests/utils/utils.py -import numpy as np -import pandas as pd -import shutil -from pathlib import Path - -from plexe.internal.models.entities.description import ModelDescription - - -def generate_heart_data(n_samples=30, random_seed=42): - """Generate synthetic heart disease data for testing. - - The data follows the structure: - - age: int (25-80) - - gender: int (0=female, 1=male) - - cp: int (chest pain type, 0-3) - - trtbps: int (resting blood pressure, 90-200) - - chol: int (cholesterol, 120-400) - - fbs: int (fasting blood sugar > 120 mg/dl, 0-1) - - restecg: int (resting ECG results, 0-2) - - thalachh: int (maximum heart rate achieved, 70-220) - - exng: int (exercise induced angina, 0-1) - - oldpeak: float (ST depression induced by exercise, 0-6.0) - - slp: int (slope of peak exercise ST segment, 0-2) - - caa: int (number of major vessels, 0-4) - - thall: int (thalassemia, 0-3) - - output: int (presence of heart disease, 0-1) - """ - np.random.seed(random_seed) - - # Generate features - data = { - "age": np.random.randint(25, 80, n_samples), - "gender": np.random.randint(0, 2, n_samples), - "cp": np.random.randint(0, 4, n_samples), - "trtbps": np.random.randint(90, 200, n_samples), - "chol": np.random.randint(120, 400, n_samples), - "fbs": np.random.randint(0, 2, n_samples), - "restecg": np.random.randint(0, 3, n_samples), - "thalachh": np.random.randint(70, 220, n_samples), - "exng": np.random.randint(0, 2, n_samples), - "oldpeak": np.round(np.random.uniform(0, 6, n_samples), 1), - "slp": np.random.randint(0, 3, n_samples), - "caa": np.random.randint(0, 5, n_samples), - "thall": np.random.randint(0, 4, n_samples), - } - - # Generate target based on risk factors - risk_factors = ( - (data["age"] > 60).astype(int) * 2 # Age over 60 is high risk - + data["gender"] # Being male slightly increases risk - + (data["cp"] > 1).astype(int) * 2 # Higher chest pain types increase risk - + (data["trtbps"] > 140).astype(int) # High blood pressure - + (data["chol"] > 250).astype(int) # High cholesterol - + data["fbs"] # High fasting blood sugar - + (data["thalachh"] < 120).astype(int) * 2 # Low max heart rate - + data["exng"] * 2 # Exercise-induced angina - + (data["oldpeak"] > 2).astype(int) * 2 # High ST depression - + data["caa"] # Number of major vessels - ) - - # Convert risk factors to binary output (threshold chosen to get roughly balanced classes) - data["output"] = (risk_factors > 8).astype(int) - - return pd.DataFrame(data) - - -def generate_house_prices_data(n_samples=30, random_seed=42): - """Generate synthetic house price data for regression testing. - - The data follows the structure: - - area: int (square feet, 800-5000) - - bedrooms: int (1-6) - - bathrooms: int (1-5) - - stories: int (1-4) - - garage: int (0-3 cars) - - garden: int (0=no, 1=yes) - - fenced: int (0=no, 1=yes) - - age: int (years, 0-100) - - price: float (house price in thousands, 100-2000) - """ - np.random.seed(random_seed) - - # Generate features - data = { - "area": np.random.randint(800, 5000, n_samples), - "bedrooms": np.random.randint(1, 7, n_samples), - "bathrooms": np.random.randint(1, 6, n_samples), - "stories": np.random.randint(1, 5, n_samples), - "garage": np.random.randint(0, 4, n_samples), - "garden": np.random.randint(0, 2, n_samples), - "fenced": np.random.randint(0, 2, n_samples), - "age": np.random.randint(0, 101, n_samples), - } - - # Generate price based on features - # Base price - price = 100 + np.random.normal(0, 20, n_samples) - - # Add impact of features - price += data["area"] * 0.2 # Larger area increases price - price += data["bedrooms"] * 25 # More bedrooms increase price - price += data["bathrooms"] * 35 # More bathrooms increase price - price += data["stories"] * 30 # More stories increase price - price += data["garage"] * 40 # Garage increases price - price += data["garden"] * 50 # Garden increases price - price += data["fenced"] * 25 # Fenced yard increases price - price -= data["age"] * 1.5 # Older houses decrease in price - - # Add some noise - price += np.random.normal(0, 50, n_samples) - - # Ensure reasonable price range - price = np.clip(price, 100, 2000) - data["price"] = np.round(price, 2) - - return pd.DataFrame(data) - - -def generate_customer_churn_data(n_samples=30, random_seed=42): - """Generate synthetic customer churn data for classification testing. - - The data follows the structure: - - tenure: int (months, 0-100) - - monthly_charges: float (dollars, 20-150) - - total_charges: float (dollars, 0-10000) - - contract_type: int (0=month-to-month, 1=one year, 2=two year) - - payment_method: int (0=electronic check, 1=mailed check, 2=bank transfer, 3=credit card) - - tech_support: int (0=no, 1=yes) - - online_backup: int (0=no, 1=yes) - - online_security: int (0=no, 1=yes) - - churn: int (0=no, 1=yes) - """ - np.random.seed(random_seed) - - # Generate features - data = { - "tenure": np.random.randint(1, 101, n_samples), - "monthly_charges": np.round(np.random.uniform(20, 150, n_samples), 2), - "contract_type": np.random.randint(0, 3, n_samples), - "payment_method": np.random.randint(0, 4, n_samples), - "tech_support": np.random.randint(0, 2, n_samples), - "online_backup": np.random.randint(0, 2, n_samples), - "online_security": np.random.randint(0, 2, n_samples), - } - - # Calculate total charges based on tenure and monthly charges - # Add some variance to simulate different starting points and promotional offers - variance_factor = np.random.uniform(0.8, 1.2, n_samples) - data["total_charges"] = np.round(data["tenure"] * data["monthly_charges"] * variance_factor, 2) - - # Generate churn based on risk factors - risk_factors = ( - (data["tenure"] < 12).astype(int) * 3 # Low tenure is high risk - + (data["monthly_charges"] > 100).astype(int) * 2 # High monthly charges - + (data["contract_type"] == 0).astype(int) * 3 # Month-to-month contracts are higher risk - + (data["payment_method"] == 0).astype(int) * 2 # Electronic check is higher risk - - data["tech_support"] * 1 # Having tech support reduces risk - - data["online_backup"] * 1 # Having online backup reduces risk - - data["online_security"] * 1 # Having online security reduces risk - ) - - # Convert risk factors to binary churn (threshold chosen to get roughly balanced classes) - data["churn"] = (risk_factors > 5).astype(int) - data["churn_probability"] = np.round(np.clip(risk_factors / 10, 0, 1), 2) # Probability of churn - - return pd.DataFrame(data) - - -def generate_sentiment_data(n_samples=20, random_seed=42): - """Generate synthetic sentiment analysis data for text classification testing. - - The data follows the structure: - - text: str (review text) - - sentiment: str (positive, negative, or neutral) - """ - np.random.seed(random_seed) - - positive_texts = [ - "This product exceeded my expectations!", - "Great service and amazing quality", - "I absolutely love this product", - "Best purchase I've ever made", - "Fantastic experience overall", - "The customer service was exceptional", - "Very satisfied with my purchase", - "Would definitely recommend to friends", - "Works perfectly for what I need", - "Very happy with the quality and speed of delivery", - ] - - negative_texts = [ - "Very disappointed with the quality", - "Would not recommend this", - "Poor service and slow delivery", - "Product broke after first use", - "Waste of money", - "Customer service was terrible", - "Not as described in the listing", - "Overpriced for what you get", - "Save your money and buy something else", - "Regret making this purchase", - ] - - neutral_texts = [ - "Product was ok, nothing special", - "Meets basic expectations", - "Delivery was on time", - "Average quality for the price", - "Some features work well, others don't", - "Neither impressed nor disappointed", - "Does the job but could be better", - "Might purchase again, still deciding", - "Mixed feelings about this product", - "Not bad, not great", - ] - - # Create balanced dataset with roughly equal numbers of each sentiment - data = [] - categories = ["positive", "negative", "neutral"] - text_sources = [positive_texts, negative_texts, neutral_texts] - - for i in range(n_samples): - cat_idx = i % 3 - text_idx = np.random.randint(0, len(text_sources[cat_idx])) - data.append({"text": text_sources[cat_idx][text_idx], "sentiment": categories[cat_idx]}) - - # Shuffle the data - np.random.shuffle(data) - - return pd.DataFrame(data) - - -def generate_product_recommendation_data(n_samples=30, random_seed=42): - """Generate synthetic product recommendation data. - - The data follows the structure: - - order_id: str (unique order identifier) - - style: str (product style code) - - category: str (product category) - - customer_id: str (customer identifier) - """ - np.random.seed(random_seed) - - # Define product categories and styles - categories = ["Clothing", "Electronics", "Home", "Beauty", "Books"] - - # Generate unique style codes for each category - styles_by_category = {} - for category in categories: - prefix = category[:3].upper() - styles_by_category[category] = [f"{prefix}{100 + i}" for i in range(10)] - - # Flatten all styles - all_styles = [style for styles in styles_by_category.values() for style in styles] - - # Create customer IDs - customer_ids = [f"CUST{1000 + i}" for i in range(10)] - - # Generate purchase patterns - # Each customer has preferences for certain categories and styles - customer_preferences = {} - for cust_id in customer_ids: - # Pick 2-3 preferred categories - preferred_cats = np.random.choice(categories, size=np.random.randint(2, 4), replace=False) - # For each preferred category, pick 2-3 preferred styles - preferred_styles = [] - for cat in preferred_cats: - preferred_styles.extend( - np.random.choice(styles_by_category[cat], size=np.random.randint(2, 4), replace=False) - ) - customer_preferences[cust_id] = preferred_styles - - # Generate orders - data = [] - order_ids = [] - - # Create order IDs - for i in range(n_samples // 3): # Each order will have multiple items - order_ids.append(f"ORD{10000 + i}") - - # Generate order data - for order_id in order_ids: - # Pick a random customer - customer_id = np.random.choice(customer_ids) - # Decide number of items in this order (2-5) - n_items = np.random.randint(2, 6) - - # 70% chance the customer buys from their preferences - if np.random.random() < 0.7: - # Pick from their preferred styles - styles = np.random.choice( - customer_preferences[customer_id], - size=min(n_items, len(customer_preferences[customer_id])), - replace=False, - ) - else: - # Pick random styles - styles = np.random.choice(all_styles, size=n_items, replace=False) - - # Add items to data - for style in styles: - # Find category of this style - for cat, cat_styles in styles_by_category.items(): - if style in cat_styles: - category = cat - break - - data.append({"order_id": order_id, "style": style, "category": category, "customer_id": customer_id}) - - return pd.DataFrame(data) - - -def generate_time_series_data(n_samples=60, random_seed=42): - """Generate synthetic time series data for forecasting testing. - - The data follows the structure: - - date: date (time index) - - sales: float (sales amount) - - promo: int (0=no promotion, 1=promotion running) - - holiday: int (0=no holiday, 1=holiday) - - day_of_week: int (0-6, Monday=0) - """ - np.random.seed(random_seed) - - # Generate dates - start_date = pd.to_datetime("2023-01-01") - dates = [start_date + pd.Timedelta(days=i) for i in range(n_samples)] - - # Generate features - day_of_week = [date.weekday() for date in dates] - - # Holidays (random 7% of days are holidays) - holiday = np.zeros(n_samples) - holiday_indices = np.random.choice(range(n_samples), size=int(n_samples * 0.07), replace=False) - holiday[holiday_indices] = 1 - - # Promotions (random 20% of days have promotions) - promo = np.zeros(n_samples) - promo_indices = np.random.choice(range(n_samples), size=int(n_samples * 0.2), replace=False) - promo[promo_indices] = 1 - - # Generate sales with trend, seasonality, and effects of promotions and holidays - # Base trend (slightly increasing) - trend = np.linspace(0, 20, n_samples) - - # Weekly seasonality (higher on weekends) - seasonality = np.array([5 if dow >= 5 else 0 for dow in day_of_week]) - - # Promotion and holiday effects - promo_effect = promo * 25 - holiday_effect = holiday * 15 - - # Combine effects - sales = 100 + trend + seasonality + promo_effect + holiday_effect - - # Add noise - sales += np.random.normal(0, 10, n_samples) - - # Ensure sales are positive - sales = np.maximum(0, sales) - - # Create DataFrame - data = pd.DataFrame( - { - "date": dates, - "sales": np.round(sales, 2), - "promo": promo.astype(int), - "holiday": holiday.astype(int), - "day_of_week": day_of_week, - } - ) - - return data - - -def verify_prediction(prediction, expected_schema=None): - """Verify that a prediction matches expected format.""" - assert isinstance(prediction, dict), "Prediction should be a dictionary" - assert len(prediction) > 0, "Prediction should not be empty" - - if expected_schema: - schema_keys = getattr(expected_schema, "model_fields", None) - if schema_keys is not None: - schema_keys = set(schema_keys.keys()) - else: - schema_keys = set(expected_schema.keys()) - - assert ( - set(prediction.keys()) == schema_keys - ), f"Prediction keys {prediction.keys()} don't match schema keys {schema_keys}" - - # Check first value to ensure it's of expected type - output_value = list(prediction.values())[0] - if isinstance(output_value, list): - # If the output is a list, check that it's not empty - assert len(output_value) > 0, "Prediction list should not be empty" - else: - # If output is not a list, check that it's one of the expected types - assert isinstance( - output_value, (int, float, str) - ), f"Prediction value should be numeric, string, or list, got {type(output_value)}" - - -def verify_model_description(description): - """Verify that a model description contains expected fields.""" - assert isinstance(description, ModelDescription), "Model description should be a 'ModelDescription' object" - required_fields = ["intent", "schemas", "code"] - for field in required_fields: - assert hasattr(description, field), f"Model description missing required field: {field}" - - -def cleanup_files(model_dir=None): - """Clean up any files created during tests.""" - files_to_clean = [ - "plexe.log", - "*.pmb", - "*.tar.gz", - ] - - # Clean up files in current directory - for pattern in files_to_clean: - try: - for file in Path(".").glob(pattern): - if file.is_file(): - file.unlink() - except Exception as e: - print(f"Failed to clean up {pattern}: {e}") - - # Clean up files in model directory - if model_dir is not None and Path(model_dir).exists(): - try: - # Use rmtree to recursively remove directory and contents - shutil.rmtree(model_dir, ignore_errors=True) - except Exception as e: - print(f"Failed to clean up {model_dir}: {e}") - # If rmtree fails, try to at least clean individual files - for file in Path(model_dir).glob("*"): - try: - if file.is_file(): - file.unlink(missing_ok=True) - elif file.is_dir(): - shutil.rmtree(file, ignore_errors=True) - except Exception as e: - print(f"Failed to clean up {file}: {e}")