php-llm-agent/lib/actions/wikipedia.action.php

<?php
/**
 * ACTIONS: Wikipedia Integration
 * ------------------------------
 * @wikiSummary <Title>      → returns the lead/summary paragraph
 * @wikiFullArticle <Title>  → returns the full article as plain text
 *
 * Examples:
 *   @wikiSummary Finland
 *   @wikiFullArticle Bitcoin
 */

// 1) Register actions
registerAction(
    (new Action())
        ->setFunctionName("wikiSummary")
        ->setArgumentsTypes(["string"]) // title
);

registerAction(
    (new Action())
        ->setFunctionName("wikiFullArticle")
        ->setArgumentsTypes(["string"]) // title
);

// --- Helpers ---------------------------------------------------------------

/**
 * Basic HTTP GET via cURL.
 * @return array [int $httpCode, ?string $body, ?string $err]
 */
function http_get(string $url, int $timeout = 6): array {
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_TIMEOUT        => $timeout,
        CURLOPT_CONNECTTIMEOUT => 4,
        CURLOPT_USERAGENT      => "LLM-Action-Demo/1.0 (+https://example.com)"
    ]);
    $body = curl_exec($ch);
    $err  = curl_error($ch) ?: null;
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);
    return [$code, $body, $err];
}

/** Safely pick the first (and only) page object from MediaWiki Action API */
function mw_first_page(array $json): ?array {
    if (!isset($json['query']['pages']) || !is_array($json['query']['pages'])) return null;
    foreach ($json['query']['pages'] as $page) {
        return $page; // first element
    }
    return null;
}

// --- Actions ---------------------------------------------------------------

/**
 * @param string $title  Wikipedia article title
 * @return string
 */
function wikiSummary(string $title): string {
    $title = str_replace("-"," ",$title);
    $encoded = rawurlencode($title);
    $url = "https://en.wikipedia.org/api/rest_v1/page/summary/{$encoded}";

    [$code, $body, $err] = http_get($url);
    if ($err)   return "Error fetching summary for '{$title}': {$err}";
    if ($code < 200 || $code >= 300 || !$body) return "HTTP {$code}: Failed to fetch summary for '{$title}'.";

    $data = json_decode($body, true);
    if (isset($data['extract']) && is_string($data['extract']) && $data['extract'] !== '') {
        return $data['extract'];
    }

    // Common “not found” or disambiguation handling
    if (!empty($data['type']) && $data['type'] === 'disambiguation') {
        return "‘{$title}’ is a disambiguation page. Try a more specific title.";
    }

    return "No summary found for '{$title}'.";
}

/**
 * Returns full article as plain text (sections + paragraphs).
 * Uses MediaWiki Action API with extracts (plaintext).
 *
 * @param string $title
 * @return string
 */
function wikiFullArticle(string $title): string {
    $title = str_replace("-"," ",$title);
    $encoded = rawurlencode($title);
    $url = "https://en.wikipedia.org/w/api.php"
         . "?action=query"
         . "&prop=extracts"
         . "&explaintext=1"
         . "&exsectionformat=plain"
         . "&format=json"
         . "&redirects=1"
         . "&titles={$encoded}";

    [$code, $body, $err] = http_get($url, 12);
    if ($err)   return "Error fetching article for '{$title}': {$err}";
    if ($code < 200 || $code >= 300 || !$body) return "HTTP {$code}: Failed to fetch article for '{$title}'.";

    $json = json_decode($body, true);
    $page = mw_first_page($json);

    if (!$page) {
        return "No article found for '{$title}'.";
    }

    if (isset($page['missing'])) {
        return "No article found for '{$title}'.";
    }

    if (!isset($page['extract']) || trim($page['extract']) === '') {
        return "Article exists but has no plain-text extract for '{$title}'.";
    }

    // Optionally trim extremely long responses (LLM-friendly)
    $maxChars = 40000; // adjust for your pipeline
    $text = $page['extract'];
    if (mb_strlen($text, 'UTF-8') > $maxChars) {
        $text = mb_substr($text, 0, $maxChars, 'UTF-8') . "\n\n[Truncated]";
    }

    return $text;
}