Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion cmd/slack-bot/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (

userv1 "github.com/openshift/api/user/v1"

"github.com/openshift/ci-tools/pkg/chaibot"
"github.com/openshift/ci-tools/pkg/jira"
"github.com/openshift/ci-tools/pkg/pagerdutyutil"
eventhandler "github.com/openshift/ci-tools/pkg/slack/events"
Expand Down Expand Up @@ -66,6 +67,9 @@ type options struct {
requireWorkflowsInForum bool
supportRequestChannelID string
supportRequestThreshold int

enableTriage bool
triageConfigPath string
}

func (o *options) Validate() error {
Expand Down Expand Up @@ -117,6 +121,8 @@ func gatherOptions(fs *flag.FlagSet, args ...string) options {
fs.BoolVar(&o.requireWorkflowsInForum, "require-workflows-in-forum", true, "Require the use of workflows in the designated forum channel")
fs.StringVar(&o.supportRequestChannelID, "support-request-channel-id", "CBN38N3MW", "Channel ID where support request mode watches long threads (defaults to #forum-ocp-testplatform)")
fs.IntVar(&o.supportRequestThreshold, "support-request-threshold", 12, "Create a support-request Jira when a thread has more than this many messages (total count includes the root message)")
fs.BoolVar(&o.enableTriage, "enable-triage", false, "Enable Chaibot automatic test failure triage")
fs.StringVar(&o.triageConfigPath, "triage-config-path", "", "Path to triage configuration file")
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if err := fs.Parse(args); err != nil {
logrus.WithError(err).Fatal("Could not parse args.")
Expand Down Expand Up @@ -198,6 +204,57 @@ func main() {
}
}

// Initialize Chaibot if enabled
var chaibotAnalyzer *chaibot.Analyzer
var chaibotChannels []string
if o.enableTriage {
// Fail fast if required config is missing
if o.triageConfigPath == "" {
logrus.Fatal("--enable-triage requires --triage-config-path to be set")
}

mcpURL := os.Getenv("SHIP_HELP_MCP_URL")
mcpToken := os.Getenv("SHIP_HELP_MCP_TOKEN")

// Fail fast if required env vars are missing
if mcpURL == "" || mcpToken == "" {
logrus.Fatal("--enable-triage requires both SHIP_HELP_MCP_URL and SHIP_HELP_MCP_TOKEN environment variables")
}

type triageConfig struct {
Enabled bool `yaml:"enabled"`
MonitoredChannels []struct {
Name string `yaml:"name"`
ChannelID string `yaml:"channel_id"`
} `yaml:"monitored_channels"`
Analysis struct {
AIProvider string `yaml:"ai_provider"`
PromptTemplate string `yaml:"prompt_template"`
} `yaml:"analysis"`
}

configData, err := os.ReadFile(o.triageConfigPath)
if err != nil {
logrus.WithError(err).Fatal("Failed to read triage config")
}

var cfg triageConfig
if err := yaml.Unmarshal(configData, &cfg); err != nil {
logrus.WithError(err).Fatal("Failed to parse triage config")
}

chaibotAnalyzer = chaibot.NewAnalyzer(mcpURL, mcpToken, cfg.Analysis.PromptTemplate)

for _, ch := range cfg.MonitoredChannels {
chaibotChannels = append(chaibotChannels, ch.ChannelID)
}

logrus.WithFields(logrus.Fields{
"channels": len(chaibotChannels),
"provider": cfg.Analysis.AIProvider,
}).Info("Chaibot triage enabled")
}

metrics.ExposeMetrics("slack-bot", config.PushGateway{}, o.instrumentationOptions.MetricsPort)
simplifier := simplifypath.NewSimplifier(l("", // shadow element mimicing the root
l(""), // for black-box health checks
Expand All @@ -215,7 +272,7 @@ func main() {
// handle the root to allow for a simple uptime probe
mux.Handle("/", handler(http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) { writer.WriteHeader(http.StatusOK) })))
mux.Handle("/slack/interactive-endpoint", handler(handleInteraction(secret.GetTokenGenerator(o.slackSigningSecretPath), interactionrouter.ForModals(issueFiler, slackClient))))
mux.Handle("/slack/events-endpoint", handler(handleEvent(secret.GetTokenGenerator(o.slackSigningSecretPath), eventrouter.ForEvents(slackClient, issueFiler, kubeClient, configAgent.Config, gcsClient, keywordsConfig, o.helpdeskAlias, o.forumChannelId, o.reviewRequestWorkflowID, o.namespace, o.supportRequestChannelID, o.supportRequestThreshold, o.requireWorkflowsInForum))))
mux.Handle("/slack/events-endpoint", handler(handleEvent(secret.GetTokenGenerator(o.slackSigningSecretPath), eventrouter.ForEvents(slackClient, issueFiler, kubeClient, configAgent.Config, gcsClient, keywordsConfig, o.helpdeskAlias, o.forumChannelId, o.reviewRequestWorkflowID, o.namespace, o.supportRequestChannelID, o.supportRequestThreshold, o.requireWorkflowsInForum, chaibotAnalyzer, chaibotChannels))))
server := &http.Server{Addr: ":" + strconv.Itoa(o.port), Handler: mux}

health.ServeReady()
Expand Down
182 changes: 182 additions & 0 deletions pkg/chaibot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Chaibot - Ship-Help MCP Integration

This package provides test failure analysis using Chai Bot (ship-help MCP) for automatic Slack triage.

## Overview

Chaibot monitors Slack channels for Prow CI job failure URLs and automatically posts analysis using the Chai Bot service via ship-help MCP.

## Files in This PR

- `pkg/chaibot/analyzer.go` - Ship-help MCP client implementation
- `pkg/chaibot/analyzer_test.go` - Unit tests
- `pkg/slack/events/chaibot/handler.go` - Slack event handler (monitors for Prow URLs)
- `pkg/slack/events/chaibot/handler_test.go` - Event handler tests
- `pkg/slack/events/router/router.go` - Updated to register Chaibot handler
- `cmd/slack-bot/main.go` - Updated with Chaibot initialization

**This PR provides the complete implementation.** The integration is ready to use once deployed.

## How It Works

### 1. Event Handler Pattern (Already Implemented)

Chaibot uses the existing event handler pattern in `openshift/ci-tools`:

**Implementation files:**
- `pkg/slack/events/chaibot/handler.go` - Monitors Slack messages for Prow URLs
- Registered in `pkg/slack/events/router/router.go`
- Initialized in `cmd/slack-bot/main.go`

**What the handler does:**
1. Monitors configured Slack channels (e.g., `#opp-discussion`)
2. Detects Prow CI job URLs in messages
3. Calls `analyzer.AnalyzeFailure()` asynchronously
4. Posts analysis results in a thread

### 2. Initialization in cmd/slack-bot/main.go

**Already implemented in this PR:**

```go
// Command-line flags (added)
--enable-triage // Enable Chaibot
--triage-config-path // Path to triage-config.yaml

// Initialization (added to main())
if o.enableTriage && o.triageConfigPath != "" {
mcpURL := os.Getenv("SHIP_HELP_MCP_URL")
mcpToken := os.Getenv("SHIP_HELP_MCP_TOKEN")

// Create analyzer
chaibotAnalyzer = chaibot.NewAnalyzer(mcpURL, mcpToken, promptTemplate)

// Handler is registered in router.ForEvents()
}
```

### 3. Event Router Registration

**Already implemented in pkg/slack/events/router/router.go:**

```go
func ForEvents(client *slack.Client, chaibotAnalyzer *chaibot.Analyzer, chaibotChannels []string, ...) {
// ... existing handlers ...

if chaibotAnalyzer != nil && len(chaibotChannels) > 0 {
handlers = append(handlers, chaibothandler.Handler(client, chaibotAnalyzer, chaibotChannels))
}
}
```

## NOT in This PR (Requires openshift/release Configuration)

The following configuration files are in **openshift/release#80559**, not this PR:

- `core-services/ci-chat-bot/triage-config.yaml` - Chaibot configuration
- `clusters/app.ci/ci-chat-bot/chaibot-configmap.yaml` - Kubernetes ConfigMap
- `clusters/app.ci/ci-chat-bot/ci-chat-bot.yaml` - Deployment with environment variables
- `core-services/ci-secret-bootstrap/chaibot-secret-config.yaml` - Ship-help token secret

## Usage

Once both PRs are merged and deployed:

1. **Post a Prow URL in a monitored channel:**
```
Job failed: https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-stolostron-policy-collection-main-ocp4.22-interop-opp-aws/2066255424226594816
```

2. **Chaibot responds in a thread within 30-60 seconds** with:
- Which step(s) failed
- Root cause analysis (product bug, test issue, or infrastructure)
- Related Jira tickets
- Pass rate history
- Recommended fixes

## Configuration

**Deployment configuration is in openshift/release#80559:**

- `core-services/ci-chat-bot/triage-config.yaml` - Main config:
- Monitored channels (e.g., `#opp-discussion`)
- Ship-help MCP endpoint
- Analysis prompt template
- Rate limiting settings

- `clusters/app.ci/ci-chat-bot/ci-chat-bot.yaml` - Deployment:
- Environment variables: `SHIP_HELP_MCP_URL`, `SHIP_HELP_MCP_TOKEN`
- ConfigMap mount: `/etc/triage-config/triage-config.yaml`

## How to Enable Chaibot

Chaibot is enabled via **command-line flags** (not environment variables):

**Command-line flags (required):**
- `--enable-triage` - Enable Chaibot functionality
- `--triage-config-path=/etc/triage-config/triage-config.yaml` - Path to config file

**Environment variables (required):**
- `SHIP_HELP_MCP_URL` - Ship-help MCP endpoint
- `SHIP_HELP_MCP_TOKEN` - Authentication token (from Kubernetes secret)

**Example deployment command:**
```yaml
# In clusters/app.ci/ci-chat-bot/ci-chat-bot.yaml
args:
- --enable-triage
- --triage-config-path=/etc/triage-config/triage-config.yaml
env:
- name: SHIP_HELP_MCP_URL
value: "https://ship-help-mcp-continuous-release-tooling--ship-help-bot.apps.gpc.ocp-hub.prod.psi.redhat.com/personas/ocp_ai_helpdesk/mcp"
- name: SHIP_HELP_MCP_TOKEN
valueFrom:
secretKeyRef:
name: cluster-secrets-chaibot-ship-help
key: ship-help-token
```

**Without these flags, Chaibot will NOT activate** - even if environment variables are set.

## Related PRs

- **This PR (openshift/ci-tools#5251)** - Chaibot implementation (analyzer, handler, router, main.go)
- **openshift/release#80559** - Configuration and deployment (config files, secrets, ConfigMaps)
- Based on `/analyze-failure` skill by MPEX Integrity team
- Alternative to PR openshift/release#80476 (OpenAI approach)

## Architecture

```
User posts Prow URL in Slack
Slack Event API → ci-chat-bot deployment
pkg/slack/events/chaibot/handler.go
- Detects Prow URL
- Extracts job URL
pkg/chaibot/analyzer.go
- Calls ship-help MCP (ask_persona tool)
- Sends prompt with job URL
Ship-Help MCP (ocp_ai_helpdesk persona)
- Searches Jira, Sippy, Prow logs
- Analyzes failure
- Returns comprehensive analysis
pkg/chaibot/analyzer.go
- Formats response as Slack Block Kit
Slack API
- Posts analysis in thread
```

## Cost Comparison

| Solution | Cost | Data Sources |
|----------|------|--------------|
| **Chaibot (ship-help MCP)** | $0/month | 9+ sources (Jira, Sippy, Prow, GitHub, etc.) |
| OpenAI GPT-4o (PR #80476) | ~$1,080/year | 3 sources (limited context) |

**Chaibot uses internal Red Hat infrastructure** - no external API costs.
Loading