Compare commits

...

8 Commits

Author SHA1 Message Date
Kota-Yamaguchi
b2e94e49cd
Merge c03f8f55ed into 5ff02b469f 2024-11-15 00:13:48 +09:00
Kota-Yamaguchi
c03f8f55ed Add gitignore 2024-11-10 22:35:56 +09:00
Kota-Yamaguchi
a42a176798 Update bin 2024-11-10 22:33:55 +09:00
Kota-Yamaguchi
e768497a23 docs: add README for code generator evaluator 2024-11-10 22:06:23 +09:00
Kota-Yamaguchi
36ec1acf7f Add executables for multi-platform build:
- Windows (.exe)
- macOS (.app)
- Linux (binary)
2024-11-10 22:00:11 +09:00
Kota-Yamaguchi
6524cfca99 add golang dependencies 2024-11-10 21:59:19 +09:00
Kota-Yamaguchi
6d2f6b6372 add testcases 2024-11-10 21:58:57 +09:00
Kota-Yamaguchi
d5afda533f Add evaluation metrics for code generator 2024-11-10 21:58:47 +09:00
20 changed files with 748 additions and 0 deletions

View File

@ -0,0 +1,7 @@
# MODEL_PROVIDER=anthropic
# MODEL_NAME=claude-3-5-sonnet-20241022
MODEL_PROVIDER=openai
MODEL_NAME=gpt-4o-mini
CODE_EXECUTION_ENDPOINT=http://127.0.0.1:8194
CODE_EXECUTION_API_KEY=dify-sandbox
CONSOLE_API_URL=http://127.0.0.1:5001

2
evaluate/code-generator/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env
.env.local

View File

@ -0,0 +1,68 @@
# Code Generator Evaluator
## Getting Started
1. Move to the evaluator directory
```bash
cd dify/evaluate/code-generator
```
2. Set up your `.env` file with required variables
```bash
cp .env.example .env
```
3. Add your test cases to `testdata/testcases.json`
4. Execute the evaluator
```bash
# For Linux
./bin/evaluate-code-linux
# For macOS (Intel)
./bin/evaluate-code-mac
# For macOS (Apple Silicon)
./bin/evaluate-code-mac-arm64
# For Windows
./bin/evaluate-code.exe
```
## Build Instructions
### 1. Prepare Build Script
First, grant execution permissions to the build script:
```bash
chmod +x build.sh
```
### 2. Prerequisites
- Go 1.20 or higher
- Properly configured `GOPATH`
### 3. Build Process
Run the cross-platform build with the following command:
```bash
./build.sh
```
## Running the Evaluator
Execute the Code Generator evaluation on your platform using:
```bash
# For Linux
./bin/evaluate-code-linux
# For macOS (Intel)
./bin/evaluate-code-mac
# For macOS (Apple Silicon)
./bin/evaluate-code-mac-arm64
# For Windows
./bin/evaluate-code.exe
```

View File

@ -0,0 +1,62 @@
package auth
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"os"
"time"
)
type LoginRequest struct {
Email string `json:"email"`
Password string `json:"password"`
}
type LoginResponse struct {
Result string `json:"result"`
Data struct {
AccessToken string `json:"access_token"`
RefreshToken string `json:"refresh_token"`
} `json:"data"`
}
func Login(email, password string) (string, error) {
client := &http.Client{
Timeout: 10 * time.Second,
}
loginPayload := LoginRequest{
Email: email,
Password: password,
}
loginJSON, err := json.Marshal(loginPayload)
if err != nil {
return "", fmt.Errorf("failed to convert to JSON: %w", err)
}
baseUrl := os.Getenv("CONSOLE_API_URL")
loginReq, err := http.NewRequest("POST", baseUrl+"/console/api/login", bytes.NewBuffer(loginJSON))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
loginReq.Header.Set("Content-Type", "application/json")
loginResp, err := client.Do(loginReq)
if err != nil {
return "", fmt.Errorf("failed to send request: %w", err)
}
defer loginResp.Body.Close()
var loginResult LoginResponse
if err := json.NewDecoder(loginResp.Body).Decode(&loginResult); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
if loginResult.Result != "success" {
return "", fmt.Errorf("login failed")
}
return loginResult.Data.AccessToken, nil
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,17 @@
#!/bin/bash
mkdir -p bin
echo "Building for Linux (amd64)..."
GOOS=linux GOARCH=amd64 go build -o bin/evaluate-code-linux ./cmd/
echo "Building for macOS (amd64)..."
GOOS=darwin GOARCH=amd64 go build -o bin/evaluate-code-mac ./cmd/
echo "Building for macOS (arm64)..."
GOOS=darwin GOARCH=arm64 go build -o bin/evaluate-code-mac-arm64 ./cmd/
echo "Building for Windows (amd64)..."
GOOS=windows GOARCH=amd64 go build -o bin/evaluate-code.exe ./cmd/
echo "Build complete! Binaries are in the bin directory."

View File

@ -0,0 +1,121 @@
package main
import (
"evaluate/auth"
"evaluate/coderuntime"
"evaluate/testdata"
"fmt"
"os"
"strings"
"syscall"
"github.com/joho/godotenv"
"golang.org/x/term"
)
func main() {
if err := godotenv.Load("./.env"); err != nil {
fmt.Printf("Failed to load .env file: %v\n", err)
return
}
fmt.Print("Please enter your email address: ")
var email string
fmt.Scanln(&email)
fmt.Print("Please enter your password: ")
password, err := term.ReadPassword(int(syscall.Stdin))
if err != nil {
fmt.Printf("\nFailed to read password: %v\n", err)
return
}
fmt.Println()
accessToken, err := auth.Login(email, string(password))
testCases, err := testdata.LoadTestCases("./testdata/testcases.json")
if err != nil {
fmt.Printf("Failed to load test cases: %v\n", err)
return
}
metrics := testdata.NewTestMetrics()
modelProvider := os.Getenv("MODEL_PROVIDER")
modelName := os.Getenv("MODEL_NAME")
fmt.Println("━━━━━━━━━━━━━━━━━━━━━━━━━━━")
fmt.Printf("📱 Model Provider: %s\n", modelProvider)
fmt.Printf("🤖 Model Name: %s\n", modelName)
fmt.Println("━━━━━━━━━━━━━━━━━━━━━━━━━━━")
for _, tc := range testCases {
fmt.Printf("\nExecuting test case: %s\n", tc.Name)
codegenRequest := coderuntime.GenerateCodeRequest{
Instruction: tc.Instruction,
CodeLanguage: tc.CodeLanguage,
NoVariable: false,
ModelConfig: coderuntime.ModelConfig{
Provider: modelProvider,
Name: modelName,
Mode: "chat",
CompletionParams: coderuntime.CompletionParams{
Temperature: 0.7,
MaxTokens: 0,
TopP: 0,
Echo: false,
Stop: []string{},
PresencePenalty: 0,
FrequencyPenalty: 0,
},
},
}
generatedCode, err := coderuntime.GenerateCode(
codegenRequest,
coderuntime.AccessToken{
Value: accessToken,
},
)
if err != nil {
metrics.AddResult(testdata.TestResult{
TestCase: tc,
Success: false,
Error: err,
})
continue
}
language := generatedCode.Language
if language == "python" {
language += "3"
}
request := coderuntime.SandboxRequest{
Language: language,
Code: generatedCode.Code,
EnableNetwork: true,
}
result, err := coderuntime.ExecuteCode(request, tc.Inputs)
if result.Error != nil {
metrics.AddResult(testdata.TestResult{
TestCase: tc,
Success: false,
Error: result.Error,
})
continue
}
normalizedResult := strings.ReplaceAll(strings.ReplaceAll(result.Body, " ", ""), "\n", "")
normalizedTruth := strings.ReplaceAll(strings.ReplaceAll(tc.GroundTruth, " ", ""), "\n", "")
metrics.AddResult(testdata.TestResult{
TestCase: tc,
Success: normalizedResult == normalizedTruth,
ActualValue: result.Body,
})
}
metrics.Finish()
metrics.PrintSummary()
}

View File

@ -0,0 +1,142 @@
package coderuntime
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strings"
"time"
)
type SandboxRequest struct {
Language string `json:"language"`
Code string `json:"code"`
Preload string `json:"preload,omitempty"`
EnableNetwork bool `json:"enable_network"`
}
type ExecutionResult struct {
StatusCode int
Body string
Error error
}
func ExtractResult(response string) (string, error) {
const resultTag = "<<RESULT>>"
startIndex := strings.Index(response, resultTag) + len(resultTag)
endIndex := strings.LastIndex(response, resultTag)
if startIndex == -1 || endIndex == -1 {
return "", fmt.Errorf("invalid result format")
}
jsonStr := response[startIndex:endIndex]
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return "", fmt.Errorf("failed to parse JSON: %v", err)
}
// Format output
prettyJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("failed to format JSON: %v", err)
}
return string(prettyJSON), nil
}
func ExecuteCode(request SandboxRequest, inputs map[string]interface{}) (ExecutionResult, error) {
apiKey := os.Getenv("CODE_EXECUTION_API_KEY")
endpoint := os.Getenv("CODE_EXECUTION_ENDPOINT")
if apiKey == "" || endpoint == "" {
fmt.Println("必要な環境変数が設定されていません")
return ExecutionResult{}, fmt.Errorf("missing required environment variables")
}
var transformer TemplateTransformer
switch request.Language {
case "python3":
transformer = NewPython3TemplateTransformer()
case "javascript":
transformer = NewJavaScriptTemplateTransformer()
default:
return ExecutionResult{}, fmt.Errorf("unsupported language: %s", request.Language)
}
// transformer := NewPython3TemplateTransformer()
finalCode, preload, err := transformer.TransformCaller(request.Code, inputs)
if err != nil {
return ExecutionResult{}, fmt.Errorf("failed to transform code: %v", err)
}
execRequest := SandboxRequest{
Language: request.Language,
Code: finalCode,
Preload: preload,
EnableNetwork: request.EnableNetwork,
}
client := &http.Client{
Timeout: 10 * time.Second,
}
jsonData, err := json.Marshal(execRequest)
if err != nil {
return ExecutionResult{}, fmt.Errorf("failed to convert to JSON: %v", err)
}
url := endpoint + "/v1/sandbox/run"
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return ExecutionResult{}, fmt.Errorf("failed to create request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Api-Key", apiKey)
resp, err := client.Do(req)
if err != nil {
return ExecutionResult{}, fmt.Errorf("failed to send request: %v", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return ExecutionResult{}, fmt.Errorf("failed to read response: %v", err)
}
result := ExecutionResult{
StatusCode: resp.StatusCode,
Body: string(body),
}
if resp.StatusCode == 200 {
var response struct {
Code int `json:"code"`
Message string `json:"message"`
Data struct {
Error string `json:"error"`
Stdout string `json:"stdout"`
} `json:"data"`
}
if err := json.Unmarshal(body, &response); err != nil {
return result, fmt.Errorf("failed to parse response: %v", err)
}
if response.Data.Error != "" {
result.Error = fmt.Errorf("execution error: %s", response.Data.Error)
} else if prettyResult, err := ExtractResult(response.Data.Stdout); err != nil {
result.Error = fmt.Errorf("failed to process result: %v", err)
} else {
result.Body = prettyResult
}
}
return result, nil
}

View File

@ -0,0 +1,73 @@
package coderuntime
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"os"
)
func GenerateCode(request GenerateCodeRequest, accessToken AccessToken) (*GenerateCodeResponse, error) {
baseUrl := os.Getenv("CONSOLE_API_URL")
url := baseUrl + "/console/api/rule-code-generate"
jsonData, err := json.Marshal(request)
if err != nil {
return nil, fmt.Errorf("JSON encoding error: %v", err)
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return nil, fmt.Errorf("request creation error: %v", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+accessToken.Value)
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("request sending error: %v", err)
}
defer resp.Body.Close()
var response GenerateCodeResponse
if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
return nil, fmt.Errorf("response decoding error: %v", err)
}
return &response, nil
}
type GenerateCodeRequest struct {
Instruction string `json:"instruction"`
CodeLanguage string `json:"code_language"`
NoVariable bool `json:"no_variable"`
ModelConfig ModelConfig `json:"model_config"`
}
type AccessToken struct {
Value string
}
type GenerateCodeResponse struct {
Code string `json:"code"`
Error string `json:"error"`
Language string `json:"language"`
}
type ModelConfig struct {
Provider string `json:"provider"`
Name string `json:"name"`
Mode string `json:"mode"`
CompletionParams CompletionParams `json:"completion_params"`
}
type CompletionParams struct {
Temperature float64 `json:"temperature"`
MaxTokens int `json:"max_tokens"`
TopP float64 `json:"top_p"`
Echo bool `json:"echo"`
Stop []string `json:"stop"`
PresencePenalty float64 `json:"presence_penalty"`
FrequencyPenalty float64 `json:"frequency_penalty"`
}

View File

@ -0,0 +1,28 @@
package coderuntime
type JavaScriptTemplateTransformer struct {
*BaseTemplateTransformer
}
func NewJavaScriptTemplateTransformer() *JavaScriptTemplateTransformer {
t := &JavaScriptTemplateTransformer{}
t.BaseTemplateTransformer = NewBaseTemplateTransformer(t)
return t
}
func (j *JavaScriptTemplateTransformer) GetRunnerScript() string {
return `
// declare main function
{{code}}
// decode and prepare input object
const inputs_obj = JSON.parse(Buffer.from('{{inputs}}', 'base64').toString('utf-8'))
// execute main function
const output_obj = main(inputs_obj)
// convert output to json and print
const output_json = JSON.stringify(output_obj, null, 4)
const result = '<<RESULT>>' + output_json + '<<RESULT>>'
console.log(result)
`
}

View File

@ -0,0 +1,32 @@
package coderuntime
type Python3TemplateTransformer struct {
*BaseTemplateTransformer
}
func NewPython3TemplateTransformer() *Python3TemplateTransformer {
t := &Python3TemplateTransformer{}
t.BaseTemplateTransformer = NewBaseTemplateTransformer(t)
return t
}
func (p *Python3TemplateTransformer) GetRunnerScript() string {
return `
# declare main function
{{code}}
import json
from base64 import b64decode
# decode and prepare input dict
inputs_obj = json.loads(b64decode('{{inputs}}').decode('utf-8'))
# execute main function
output_obj = main(**inputs_obj)
# convert output to json and print
output_json = json.dumps(output_obj, indent=4)
result = f'''<<RESULT>>{output_json}<<RESULT>>'''
print(result)
`
}

View File

@ -0,0 +1,63 @@
package coderuntime
import (
"bytes"
"encoding/base64"
"encoding/json"
"strings"
)
type TemplateTransformer interface {
TransformCaller(code string, inputs map[string]interface{}) (string, string, error)
GetRunnerScript() string
GetPreloadScript() string
}
type BaseTemplateTransformer struct {
CodePlaceholder string
InputsPlaceholder string
ResultTag string
transformer TemplateTransformer
}
func NewBaseTemplateTransformer(t TemplateTransformer) *BaseTemplateTransformer {
return &BaseTemplateTransformer{
CodePlaceholder: "{{code}}",
InputsPlaceholder: "{{inputs}}",
ResultTag: "<<RESULT>>",
transformer: t,
}
}
func (t *BaseTemplateTransformer) GetRunnerScript() string {
return ""
}
func (t *BaseTemplateTransformer) GetPreloadScript() string {
return ""
}
func (t *BaseTemplateTransformer) TransformCaller(code string, inputs map[string]interface{}) (string, string, error) {
inputsJSON, err := json.Marshal(inputs)
if err != nil {
return "", "", err
}
var buf bytes.Buffer
encoder := json.NewEncoder(&buf)
encoder.SetEscapeHTML(false)
if err := encoder.Encode(inputs); err != nil {
return "", "", err
}
inputsJSON = bytes.TrimSpace(buf.Bytes()) // 末尾の改行を削除
inputsBase64 := base64.StdEncoding.EncodeToString(inputsJSON)
runnerScript := t.transformer.GetRunnerScript()
runnerScript = strings.ReplaceAll(runnerScript, t.CodePlaceholder, code)
runnerScript = strings.ReplaceAll(runnerScript, t.InputsPlaceholder, inputsBase64)
preloadScript := t.GetPreloadScript()
return runnerScript, preloadScript, nil
}

View File

@ -0,0 +1,10 @@
module evaluate
go 1.23.0
require (
github.com/joho/godotenv v1.5.1
golang.org/x/term v0.26.0
)
require golang.org/x/sys v0.27.0 // indirect

View File

@ -0,0 +1,6 @@
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=
golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=

View File

@ -0,0 +1,62 @@
package testdata
import (
"fmt"
"time"
)
type TestMetrics struct {
TotalTests int
SuccessfulTests int
FailedTests int
StartTime time.Time
EndTime time.Time
Results []TestResult
}
func NewTestMetrics() *TestMetrics {
return &TestMetrics{
StartTime: time.Now(),
Results: make([]TestResult, 0),
}
}
func (m *TestMetrics) AddResult(result TestResult) {
m.TotalTests++
if result.Success {
m.SuccessfulTests++
} else {
m.FailedTests++
}
m.Results = append(m.Results, result)
}
func (m *TestMetrics) Finish() {
m.EndTime = time.Now()
}
func (m *TestMetrics) PrintSummary() {
duration := m.EndTime.Sub(m.StartTime)
accuracy := float64(m.SuccessfulTests) / float64(m.TotalTests) * 100
fmt.Printf("\n=== Detailed Results ===\n")
for _, result := range m.Results {
if result.Success {
fmt.Printf("✅ %s\n", result.TestCase.Name)
} else {
fmt.Printf("❌ %s\n", result.TestCase.Name)
if result.Error != nil {
fmt.Printf(" Error: %v\n", result.Error)
} else {
fmt.Printf(" Expected: %s\n Actual: %s\n",
result.TestCase.GroundTruth, result.ActualValue)
}
}
}
fmt.Printf("\n=== Test Execution Summary ===\n")
fmt.Printf("Total Tests: %d\n", m.TotalTests)
fmt.Printf("Successful: %d\n", m.SuccessfulTests)
fmt.Printf("Failed: %d\n", m.FailedTests)
fmt.Printf("Accuracy: %.2f%%\n", accuracy)
fmt.Printf("Execution Time: %.2f seconds\n", duration.Seconds())
}

View File

@ -0,0 +1,35 @@
package testdata
import (
"encoding/json"
"os"
)
type TestCase struct {
Name string `json:"name"`
Inputs map[string]interface{} `json:"inputs"`
Instruction string `json:"instruction"`
CodeLanguage string `json:"code_language"`
GroundTruth string `json:"ground_truth"`
}
type TestResult struct {
TestCase TestCase
Success bool
ActualValue string
Error error
}
func LoadTestCases(filePath string) ([]TestCase, error) {
file, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
var testCases []TestCase
if err := json.Unmarshal(file, &testCases); err != nil {
return nil, err
}
return testCases, nil
}

View File

@ -0,0 +1,20 @@
[
{
"name": "Positive Number Check",
"inputs": {
"x": 10
},
"instruction": "if x > 0: return 'positive'",
"code_language": "python",
"ground_truth": "{\"result\": \"positive\"}"
},
{
"name": "Negative Number Check",
"inputs": {
"x": -5
},
"instruction": "if x > 0: return 'positive' else: return 'negative'",
"code_language": "python",
"ground_truth": "{\"result\": \"negative\"}"
}
]