From 8ead10c82e88b8f598fcb9b71cfae6fbe7978260 Mon Sep 17 00:00:00 2001
From: Moritz Marquardt <git@momar.de>
Date: Fri, 19 Mar 2021 20:30:08 +0100
Subject: [PATCH] Implement SEO optimizations and improve error handling and
 branch detection

---
 handler.go | 162 ++++++++++++++++++++++++++++++++++++-----------------
 main.go    |  20 -------
 2 files changed, 111 insertions(+), 71 deletions(-)

diff --git a/handler.go b/handler.go
index be78499..adf5c25 100644
--- a/handler.go
+++ b/handler.go
@@ -63,16 +63,58 @@ func handler(ctx *fasthttp.RequestCtx) {
 		TryIndexPages:      true,
 	}
 
+	// tryBranch checks if a branch exists and populates the target variables. If canonicalLink is non-empty, it will
+	// also disallow search indexing and add a Link header to the canonical URL.
+	var tryBranch = func(repo string, branch string, path []string, canonicalLink string) bool {
+		if repo == "" {
+			return false
+		}
+		fmt.Printf("Trying branch: %s/%s/%s with path %v\n", targetOwner, repo, branch, path)
+
+		escapedBranch, _ := url.PathUnescape(branch)
+		if escapedBranch == "" {
+			escapedBranch = branch
+		}
+		// Check if the branch exists, otherwise treat it as a file path
+		targetBranch, targetOptions.BranchTimestamp = getBranchTimestamp(targetOwner, repo, branch)
+		fmt.Printf("Branch %s has timestamp %v\n", targetBranch, targetOptions.BranchTimestamp)
+		if targetOptions.BranchTimestamp != (time.Time{}) {
+			// Branch exists, use it
+			targetRepo = repo
+			targetPath = strings.Trim(strings.Join(path, "/"), "/")
+
+			if canonicalLink != "" {
+				// Hide from search machines & add canonical link
+				ctx.Response.Header.Set("X-Robots-Tag", "noarchive, noindex")
+				ctx.Response.Header.Set("Link",
+					strings.NewReplacer("%b", targetBranch, "%p", targetPath).Replace(canonicalLink)+
+						"; rel=\"canonical\"",
+				)
+			}
+
+			return true
+		} else {
+			// branch doesn't exist
+			return false
+		}
+	}
+
+	// tryUpstream forwards the target request to the Gitea API, and shows an error page on failure.
+	var tryUpstream = func() {
+		// Try to request the file from the Gitea API
+		if !upstream(ctx, targetOwner, targetRepo, targetBranch, targetPath, targetOptions) {
+			returnErrorPage(ctx, ctx.Response.StatusCode())
+		}
+	}
+
 	if RawDomain != nil && bytes.Equal(ctx.Request.Host(), RawDomain) {
 		// Serve raw content from RawDomain
 
-		// TODO: add canonical link and "X-Robots-Tag: noarchive, noindex"
-
 		targetOptions.TryIndexPages = false
 		targetOptions.ForbiddenMimeTypes["text/html"] = struct{}{}
 		targetOptions.DefaultMimeType = "text/plain; charset=utf-8"
 
-		pathElements := strings.SplitN(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/", 4)
+		pathElements := strings.Split(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/")
 		if len(pathElements) < 2 {
 			// https://{RawDomain}/{owner}/{repo}[/@{branch}]/{path} is required
 			ctx.Redirect(RawInfoPage, fasthttp.StatusTemporaryRedirect)
@@ -80,46 +122,74 @@ func handler(ctx *fasthttp.RequestCtx) {
 		}
 		targetOwner = pathElements[0]
 		targetRepo = pathElements[1]
-		if len(pathElements) > 3 {
-			targetPath = strings.Trim(pathElements[2]+"/"+pathElements[3], "/")
-		} else if len(pathElements) > 2 {
-			targetPath = pathElements[2]
-		}
 
 		// raw.codeberg.page/example/myrepo/@main/index.html
-		if len(pathElements) > 3 && strings.HasPrefix(pathElements[2], "@") {
-			branch, _ := url.PathUnescape(pathElements[2][1:])
-			if branch == "" {
-				branch = pathElements[2][1:]
-			}
-			// Check if the branch exists, otherwise treat it as a file path
-			targetBranch, targetOptions.BranchTimestamp = getBranchTimestamp(targetOwner, targetRepo, branch)
-			if targetOptions.BranchTimestamp != (time.Time{}) {
-				targetPath = strings.Trim(pathElements[3], "/") // branch exists, use it
-			} else {
-				targetBranch = "" // branch doesn't exist, use default branch
+		if len(pathElements) > 2 && strings.HasPrefix(pathElements[2], "@") {
+			if tryBranch(targetRepo, pathElements[2][1:], pathElements[3:],
+				string(GiteaRoot)+"/"+targetOwner+"/"+targetRepo+"/src/branch/%b/%p",
+			) {
+				tryUpstream()
+				return
 			}
+			returnErrorPage(ctx, fasthttp.StatusFailedDependency)
+			return
+		} else {
+			tryBranch(targetRepo, "", pathElements[2:],
+				string(GiteaRoot)+"/"+targetOwner+"/"+targetRepo+"/src/branch/%b/%p",
+			)
+			tryUpstream()
+			return
 		}
 
 	} else if bytes.HasSuffix(ctx.Request.Host(), MainDomainSuffix) {
 		// Serve pages from subdomains of MainDomainSuffix
 
-		// TODO: add @branch syntax with "X-Robots-Tag: noarchive, noindex"
-
-		pathElements := strings.SplitN(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/", 2)
+		pathElements := strings.Split(string(bytes.Trim(ctx.Request.URI().Path(), "/")), "/")
 		targetOwner = string(bytes.TrimSuffix(ctx.Request.Host(), MainDomainSuffix))
 		targetRepo = pathElements[0]
-		if len(pathElements) > 1 {
-			targetPath = strings.Trim(pathElements[1], "/")
+		targetPath = strings.Trim(strings.Join(pathElements[1:], "/"), "/")
+
+		// Check if the first directory is a repo with the second directory as a branch
+		// example.codeberg.page/myrepo/@main/index.html
+		if len(pathElements) > 1 && strings.HasPrefix(pathElements[1], "@") {
+			if tryBranch(pathElements[0], pathElements[1][1:], pathElements[2:],
+				"/"+pathElements[0]+"/%p",
+			) {
+				tryUpstream()
+			} else {
+				returnErrorPage(ctx, fasthttp.StatusFailedDependency)
+			}
+			return
+		}
+
+		// Check if the first directory is a branch for the "pages" repo
+		// example.codeberg.page/@main/index.html
+		if strings.HasPrefix(pathElements[0], "@") {
+			if tryBranch("pages", pathElements[0][1:], pathElements[1:], "/%p") {
+				tryUpstream()
+			} else {
+				returnErrorPage(ctx, fasthttp.StatusFailedDependency)
+			}
+			return
 		}
 
 		// Check if the first directory is a repo with a "pages" branch
-		targetBranch, targetOptions.BranchTimestamp = getBranchTimestamp(targetOwner, targetRepo, "pages")
-		if targetOptions.BranchTimestamp == (time.Time{}) {
-			targetRepo = "pages"
-			targetBranch = ""
-			targetPath = strings.Trim(pathElements[0]+"/"+targetPath, "/")
+		// example.codeberg.page/myrepo/index.html
+		if tryBranch(pathElements[0], "pages", pathElements[1:], "") {
+			tryUpstream()
+			return
 		}
+
+		// Try to use the "pages" repo on its default branch
+		// example.codeberg.page/index.html
+		if tryBranch("pages", "", pathElements, "") {
+			tryUpstream()
+			return
+		}
+
+		// Couldn't find a valid repo/branch
+		returnErrorPage(ctx, fasthttp.StatusFailedDependency)
+		return
 	} else {
 		// Serve pages from external domains
 
@@ -129,23 +199,6 @@ func handler(ctx *fasthttp.RequestCtx) {
 			return
 		}
 	}
-
-	// Check if a username can't exist because it's reserved (we'd risk to hit a Gitea route in that case)
-	if _, ok := ReservedUsernames[targetOwner]; ok {
-		returnErrorPage(ctx, fasthttp.StatusForbidden)
-		return
-	}
-
-	// Check for blob path
-	if strings.HasPrefix(targetPath, "blob/") {
-		returnErrorPage(ctx, fasthttp.StatusForbidden)
-		return
-	}
-
-	// Try to request the file from the Gitea API
-	if !upstream(ctx, targetOwner, targetRepo, targetBranch, targetPath, targetOptions) {
-		returnErrorPage(ctx, ctx.Response.StatusCode())
-	}
 }
 
 // returnErrorPage sets the response status code and writes NotFoundPage to the response body, with "%status" replaced
@@ -153,7 +206,7 @@ func handler(ctx *fasthttp.RequestCtx) {
 func returnErrorPage(ctx *fasthttp.RequestCtx, code int) {
 	ctx.Response.SetStatusCode(code)
 	ctx.Response.Header.SetContentType("text/html; charset=utf-8")
-	ctx.Response.SetBody(bytes.ReplaceAll(NotFoundPage, []byte("%status"), []byte(strconv.Itoa(code) + " " + fasthttp.StatusMessage(code))))
+	ctx.Response.SetBody(bytes.ReplaceAll(NotFoundPage, []byte("%status"), []byte(strconv.Itoa(code)+" "+fasthttp.StatusMessage(code))))
 }
 
 // getBranchTimestamp finds the default branch (if branch is "") and returns the last modification time of the branch
@@ -163,8 +216,9 @@ func getBranchTimestamp(owner, repo, branch string) (branchWithFallback string,
 	branchWithFallback = branch
 	if branch == "" {
 		var body = make([]byte, 0)
-		status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+owner+"/"+repo, 10*time.Second)
+		status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+url.PathEscape(owner)+"/"+url.PathEscape(repo), 10*time.Second)
 		if err != nil || status != 200 {
+			fmt.Printf("Default branch request to Gitea API failed with status code %d and error %s\n", status, err)
 			branchWithFallback = ""
 			return
 		}
@@ -173,8 +227,9 @@ func getBranchTimestamp(owner, repo, branch string) (branchWithFallback string,
 	}
 
 	var body = make([]byte, 0)
-	status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+owner+"/"+repo+"/branches/"+branch, 10*time.Second)
+	status, body, err := fasthttp.GetTimeout(body, string(GiteaRoot)+"/api/v1/repos/"+url.PathEscape(owner)+"/"+url.PathEscape(repo)+"/branches/"+url.PathEscape(branch), 10*time.Second)
 	if err != nil || status != 200 {
+		fmt.Printf("Branch info request to Gitea API failed with status code %d and error %s\n", status, err)
 		branchWithFallback = ""
 		return
 	}
@@ -196,12 +251,17 @@ func upstream(ctx *fasthttp.RequestCtx, targetOwner string, targetRepo string, t
 
 	// Handle repositories with no/broken pages setup
 	if options.BranchTimestamp == (time.Time{}) || targetBranch == "" {
-		ctx.Response.SetStatusCode(fasthttp.StatusNotFound)
+		ctx.Response.SetStatusCode(fasthttp.StatusFailedDependency)
 		ctx.Response.Header.SetContentType("text/html; charset=utf-8")
 		ctx.Response.SetBody(bytes.ReplaceAll(NotFoundPage, []byte("%status"), []byte("pages not set up for this repo")))
 		return true
 	}
 
+	if targetOwner == "" || targetRepo == "" || targetBranch == "" {
+		returnErrorPage(ctx, fasthttp.StatusBadRequest)
+		return true
+	}
+
 	// Check if the browser has a cached version
 	if ifModifiedSince, err := time.Parse(time.RFC1123, string(ctx.Request.Header.Peek("If-Modified-Since"))); err == nil {
 		if !ifModifiedSince.Before(options.BranchTimestamp) {
@@ -212,7 +272,7 @@ func upstream(ctx *fasthttp.RequestCtx, targetOwner string, targetRepo string, t
 
 	// Make a GET request to the upstream URL
 	req := fasthttp.AcquireRequest()
-	req.SetRequestURI(string(GiteaRoot) + "/api/v1/repos/" + targetOwner + "/" + targetRepo + "/raw/" + targetBranch + "/" + targetPath)
+	req.SetRequestURI(string(GiteaRoot) + "/api/v1/repos/" + url.PathEscape(targetOwner) + "/" + url.PathEscape(targetRepo) + "/raw/" + url.PathEscape(targetBranch) + "/" + url.PathEscape(targetPath))
 	res := fasthttp.AcquireResponse()
 	err := fasthttp.DoTimeout(req, res, 10*time.Second)
 
diff --git a/main.go b/main.go
index ae4ac7b..a7081a4 100644
--- a/main.go
+++ b/main.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"net"
 	"os"
-	"strings"
 	"time"
 
 	_ "embed"
@@ -67,14 +66,6 @@ var IndexPages = []string{
 	"index.html",
 }
 
-// ReservedUsernames specifies the usernames that are reserved by Gitea and thus may not be used as owner names.
-// The contents are taken from https://github.com/go-gitea/gitea/blob/master/models/user.go#L783; reserved names with
-// dots are removed as they are forbidden for Codeberg Pages anyways.
-var ReservedUsernames = createLookupMapFromWords(`
-	admin api assets attachments avatars captcha commits debug error explore ghost help install issues less login metrics milestones new notifications org plugins pulls raw repo search stars template user
-	
-`)
-
 // main sets up and starts the web server.
 func main() {
 	// Make sure MainDomain has a trailing dot, and GiteaRoot has no trailing slash
@@ -122,14 +113,3 @@ func envOr(env string, or string) string {
 	}
 	return or
 }
-
-func createLookupMapFromWords(input string) map[string]struct{} {
-	var res = map[string]struct{}{}
-	input = strings.NewReplacer("\t", " ", "\n", " ", "\r", " ").Replace(input)
-	for _, word := range strings.Split(input, " ") {
-		if len(word) > 0 {
-			res[word] = struct{}{}
-		}
-	}
-	return res
-}