From 85059aa46ef0b04c76646c8a56a7860f5007f58a Mon Sep 17 00:00:00 2001 From: crapStone Date: Thu, 21 Nov 2024 23:26:30 +0000 Subject: [PATCH] cache pages (#403) taken from https://codeberg.org/Codeberg/pages-server/pulls/301 Co-authored-by: Moritz Marquardt Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/403 Co-authored-by: crapStone Co-committed-by: crapStone --- server/gitea/cache.go | 61 +++++++------ server/gitea/client.go | 171 ++++++++++++++++++------------------ server/upstream/header.go | 3 - server/upstream/upstream.go | 2 +- 4 files changed, 120 insertions(+), 117 deletions(-) diff --git a/server/gitea/cache.go b/server/gitea/cache.go index cfb7c2a..c560a19 100644 --- a/server/gitea/cache.go +++ b/server/gitea/cache.go @@ -2,6 +2,7 @@ package gitea import ( "bytes" + "encoding/json" "fmt" "io" "net/http" @@ -34,23 +35,18 @@ const ( ) type FileResponse struct { - Exists bool - IsSymlink bool - ETag string - - // uncompressed MIME type - MimeType string - - // raw MIME type (if compressed, type of compression) - RawMime string - Body []byte + Exists bool `json:"exists"` + IsSymlink bool `json:"isSymlink"` + ETag string `json:"eTag"` + MimeType string `json:"mimeType"` + Body []byte `json:"-"` // saved separately } func (f FileResponse) IsEmpty() bool { return len(f.Body) == 0 } -func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (header http.Header, statusCode int) { +func (f FileResponse) createHttpResponse(cacheKey string) (header http.Header, statusCode int) { header = make(http.Header) if f.Exists { @@ -63,12 +59,7 @@ func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (head header.Set(giteaObjectTypeHeader, objTypeSymlink) } header.Set(ETagHeader, f.ETag) - - if decompress { - header.Set(ContentTypeHeader, f.MimeType) - } else { - header.Set(ContentTypeHeader, f.RawMime) - } + header.Set(ContentTypeHeader, f.MimeType) header.Set(ContentLengthHeader, fmt.Sprintf("%d", len(f.Body))) header.Set(PagesCacheIndicatorHeader, "true") @@ -77,9 +68,9 @@ func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (head } type BranchTimestamp struct { - Branch string - Timestamp time.Time - notFound bool + NotFound bool `json:"notFound"` + Branch string `json:"branch,omitempty"` + Timestamp time.Time `json:"timestamp,omitempty"` } type writeCacheReader struct { @@ -89,32 +80,46 @@ type writeCacheReader struct { cacheKey string cache cache.ICache hasError bool + doNotCache bool + complete bool } func (t *writeCacheReader) Read(p []byte) (n int, err error) { log.Trace().Msgf("[cache] read %q", t.cacheKey) n, err = t.originalReader.Read(p) + if err == io.EOF { + t.complete = true + } if err != nil && err != io.EOF { log.Trace().Err(err).Msgf("[cache] original reader for %q has returned an error", t.cacheKey) t.hasError = true } else if n > 0 { - _, _ = t.buffer.Write(p[:n]) + if t.buffer.Len()+n > int(fileCacheSizeLimit) { + t.doNotCache = true + t.buffer.Reset() + } else { + _, _ = t.buffer.Write(p[:n]) + } } return } func (t *writeCacheReader) Close() error { - doWrite := !t.hasError + doWrite := !t.hasError && !t.doNotCache && t.complete fc := *t.fileResponse fc.Body = t.buffer.Bytes() - if fc.IsEmpty() { - log.Trace().Msg("[cache] file response is empty") - doWrite = false - } if doWrite { - err := t.cache.Set(t.cacheKey, fc, fileCacheTimeout) + jsonToCache, err := json.Marshal(fc) if err != nil { - log.Trace().Err(err).Msgf("[cache] writer for %q has returned an error", t.cacheKey) + log.Trace().Err(err).Msgf("[cache] marshaling json for %q has returned an error", t.cacheKey+"|Metadata") + } + err = t.cache.Set(t.cacheKey+"|Metadata", jsonToCache, fileCacheTimeout) + if err != nil { + log.Trace().Err(err).Msgf("[cache] writer for %q has returned an error", t.cacheKey+"|Metadata") + } + err = t.cache.Set(t.cacheKey+"|Body", fc.Body, fileCacheTimeout) + if err != nil { + log.Trace().Err(err).Msgf("[cache] writer for %q has returned an error", t.cacheKey+"|Body") } } log.Trace().Msgf("cacheReader for %q saved=%t closed", t.cacheKey, doWrite) diff --git a/server/gitea/client.go b/server/gitea/client.go index 9777043..ea57c14 100644 --- a/server/gitea/client.go +++ b/server/gitea/client.go @@ -2,6 +2,7 @@ package gitea import ( "bytes" + "encoding/json" "errors" "fmt" "io" @@ -39,10 +40,9 @@ const ( objTypeSymlink = "symlink" // std - ETagHeader = "ETag" - ContentTypeHeader = "Content-Type" - ContentLengthHeader = "Content-Length" - ContentEncodingHeader = "Content-Encoding" + ETagHeader = "ETag" + ContentTypeHeader = "Content-Type" + ContentLengthHeader = "Content-Length" ) type Client struct { @@ -104,7 +104,7 @@ func (client *Client) ContentWebLink(targetOwner, targetRepo, branch, resource s } func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource string) ([]byte, error) { - reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource, false) + reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource) if err != nil { return nil, err } @@ -112,27 +112,42 @@ func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource str return io.ReadAll(reader) } -func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string, decompress bool) (io.ReadCloser, http.Header, int, error) { +func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string) (io.ReadCloser, http.Header, int, error) { cacheKey := fmt.Sprintf("%s/%s/%s|%s|%s", rawContentCacheKeyPrefix, targetOwner, targetRepo, ref, resource) log := log.With().Str("cache_key", cacheKey).Logger() log.Trace().Msg("try file in cache") // handle if cache entry exist - if cache, ok := client.responseCache.Get(cacheKey); ok { - cache := cache.(FileResponse) - cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey, decompress) - // TODO: check against some timestamp mismatch?!? + if cacheMetadata, ok := client.responseCache.Get(cacheKey + "|Metadata"); ok { + var cache FileResponse + err := json.Unmarshal(cacheMetadata.([]byte), &cache) + if err != nil { + log.Error().Err(err).Msgf("[cache] failed to unmarshal metadata for: %s", cacheKey) + return nil, nil, http.StatusNotFound, err + } + + if !cache.Exists { + return nil, nil, http.StatusNotFound, ErrorNotFound + } + + body, ok := client.responseCache.Get(cacheKey + "|Body") + if !ok { + log.Error().Msgf("[cache] failed to get body for: %s", cacheKey) + return nil, nil, http.StatusNotFound, ErrorNotFound + } + cache.Body = body.([]byte) + + cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey) if cache.Exists { - log.Debug().Msg("[cache] exists") if cache.IsSymlink { linkDest := string(cache.Body) log.Debug().Msgf("[cache] follow symlink from %q to %q", resource, linkDest) - return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress) - } else if !cache.IsEmpty() { + return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest) + } else { log.Debug().Msgf("[cache] return %d bytes", len(cache.Body)) return io.NopCloser(bytes.NewReader(cache.Body)), cachedHeader, cachedStatusCode, nil - } else if cache.IsEmpty() { - log.Debug().Msg("[cache] is empty") } + } else { + return nil, nil, http.StatusNotFound, ErrorNotFound } } log.Trace().Msg("file not in cache") @@ -166,41 +181,40 @@ func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource str ETag: resp.Header.Get(ETagHeader), } log.Trace().Msgf("file response has %d bytes", len(fileResponse.Body)) - if err := client.responseCache.Set(cacheKey, fileResponse, fileCacheTimeout); err != nil { + jsonToCache, err := json.Marshal(fileResponse) + if err != nil { + log.Error().Err(err).Msgf("[cache] marshaling json metadata for %q has returned an error", cacheKey) + } + if err := client.responseCache.Set(cacheKey+"|Metadata", jsonToCache, fileCacheTimeout); err != nil { + log.Error().Err(err).Msg("[cache] error on cache write") + } + if err := client.responseCache.Set(cacheKey+"|Body", fileResponse.Body, fileCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } log.Debug().Msgf("follow symlink from %q to %q", resource, linkDest) - return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress) + return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest) } } // now we are sure it's content so set the MIME type - mimeType, rawType := client.getMimeTypeByExtension(resource) - if decompress { - resp.Response.Header.Set(ContentTypeHeader, mimeType) - } else { - resp.Response.Header.Set(ContentTypeHeader, rawType) - } - - if !shouldRespBeSavedToCache(resp.Response) { - return reader, resp.Response.Header, resp.StatusCode, err - } + mimeType := client.getMimeTypeByExtension(resource) + resp.Response.Header.Set(ContentTypeHeader, mimeType) // now we write to cache and respond at the same time fileResp := FileResponse{ Exists: true, ETag: resp.Header.Get(ETagHeader), MimeType: mimeType, - RawMime: rawType, } return fileResp.CreateCacheReader(reader, client.responseCache, cacheKey), resp.Response.Header, resp.StatusCode, nil case http.StatusNotFound: - if err := client.responseCache.Set(cacheKey, FileResponse{ - Exists: false, - ETag: resp.Header.Get(ETagHeader), - }, fileCacheTimeout); err != nil { + jsonToCache, err := json.Marshal(FileResponse{ETag: resp.Header.Get(ETagHeader)}) + if err != nil { + log.Error().Err(err).Msgf("[cache] marshaling json metadata for %q has returned an error", cacheKey) + } + if err := client.responseCache.Set(cacheKey+"|Metadata", jsonToCache, fileCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } @@ -215,21 +229,36 @@ func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource str func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchName string) (*BranchTimestamp, error) { cacheKey := fmt.Sprintf("%s/%s/%s/%s", branchTimestampCacheKeyPrefix, repoOwner, repoName, branchName) - if stamp, ok := client.responseCache.Get(cacheKey); ok && stamp != nil { - branchTimeStamp := stamp.(*BranchTimestamp) - if branchTimeStamp.notFound { - log.Trace().Msgf("[cache] use branch %q not found", branchName) + if stampRaw, ok := client.responseCache.Get(cacheKey); ok { + var stamp BranchTimestamp + err := json.Unmarshal(stampRaw.([]byte), &stamp) + if err != nil { + log.Error().Err(err).Bytes("stamp", stampRaw.([]byte)).Msgf("[cache] failed to unmarshal timestamp for: %s", cacheKey) return &BranchTimestamp{}, ErrorNotFound } - log.Trace().Msgf("[cache] use branch %q exist", branchName) - return branchTimeStamp, nil + + if stamp.NotFound { + log.Trace().Msgf("[cache] branch %q does not exist", branchName) + + return &BranchTimestamp{}, ErrorNotFound + } else { + log.Trace().Msgf("[cache] use branch %q exist", branchName) + // This comes from the refactoring of the caching library. + // The branch as reported by the API was stored in the cache, and I'm not sure if there are + // situations where it differs from the name in the request, hence this is left here. + return &stamp, nil + } } branch, resp, err := client.sdkClient.GetRepoBranch(repoOwner, repoName, branchName) if err != nil { if resp != nil && resp.StatusCode == http.StatusNotFound { log.Trace().Msgf("[cache] set cache branch %q not found", branchName) - if err := client.responseCache.Set(cacheKey, &BranchTimestamp{Branch: branchName, notFound: true}, branchExistenceCacheTimeout); err != nil { + jsonToCache, err := json.Marshal(BranchTimestamp{NotFound: true}) + if err != nil { + log.Error().Err(err).Msgf("[cache] marshaling empty timestamp for '%s' has returned an error", cacheKey) + } + if err := client.responseCache.Set(cacheKey, jsonToCache, branchExistenceCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } return &BranchTimestamp{}, ErrorNotFound @@ -246,7 +275,11 @@ func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchNam } log.Trace().Msgf("set cache branch [%s] exist", branchName) - if err := client.responseCache.Set(cacheKey, stamp, branchExistenceCacheTimeout); err != nil { + jsonToCache, err := json.Marshal(stamp) + if err != nil { + log.Error().Err(err).Msgf("[cache] marshaling timestamp for %q has returned an error", cacheKey) + } + if err := client.responseCache.Set(cacheKey, jsonToCache, branchExistenceCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } return stamp, nil @@ -255,8 +288,8 @@ func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchNam func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (string, error) { cacheKey := fmt.Sprintf("%s/%s/%s", defaultBranchCacheKeyPrefix, repoOwner, repoName) - if branch, ok := client.responseCache.Get(cacheKey); ok && branch != nil { - return branch.(string), nil + if branch, ok := client.responseCache.Get(cacheKey); ok { + return string(branch.([]byte)), nil } repo, resp, err := client.sdkClient.GetRepo(repoOwner, repoName) @@ -268,7 +301,7 @@ func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (str } branch := repo.DefaultBranch - if err := client.responseCache.Set(cacheKey, branch, defaultBranchCacheTimeout); err != nil { + if err := client.responseCache.Set(cacheKey, []byte(branch), defaultBranchCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } return branch, nil @@ -277,13 +310,14 @@ func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (str func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) { cacheKey := fmt.Sprintf("%s/%s", ownerExistenceKeyPrefix, owner) - if exist, ok := client.responseCache.Get(cacheKey); ok && exist != nil { - return exist.(bool), nil + if existRaw, ok := client.responseCache.Get(cacheKey); ok && existRaw != nil { + exist, err := strconv.ParseBool(existRaw.(string)) + return exist, err } _, resp, err := client.sdkClient.GetUserInfo(owner) if resp.StatusCode == http.StatusOK && err == nil { - if err := client.responseCache.Set(cacheKey, true, ownerExistenceCacheTimeout); err != nil { + if err := client.responseCache.Set(cacheKey, []byte("true"), ownerExistenceCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } return true, nil @@ -293,59 +327,26 @@ func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) { _, resp, err = client.sdkClient.GetOrg(owner) if resp.StatusCode == http.StatusOK && err == nil { - if err := client.responseCache.Set(cacheKey, true, ownerExistenceCacheTimeout); err != nil { + if err := client.responseCache.Set(cacheKey, []byte("true"), ownerExistenceCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } return true, nil } else if resp.StatusCode != http.StatusNotFound { return false, err } - if err := client.responseCache.Set(cacheKey, false, ownerExistenceCacheTimeout); err != nil { + if err := client.responseCache.Set(cacheKey, []byte("false"), ownerExistenceCacheTimeout); err != nil { log.Error().Err(err).Msg("[cache] error on cache write") } return false, nil } -func (client *Client) extToMime(ext string) string { - mimeType := mime.TypeByExtension(ext) +func (client *Client) getMimeTypeByExtension(resource string) string { + mimeType := mime.TypeByExtension(path.Ext(resource)) mimeTypeSplit := strings.SplitN(mimeType, ";", 2) if client.forbiddenMimeTypes[mimeTypeSplit[0]] || mimeType == "" { mimeType = client.defaultMimeType } + log.Trace().Msgf("probe mime of %q is %q", resource, mimeType) + return mimeType } - -func (client *Client) getMimeTypeByExtension(resource string) (mimeType, rawType string) { - rawExt := path.Ext(resource) - innerExt := rawExt - switch rawExt { - case ".gz", ".br", ".zst": - innerExt = path.Ext(resource[:len(resource)-len(rawExt)]) - } - rawType = client.extToMime(rawExt) - mimeType = rawType - if innerExt != rawExt { - mimeType = client.extToMime(innerExt) - } - log.Trace().Msgf("probe mime of %q is (%q / raw %q)", resource, mimeType, rawType) - return mimeType, rawType -} - -func shouldRespBeSavedToCache(resp *http.Response) bool { - if resp == nil { - return false - } - - contentLengthRaw := resp.Header.Get(ContentLengthHeader) - if contentLengthRaw == "" { - return false - } - - contentLength, err := strconv.ParseInt(contentLengthRaw, 10, 64) - if err != nil { - log.Error().Err(err).Msg("could not parse content length") - } - - // if content to big or could not be determined we not cache it - return contentLength > 0 && contentLength < fileCacheSizeLimit -} diff --git a/server/upstream/header.go b/server/upstream/header.go index 3a218a1..7b85df1 100644 --- a/server/upstream/header.go +++ b/server/upstream/header.go @@ -24,8 +24,5 @@ func (o *Options) setHeader(ctx *context.Context, header http.Header) { } else { ctx.RespWriter.Header().Set(gitea.ContentTypeHeader, mime) } - if encoding := header.Get(gitea.ContentEncodingHeader); encoding != "" && encoding != "identity" { - ctx.RespWriter.Header().Set(gitea.ContentEncodingHeader, encoding) - } ctx.RespWriter.Header().Set(headerLastModified, o.BranchTimestamp.In(time.UTC).Format(http.TimeFormat)) } diff --git a/server/upstream/upstream.go b/server/upstream/upstream.go index 98137ba..2f1751b 100644 --- a/server/upstream/upstream.go +++ b/server/upstream/upstream.go @@ -182,7 +182,7 @@ func (o *Options) Upstream(ctx *context.Context, giteaClient *gitea.Client, redi // add extension for encoding path := o.TargetPath + allowedEncodings[encoding] - reader, header, statusCode, err = giteaClient.ServeRawContent(o.TargetOwner, o.TargetRepo, o.TargetBranch, path, true) + reader, header, statusCode, err = giteaClient.ServeRawContent(o.TargetOwner, o.TargetRepo, o.TargetBranch, path) if statusCode == 404 { continue }