From 3be16a9277b8dccc5838023ed0ae5b6dc1867bd5 Mon Sep 17 00:00:00 2001 From: SoXX Date: Wed, 15 Nov 2023 20:28:44 +0000 Subject: [PATCH] new_scrape_algorithm_#5 (#20) First implementation of the new algorithm that got proposed in issue #5 Reviewed-on: https://git.dragse.it/anthrove/e621-to-graph/pulls/20 Reviewed-by: Lennard Brinkhaus Reviewed-by: daskadse Co-authored-by: SoXX Co-committed-by: SoXX --- internal/database/neo4j/impl.go | 4 + internal/database/neo4j/post.go | 5 +- internal/database/neo4j/relationship.go | 2 +- internal/database/neo4j/source.go | 5 +- internal/database/neo4j/tag.go | 7 +- internal/database/neo4j/user.go | 34 +++- internal/service/manager.go | 208 ++++++++++++++---------- pkg/logic/database.go | 1 + 8 files changed, 163 insertions(+), 103 deletions(-) diff --git a/internal/database/neo4j/impl.go b/internal/database/neo4j/impl.go index 3189ef3..2dfa46a 100644 --- a/internal/database/neo4j/impl.go +++ b/internal/database/neo4j/impl.go @@ -20,6 +20,10 @@ func NewNeo4JConnection(neo4jDebug bool) logic.GraphConnection { } } +func (c *neo4jConnection) GetUserFavoriteCount(ctx context.Context, userID model.UserID) (int64, error) { + return GetUserFavoritesCount(ctx, c.driver, userID) +} + func (c *neo4jConnection) CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error) { return CheckUserToPostLink(ctx, c.driver, e621PostID, e621UserID) } diff --git a/internal/database/neo4j/post.go b/internal/database/neo4j/post.go index 9cd81aa..d285e29 100644 --- a/internal/database/neo4j/post.go +++ b/internal/database/neo4j/post.go @@ -8,9 +8,8 @@ import ( func CreatePostNode(ctx context.Context, driver neo4j.DriverWithContext, postID model.PostID) error { query := ` - MERGE (u:e621Post {e621PostID: $postID}) - RETURN u -` + MERGE (u:e621Post {e621PostID: $postID}); + ` params := map[string]any{ "postID": postID, } diff --git a/internal/database/neo4j/relationship.go b/internal/database/neo4j/relationship.go index 317a0bc..9e607a2 100644 --- a/internal/database/neo4j/relationship.go +++ b/internal/database/neo4j/relationship.go @@ -11,7 +11,7 @@ func EstablishPostTagLink(ctx context.Context, driver neo4j.DriverWithContext, e query := ` MATCH (p:e621Post {e621PostID: $e621PostID}) MATCH (t:e621Tag {e621Tag: $e621Tag}) - MERGE (p)-[:HAS_TAG]->(t) + MERGE (p)-[:HAS_TAG]->(t); ` params := map[string]interface{}{ "e621PostID": e621PostID, diff --git a/internal/database/neo4j/source.go b/internal/database/neo4j/source.go index bbddac2..2f6360c 100644 --- a/internal/database/neo4j/source.go +++ b/internal/database/neo4j/source.go @@ -7,9 +7,8 @@ import ( func CreateSourceNode(ctx context.Context, driver neo4j.DriverWithContext, URL string) error { query := ` - MERGE (u:Source {URL: $url}) - RETURN u -` + MERGE (u:Source {URL: $url}); + ` params := map[string]any{ "url": URL, } diff --git a/internal/database/neo4j/tag.go b/internal/database/neo4j/tag.go index 7a98aec..aab8d9f 100644 --- a/internal/database/neo4j/tag.go +++ b/internal/database/neo4j/tag.go @@ -8,9 +8,8 @@ import ( func CreateTagNode(ctx context.Context, driver neo4j.DriverWithContext, name string, tagType string) error { query := ` - MERGE (u:e621Tag {e621Tag: $name, e621TagType: $tagType}) - RETURN u -` + MERGE (u:e621Tag {e621Tag: $name, e621TagType: $tagType}); + ` params := map[string]interface{}{ "name": name, "tagType": tagType, @@ -30,7 +29,7 @@ func GetTagNodeByName(ctx context.Context, driver neo4j.DriverWithContext, name query := ` MATCH (u:e621Tag {e621Tag: $name}) - RETURN u.e621Tag as e621Tag, u.e621TagType as e621TagType + RETURN u.e621Tag AS e621Tag, u.e621TagType AS e621TagType; ` params := map[string]interface{}{ diff --git a/internal/database/neo4j/user.go b/internal/database/neo4j/user.go index b57847c..6481d45 100644 --- a/internal/database/neo4j/user.go +++ b/internal/database/neo4j/user.go @@ -8,8 +8,7 @@ import ( func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user model.User) error { query := ` - MERGE (u:e621User {e621ID: $id, e621Username: $name}) - RETURN u + MERGE (u:e621User {e621ID: $id, e621Username: $name}); ` params := map[string]interface{}{ "id": user.ID, @@ -22,3 +21,34 @@ func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user mo } return nil } + +func GetUserFavoritesCount(ctx context.Context, driver neo4j.DriverWithContext, userID model.UserID) (int64, error) { + var userFavoriteCount int64 + + query := ` + MATCH (:e621User {e621ID: $userID})-[:IS_FAVORITE]->(:e621Post) + RETURN count(*) AS numberOfFavoritedPosts; + ` + params := map[string]interface{}{ + "userID": userID, + } + + result, err := neo4j.ExecuteQuery(ctx, driver, query, params, neo4j.EagerResultTransformer) + if err != nil { + return 0, err + } + + if len(result.Records) == 0 { + // no matches -> user does not exist, return count 0 + return userFavoriteCount, err + } + + record := result.Records[0] + + userFavoriteCount, _, err = neo4j.GetRecordValue[int64](record, "numberOfFavoritedPosts") + if err != nil { + return userFavoriteCount, err + } + + return userFavoriteCount, nil +} diff --git a/internal/service/manager.go b/internal/service/manager.go index 5e98d16..632d11d 100644 --- a/internal/service/manager.go +++ b/internal/service/manager.go @@ -13,9 +13,10 @@ import ( func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, client *e621.Client, username string) error { var err error + scrapeTime := time.Now() + e621User, err := client.GetUserByName(username).Execute() if err != nil { - log.Info(err) return err } @@ -28,107 +29,134 @@ func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, clie return nil } - log.WithFields(log.Fields{ - "e621_username": e621User.Name, - "e621_user_id": e621User.ID, - }).Info("service: processing user") - err = graphConnection.UploadUser(ctx, e621User) if err != nil { - log.Fatal(err) + return err + } + + currentDBFavCount, err := graphConnection.GetUserFavoriteCount(ctx, e621User.ID) + if err != nil { + return err + } + + favoriteBuilder, err := client.GetFavoritesForUser(e621User.Name) + if err != nil { + return err + } + + if currentDBFavCount > e621User.FavoriteCount { + //TODO: IMPLEMENT USER MARKED FOR DELETED FAVS + log.WithFields(log.Fields{ + "e621_username": e621User.Name, + "e621_user_id": e621User.ID, + "e621_current_db_favorite_count": currentDBFavCount, + "e621_user_favorite_count": e621User.FavoriteCount, + }).Debug("service: user has favorites deleted") + } + + var pageIndex = 1 + for currentDBFavCount < e621User.FavoriteCount { + + favorites, err := favoriteBuilder.Page(pageIndex).Execute() + if err != nil { + return err + } + + if len(favorites) <= 0 { + return nil + } + + for _, favorite := range favorites { + + if currentDBFavCount == e621User.FavoriteCount { + break + } + + isFaved, err := graphConnection.CheckUserToPostLink(ctx, favorite.ID, e621User.ID) + if err != nil { + return err + } + + if !isFaved { + err = uploadDataToDB(ctx, graphConnection, favorite, e621User) + if err != nil { + return err + } + currentDBFavCount++ + } + + } + pageIndex++ } log.WithFields(log.Fields{ "e621_username": e621User.Name, "e621_user_id": e621User.ID, - }).Info("service: start processing favorites") - start := time.Now() - - e621FavoritesBuilder := client.GetFavoritesBuilder().SetUserID(e621User.ID) - e621Favorites, err := client.GetAllFavoritesForUser(e621FavoritesBuilder) - - // Uploads all Tags, Posts as Nodes to Neo4j - for i, post := range e621Favorites { - if exists, err := graphConnection.CheckUserToPostLink(ctx, post.ID, e621User.ID); err == nil && exists { - log.WithFields(log.Fields{ - "e621_username": e621User.Name, - "e621_user_id": e621User.ID, - "last_post_id": post.ID, - }).Info("service: no new favorites found") - break - } else if err != nil { - return err - } - - start = time.Now() - err = uploadNodes(ctx, graphConnection, post) - if err != nil { - return err - } - log.WithFields(log.Fields{ - "e621_username": e621User.Name, - "e621_user_id": e621User.ID, - "post_number": i, - "post_amount": len(e621Favorites), - "post_id": post.ID, - "upload_time": time.Since(start), - }).Debug("service: uploading post") - - start := time.Now() - err = uploadPostToUserRelationship(ctx, graphConnection, post, e621User) - if err != nil { - log.Fatal(err) - return err - } - - err = uploadSourceTagRelationship(ctx, graphConnection, post) - if err != nil { - log.Fatal(err) - return err - } - - err = uploadGeneralTagRelationship(ctx, graphConnection, post) - if err != nil { - log.Fatal(err) - return err - } - - err = uploadCharacterTagtRelationship(ctx, graphConnection, post) - if err != nil { - log.Fatal(err) - return err - } - - err = uploadCopyrightTagRelationship(ctx, graphConnection, post) - if err != nil { - log.Fatal(err) - return err - } - - err = uploadArtistTagRelationship(ctx, graphConnection, post) - if err != nil { - log.Fatal(err) - return err - } - log.WithFields(log.Fields{ - "e621_username": e621User.Name, - "e621_user_id": e621User.ID, - "post_number": i, - "post_amount": len(e621Favorites), - "post_id": post.ID, - "upload_time": time.Since(start), - }).Debug("service: making relationship") - } - log.WithFields(log.Fields{ - "e621_username": e621User.Name, - "e621_user_id": e621User.ID, - "post_amount": len(e621Favorites), - "scrape_time": time.Since(start), + "post_amount": e621User.FavoriteCount, + "scrape_time": time.Since(scrapeTime), }).Info("service: finished processing favorites") return nil } +func uploadDataToDB(ctx context.Context, graphConnection logic.GraphConnection, favorite model.Post, e621User model.User) error { + start := time.Now() + err := uploadNodes(ctx, graphConnection, favorite) + if err != nil { + return err + } + log.WithFields(log.Fields{ + "e621_username": e621User.Name, + "e621_user_id": e621User.ID, + "post_id": favorite.ID, + "upload_time": time.Since(start), + }).Debug("service: uploaded post") + + start = time.Now() + err = uploadPostToUserRelationship(ctx, graphConnection, favorite, e621User) + if err != nil { + log.Fatal(err) + return err + } + + err = uploadSourceTagRelationship(ctx, graphConnection, favorite) + if err != nil { + log.Fatal(err) + return err + } + + err = uploadGeneralTagRelationship(ctx, graphConnection, favorite) + if err != nil { + log.Fatal(err) + return err + } + + err = uploadCharacterTagtRelationship(ctx, graphConnection, favorite) + if err != nil { + log.Fatal(err) + return err + } + + err = uploadCopyrightTagRelationship(ctx, graphConnection, favorite) + if err != nil { + log.Fatal(err) + return err + } + + err = uploadArtistTagRelationship(ctx, graphConnection, favorite) + if err != nil { + log.Fatal(err) + return err + } + log.WithFields(log.Fields{ + "e621_username": e621User.Name, + "e621_user_id": e621User.ID, + "post_id": favorite.ID, + "upload_time": time.Since(start), + }).Debug("service: made relationship") + return nil +} + // uploadNodes uploads the post to the database and creates the nodes func uploadNodes(ctx context.Context, graphConnection logic.GraphConnection, post model.Post) error { diff --git a/pkg/logic/database.go b/pkg/logic/database.go index 09e384c..a035378 100644 --- a/pkg/logic/database.go +++ b/pkg/logic/database.go @@ -15,4 +15,5 @@ type GraphConnection interface { EstablishPostToSourceLink(ctx context.Context, e621PostID model.PostID, sourceURL string) error EstablishUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) error CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error) + GetUserFavoriteCount(ctx context.Context, userID model.UserID) (int64, error) }